1 //Written in the D programming language
2 /*
3     Regular expression pattern parser.
4 */
5 module std.regex.internal.parser;
6 
7 import std.regex.internal.ir;
8 import std.range.primitives, std.uni, std.meta,
9     std.traits, std.typecons, std.exception;
10 static import std.ascii;
11 
12 // package relevant info from parser into a regex object
makeRegex(S,CG)13 auto makeRegex(S, CG)(Parser!(S, CG) p)
14 {
15     import std.regex.internal.backtracking : BacktrackingMatcher;
16     import std.regex.internal.thompson : ThompsonMatcher;
17     import std.algorithm.searching : canFind;
18     alias Char = BasicElementOf!S;
19     Regex!Char re;
20     auto g = p.g;
21     with(re)
22     {
23         ir = g.ir;
24         dict = g.dict;
25         ngroup = g.ngroup;
26         maxCounterDepth = g.counterDepth;
27         flags = p.re_flags;
28         charsets = g.charsets;
29         matchers = g.matchers;
30         backrefed = g.backrefed;
31         re.pattern = p.origin.idup;
32         re.postprocess();
33         // check if we have backreferences, if so - use backtracking
34         if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
35         else if (re.backrefed.canFind!"a != 0")
36             factory =  new RuntimeFactory!(BacktrackingMatcher, Char);
37         else
38             factory = new RuntimeFactory!(ThompsonMatcher, Char);
39         debug(std_regex_parser)
40         {
41             __ctfe || print();
42         }
43         //@@@BUG@@@ (not reduced)
44         //somehow just using validate _collides_ with std.utf.validate (!)
45         version (assert) re.validateRe();
46     }
47     return re;
48 }
49 
50 // helper for unittest
51 auto makeRegex(S)(S arg)
52 if (isSomeString!S)
53 {
54     return makeRegex(Parser!(S, CodeGen)(arg, ""));
55 }
56 
57 @system unittest
58 {
59     import std.algorithm.comparison : equal;
60     auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61     auto nc = re.namedCaptures;
62     static assert(isRandomAccessRange!(typeof(nc)));
63     assert(!nc.empty);
64     assert(nc.length == 2);
65     assert(nc.equal(["name", "var"]));
66     assert(nc[0] == "name");
67     assert(nc[1..$].equal(["var"]));
68 
69     re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70     nc = re.namedCaptures;
71     assert(nc.length == 1);
72     assert(nc[0] == "named");
73     assert(nc.front == "named");
74     assert(nc.back == "named");
75 
76     re = makeRegex(`(\w+) (\w+)`);
77     nc = re.namedCaptures;
78     assert(nc.empty);
79 
80     re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81     nc = re.namedCaptures;
82     auto cp = nc.save;
83     assert(nc.equal(cp));
84     nc.popFront();
85     assert(nc.equal(cp[1..$]));
86     nc.popBack();
87     assert(nc.equal(cp[1 .. $ - 1]));
88 }
89 
90 
reverseBytecode()91 @trusted void reverseBytecode()(Bytecode[] code)
92 {
93     Bytecode[] rev = new Bytecode[code.length];
94     uint revPc = cast(uint) rev.length;
95     Stack!(Tuple!(uint, uint, uint)) stack;
96     uint start = 0;
97     uint end = cast(uint) code.length;
98     for (;;)
99     {
100         for (uint pc = start; pc < end; )
101         {
102             immutable len = code[pc].length;
103             if (code[pc].code == IR.GotoEndOr)
104                 break; //pick next alternation branch
105             if (code[pc].isAtom)
106             {
107                 rev[revPc - len .. revPc] = code[pc .. pc + len];
108                 revPc -= len;
109                 pc += len;
110             }
111             else if (code[pc].isStart || code[pc].isEnd)
112             {
113                 //skip over other embedded lookbehinds they are reversed
114                 if (code[pc].code == IR.LookbehindStart
115                     || code[pc].code == IR.NeglookbehindStart)
116                 {
117                     immutable blockLen = len + code[pc].data
118                          + code[pc].pairedLength;
119                     rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
120                     pc += blockLen;
121                     revPc -= blockLen;
122                     continue;
123                 }
124                 immutable second = code[pc].indexOfPair(pc);
125                 immutable secLen = code[second].length;
126                 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
127                 revPc -= secLen;
128                 if (code[pc].code == IR.OrStart)
129                 {
130                     //we pass len bytes forward, but secLen in reverse
131                     immutable revStart = revPc - (second + len - secLen - pc);
132                     uint r = revStart;
133                     uint i = pc + IRL!(IR.OrStart);
134                     while (code[i].code == IR.Option)
135                     {
136                         if (code[i - 1].code != IR.OrStart)
137                         {
138                             assert(code[i - 1].code == IR.GotoEndOr);
139                             rev[r - 1] = code[i - 1];
140                         }
141                         rev[r] = code[i];
142                         auto newStart = i + IRL!(IR.Option);
143                         auto newEnd = newStart + code[i].data;
144                         auto newRpc = r + code[i].data + IRL!(IR.Option);
145                         if (code[newEnd].code != IR.OrEnd)
146                         {
147                             newRpc--;
148                         }
149                         stack.push(tuple(newStart, newEnd, newRpc));
150                         r += code[i].data + IRL!(IR.Option);
151                         i += code[i].data + IRL!(IR.Option);
152                     }
153                     pc = i;
154                     revPc = revStart;
155                     assert(code[pc].code == IR.OrEnd);
156                 }
157                 else
158                     pc += len;
159             }
160         }
161         if (stack.empty)
162             break;
163         start = stack.top[0];
164         end = stack.top[1];
165         revPc = stack.top[2];
166         stack.pop();
167     }
168     code[] = rev[];
169 }
170 
171 struct CodeGen
172 {
173     Bytecode[] ir;                 // resulting bytecode
174     Stack!(uint) fixupStack;       // stack of opened start instructions
175     NamedGroup[] dict;             // maps name -> user group number
176     Stack!(uint) groupStack;       // stack of current number of group
177     uint nesting = 0;              // group nesting level and repetitions step
178     uint lookaroundNest = 0;       // nesting of lookaround
179     uint counterDepth = 0;         // current depth of nested counted repetitions
180     CodepointSet[] charsets;       // sets for char classes
181     const(CharMatcher)[] matchers; // matchers for char classes
182     uint[] backrefed;              // bitarray for groups refered by backref
183     uint ngroup;                   // final number of groups (of all patterns)
184 
startCodeGen185     void start(uint length)
186     {
187         if (!__ctfe)
188             ir.reserve((length*5+2)/4);
189         fixupStack.push(0);
190         groupStack.push(1);//0 - whole match
191     }
192 
193     //mark referenced groups for latter processing
markBackrefCodeGen194     void markBackref(uint n)
195     {
196         if (n/32 >= backrefed.length)
197             backrefed.length = n/32 + 1;
198         backrefed[n / 32] |= 1 << (n & 31);
199     }
200 
isOpenGroupCodeGen201     bool isOpenGroup(uint n)
202     {
203         import std.algorithm.searching : canFind;
204         // walk the fixup stack and see if there are groups labeled 'n'
205         // fixup '0' is reserved for alternations
206         return fixupStack.data[1..$].
207             canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
208     }
209 
putCodeGen210     void put(Bytecode code)
211     {
212         enforce(ir.length < maxCompiledLength,
213             "maximum compiled pattern length is exceeded");
214         ir ~= code;
215     }
216 
putRawCodeGen217     void putRaw(uint number)
218     {
219         enforce(ir.length < maxCompiledLength,
220             "maximum compiled pattern length is exceeded");
221         ir ~= Bytecode.fromRaw(number);
222     }
223 
224     //try to generate optimal IR code for this CodepointSet
charsetToIrCodeGen225     @trusted void charsetToIr(CodepointSet set)
226     {//@@@BUG@@@ writeln is @system
227         uint chars = cast(uint) set.length;
228         if (chars < Bytecode.maxSequence)
229         {
230             switch (chars)
231             {
232                 case 1:
233                     put(Bytecode(IR.Char, set.byCodepoint.front));
234                     break;
235                 case 0:
236                     throw new RegexException("empty CodepointSet not allowed");
237                 default:
238                     foreach (ch; set.byCodepoint)
239                         put(Bytecode(IR.OrChar, ch, chars));
240             }
241         }
242         else
243         {
244             import std.algorithm.searching : countUntil;
245             const ivals = set.byInterval;
246             immutable n = charsets.countUntil(set);
247             if (n >= 0)
248             {
249                 if (ivals.length*2 > maxCharsetUsed)
250                     put(Bytecode(IR.Trie, cast(uint) n));
251                 else
252                     put(Bytecode(IR.CodepointSet, cast(uint) n));
253                 return;
254             }
255             if (ivals.length*2 > maxCharsetUsed)
256             {
257                 auto t  = getMatcher(set);
258                 put(Bytecode(IR.Trie, cast(uint) matchers.length));
259                 matchers ~= t;
260                 debug(std_regex_allocation) writeln("Trie generated");
261             }
262             else
263             {
264                 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
265                 matchers ~= CharMatcher.init;
266             }
267             charsets ~= set;
268             assert(charsets.length == matchers.length);
269         }
270     }
271 
genLogicGroupCodeGen272     void genLogicGroup()
273     {
274         nesting++;
275         pushFixup(length);
276         put(Bytecode(IR.Nop, 0));
277     }
278 
genGroupCodeGen279     void genGroup()
280     {
281         nesting++;
282         pushFixup(length);
283         immutable nglob = groupStack.top++;
284         enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
285         put(Bytecode(IR.GroupStart, nglob));
286     }
287 
genNamedGroupCodeGen288     void genNamedGroup(string name)
289     {
290         import std.array : insertInPlace;
291         import std.range : assumeSorted;
292         nesting++;
293         pushFixup(length);
294         immutable nglob = groupStack.top++;
295         enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
296         auto t = NamedGroup(name, nglob);
297         auto d = assumeSorted!"a.name < b.name"(dict);
298         immutable ind = d.lowerBound(t).length;
299         insertInPlace(dict, ind, t);
300         put(Bytecode(IR.GroupStart, nglob));
301     }
302 
303         //generate code for start of lookaround: (?= (?! (?<= (?<!
genLookaroundCodeGen304     void genLookaround(IR opcode)
305     {
306         nesting++;
307         pushFixup(length);
308         put(Bytecode(opcode, 0));
309         put(Bytecode.fromRaw(0));
310         put(Bytecode.fromRaw(0));
311         groupStack.push(0);
312         lookaroundNest++;
313         enforce(lookaroundNest <= maxLookaroundDepth,
314             "maximum lookaround depth is exceeded");
315     }
316 
endPatternCodeGen317     void endPattern(uint num)
318     {
319         import std.algorithm.comparison : max;
320         put(Bytecode(IR.End, num));
321         ngroup = max(ngroup, groupStack.top);
322         groupStack.top = 1; // reset group counter
323     }
324 
325     //fixup lookaround with start at offset fix and append a proper *-End opcode
fixLookaroundCodeGen326     void fixLookaround(uint fix)
327     {
328         lookaroundNest--;
329         ir[fix] = Bytecode(ir[fix].code,
330             cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
331         auto g = groupStack.pop();
332         assert(!groupStack.empty);
333         ir[fix+1] = Bytecode.fromRaw(groupStack.top);
334         //groups are cumulative across lookarounds
335         ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
336         groupStack.top += g;
337         if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
338         {
339             reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
340         }
341         put(ir[fix].paired);
342     }
343 
344     // repetition of {1,1}
fixRepetitionCodeGen345     void fixRepetition(uint offset)
346     {
347         import std.algorithm.mutation : copy;
348         immutable replace = ir[offset].code == IR.Nop;
349         if (replace)
350         {
351             copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
352             ir.length -= 1;
353         }
354     }
355 
356     // repetition of {x,y}
fixRepetitionCodeGen357     void fixRepetition(uint offset, uint min, uint max, bool greedy)
358     {
359         static import std.algorithm.comparison;
360         import std.algorithm.mutation : copy;
361         import std.array : insertInPlace;
362         immutable replace = ir[offset].code == IR.Nop;
363         immutable len = cast(uint) ir.length - offset - replace;
364         if (max != infinite)
365         {
366             if (min != 1 || max != 1)
367             {
368                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
369                 if (replace)
370                     ir[offset] = op;
371                 else
372                     insertInPlace(ir, offset, op);
373                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
374                 put(Bytecode.init); //hotspot
375                 putRaw(1);
376                 putRaw(min);
377                 putRaw(max);
378                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
379             }
380         }
381         else if (min) //&& max is infinite
382         {
383             if (min != 1)
384             {
385                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
386                 if (replace)
387                     ir[offset] = op;
388                 else
389                     insertInPlace(ir, offset, op);
390                 offset += 1;//so it still points to the repeated block
391                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
392                 put(Bytecode.init); //hotspot
393                 putRaw(1);
394                 putRaw(min);
395                 putRaw(min);
396                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
397             }
398             else if (replace)
399             {
400                 copy(ir[offset+1 .. $], ir[offset .. $-1]);
401                 ir.length -= 1;
402             }
403             put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
404             enforce(ir.length + len < maxCompiledLength,  "maximum compiled pattern length is exceeded");
405             ir ~= ir[offset .. offset+len];
406             //IR.InfinteX is always a hotspot
407             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
408             put(Bytecode.init); //merge index
409         }
410         else//vanila {0,inf}
411         {
412             Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
413             if (replace)
414                 ir[offset] = op;
415             else
416                 insertInPlace(ir, offset, op);
417             //IR.InfinteX is always a hotspot
418             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
419             put(Bytecode.init); //merge index
420         }
421     }
422 
fixAlternationCodeGen423     void fixAlternation()
424     {
425         import std.array : insertInPlace;
426         uint fix = fixupStack.top;
427         if (ir.length > fix && ir[fix].code == IR.Option)
428         {
429             ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
430             put(Bytecode(IR.GotoEndOr, 0));
431             fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
432             put(Bytecode(IR.Option, 0));
433             return;
434         }
435         uint len, orStart;
436         //start a new option
437         if (fixupStack.length == 1)
438         {//only root entry, effectively no fixup
439             len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
440             orStart = 0;
441         }
442         else
443         {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444             len = cast(uint) ir.length - fix - (ir[fix].length - 1);
445             orStart = fix + ir[fix].length;
446         }
447         insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
448         assert(ir[orStart].code == IR.OrStart);
449         put(Bytecode(IR.GotoEndOr, 0));
450         fixupStack.push(orStart); //fixup for StartOR
451         fixupStack.push(cast(uint) ir.length); //for second Option
452         put(Bytecode(IR.Option, 0));
453     }
454 
455     // finalizes IR.Option, fix points to the first option of sequence
finishAlternationCodeGen456     void finishAlternation(uint fix)
457     {
458         enforce(ir[fix].code == IR.Option, "no matching ')'");
459         ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
460         fix = fixupStack.pop();
461         enforce(ir[fix].code == IR.OrStart, "no matching ')'");
462         ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
463         put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
464         uint pc = fix + IRL!(IR.OrStart);
465         while (ir[pc].code == IR.Option)
466         {
467             pc = pc + ir[pc].data;
468             if (ir[pc].code != IR.GotoEndOr)
469                 break;
470             ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
471             pc += IRL!(IR.GotoEndOr);
472         }
473         put(Bytecode.fromRaw(0));
474     }
475 
476     // returns: (flag - repetition possible?, fixup of the start of this "group")
477     Tuple!(bool, uint) onClose()
478     {
479         nesting--;
480         uint fix = popFixup();
481         switch (ir[fix].code)
482         {
483         case IR.GroupStart:
484             put(Bytecode(IR.GroupEnd, ir[fix].data));
485             return tuple(true, fix);
486         case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
487             assert(lookaroundNest);
488             fixLookaround(fix);
489             return tuple(false, 0u);
490         case IR.Option: //| xxx )
491             //two fixups: last option + full OR
492             finishAlternation(fix);
493             fix = topFixup;
494             switch (ir[fix].code)
495             {
496             case IR.GroupStart:
497                 popFixup();
498                 put(Bytecode(IR.GroupEnd, ir[fix].data));
499                 return tuple(true, fix);
500             case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
501                 assert(lookaroundNest);
502                 fix = popFixup();
503                 fixLookaround(fix);
504                 return tuple(false, 0u);
505             default://(?:xxx)
506                 popFixup();
507                 return tuple(true, fix);
508             }
509         default://(?:xxx)
510             return tuple(true, fix);
511         }
512     }
513 
popFixupCodeGen514     uint popFixup(){ return fixupStack.pop(); }
515 
pushFixupCodeGen516     void pushFixup(uint val){ return fixupStack.push(val); }
517 
topFixupCodeGen518     @property uint topFixup(){ return fixupStack.top; }
519 
fixupLengthCodeGen520     @property size_t fixupLength(){ return fixupStack.data.length; }
521 
lengthCodeGen522     @property uint length(){ return cast(uint) ir.length; }
523 }
524 
525 // safety limits
526 enum maxGroupNumber = 2^^19;
527 enum maxLookaroundDepth = 16;
528 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529 enum maxCompiledLength = 2^^18;
530 // amounts to up to 4 Mb of auxilary table for matching
531 enum maxCumulativeRepetitionLength = 2^^20;
532 // marker to indicate infinite repetition
533 enum infinite = ~0u;
534 
535 struct Parser(R, Generator)
536 if (isForwardRange!R && is(ElementType!R : dchar))
537 {
538     dchar front;
539     bool empty;
540     R pat, origin;       //keep full pattern for pretty printing error messages
541     uint re_flags = 0;   //global flags e.g. multiline + internal ones
542     Generator g;
543 
544     @trusted this(S)(R pattern, S flags)
545         if (isSomeString!S)
546     {
547         pat = origin = pattern;
548         //reserve slightly more then avg as sampled from unittests
549         parseFlags(flags);
550         front = ' ';//a safe default for freeform parsing
551         popFront();
552         g.start(cast(uint) pat.length);
553         try
554         {
555             parseRegex();
556         }
catch(Exception e)557         catch (Exception e)
558         {
559             error(e.msg);//also adds pattern location
560         }
561         g.endPattern(1);
562     }
563 
_popFront()564     void _popFront()
565     {
566         if (pat.empty)
567         {
568             empty =  true;
569         }
570         else
571         {
572             front = pat.front;
573             pat.popFront();
574         }
575     }
576 
skipSpace()577     void skipSpace()
578     {
579         while (!empty && isWhite(front)) _popFront();
580     }
581 
popFront()582     void popFront()
583     {
584         _popFront();
585         if (re_flags & RegexOption.freeform) skipSpace();
586     }
587 
save()588     auto save(){ return this; }
589 
590     //parsing number with basic overflow check
parseDecimal()591     uint parseDecimal()
592     {
593         uint r = 0;
594         while (std.ascii.isDigit(front))
595         {
596             if (r >= (uint.max/10))
597                 error("Overflow in decimal number");
598             r = 10*r + cast(uint)(front-'0');
599             popFront();
600             if (empty) break;
601         }
602         return r;
603     }
604 
605     //
parseFlags(S)606     @trusted void parseFlags(S)(S flags)
607     {//@@@BUG@@@ text is @system
608         import std.conv : text;
609         foreach (ch; flags)//flags are ASCII anyway
610         {
611         L_FlagSwitch:
612             switch (ch)
613             {
614 
615                 foreach (i, op; __traits(allMembers, RegexOption))
616                 {
617                     case RegexOptionNames[i]:
618                             if (re_flags & mixin("RegexOption."~op))
619                                 throw new RegexException(text("redundant flag specified: ",ch));
620                             re_flags |= mixin("RegexOption."~op);
621                             break L_FlagSwitch;
622                 }
623                 default:
624                     throw new RegexException(text("unknown regex flag '",ch,"'"));
625             }
626         }
627     }
628 
629     //parse and store IR for regex pattern
parseRegex()630     @trusted void parseRegex()
631     {
632         uint fix;//fixup pointer
633 
634         while (!empty)
635         {
636             debug(std_regex_parser)
637                 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
638             switch (front)
639             {
640             case '(':
641                 popFront();
642                 if (front == '?')
643                 {
644                     popFront();
645                     switch (front)
646                     {
647                     case '#':
648                         for (;;)
649                         {
650                             popFront();
651                             enforce(!empty, "Unexpected end of pattern");
652                             if (front == ')')
653                             {
654                                 popFront();
655                                 break;
656                             }
657                         }
658                         break;
659                     case ':':
660                         g.genLogicGroup();
661                         popFront();
662                         break;
663                     case '=':
664                         g.genLookaround(IR.LookaheadStart);
665                         popFront();
666                         break;
667                     case '!':
668                         g.genLookaround(IR.NeglookaheadStart);
669                         popFront();
670                         break;
671                     case 'P':
672                         popFront();
673                         enforce(front == '<', "Expected '<' in named group");
674                         string name;
675                         popFront();
676                         if (empty || !(isAlpha(front) || front == '_'))
677                             error("Expected alpha starting a named group");
678                         name ~= front;
679                         popFront();
680                         while (!empty && (isAlpha(front) ||
681                             front == '_' || std.ascii.isDigit(front)))
682                         {
683                             name ~= front;
684                             popFront();
685                         }
686                         enforce(front == '>', "Expected '>' closing named group");
687                         popFront();
688                         g.genNamedGroup(name);
689                         break;
690                     case '<':
691                         popFront();
692                         if (front == '=')
693                             g.genLookaround(IR.LookbehindStart);
694                         else if (front == '!')
695                             g.genLookaround(IR.NeglookbehindStart);
696                         else
697                             error("'!' or '=' expected after '<'");
698                         popFront();
699                         break;
700                     default:
701                         uint enableFlags, disableFlags;
702                         bool enable = true;
703                         do
704                         {
705                             switch (front)
706                             {
707                             case 's':
708                                 if (enable)
709                                     enableFlags |= RegexOption.singleline;
710                                 else
711                                     disableFlags |= RegexOption.singleline;
712                                 break;
713                             case 'x':
714                                 if (enable)
715                                     enableFlags |= RegexOption.freeform;
716                                 else
717                                     disableFlags |= RegexOption.freeform;
718                                 break;
719                             case 'i':
720                                 if (enable)
721                                     enableFlags |= RegexOption.casefold;
722                                 else
723                                     disableFlags |= RegexOption.casefold;
724                                 break;
725                             case 'm':
726                                 if (enable)
727                                     enableFlags |= RegexOption.multiline;
728                                 else
729                                     disableFlags |= RegexOption.multiline;
730                                 break;
731                             case '-':
732                                 if (!enable)
733                                     error(" unexpected second '-' in flags");
734                                 enable = false;
735                                 break;
736                             default:
737                                 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
738                             }
739                             popFront();
740                         }while (front != ')');
741                         popFront();
742                         re_flags |= enableFlags;
743                         re_flags &= ~disableFlags;
744                     }
745                 }
746                 else
747                 {
748                     g.genGroup();
749                 }
750                 break;
751             case ')':
752                 enforce(g.nesting, "Unmatched ')'");
753                 popFront();
754                 auto pair = g.onClose();
755                 if (pair[0])
756                     parseQuantifier(pair[1]);
757                 break;
758             case '|':
759                 popFront();
760                 g.fixAlternation();
761                 break;
762             default://no groups or whatever
763                 immutable start = g.length;
764                 parseAtom();
765                 parseQuantifier(start);
766             }
767         }
768 
769         if (g.fixupLength != 1)
770         {
771             fix = g.popFixup();
772             g.finishAlternation(fix);
773             enforce(g.fixupLength == 1, "no matching ')'");
774         }
775     }
776 
777 
778     //parse and store IR for atom-quantifier pair
parseQuantifier(uint offset)779     @trusted void parseQuantifier(uint offset)
780     {//copy is @system
781         if (empty)
782             return g.fixRepetition(offset);
783         uint min, max;
784         switch (front)
785         {
786         case '*':
787             min = 0;
788             max = infinite;
789             break;
790         case '?':
791             min = 0;
792             max = 1;
793             break;
794         case '+':
795             min = 1;
796             max = infinite;
797             break;
798         case '{':
799             popFront();
800             enforce(!empty, "Unexpected end of regex pattern");
801             enforce(std.ascii.isDigit(front), "First number required in repetition");
802             min = parseDecimal();
803             if (front == '}')
804                 max = min;
805             else if (front == ',')
806             {
807                 popFront();
808                 if (std.ascii.isDigit(front))
809                     max = parseDecimal();
810                 else if (front == '}')
811                     max = infinite;
812                 else
813                     error("Unexpected symbol in regex pattern");
814                 skipSpace();
815                 enforce(front == '}', "Unmatched '{' in regex pattern");
816             }
817             else
818                 error("Unexpected symbol in regex pattern");
819             enforce(min <= max, "Illegal {n,m} quantifier");
820             break;
821         default:
822             g.fixRepetition(offset);
823             return;
824         }
825         bool greedy = true;
826         //check only if we managed to get new symbol
827         popFront();
828         if (!empty && front == '?')
829         {
830             greedy = false;
831             popFront();
832         }
833         g.fixRepetition(offset, min, max, greedy);
834     }
835 
836     //parse and store IR for atom
parseAtom()837     void parseAtom()
838     {
839         if (empty)
840             return;
841         switch (front)
842         {
843         case '*', '?', '+', '|', '{', '}':
844             error("'*', '+', '?', '{', '}' not allowed in atom");
845         case '.':
846             if (re_flags & RegexOption.singleline)
847                 g.put(Bytecode(IR.Any, 0));
848             else
849             {
850                 CodepointSet set;
851                 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
852             }
853             popFront();
854             break;
855         case '[':
856             parseCharset();
857             break;
858         case '\\':
859             _popFront();
860             enforce(!empty, "Unfinished escape sequence");
861             parseEscape();
862             break;
863         case '^':
864             if (re_flags & RegexOption.multiline)
865                 g.put(Bytecode(IR.Bol, 0));
866             else
867                 g.put(Bytecode(IR.Bof, 0));
868             popFront();
869             break;
870         case '$':
871             if (re_flags & RegexOption.multiline)
872                 g.put(Bytecode(IR.Eol, 0));
873             else
874                 g.put(Bytecode(IR.Eof, 0));
875             popFront();
876             break;
877         default:
878             if (re_flags & RegexOption.casefold)
879             {
880                 auto range = simpleCaseFoldings(front);
881                 assert(range.length <= 5);
882                 if (range.length == 1)
883                     g.put(Bytecode(IR.Char, range.front));
884                 else
885                     foreach (v; range)
886                         g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
887             }
888             else
889                 g.put(Bytecode(IR.Char, front));
890             popFront();
891         }
892     }
893 
894     //parse and store IR for CodepointSet
parseCharset()895     void parseCharset()
896     {
897         const save = re_flags;
898         re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
899         bool casefold = cast(bool)(re_flags & RegexOption.casefold);
900         g.charsetToIr(unicode.parseSet(this, casefold));
901         re_flags = save;
902         // Last next() in parseCharset is executed w/o freeform flag
903         if (re_flags & RegexOption.freeform) skipSpace();
904     }
905 
906     //parse and generate IR for escape stand alone escape sequence
parseEscape()907     @trusted void parseEscape()
908     {//accesses array of appender
909         import std.algorithm.iteration : sum;
910         switch (front)
911         {
912         case 'f':   popFront(); g.put(Bytecode(IR.Char, '\f')); break;
913         case 'n':   popFront(); g.put(Bytecode(IR.Char, '\n')); break;
914         case 'r':   popFront(); g.put(Bytecode(IR.Char, '\r')); break;
915         case 't':   popFront(); g.put(Bytecode(IR.Char, '\t')); break;
916         case 'v':   popFront(); g.put(Bytecode(IR.Char, '\v')); break;
917 
918         case 'd':
919             popFront();
920             g.charsetToIr(unicode.Nd);
921             break;
922         case 'D':
923             popFront();
924             g.charsetToIr(unicode.Nd.inverted);
925             break;
926         case 'b':   popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
927         case 'B':   popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
928         case 's':
929             popFront();
930             g.charsetToIr(unicode.White_Space);
931             break;
932         case 'S':
933             popFront();
934             g.charsetToIr(unicode.White_Space.inverted);
935             break;
936         case 'w':
937             popFront();
938             g.charsetToIr(wordCharacter);
939             break;
940         case 'W':
941             popFront();
942             g.charsetToIr(wordCharacter.inverted);
943             break;
944         case 'p': case 'P':
945             bool casefold = cast(bool)(re_flags & RegexOption.casefold);
946             auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
947             g.charsetToIr(set);
948             break;
949         case 'x':
950             immutable code = parseUniHex(pat, 2);
951             popFront();
952             g.put(Bytecode(IR.Char,code));
953             break;
954         case 'u': case 'U':
955             immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
956             popFront();
957             g.put(Bytecode(IR.Char, code));
958             break;
959         case 'c': //control codes
960             Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
961             popFront();
962             g.put(code);
963             break;
964         case '0':
965             popFront();
966             g.put(Bytecode(IR.Char, 0));//NUL character
967             break;
968         case '1': .. case '9':
969             uint nref = cast(uint) front - '0';
970             immutable maxBackref = sum(g.groupStack.data);
971             enforce(nref < maxBackref, "Backref to unseen group");
972             //perl's disambiguation rule i.e.
973             //get next digit only if there is such group number
974             popFront();
975             while (nref < maxBackref && !empty && std.ascii.isDigit(front))
976             {
977                 nref = nref * 10 + front - '0';
978                 popFront();
979             }
980             if (nref >= maxBackref)
981                 nref /= 10;
982             enforce(!g.isOpenGroup(nref), "Backref to open group");
983             uint localLimit = maxBackref - g.groupStack.top;
984             if (nref >= localLimit)
985             {
986                 g.put(Bytecode(IR.Backref, nref-localLimit));
987                 g.ir[$-1].setLocalRef();
988             }
989             else
990                 g.put(Bytecode(IR.Backref, nref));
991             g.markBackref(nref);
992             break;
993         default:
994             if (front == '\\' && !pat.empty)
995             {
996                 if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
997                     enforce(false, "invalid escape sequence");
998             }
999             if (front >= privateUseStart && front <= privateUseEnd)
1000             {
1001                 g.endPattern(front - privateUseStart + 1);
1002                 break;
1003             }
1004             auto op = Bytecode(IR.Char, front);
1005             popFront();
1006             g.put(op);
1007         }
1008     }
1009 
1010     //
error(string msg)1011     @trusted void error(string msg)
1012     {
1013         import std.array : appender;
1014         import std.format.write : formattedWrite;
1015         auto app = appender!string();
1016         formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1017                        msg, origin[0..$-pat.length], pat);
1018         throw new RegexException(app.data);
1019     }
1020 
1021     alias Char = BasicElementOf!R;
1022 
program()1023     @property program()
1024     {
1025         return makeRegex(this);
1026     }
1027 }
1028 
1029 /+
1030     Postproces the IR, then optimize.
1031 +/
postprocess(Char)1032 @trusted void postprocess(Char)(ref Regex!Char zis)
1033 {//@@@BUG@@@ write is @system
1034     with(zis)
1035     {
1036         struct FixedStack(T)
1037         {
1038             T[] arr;
1039             uint _top;
1040             //this(T[] storage){   arr = storage; _top = -1; }
1041             @property ref T top(){  assert(!empty); return arr[_top]; }
1042             void push(T x){  arr[++_top] = x; }
1043             T pop() { assert(!empty);   return arr[_top--]; }
1044             @property bool empty(){   return _top == -1; }
1045         }
1046         auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1047         counterRange.push(1);
1048         ulong cumRange = 0;
1049         for (uint i = 0; i < ir.length; i += ir[i].length)
1050         {
1051             if (ir[i].hotspot)
1052             {
1053                 assert(i + 1 < ir.length,
1054                     "unexpected end of IR while looking for hotspot");
1055                 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1056                 hotspotTableSize += counterRange.top;
1057             }
1058             switch (ir[i].code)
1059             {
1060             case IR.RepeatStart, IR.RepeatQStart:
1061                 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1062                 assert(ir[repEnd].code == ir[i].paired.code);
1063                 immutable max = ir[repEnd + 4].raw;
1064                 ir[repEnd+2].raw = counterRange.top;
1065                 ir[repEnd+3].raw *= counterRange.top;
1066                 ir[repEnd+4].raw *= counterRange.top;
1067                 ulong cntRange = cast(ulong)(max)*counterRange.top;
1068                 cumRange += cntRange;
1069                 enforce(cumRange < maxCumulativeRepetitionLength,
1070                     "repetition length limit is exceeded");
1071                 counterRange.push(cast(uint) cntRange + counterRange.top);
1072                 threadCount += counterRange.top;
1073                 break;
1074             case IR.RepeatEnd, IR.RepeatQEnd:
1075                 threadCount += counterRange.top;
1076                 counterRange.pop();
1077                 break;
1078             case IR.GroupStart:
1079                 if (isBackref(ir[i].data))
1080                     ir[i].setBackrefence();
1081                 threadCount += counterRange.top;
1082                 break;
1083             case IR.GroupEnd:
1084                 if (isBackref(ir[i].data))
1085                     ir[i].setBackrefence();
1086                 threadCount += counterRange.top;
1087                 break;
1088             default:
1089                 threadCount += counterRange.top;
1090             }
1091         }
1092         checkIfOneShot();
1093         if (!(flags & RegexInfo.oneShot))
1094             kickstart = Kickstart!Char(zis, new uint[](256));
1095         debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1096         optimize(zis);
1097     }
1098 }
1099 
fixupBytecode()1100 void fixupBytecode()(Bytecode[] ir)
1101 {
1102     Stack!uint fixups;
1103 
1104     with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1105     {
1106         if (ir[i].isStart || ir[i].code == Option)
1107             fixups.push(i);
1108         else if (ir[i].code == OrEnd)
1109         {
1110             // Alternatives need more care
1111             auto j = fixups.pop(); // last Option
1112             ir[j].data = i -  j - ir[j].length;
1113             j = fixups.pop(); // OrStart
1114             ir[j].data = i - j - ir[j].length;
1115             ir[i].data = ir[j].data;
1116 
1117             // fixup all GotoEndOrs
1118             j = j + IRL!(OrStart);
1119             assert(ir[j].code == Option);
1120             for (;;)
1121             {
1122                 auto next = j + ir[j].data + IRL!(Option);
1123                 if (ir[next].code == IR.OrEnd)
1124                     break;
1125                 ir[next - IRL!(GotoEndOr)].data = i - next;
1126                 j = next;
1127             }
1128         }
1129         else if (ir[i].code == GotoEndOr)
1130         {
1131             auto j = fixups.pop(); // Option
1132             ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1133         }
1134         else if (ir[i].isEnd)
1135         {
1136             auto j = fixups.pop();
1137             ir[i].data = i - j - ir[j].length;
1138             ir[j].data = ir[i].data;
1139         }
1140     }
1141     assert(fixups.empty);
1142 }
1143 
optimize(Char)1144 void optimize(Char)(ref Regex!Char zis)
1145 {
1146     import std.array : insertInPlace;
1147     CodepointSet nextSet(uint idx)
1148     {
1149         CodepointSet set;
1150         with(zis) with(IR)
1151     Outer:
1152         for (uint i = idx; i < ir.length; i += ir[i].length)
1153         {
1154             switch (ir[i].code)
1155             {
1156                 case Char:
1157                     set.add(ir[i].data, ir[i].data+1);
1158                     goto default;
1159                 //TODO: OrChar
1160                 case Trie, CodepointSet:
1161                     set = zis.charsets[ir[i].data];
1162                     goto default;
1163                 case GroupStart,GroupEnd:
1164                     break;
1165                 default:
1166                     break Outer;
1167             }
1168         }
1169         return set;
1170     }
1171 
1172     with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1173     {
1174         if (ir[i].code == InfiniteEnd)
1175         {
1176             auto set = nextSet(i+IRL!(InfiniteEnd));
1177             if (!set.empty && set.length < 10_000)
1178             {
1179                 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1180                 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1181                     Bytecode(InfiniteBloomStart, ir[i].data);
1182                 ir.insertInPlace(i+IRL!(InfiniteEnd),
1183                     Bytecode.fromRaw(cast(uint) zis.filters.length));
1184                 zis.filters ~= BitTable(set);
1185                 fixupBytecode(ir);
1186             }
1187         }
1188     }
1189 }
1190 
1191 //IR code validator - proper nesting, illegal instructions, etc.
validateRe(Char)1192 @trusted void validateRe(Char)(ref Regex!Char zis)
1193 {//@@@BUG@@@ text is @system
1194     import std.conv : text;
1195     with(zis)
1196     {
1197         for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1198         {
1199             if (ir[pc].isStart || ir[pc].isEnd)
1200             {
1201                 immutable dest = ir[pc].indexOfPair(pc);
1202                 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1203                     pc, " ", dest, " vs ", ir.length));
1204                 assert(ir[dest].paired ==  ir[pc],
1205                     text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1206             }
1207             else if (ir[pc].isAtom)
1208             {
1209 
1210             }
1211             else
1212                assert(0, text("Unknown type of instruction at pc=", pc));
1213         }
1214     }
1215 }
1216