1 //Written in the D programming language
2 /*
3     Regular expression pattern parser.
4 */
5 module std.regex.internal.parser;
6 
7 static import std.ascii;
8 import std.range.primitives, std.uni, std.meta,
9     std.traits, std.typecons, std.exception;
10 import std.regex.internal.ir;
11 
12 // package relevant info from parser into a regex object
makeRegex(S,CG)13 auto makeRegex(S, CG)(Parser!(S, CG) p)
14 {
15     Regex!(BasicElementOf!S) re;
16     auto g = p.g;
17     with(re)
18     {
19         ir = g.ir;
20         dict = g.dict;
21         ngroup = g.ngroup;
22         maxCounterDepth = g.counterDepth;
23         flags = p.re_flags;
24         charsets = g.charsets;
25         matchers = g.matchers;
26         backrefed = g.backrefed;
27         re.postprocess();
28         debug(std_regex_parser)
29         {
30             __ctfe || print();
31         }
32         //@@@BUG@@@ (not reduced)
33         //somehow just using validate _collides_ with std.utf.validate (!)
34         version (assert) re.validateRe();
35     }
36     return re;
37 }
38 
39 // helper for unittest
40 auto makeRegex(S)(S arg)
41 if (isSomeString!S)
42 {
43     return makeRegex(Parser!(S, CodeGen)(arg, ""));
44 }
45 
46 @system unittest
47 {
48     import std.algorithm.comparison : equal;
49     auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
50     auto nc = re.namedCaptures;
51     static assert(isRandomAccessRange!(typeof(nc)));
52     assert(!nc.empty);
53     assert(nc.length == 2);
54     assert(nc.equal(["name", "var"]));
55     assert(nc[0] == "name");
56     assert(nc[1..$].equal(["var"]));
57 
58     re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
59     nc = re.namedCaptures;
60     assert(nc.length == 1);
61     assert(nc[0] == "named");
62     assert(nc.front == "named");
63     assert(nc.back == "named");
64 
65     re = makeRegex(`(\w+) (\w+)`);
66     nc = re.namedCaptures;
67     assert(nc.empty);
68 
69     re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
70     nc = re.namedCaptures;
71     auto cp = nc.save;
72     assert(nc.equal(cp));
73     nc.popFront();
74     assert(nc.equal(cp[1..$]));
75     nc.popBack();
76     assert(nc.equal(cp[1 .. $ - 1]));
77 }
78 
79 
reverseBytecode()80 @trusted void reverseBytecode()(Bytecode[] code)
81 {
82     Bytecode[] rev = new Bytecode[code.length];
83     uint revPc = cast(uint) rev.length;
84     Stack!(Tuple!(uint, uint, uint)) stack;
85     uint start = 0;
86     uint end = cast(uint) code.length;
87     for (;;)
88     {
89         for (uint pc = start; pc < end; )
90         {
91             immutable len = code[pc].length;
92             if (code[pc].code == IR.GotoEndOr)
93                 break; //pick next alternation branch
94             if (code[pc].isAtom)
95             {
96                 rev[revPc - len .. revPc] = code[pc .. pc + len];
97                 revPc -= len;
98                 pc += len;
99             }
100             else if (code[pc].isStart || code[pc].isEnd)
101             {
102                 //skip over other embedded lookbehinds they are reversed
103                 if (code[pc].code == IR.LookbehindStart
104                     || code[pc].code == IR.NeglookbehindStart)
105                 {
106                     immutable blockLen = len + code[pc].data
107                          + code[pc].pairedLength;
108                     rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
109                     pc += blockLen;
110                     revPc -= blockLen;
111                     continue;
112                 }
113                 immutable second = code[pc].indexOfPair(pc);
114                 immutable secLen = code[second].length;
115                 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
116                 revPc -= secLen;
117                 if (code[pc].code == IR.OrStart)
118                 {
119                     //we pass len bytes forward, but secLen in reverse
120                     immutable revStart = revPc - (second + len - secLen - pc);
121                     uint r = revStart;
122                     uint i = pc + IRL!(IR.OrStart);
123                     while (code[i].code == IR.Option)
124                     {
125                         if (code[i - 1].code != IR.OrStart)
126                         {
127                             assert(code[i - 1].code == IR.GotoEndOr);
128                             rev[r - 1] = code[i - 1];
129                         }
130                         rev[r] = code[i];
131                         auto newStart = i + IRL!(IR.Option);
132                         auto newEnd = newStart + code[i].data;
133                         auto newRpc = r + code[i].data + IRL!(IR.Option);
134                         if (code[newEnd].code != IR.OrEnd)
135                         {
136                             newRpc--;
137                         }
138                         stack.push(tuple(newStart, newEnd, newRpc));
139                         r += code[i].data + IRL!(IR.Option);
140                         i += code[i].data + IRL!(IR.Option);
141                     }
142                     pc = i;
143                     revPc = revStart;
144                     assert(code[pc].code == IR.OrEnd);
145                 }
146                 else
147                     pc += len;
148             }
149         }
150         if (stack.empty)
151             break;
152         start = stack.top[0];
153         end = stack.top[1];
154         revPc = stack.top[2];
155         stack.pop();
156     }
157     code[] = rev[];
158 }
159 
160 //test if a given string starts with hex number of maxDigit that's a valid codepoint
161 //returns it's value and skips these maxDigit chars on success, throws on failure
parseUniHex(Char)162 dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
163 {
164     //std.conv.parse is both @system and bogus
165     enforce(str.length >= maxDigit,"incomplete escape sequence");
166     uint val;
167     for (int k = 0; k < maxDigit; k++)
168     {
169         immutable current = str[k];//accepts ascii only, so it's OK to index directly
170         if ('0' <= current && current <= '9')
171             val = val * 16 + current - '0';
172         else if ('a' <= current && current <= 'f')
173             val = val * 16 + current -'a' + 10;
174         else if ('A' <= current && current <= 'F')
175             val = val * 16 + current - 'A' + 10;
176         else
177             throw new Exception("invalid escape sequence");
178     }
179     enforce(val <= 0x10FFFF, "invalid codepoint");
180     str = str[maxDigit..$];
181     return val;
182 }
183 
184 @system unittest //BUG canFind is system
185 {
186     import std.algorithm.searching : canFind;
187     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
188     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
189     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
190     foreach (v; non_hex)
191         assert(collectException(parseUniHex(v, v.length)).msg
192           .canFind("invalid escape sequence"));
193     foreach (i, v; hex)
194         assert(parseUniHex(v, v.length) == value[i]);
195     string over = "0011FFFF";
196     assert(collectException(parseUniHex(over, over.length)).msg
197       .canFind("invalid codepoint"));
198 }
199 
caseEnclose(CodepointSet set)200 auto caseEnclose(CodepointSet set)
201 {
202     auto cased = set & unicode.LC;
203     foreach (dchar ch; cased.byCodepoint)
204     {
205         foreach (c; simpleCaseFoldings(ch))
206             set |= c;
207     }
208     return set;
209 }
210 
211 /+
212     fetch codepoint set corresponding to a name (InBlock or binary property)
213 +/
getUnicodeSet(in char[]name,bool negated,bool casefold)214 @trusted CodepointSet getUnicodeSet(in char[] name, bool negated,  bool casefold)
215 {
216     CodepointSet s = unicode(name);
217     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
218     if (casefold)
219        s = caseEnclose(s);
220     if (negated)
221         s = s.inverted;
222     return s;
223 }
224 
225 //basic stack, just in case it gets used anywhere else then Parser
Stack(T)226 @trusted struct Stack(T)
227 {
228     T[] data;
229     @property bool empty(){ return data.empty; }
230 
231     @property size_t length(){ return data.length; }
232 
233     void push(T val){ data ~= val;  }
234 
235     T pop()
236     {
237         assert(!empty);
238         auto val = data[$ - 1];
239         data = data[0 .. $ - 1];
240         if (!__ctfe)
241             cast(void) data.assumeSafeAppend();
242         return val;
243     }
244 
245     @property ref T top()
246     {
247         assert(!empty);
248         return data[$ - 1];
249     }
250 }
251 
252 struct CodeGen
253 {
254     Bytecode[] ir;                 // resulting bytecode
255     Stack!(uint) fixupStack;       // stack of opened start instructions
256     NamedGroup[] dict;             // maps name -> user group number
257     Stack!(uint) groupStack;       // stack of current number of group
258     uint nesting = 0;              // group nesting level and repetitions step
259     uint lookaroundNest = 0;       // nesting of lookaround
260     uint counterDepth = 0;         // current depth of nested counted repetitions
261     CodepointSet[] charsets;       // sets for char classes
262     const(CharMatcher)[] matchers; // matchers for char classes
263     uint[] backrefed;              // bitarray for groups refered by backref
264     uint ngroup;                   // final number of groups (of all patterns)
265 
startCodeGen266     void start(uint length)
267     {
268         if (!__ctfe)
269             ir.reserve((length*5+2)/4);
270         fixupStack.push(0);
271         groupStack.push(1);//0 - whole match
272     }
273 
274     //mark referenced groups for latter processing
markBackrefCodeGen275     void markBackref(uint n)
276     {
277         if (n/32 >= backrefed.length)
278             backrefed.length = n/32 + 1;
279         backrefed[n / 32] |= 1 << (n & 31);
280     }
281 
isOpenGroupCodeGen282     bool isOpenGroup(uint n)
283     {
284         import std.algorithm.searching : canFind;
285         // walk the fixup stack and see if there are groups labeled 'n'
286         // fixup '0' is reserved for alternations
287         return fixupStack.data[1..$].
288             canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
289     }
290 
putCodeGen291     void put(Bytecode code)
292     {
293         enforce(ir.length < maxCompiledLength,
294             "maximum compiled pattern length is exceeded");
295         ir ~= code;
296     }
297 
putRawCodeGen298     void putRaw(uint number)
299     {
300         enforce(ir.length < maxCompiledLength,
301             "maximum compiled pattern length is exceeded");
302         ir ~= Bytecode.fromRaw(number);
303     }
304 
305     //try to generate optimal IR code for this CodepointSet
charsetToIrCodeGen306     @trusted void charsetToIr(CodepointSet set)
307     {//@@@BUG@@@ writeln is @system
308         uint chars = cast(uint) set.length;
309         if (chars < Bytecode.maxSequence)
310         {
311             switch (chars)
312             {
313                 case 1:
314                     put(Bytecode(IR.Char, set.byCodepoint.front));
315                     break;
316                 case 0:
317                     throw new RegexException("empty CodepointSet not allowed");
318                 default:
319                     foreach (ch; set.byCodepoint)
320                         put(Bytecode(IR.OrChar, ch, chars));
321             }
322         }
323         else
324         {
325             import std.algorithm.searching : countUntil;
326             const ivals = set.byInterval;
327             immutable n = charsets.countUntil(set);
328             if (n >= 0)
329             {
330                 if (ivals.length*2 > maxCharsetUsed)
331                     put(Bytecode(IR.Trie, cast(uint) n));
332                 else
333                     put(Bytecode(IR.CodepointSet, cast(uint) n));
334                 return;
335             }
336             if (ivals.length*2 > maxCharsetUsed)
337             {
338                 auto t  = getMatcher(set);
339                 put(Bytecode(IR.Trie, cast(uint) matchers.length));
340                 matchers ~= t;
341                 debug(std_regex_allocation) writeln("Trie generated");
342             }
343             else
344             {
345                 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
346                 matchers ~= CharMatcher.init;
347             }
348             charsets ~= set;
349             assert(charsets.length == matchers.length);
350         }
351     }
352 
genLogicGroupCodeGen353     void genLogicGroup()
354     {
355         nesting++;
356         pushFixup(length);
357         put(Bytecode(IR.Nop, 0));
358     }
359 
genGroupCodeGen360     void genGroup()
361     {
362         nesting++;
363         pushFixup(length);
364         immutable nglob = groupStack.top++;
365         enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
366         put(Bytecode(IR.GroupStart, nglob));
367     }
368 
genNamedGroupCodeGen369     void genNamedGroup(string name)
370     {
371         import std.array : insertInPlace;
372         import std.range : assumeSorted;
373         nesting++;
374         pushFixup(length);
375         immutable nglob = groupStack.top++;
376         enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
377         auto t = NamedGroup(name, nglob);
378         auto d = assumeSorted!"a.name < b.name"(dict);
379         immutable ind = d.lowerBound(t).length;
380         insertInPlace(dict, ind, t);
381         put(Bytecode(IR.GroupStart, nglob));
382     }
383 
384         //generate code for start of lookaround: (?= (?! (?<= (?<!
genLookaroundCodeGen385     void genLookaround(IR opcode)
386     {
387         nesting++;
388         pushFixup(length);
389         put(Bytecode(opcode, 0));
390         put(Bytecode.fromRaw(0));
391         put(Bytecode.fromRaw(0));
392         groupStack.push(0);
393         lookaroundNest++;
394         enforce(lookaroundNest <= maxLookaroundDepth,
395             "maximum lookaround depth is exceeded");
396     }
397 
endPatternCodeGen398     void endPattern(uint num)
399     {
400         import std.algorithm.comparison : max;
401         put(Bytecode(IR.End, num));
402         ngroup = max(ngroup, groupStack.top);
403         groupStack.top = 1; // reset group counter
404     }
405 
406     //fixup lookaround with start at offset fix and append a proper *-End opcode
fixLookaroundCodeGen407     void fixLookaround(uint fix)
408     {
409         lookaroundNest--;
410         ir[fix] = Bytecode(ir[fix].code,
411             cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
412         auto g = groupStack.pop();
413         assert(!groupStack.empty);
414         ir[fix+1] = Bytecode.fromRaw(groupStack.top);
415         //groups are cumulative across lookarounds
416         ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
417         groupStack.top += g;
418         if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
419         {
420             reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
421         }
422         put(ir[fix].paired);
423     }
424 
425     // repetition of {1,1}
fixRepetitionCodeGen426     void fixRepetition(uint offset)
427     {
428         import std.algorithm.mutation : copy;
429         immutable replace = ir[offset].code == IR.Nop;
430         if (replace)
431         {
432             copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
433             ir.length -= 1;
434         }
435     }
436 
437     // repetition of {x,y}
fixRepetitionCodeGen438     void fixRepetition(uint offset, uint min, uint max, bool greedy)
439     {
440         static import std.algorithm.comparison;
441         import std.algorithm.mutation : copy;
442         import std.array : insertInPlace;
443         immutable replace = ir[offset].code == IR.Nop;
444         immutable len = cast(uint) ir.length - offset - replace;
445         if (max != infinite)
446         {
447             if (min != 1 || max != 1)
448             {
449                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
450                 if (replace)
451                     ir[offset] = op;
452                 else
453                     insertInPlace(ir, offset, op);
454                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
455                 put(Bytecode.init); //hotspot
456                 putRaw(1);
457                 putRaw(min);
458                 putRaw(max);
459                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
460             }
461         }
462         else if (min) //&& max is infinite
463         {
464             if (min != 1)
465             {
466                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
467                 if (replace)
468                     ir[offset] = op;
469                 else
470                     insertInPlace(ir, offset, op);
471                 offset += 1;//so it still points to the repeated block
472                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
473                 put(Bytecode.init); //hotspot
474                 putRaw(1);
475                 putRaw(min);
476                 putRaw(min);
477                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
478             }
479             else if (replace)
480             {
481                 copy(ir[offset+1 .. $], ir[offset .. $-1]);
482                 ir.length -= 1;
483             }
484             put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
485             enforce(ir.length + len < maxCompiledLength,  "maximum compiled pattern length is exceeded");
486             ir ~= ir[offset .. offset+len];
487             //IR.InfinteX is always a hotspot
488             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
489             put(Bytecode.init); //merge index
490         }
491         else//vanila {0,inf}
492         {
493             Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
494             if (replace)
495                 ir[offset] = op;
496             else
497                 insertInPlace(ir, offset, op);
498             //IR.InfinteX is always a hotspot
499             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
500             put(Bytecode.init); //merge index
501         }
502     }
503 
fixAlternationCodeGen504     void fixAlternation()
505     {
506         import std.array : insertInPlace;
507         uint fix = fixupStack.top;
508         if (ir.length > fix && ir[fix].code == IR.Option)
509         {
510             ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
511             put(Bytecode(IR.GotoEndOr, 0));
512             fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
513             put(Bytecode(IR.Option, 0));
514             return;
515         }
516         uint len, orStart;
517         //start a new option
518         if (fixupStack.length == 1)
519         {//only root entry, effectively no fixup
520             len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
521             orStart = 0;
522         }
523         else
524         {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
525             len = cast(uint) ir.length - fix - (ir[fix].length - 1);
526             orStart = fix + ir[fix].length;
527         }
528         insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
529         assert(ir[orStart].code == IR.OrStart);
530         put(Bytecode(IR.GotoEndOr, 0));
531         fixupStack.push(orStart); //fixup for StartOR
532         fixupStack.push(cast(uint) ir.length); //for second Option
533         put(Bytecode(IR.Option, 0));
534     }
535 
536     // finalizes IR.Option, fix points to the first option of sequence
finishAlternationCodeGen537     void finishAlternation(uint fix)
538     {
539         enforce(ir[fix].code == IR.Option, "no matching ')'");
540         ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
541         fix = fixupStack.pop();
542         enforce(ir[fix].code == IR.OrStart, "no matching ')'");
543         ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
544         put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
545         uint pc = fix + IRL!(IR.OrStart);
546         while (ir[pc].code == IR.Option)
547         {
548             pc = pc + ir[pc].data;
549             if (ir[pc].code != IR.GotoEndOr)
550                 break;
551             ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
552             pc += IRL!(IR.GotoEndOr);
553         }
554         put(Bytecode.fromRaw(0));
555     }
556 
557     // returns: (flag - repetition possible?, fixup of the start of this "group")
558     Tuple!(bool, uint) onClose()
559     {
560         nesting--;
561         uint fix = popFixup();
562         switch (ir[fix].code)
563         {
564         case IR.GroupStart:
565             put(Bytecode(IR.GroupEnd, ir[fix].data));
566             return tuple(true, fix);
567         case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
568             assert(lookaroundNest);
569             fixLookaround(fix);
570             return tuple(false, 0u);
571         case IR.Option: //| xxx )
572             //two fixups: last option + full OR
573             finishAlternation(fix);
574             fix = topFixup;
575             switch (ir[fix].code)
576             {
577             case IR.GroupStart:
578                 popFixup();
579                 put(Bytecode(IR.GroupEnd, ir[fix].data));
580                 return tuple(true, fix);
581             case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
582                 assert(lookaroundNest);
583                 fix = popFixup();
584                 fixLookaround(fix);
585                 return tuple(false, 0u);
586             default://(?:xxx)
587                 popFixup();
588                 return tuple(true, fix);
589             }
590         default://(?:xxx)
591             return tuple(true, fix);
592         }
593     }
594 
popFixupCodeGen595     uint popFixup(){ return fixupStack.pop(); }
596 
pushFixupCodeGen597     void pushFixup(uint val){ return fixupStack.push(val); }
598 
topFixupCodeGen599     @property uint topFixup(){ return fixupStack.top; }
600 
fixupLengthCodeGen601     @property size_t fixupLength(){ return fixupStack.data.length; }
602 
lengthCodeGen603     @property uint length(){ return cast(uint) ir.length; }
604 }
605 
606 // safety limits
607 enum maxGroupNumber = 2^^19;
608 enum maxLookaroundDepth = 16;
609 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
610 enum maxCompiledLength = 2^^18;
611 // amounts to up to 4 Mb of auxilary table for matching
612 enum maxCumulativeRepetitionLength = 2^^20;
613 // marker to indicate infinite repetition
614 enum infinite = ~0u;
615 
616 struct Parser(R, Generator)
617 if (isForwardRange!R && is(ElementType!R : dchar))
618 {
619     dchar _current;
620     bool empty;
621     R pat, origin;       //keep full pattern for pretty printing error messages
622     uint re_flags = 0;   //global flags e.g. multiline + internal ones
623     Generator g;
624 
625     @trusted this(S)(R pattern, S flags)
626         if (isSomeString!S)
627     {
628         pat = origin = pattern;
629         //reserve slightly more then avg as sampled from unittests
630         parseFlags(flags);
631         _current = ' ';//a safe default for freeform parsing
632         next();
633         g.start(cast(uint) pat.length);
634         try
635         {
636             parseRegex();
637         }
catch(Exception e)638         catch (Exception e)
639         {
640             error(e.msg);//also adds pattern location
641         }
642         g.endPattern(1);
643     }
644 
current()645     @property dchar current(){ return _current; }
646 
_next()647     bool _next()
648     {
649         if (pat.empty)
650         {
651             empty =  true;
652             return false;
653         }
654         _current = pat.front;
655         pat.popFront();
656         return true;
657     }
658 
skipSpace()659     void skipSpace()
660     {
661         while (isWhite(current) && _next()){ }
662     }
663 
next()664     bool next()
665     {
666         if (re_flags & RegexOption.freeform)
667         {
668             immutable r = _next();
669             skipSpace();
670             return r;
671         }
672         else
673             return _next();
674     }
675 
676     //parsing number with basic overflow check
parseDecimal()677     uint parseDecimal()
678     {
679         uint r = 0;
680         while (std.ascii.isDigit(current))
681         {
682             if (r >= (uint.max/10))
683                 error("Overflow in decimal number");
684             r = 10*r + cast(uint)(current-'0');
685             if (!next())
686                 break;
687         }
688         return r;
689     }
690 
691     //parse control code of form \cXXX, c assumed to be the current symbol
parseControlCode()692     dchar parseControlCode()
693     {
694         enforce(next(), "Unfinished escape sequence");
695         enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'),
696             "Only letters are allowed after \\c");
697         return current & 0x1f;
698     }
699 
700     //
parseFlags(S)701     @trusted void parseFlags(S)(S flags)
702     {//@@@BUG@@@ text is @system
703         import std.conv : text;
704         foreach (ch; flags)//flags are ASCII anyway
705         {
706         L_FlagSwitch:
707             switch (ch)
708             {
709 
710                 foreach (i, op; __traits(allMembers, RegexOption))
711                 {
712                     case RegexOptionNames[i]:
713                             if (re_flags & mixin("RegexOption."~op))
714                                 throw new RegexException(text("redundant flag specified: ",ch));
715                             re_flags |= mixin("RegexOption."~op);
716                             break L_FlagSwitch;
717                 }
718                 default:
719                     throw new RegexException(text("unknown regex flag '",ch,"'"));
720             }
721         }
722     }
723 
724     //parse and store IR for regex pattern
parseRegex()725     @trusted void parseRegex()
726     {
727         uint fix;//fixup pointer
728 
729         while (!empty)
730         {
731             debug(std_regex_parser)
732                 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
733             switch (current)
734             {
735             case '(':
736                 next();
737                 if (current == '?')
738                 {
739                     next();
740                     switch (current)
741                     {
742                     case '#':
743                         for (;;)
744                         {
745                             if (!next())
746                                 error("Unexpected end of pattern");
747                             if (current == ')')
748                             {
749                                 next();
750                                 break;
751                             }
752                         }
753                         break;
754                     case ':':
755                         g.genLogicGroup();
756                         next();
757                         break;
758                     case '=':
759                         g.genLookaround(IR.LookaheadStart);
760                         next();
761                         break;
762                     case '!':
763                         g.genLookaround(IR.NeglookaheadStart);
764                         next();
765                         break;
766                     case 'P':
767                         next();
768                         if (current != '<')
769                             error("Expected '<' in named group");
770                         string name;
771                         if (!next() || !(isAlpha(current) || current == '_'))
772                             error("Expected alpha starting a named group");
773                         name ~= current;
774                         while (next() && (isAlpha(current) ||
775                             current == '_' || std.ascii.isDigit(current)))
776                         {
777                             name ~= current;
778                         }
779                         if (current != '>')
780                             error("Expected '>' closing named group");
781                         next();
782                         g.genNamedGroup(name);
783                         break;
784                     case '<':
785                         next();
786                         if (current == '=')
787                             g.genLookaround(IR.LookbehindStart);
788                         else if (current == '!')
789                             g.genLookaround(IR.NeglookbehindStart);
790                         else
791                             error("'!' or '=' expected after '<'");
792                         next();
793                         break;
794                     default:
795                         uint enableFlags, disableFlags;
796                         bool enable = true;
797                         do
798                         {
799                             switch (current)
800                             {
801                             case 's':
802                                 if (enable)
803                                     enableFlags |= RegexOption.singleline;
804                                 else
805                                     disableFlags |= RegexOption.singleline;
806                                 break;
807                             case 'x':
808                                 if (enable)
809                                     enableFlags |= RegexOption.freeform;
810                                 else
811                                     disableFlags |= RegexOption.freeform;
812                                 break;
813                             case 'i':
814                                 if (enable)
815                                     enableFlags |= RegexOption.casefold;
816                                 else
817                                     disableFlags |= RegexOption.casefold;
818                                 break;
819                             case 'm':
820                                 if (enable)
821                                     enableFlags |= RegexOption.multiline;
822                                 else
823                                     disableFlags |= RegexOption.multiline;
824                                 break;
825                             case '-':
826                                 if (!enable)
827                                     error(" unexpected second '-' in flags");
828                                 enable = false;
829                                 break;
830                             default:
831                                 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
832                             }
833                             next();
834                         }while (current != ')');
835                         next();
836                         re_flags |= enableFlags;
837                         re_flags &= ~disableFlags;
838                     }
839                 }
840                 else
841                 {
842                     g.genGroup();
843                 }
844                 break;
845             case ')':
846                 enforce(g.nesting, "Unmatched ')'");
847                 next();
848                 auto pair = g.onClose();
849                 if (pair[0])
850                     parseQuantifier(pair[1]);
851                 break;
852             case '|':
853                 next();
854                 g.fixAlternation();
855                 break;
856             default://no groups or whatever
857                 immutable start = g.length;
858                 parseAtom();
859                 parseQuantifier(start);
860             }
861         }
862 
863         if (g.fixupLength != 1)
864         {
865             fix = g.popFixup();
866             g.finishAlternation(fix);
867             enforce(g.fixupLength == 1, "no matching ')'");
868         }
869     }
870 
871 
872     //parse and store IR for atom-quantifier pair
parseQuantifier(uint offset)873     @trusted void parseQuantifier(uint offset)
874     {//copy is @system
875         if (empty)
876             return g.fixRepetition(offset);
877         uint min, max;
878         switch (current)
879         {
880         case '*':
881             min = 0;
882             max = infinite;
883             break;
884         case '?':
885             min = 0;
886             max = 1;
887             break;
888         case '+':
889             min = 1;
890             max = infinite;
891             break;
892         case '{':
893             enforce(next(), "Unexpected end of regex pattern");
894             enforce(std.ascii.isDigit(current), "First number required in repetition");
895             min = parseDecimal();
896             if (current == '}')
897                 max = min;
898             else if (current == ',')
899             {
900                 next();
901                 if (std.ascii.isDigit(current))
902                     max = parseDecimal();
903                 else if (current == '}')
904                     max = infinite;
905                 else
906                     error("Unexpected symbol in regex pattern");
907                 skipSpace();
908                 if (current != '}')
909                     error("Unmatched '{' in regex pattern");
910             }
911             else
912                 error("Unexpected symbol in regex pattern");
913             if (min > max)
914                 error("Illegal {n,m} quantifier");
915             break;
916         default:
917             g.fixRepetition(offset);
918             return;
919         }
920         bool greedy = true;
921         //check only if we managed to get new symbol
922         if (next() && current == '?')
923         {
924             greedy = false;
925             next();
926         }
927         g.fixRepetition(offset, min, max, greedy);
928     }
929 
930     //parse and store IR for atom
parseAtom()931     void parseAtom()
932     {
933         if (empty)
934             return;
935         switch (current)
936         {
937         case '*', '?', '+', '|', '{', '}':
938             error("'*', '+', '?', '{', '}' not allowed in atom");
939             break;
940         case '.':
941             if (re_flags & RegexOption.singleline)
942                 g.put(Bytecode(IR.Any, 0));
943             else
944             {
945                 CodepointSet set;
946                 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
947             }
948             next();
949             break;
950         case '[':
951             parseCharset();
952             break;
953         case '\\':
954             enforce(_next(), "Unfinished escape sequence");
955             parseEscape();
956             break;
957         case '^':
958             if (re_flags & RegexOption.multiline)
959                 g.put(Bytecode(IR.Bol, 0));
960             else
961                 g.put(Bytecode(IR.Bof, 0));
962             next();
963             break;
964         case '$':
965             if (re_flags & RegexOption.multiline)
966                 g.put(Bytecode(IR.Eol, 0));
967             else
968                 g.put(Bytecode(IR.Eof, 0));
969             next();
970             break;
971         default:
972             //FIXME: getCommonCasing in new std uni
973             if (re_flags & RegexOption.casefold)
974             {
975                 auto range = simpleCaseFoldings(current);
976                 assert(range.length <= 5);
977                 if (range.length == 1)
978                     g.put(Bytecode(IR.Char, range.front));
979                 else
980                     foreach (v; range)
981                         g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
982             }
983             else
984                 g.put(Bytecode(IR.Char, current));
985             next();
986         }
987     }
988 
989 
990 
991     //CodepointSet operations relatively in order of priority
992     enum Operator:uint {
993         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
994     }
995 
996     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
997     //also fetches next set operation
998     Tuple!(CodepointSet,Operator) parseCharTerm()
999     {
1000         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
1001             PotentialTwinSymbolOperator }
1002         Operator op = Operator.None;
1003         dchar last;
1004         CodepointSet set;
1005         State state = State.Start;
1006 
addWithFlags(ref CodepointSet set,uint ch,uint re_flags)1007         static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags)
1008         {
1009             if (re_flags & RegexOption.casefold)
1010             {
1011                 auto range = simpleCaseFoldings(ch);
1012                 foreach (v; range)
1013                     set |= v;
1014             }
1015             else
1016                 set |= ch;
1017         }
1018 
twinSymbolOperator(dchar symbol)1019         static Operator twinSymbolOperator(dchar symbol)
1020         {
1021             switch (symbol)
1022             {
1023             case '|':
1024                 return Operator.Union;
1025             case '-':
1026                 return Operator.Difference;
1027             case '~':
1028                 return Operator.SymDifference;
1029             case '&':
1030                 return Operator.Intersection;
1031             default:
1032                 assert(false);
1033             }
1034         }
1035 
1036         L_CharTermLoop:
1037         for (;;)
1038         {
1039             final switch (state)
1040             {
1041             case State.Start:
1042                 switch (current)
1043                 {
1044                 case '|':
1045                 case '-':
1046                 case '~':
1047                 case '&':
1048                     state = State.PotentialTwinSymbolOperator;
1049                     last = current;
1050                     break;
1051                 case '[':
1052                     op = Operator.Union;
1053                     goto case;
1054                 case ']':
1055                     break L_CharTermLoop;
1056                 case '\\':
1057                     state = State.Escape;
1058                     break;
1059                 default:
1060                     state = State.Char;
1061                     last = current;
1062                 }
1063                 break;
1064             case State.Char:
1065                 // xxx last current xxx
1066                 switch (current)
1067                 {
1068                 case '|':
1069                 case '~':
1070                 case '&':
1071                     // then last is treated as normal char and added as implicit union
1072                     state = State.PotentialTwinSymbolOperator;
1073                     addWithFlags(set, last, re_flags);
1074                     last = current;
1075                     break;
1076                 case '-': // still need more info
1077                     state = State.CharDash;
1078                     break;
1079                 case '\\':
1080                     set |= last;
1081                     state = State.Escape;
1082                     break;
1083                 case '[':
1084                     op = Operator.Union;
1085                     goto case;
1086                 case ']':
1087                     addWithFlags(set, last, re_flags);
1088                     break L_CharTermLoop;
1089                 default:
1090                     state = State.Char;
1091                     addWithFlags(set, last, re_flags);
1092                     last = current;
1093                 }
1094                 break;
1095             case State.PotentialTwinSymbolOperator:
1096                 // xxx last current xxxx
1097                 // where last = [|-&~]
1098                 if (current == last)
1099                 {
1100                     op = twinSymbolOperator(last);
1101                     next();//skip second twin char
1102                     break L_CharTermLoop;
1103                 }
1104                 goto case State.Char;
1105             case State.Escape:
1106                 // xxx \ current xxx
1107                 switch (current)
1108                 {
1109                 case 'f':
1110                     last = '\f';
1111                     state = State.Char;
1112                     break;
1113                 case 'n':
1114                     last = '\n';
1115                     state = State.Char;
1116                     break;
1117                 case 'r':
1118                     last = '\r';
1119                     state = State.Char;
1120                     break;
1121                 case 't':
1122                     last = '\t';
1123                     state = State.Char;
1124                     break;
1125                 case 'v':
1126                     last = '\v';
1127                     state = State.Char;
1128                     break;
1129                 case 'c':
1130                     last = parseControlCode();
1131                     state = State.Char;
1132                     break;
foreach(val;Escapables)1133                 foreach (val; Escapables)
1134                 {
1135                 case val:
1136                 }
1137                     last = current;
1138                     state = State.Char;
1139                     break;
1140                 case 'p':
1141                     set.add(parseUnicodePropertySpec(false));
1142                     state = State.Start;
1143                     continue L_CharTermLoop; //next char already fetched
1144                 case 'P':
1145                     set.add(parseUnicodePropertySpec(true));
1146                     state = State.Start;
1147                     continue L_CharTermLoop; //next char already fetched
1148                 case 'x':
1149                     last = parseUniHex(pat, 2);
1150                     state = State.Char;
1151                     break;
1152                 case 'u':
1153                     last = parseUniHex(pat, 4);
1154                     state = State.Char;
1155                     break;
1156                 case 'U':
1157                     last = parseUniHex(pat, 8);
1158                     state = State.Char;
1159                     break;
1160                 case 'd':
1161                     set.add(unicode.Nd);
1162                     state = State.Start;
1163                     break;
1164                 case 'D':
1165                     set.add(unicode.Nd.inverted);
1166                     state = State.Start;
1167                     break;
1168                 case 's':
1169                     set.add(unicode.White_Space);
1170                     state = State.Start;
1171                     break;
1172                 case 'S':
1173                     set.add(unicode.White_Space.inverted);
1174                     state = State.Start;
1175                     break;
1176                 case 'w':
1177                     set.add(wordCharacter);
1178                     state = State.Start;
1179                     break;
1180                 case 'W':
1181                     set.add(wordCharacter.inverted);
1182                     state = State.Start;
1183                     break;
1184                 default:
1185                     if (current >= privateUseStart && current <= privateUseEnd)
1186                        enforce(false, "no matching ']' found while parsing character class");
1187                     enforce(false, "invalid escape sequence");
1188                 }
1189                 break;
1190             case State.CharDash:
1191                 // xxx last - current xxx
1192                 switch (current)
1193                 {
1194                 case '[':
1195                     op = Operator.Union;
1196                     goto case;
1197                 case ']':
1198                     //means dash is a single char not an interval specifier
1199                     addWithFlags(set, last, re_flags);
1200                     addWithFlags(set, '-', re_flags);
1201                     break L_CharTermLoop;
1202                  case '-'://set Difference again
1203                     addWithFlags(set, last, re_flags);
1204                     op = Operator.Difference;
1205                     next();//skip '-'
1206                     break L_CharTermLoop;
1207                 case '\\':
1208                     state = State.CharDashEscape;
1209                     break;
1210                 default:
1211                     enforce(last <= current, "inverted range");
1212                     if (re_flags & RegexOption.casefold)
1213                     {
1214                         for (uint ch = last; ch <= current; ch++)
1215                             addWithFlags(set, ch, re_flags);
1216                     }
1217                     else
1218                         set.add(last, current + 1);
1219                     state = State.Start;
1220                 }
1221                 break;
1222             case State.CharDashEscape:
1223             //xxx last - \ current xxx
1224                 uint end;
1225                 switch (current)
1226                 {
1227                 case 'f':
1228                     end = '\f';
1229                     break;
1230                 case 'n':
1231                     end = '\n';
1232                     break;
1233                 case 'r':
1234                     end = '\r';
1235                     break;
1236                 case 't':
1237                     end = '\t';
1238                     break;
1239                 case 'v':
1240                     end = '\v';
1241                     break;
foreach(val;Escapables)1242                 foreach (val; Escapables)
1243                 {
1244                 case val:
1245                 }
1246                     end = current;
1247                     break;
1248                 case 'c':
1249                     end = parseControlCode();
1250                     break;
1251                 case 'x':
1252                     end = parseUniHex(pat, 2);
1253                     break;
1254                 case 'u':
1255                     end = parseUniHex(pat, 4);
1256                     break;
1257                 case 'U':
1258                     end = parseUniHex(pat, 8);
1259                     break;
1260                 default:
1261                     if (current >= privateUseStart && current <= privateUseEnd)
1262                        enforce(false, "no matching ']' found while parsing character class");
1263                     error("invalid escape sequence");
1264                 }
1265                 // Lookahead to check if it's a \T
1266                 // where T is sub-pattern terminator in multi-pattern scheme
1267                 if (end == '\\' && !pat.empty)
1268                 {
1269                     if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
1270                         enforce(false, "invalid escape sequence");
1271                 }
1272                 enforce(last <= end,"inverted range");
1273                 set.add(last, end + 1);
1274                 state = State.Start;
1275                 break;
1276             }
1277             enforce(next(), "unexpected end of CodepointSet");
1278         }
1279         return tuple(set, op);
1280     }
1281 
1282     alias ValStack = Stack!(CodepointSet);
1283     alias OpStack = Stack!(Operator);
1284 
1285     //parse and store IR for CodepointSet
parseCharset()1286     void parseCharset()
1287     {
1288         const save = re_flags;
1289         re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
1290         parseCharsetImpl();
1291         re_flags = save;
1292         // Last next() in parseCharsetImp is executed w/o freeform flag
1293         if (re_flags & RegexOption.freeform) skipSpace();
1294     }
1295 
parseCharsetImpl()1296     void parseCharsetImpl()
1297     {
1298         ValStack vstack;
1299         OpStack opstack;
1300         import std.functional : unaryFun;
1301         //
1302         static bool apply(Operator op, ref ValStack stack)
1303         {
1304             switch (op)
1305             {
1306             case Operator.Negate:
1307                 enforce(!stack.empty, "no operand for '^'");
1308                 stack.top = stack.top.inverted;
1309                 break;
1310             case Operator.Union:
1311                 auto s = stack.pop();//2nd operand
1312                 enforce(!stack.empty, "no operand for '||'");
1313                 stack.top.add(s);
1314                 break;
1315             case Operator.Difference:
1316                 auto s = stack.pop();//2nd operand
1317                 enforce(!stack.empty, "no operand for '--'");
1318                 stack.top.sub(s);
1319                 break;
1320             case Operator.SymDifference:
1321                 auto s = stack.pop();//2nd operand
1322                 enforce(!stack.empty, "no operand for '~~'");
1323                 stack.top ~= s;
1324                 break;
1325             case Operator.Intersection:
1326                 auto s = stack.pop();//2nd operand
1327                 enforce(!stack.empty, "no operand for '&&'");
1328                 stack.top.intersect(s);
1329                 break;
1330             default:
1331                 return false;
1332             }
1333             return true;
1334         }
1335         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
1336         {
1337             while (cond(opstack.top))
1338             {
1339                 if (!apply(opstack.pop(),vstack))
1340                     return false;//syntax error
1341                 if (opstack.empty)
1342                     return false;
1343             }
1344             return true;
1345         }
1346 
1347         L_CharsetLoop:
1348         do
1349         {
1350             switch (current)
1351             {
1352             case '[':
1353                 opstack.push(Operator.Open);
1354                 enforce(next(), "unexpected end of character class");
1355                 if (current == '^')
1356                 {
1357                     opstack.push(Operator.Negate);
1358                     enforce(next(), "unexpected end of character class");
1359                 }
1360                 else if (current == ']') // []...] is special cased
1361                 {
1362                     enforce(next(), "wrong character set");
1363                     auto pair = parseCharTerm();
1364                     pair[0].add(']', ']'+1);
1365                     if (pair[1] != Operator.None)
1366                     {
1367                         if (opstack.top == Operator.Union)
1368                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1369                         opstack.push(pair[1]);
1370                     }
1371                     vstack.push(pair[0]);
1372                 }
1373                 break;
1374             case ']':
1375                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
1376                     "character class syntax error");
1377                 enforce(!opstack.empty, "unmatched ']'");
1378                 opstack.pop();
1379                 if (!next() || opstack.empty)
1380                     break L_CharsetLoop;
1381                 auto pair  = parseCharTerm();
1382                 if (!pair[0].empty)//not only operator e.g. -- or ~~
1383                 {
1384                     vstack.top.add(pair[0]);//apply union
1385                 }
1386                 if (pair[1] != Operator.None)
1387                 {
1388                     if (opstack.top == Operator.Union)
1389                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1390                     opstack.push(pair[1]);
1391                 }
1392                 break;
1393             //
1394             default://yet another pair of term(op)?
1395                 auto pair = parseCharTerm();
1396                 if (pair[1] != Operator.None)
1397                 {
1398                     if (opstack.top == Operator.Union)
1399                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1400                     opstack.push(pair[1]);
1401                 }
1402                 vstack.push(pair[0]);
1403             }
1404         }while (!empty || !opstack.empty);
1405         while (!opstack.empty)
1406         {
1407             enforce(opstack.top != Operator.Open,
1408                 "no matching ']' found while parsing character class");
1409             apply(opstack.pop(), vstack);
1410         }
1411         assert(vstack.length == 1);
1412         g.charsetToIr(vstack.top);
1413     }
1414 
1415     //parse and generate IR for escape stand alone escape sequence
parseEscape()1416     @trusted void parseEscape()
1417     {//accesses array of appender
1418         import std.algorithm.iteration : sum;
1419         switch (current)
1420         {
1421         case 'f':   next(); g.put(Bytecode(IR.Char, '\f')); break;
1422         case 'n':   next(); g.put(Bytecode(IR.Char, '\n')); break;
1423         case 'r':   next(); g.put(Bytecode(IR.Char, '\r')); break;
1424         case 't':   next(); g.put(Bytecode(IR.Char, '\t')); break;
1425         case 'v':   next(); g.put(Bytecode(IR.Char, '\v')); break;
1426 
1427         case 'd':
1428             next();
1429             g.charsetToIr(unicode.Nd);
1430             break;
1431         case 'D':
1432             next();
1433             g.charsetToIr(unicode.Nd.inverted);
1434             break;
1435         case 'b':   next(); g.put(Bytecode(IR.Wordboundary, 0)); break;
1436         case 'B':   next(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
1437         case 's':
1438             next();
1439             g.charsetToIr(unicode.White_Space);
1440             break;
1441         case 'S':
1442             next();
1443             g.charsetToIr(unicode.White_Space.inverted);
1444             break;
1445         case 'w':
1446             next();
1447             g.charsetToIr(wordCharacter);
1448             break;
1449         case 'W':
1450             next();
1451             g.charsetToIr(wordCharacter.inverted);
1452             break;
1453         case 'p': case 'P':
1454             auto CodepointSet = parseUnicodePropertySpec(current == 'P');
1455             g.charsetToIr(CodepointSet);
1456             break;
1457         case 'x':
1458             immutable code = parseUniHex(pat, 2);
1459             next();
1460             g.put(Bytecode(IR.Char,code));
1461             break;
1462         case 'u': case 'U':
1463             immutable code = parseUniHex(pat, current == 'u' ? 4 : 8);
1464             next();
1465             g.put(Bytecode(IR.Char, code));
1466             break;
1467         case 'c': //control codes
1468             Bytecode code = Bytecode(IR.Char, parseControlCode());
1469             next();
1470             g.put(code);
1471             break;
1472         case '0':
1473             next();
1474             g.put(Bytecode(IR.Char, 0));//NUL character
1475             break;
1476         case '1': .. case '9':
1477             uint nref = cast(uint) current - '0';
1478             immutable maxBackref = sum(g.groupStack.data);
1479             enforce(nref < maxBackref, "Backref to unseen group");
1480             //perl's disambiguation rule i.e.
1481             //get next digit only if there is such group number
1482             while (nref < maxBackref && next() && std.ascii.isDigit(current))
1483             {
1484                 nref = nref * 10 + current - '0';
1485             }
1486             if (nref >= maxBackref)
1487                 nref /= 10;
1488             enforce(!g.isOpenGroup(nref), "Backref to open group");
1489             uint localLimit = maxBackref - g.groupStack.top;
1490             if (nref >= localLimit)
1491             {
1492                 g.put(Bytecode(IR.Backref, nref-localLimit));
1493                 g.ir[$-1].setLocalRef();
1494             }
1495             else
1496                 g.put(Bytecode(IR.Backref, nref));
1497             g.markBackref(nref);
1498             break;
1499         default:
1500             // Lookahead to check if it's a \T
1501             // where T is sub-pattern terminator in multi-pattern scheme
1502             if (current == '\\' && !pat.empty)
1503             {
1504                 if (pat.front >= privateUseStart && current <= privateUseEnd)
1505                     enforce(false, "invalid escape sequence");
1506             }
1507             if (current >= privateUseStart && current <= privateUseEnd)
1508             {
1509                 g.endPattern(current - privateUseStart + 1);
1510                 break;
1511             }
1512             auto op = Bytecode(IR.Char, current);
1513             next();
1514             g.put(op);
1515         }
1516     }
1517 
1518     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
1519     //\ - assumed to be processed, p - is current
parseUnicodePropertySpec(bool negated)1520     CodepointSet parseUnicodePropertySpec(bool negated)
1521     {
1522         enum MAX_PROPERTY = 128;
1523         char[MAX_PROPERTY] result;
1524         uint k = 0;
1525         enforce(next(), "eof parsing unicode property spec");
1526         if (current == '{')
1527         {
1528             while (k < MAX_PROPERTY && next() && current !='}' && current !=':')
1529                 if (current != '-' && current != ' ' && current != '_')
1530                     result[k++] = cast(char) std.ascii.toLower(current);
1531             enforce(k != MAX_PROPERTY, "invalid property name");
1532             enforce(current == '}', "} expected ");
1533         }
1534         else
1535         {//single char properties e.g.: \pL, \pN ...
1536             enforce(current < 0x80, "invalid property name");
1537             result[k++] = cast(char) current;
1538         }
1539         auto s = getUnicodeSet(result[0 .. k], negated,
1540             cast(bool)(re_flags & RegexOption.casefold));
1541         enforce(!s.empty, "unrecognized unicode property spec");
1542         next();
1543         return s;
1544     }
1545 
1546     //
error(string msg)1547     @trusted void error(string msg)
1548     {
1549         import std.array : appender;
1550         import std.format : formattedWrite;
1551         auto app = appender!string();
1552         formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1553                        msg, origin[0..$-pat.length], pat);
1554         throw new RegexException(app.data);
1555     }
1556 
1557     alias Char = BasicElementOf!R;
1558 
program()1559     @property program()
1560     {
1561         return makeRegex(this);
1562     }
1563 }
1564 
1565 /+
1566     Postproces the IR, then optimize.
1567 +/
postprocess(Char)1568 @trusted void postprocess(Char)(ref Regex!Char zis)
1569 {//@@@BUG@@@ write is @system
1570     with(zis)
1571     {
1572         struct FixedStack(T)
1573         {
1574             T[] arr;
1575             uint _top;
1576             //this(T[] storage){   arr = storage; _top = -1; }
1577             @property ref T top(){  assert(!empty); return arr[_top]; }
1578             void push(T x){  arr[++_top] = x; }
1579             T pop() { assert(!empty);   return arr[_top--]; }
1580             @property bool empty(){   return _top == -1; }
1581         }
1582         auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1583         counterRange.push(1);
1584         ulong cumRange = 0;
1585         for (uint i = 0; i < ir.length; i += ir[i].length)
1586         {
1587             if (ir[i].hotspot)
1588             {
1589                 assert(i + 1 < ir.length,
1590                     "unexpected end of IR while looking for hotspot");
1591                 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1592                 hotspotTableSize += counterRange.top;
1593             }
1594             switch (ir[i].code)
1595             {
1596             case IR.RepeatStart, IR.RepeatQStart:
1597                 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1598                 assert(ir[repEnd].code == ir[i].paired.code);
1599                 immutable max = ir[repEnd + 4].raw;
1600                 ir[repEnd+2].raw = counterRange.top;
1601                 ir[repEnd+3].raw *= counterRange.top;
1602                 ir[repEnd+4].raw *= counterRange.top;
1603                 ulong cntRange = cast(ulong)(max)*counterRange.top;
1604                 cumRange += cntRange;
1605                 enforce(cumRange < maxCumulativeRepetitionLength,
1606                     "repetition length limit is exceeded");
1607                 counterRange.push(cast(uint) cntRange + counterRange.top);
1608                 threadCount += counterRange.top;
1609                 break;
1610             case IR.RepeatEnd, IR.RepeatQEnd:
1611                 threadCount += counterRange.top;
1612                 counterRange.pop();
1613                 break;
1614             case IR.GroupStart:
1615                 if (isBackref(ir[i].data))
1616                     ir[i].setBackrefence();
1617                 threadCount += counterRange.top;
1618                 break;
1619             case IR.GroupEnd:
1620                 if (isBackref(ir[i].data))
1621                     ir[i].setBackrefence();
1622                 threadCount += counterRange.top;
1623                 break;
1624             default:
1625                 threadCount += counterRange.top;
1626             }
1627         }
1628         checkIfOneShot();
1629         if (!(flags & RegexInfo.oneShot))
1630             kickstart = Kickstart!Char(zis, new uint[](256));
1631         debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1632         optimize(zis);
1633     }
1634 }
1635 
fixupBytecode()1636 void fixupBytecode()(Bytecode[] ir)
1637 {
1638     Stack!uint fixups;
1639 
1640     with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1641     {
1642         if (ir[i].isStart || ir[i].code == Option)
1643             fixups.push(i);
1644         else if (ir[i].code == OrEnd)
1645         {
1646             // Alternatives need more care
1647             auto j = fixups.pop(); // last Option
1648             ir[j].data = i -  j - ir[j].length;
1649             j = fixups.pop(); // OrStart
1650             ir[j].data = i - j - ir[j].length;
1651             ir[i].data = ir[j].data;
1652 
1653             // fixup all GotoEndOrs
1654             j = j + IRL!(OrStart);
1655             assert(ir[j].code == Option);
1656             for (;;)
1657             {
1658                 auto next = j + ir[j].data + IRL!(Option);
1659                 if (ir[next].code == IR.OrEnd)
1660                     break;
1661                 ir[next - IRL!(GotoEndOr)].data = i - next;
1662                 j = next;
1663             }
1664         }
1665         else if (ir[i].code == GotoEndOr)
1666         {
1667             auto j = fixups.pop(); // Option
1668             ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1669         }
1670         else if (ir[i].isEnd)
1671         {
1672             auto j = fixups.pop();
1673             ir[i].data = i - j - ir[j].length;
1674             ir[j].data = ir[i].data;
1675         }
1676     }
1677     assert(fixups.empty);
1678 }
1679 
optimize(Char)1680 void optimize(Char)(ref Regex!Char zis)
1681 {
1682     import std.array : insertInPlace;
1683     CodepointSet nextSet(uint idx)
1684     {
1685         CodepointSet set;
1686         with(zis) with(IR)
1687     Outer:
1688         for (uint i = idx; i < ir.length; i += ir[i].length)
1689         {
1690             switch (ir[i].code)
1691             {
1692                 case Char:
1693                     set.add(ir[i].data, ir[i].data+1);
1694                     goto default;
1695                 //TODO: OrChar
1696                 case Trie, CodepointSet:
1697                     set = zis.charsets[ir[i].data];
1698                     goto default;
1699                 case GroupStart,GroupEnd:
1700                     break;
1701                 default:
1702                     break Outer;
1703             }
1704         }
1705         return set;
1706     }
1707 
1708     with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1709     {
1710         if (ir[i].code == InfiniteEnd)
1711         {
1712             auto set = nextSet(i+IRL!(InfiniteEnd));
1713             if (!set.empty && set.length < 10_000)
1714             {
1715                 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1716                 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1717                     Bytecode(InfiniteBloomStart, ir[i].data);
1718                 ir.insertInPlace(i+IRL!(InfiniteEnd),
1719                     Bytecode.fromRaw(cast(uint) zis.filters.length));
1720                 zis.filters ~= BitTable(set);
1721                 fixupBytecode(ir);
1722             }
1723         }
1724     }
1725 }
1726 
1727 //IR code validator - proper nesting, illegal instructions, etc.
validateRe(Char)1728 @trusted void validateRe(Char)(ref Regex!Char zis)
1729 {//@@@BUG@@@ text is @system
1730     import std.conv : text;
1731     with(zis)
1732     {
1733         for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1734         {
1735             if (ir[pc].isStart || ir[pc].isEnd)
1736             {
1737                 immutable dest = ir[pc].indexOfPair(pc);
1738                 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1739                     pc, " ", dest, " vs ", ir.length));
1740                 assert(ir[dest].paired ==  ir[pc],
1741                     text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1742             }
1743             else if (ir[pc].isAtom)
1744             {
1745 
1746             }
1747             else
1748                assert(0, text("Unknown type of instruction at pc=", pc));
1749         }
1750     }
1751 }
1752