1 /*
2     Implementation of std.regex IR, an intermediate representation
3     of a regular expression pattern.
4 
5     This is a common ground between frontend regex component (parser)
6     and backend components - generators, matchers and other "filters".
7 */
8 module std.regex.internal.ir;
9 
10 package(std.regex):
11 
12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13 
14 debug(std_regex_parser) import std.stdio;
15 // just a common trait, may be moved elsewhere
16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17 
18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19 
20 // heuristic value determines maximum CodepointSet length suitable for linear search
21 enum maxCharsetUsed = 6;
22 
23 // another variable to tweak behavior of caching generated Tries for character classes
24 enum maxCachedMatchers = 8;
25 
26 alias Trie = CodepointSetTrie!(13, 8);
27 alias makeTrie = codepointSetTrie!(13, 8);
28 
29 CharMatcher[CodepointSet] matcherCache;
30 
31 //accessor with caching
getMatcher(CodepointSet set)32 @trusted CharMatcher getMatcher(CodepointSet set)
33 {// @@@BUG@@@ 6357 almost all properties of AA are not @safe
34     if (__ctfe || maxCachedMatchers == 0)
35         return CharMatcher(set);
36     else
37     {
38         auto p = set in matcherCache;
39         if (p)
40             return *p;
41         if (matcherCache.length == maxCachedMatchers)
42         {
43             // flush enmatchers in trieCache
44             matcherCache = null;
45         }
46         return (matcherCache[set] = CharMatcher(set));
47     }
48 }
49 
memoizeExpr(string expr)50 @trusted auto memoizeExpr(string expr)()
51 {
52     if (__ctfe)
53         return mixin(expr);
54     alias T = typeof(mixin(expr));
55     static T slot;
56     static bool initialized;
57     if (!initialized)
58     {
59         slot =  mixin(expr);
60         initialized = true;
61     }
62     return slot;
63 }
64 
65 //property for \w character class
wordCharacter()66 @property CodepointSet wordCharacter()
67 {
68     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
69         | unicode.Me | unicode.Nd | unicode.Pc")();
70 }
71 
wordMatcher()72 @property CharMatcher wordMatcher()
73 {
74     return memoizeExpr!("CharMatcher(wordCharacter)")();
75 }
76 
77 // some special Unicode white space characters
78 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
79 
80 // Characters that need escaping in string posed as regular expressions
81 alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
82     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
83 
84 //Regular expression engine/parser options:
85 // global - search  all nonoverlapping matches in input
86 // casefold - case insensitive matching, do casefolding on match in unicode mode
87 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
88 // multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
89 enum RegexOption: uint {
90     global = 0x1,
91     casefold = 0x2,
92     freeform = 0x4,
93     nonunicode = 0x8,
94     multiline = 0x10,
95     singleline = 0x20
96 }
97 //do not reorder this list
98 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
99 static assert( RegexOption.max < 0x80);
100 // flags that allow guide execution of engine
101 enum RegexInfo : uint { oneShot = 0x80 }
102 
103 // IR bit pattern: 0b1_xxxxx_yy
104 // where yy indicates class of instruction, xxxxx for actual operation code
105 //     00: atom, a normal instruction
106 //     01: open, opening of a group, has length of contained IR in the low bits
107 //     10: close, closing of a group, has length of contained IR in the low bits
108 //     11 unused
109 //
110 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
111 // Possible changes:
112 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
113 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
114 //  (like lookaround), or make it easier to identify hotspots.
115 
116 enum IR:uint {
117     Char              = 0b1_00000_00, //a character
118     Any               = 0b1_00001_00, //any character
119     CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
120     Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
121     //match with any of a consecutive OrChar's in this sequence
122     //(used for case insensitive match)
123     //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
124     //the drawback of this representation is that it is difficult
125     // to detect a jump in the middle of it
126     OrChar             = 0b1_00100_00,
127     Nop                = 0b1_00101_00, //no operation (padding)
128     End                = 0b1_00110_00, //end of program
129     Bol                = 0b1_00111_00, //beginning of a line ^
130     Eol                = 0b1_01000_00, //end of a line $
131     Wordboundary       = 0b1_01001_00, //boundary of a word
132     Notwordboundary    = 0b1_01010_00, //not a word boundary
133     Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
134     GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
135     GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
136     Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
137     GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
138     Bof                = 0b1_10000_00, //begining of "file" (string) ^
139     Eof                = 0b1_10001_00, //end of "file" (string) $
140     //... any additional atoms here
141 
142     OrStart            = 0b1_00000_01, //start of alternation group  (length)
143     OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
144     //with this instruction order
145     //bit mask 0b1_00001_00 could be used to test/set greediness
146     InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
147     InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
148     InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
149     InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
150     InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
151     InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
152     RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
153     RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
154     RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
155     RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
156 
157     //
158     LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
159     LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
160     NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
161     NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
162     LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
163     LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
164     NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
165     NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
166 }
167 
168 //a shorthand for IR length - full length of specific opcode evaluated at compile time
IRL(IR code)169 template IRL(IR code)
170 {
171     enum uint IRL =  lengthOfIR(code);
172 }
173 static assert(IRL!(IR.LookaheadStart) == 3);
174 
175 //how many parameters follow the IR, should be optimized fixing some IR bits
immediateParamsIR(IR i)176 int immediateParamsIR(IR i){
177     switch (i)
178     {
179     case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
180         return 1;  // merge table index
181     case IR.InfiniteBloomEnd:
182         return 2;  // bloom filter index + merge table index
183     case IR.RepeatEnd, IR.RepeatQEnd:
184         return 4;
185     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
186         return 2;  // start-end of captures used
187     default:
188         return 0;
189     }
190 }
191 
192 //full length of IR instruction inlcuding all parameters that might follow it
lengthOfIR(IR i)193 int lengthOfIR(IR i)
194 {
195     return 1 + immediateParamsIR(i);
196 }
197 
198 //full length of the paired IR instruction inlcuding all parameters that might follow it
lengthOfPairedIR(IR i)199 int lengthOfPairedIR(IR i)
200 {
201     return 1 + immediateParamsIR(pairedIR(i));
202 }
203 
204 //if the operation has a merge point (this relies on the order of the ops)
hasMerge(IR i)205 bool hasMerge(IR i)
206 {
207     return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
208 }
209 
210 //is an IR that opens a "group"
isStartIR(IR i)211 bool isStartIR(IR i)
212 {
213     return (i&0b11)==0b01;
214 }
215 
216 //is an IR that ends a "group"
isEndIR(IR i)217 bool isEndIR(IR i)
218 {
219     return (i&0b11)==0b10;
220 }
221 
222 //is a standalone IR
isAtomIR(IR i)223 bool isAtomIR(IR i)
224 {
225     return (i&0b11)==0b00;
226 }
227 
228 //makes respective pair out of IR i, swapping start/end bits of instruction
pairedIR(IR i)229 IR pairedIR(IR i)
230 {
231     assert(isStartIR(i) || isEndIR(i));
232     return cast(IR)(i ^ 0b11);
233 }
234 
235 //encoded IR instruction
236 struct Bytecode
237 {
238     uint raw;
239     //natural constraints
240     enum maxSequence = 2+4;
241     enum maxData = 1 << 22;
242     enum maxRaw = 1 << 31;
243 
thisBytecode244     this(IR code, uint data)
245     {
246         assert(data < (1 << 22) && code < 256);
247         raw = code << 24 | data;
248     }
249 
thisBytecode250     this(IR code, uint data, uint seq)
251     {
252         assert(data < (1 << 22) && code < 256 );
253         assert(seq >= 2 && seq < maxSequence);
254         raw = code << 24 | (seq - 2)<<22 | data;
255     }
256 
257     //store raw data
fromRawBytecode258     static Bytecode fromRaw(uint data)
259     {
260         Bytecode t;
261         t.raw = data;
262         return t;
263     }
264 
265     //bit twiddling helpers
266     //0-arg template due to @@@BUG@@@ 10985
dataBytecode267     @property uint data()() const { return raw & 0x003f_ffff; }
268 
dataBytecode269     @property void data()(uint val)
270     {
271         raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
272     }
273 
274     //ditto
275     //0-arg template due to @@@BUG@@@ 10985
sequenceBytecode276     @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
277 
278     //ditto
279     //0-arg template due to @@@BUG@@@ 10985
codeBytecode280     @property IR code()() const { return cast(IR)(raw >> 24); }
281 
282     //ditto
hotspotBytecode283     @property bool hotspot() const { return hasMerge(code); }
284 
285     //test the class of this instruction
isAtomBytecode286     @property bool isAtom() const { return isAtomIR(code); }
287 
288     //ditto
isStartBytecode289     @property bool isStart() const { return isStartIR(code); }
290 
291     //ditto
isEndBytecode292     @property bool isEnd() const { return isEndIR(code); }
293 
294     //number of arguments for this instruction
argsBytecode295     @property int args() const { return immediateParamsIR(code); }
296 
297     //mark this GroupStart or GroupEnd as referenced in backreference
setBackrefenceBytecode298     void setBackrefence()
299     {
300         assert(code == IR.GroupStart || code == IR.GroupEnd);
301         raw = raw | 1 << 23;
302     }
303 
304     //is referenced
backreferenceBytecode305     @property bool backreference() const
306     {
307         assert(code == IR.GroupStart || code == IR.GroupEnd);
308         return cast(bool)(raw & 1 << 23);
309     }
310 
311     //mark as local reference (for backrefs in lookarounds)
setLocalRefBytecode312     void setLocalRef()
313     {
314         assert(code == IR.Backref);
315         raw = raw | 1 << 23;
316     }
317 
318     //is a local ref
localRefBytecode319     @property bool localRef() const
320     {
321         assert(code == IR.Backref);
322         return cast(bool)(raw & 1 << 23);
323     }
324 
325     //human readable name of instruction
mnemonicBytecode326     @trusted @property string mnemonic()() const
327     {//@@@BUG@@@ to is @system
328         import std.conv : to;
329         return to!string(code);
330     }
331 
332     //full length of instruction
lengthBytecode333     @property uint length() const
334     {
335         return lengthOfIR(code);
336     }
337 
338     //full length of respective start/end of this instruction
pairedLengthBytecode339     @property uint pairedLength() const
340     {
341         return lengthOfPairedIR(code);
342     }
343 
344     //returns bytecode of paired instruction (assuming this one is start or end)
pairedBytecode345     @property Bytecode paired() const
346     {//depends on bit and struct layout order
347         assert(isStart || isEnd);
348         return Bytecode.fromRaw(raw ^ 0b11 << 24);
349     }
350 
351     //gets an index into IR block of the respective pair
indexOfPairBytecode352     uint indexOfPair(uint pc) const
353     {
354         assert(isStart || isEnd);
355         return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
356     }
357 }
358 
359 static assert(Bytecode.sizeof == 4);
360 
361 
362 //index entry structure for name --> number of submatch
363 struct NamedGroup
364 {
365     string name;
366     uint group;
367 }
368 
369 //holds pair of start-end markers for a submatch
Group(DataIndex)370 struct Group(DataIndex)
371 {
372     DataIndex begin, end;
373     @trusted string toString()() const
374     {
375         import std.array : appender;
376         import std.format : formattedWrite;
377         auto a = appender!string();
378         formattedWrite(a, "%s..%s", begin, end);
379         return a.data;
380     }
381 }
382 
383 //debugging tool, prints out instruction along with opcodes
384 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
385 {
386     import std.array : appender;
387     import std.format : formattedWrite;
388     auto output = appender!string();
389     formattedWrite(output,"%s", irb[pc].mnemonic);
390     switch (irb[pc].code)
391     {
392     case IR.Char:
393         formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
394         break;
395     case IR.OrChar:
396         formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
397         break;
398     case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
399     IR.Option, IR.GotoEndOr, IR.OrStart:
400         //forward-jump instructions
401         uint len = irb[pc].data;
402         formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
403         break;
404     case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
405         uint len = irb[pc].data;
406         formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
407             pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
408         break;
409     case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
410         uint len = irb[pc].data;
411         formattedWrite(output, " pc=>%u", pc-len);
412         break;
413     case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
414         uint len = irb[pc].data;
415         formattedWrite(output, " pc=>%u", pc-len);
416         break;
417     case IR.GroupStart, IR.GroupEnd:
418         uint n = irb[pc].data;
419         string name;
420         foreach (v;dict)
421             if (v.group == n)
422             {
423                 name = "'"~v.name~"'";
424                 break;
425             }
426         formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
427                 name, n);
428         break;
429     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
430         uint len = irb[pc].data;
431         uint start = irb[pc+1].raw, end = irb[pc+2].raw;
432         formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
433         break;
434     case IR.Backref: case IR.CodepointSet: case IR.Trie:
435         uint n = irb[pc].data;
436         formattedWrite(output, " %u",  n);
437         if (irb[pc].code == IR.Backref)
438             formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
439         break;
440     default://all data-free instructions
441     }
442     if (irb[pc].hotspot)
443         formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
444     return output.data;
445 }
446 
447 //disassemble the whole chunk
printBytecode()448 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
449 {
450     import std.stdio : writeln;
451     for (uint pc=0; pc<slice.length; pc += slice[pc].length)
452         writeln("\t", disassemble(slice, pc, dict));
453 }
454 
455 /++
456     $(D Regex) object holds regular expression pattern in compiled form.
457     Instances of this object are constructed via calls to $(D regex).
458     This is an intended form for caching and storage of frequently
459     used regular expressions.
460 +/
Regex(Char)461 struct Regex(Char)
462 {
463     //temporary workaround for identifier lookup
464     CodepointSet[] charsets; //
465     Bytecode[] ir;      //compiled bytecode of pattern
466 
467 
468     @safe @property bool empty() const nothrow {  return ir is null; }
469 
470     @safe @property auto namedCaptures()
471     {
472         static struct NamedGroupRange
473         {
474         private:
475             NamedGroup[] groups;
476             size_t start;
477             size_t end;
478         public:
479             this(NamedGroup[] g, size_t s, size_t e)
480             {
481                 assert(s <= e);
482                 assert(e <= g.length);
483                 groups = g;
484                 start = s;
485                 end = e;
486             }
487 
488             @property string front() { return groups[start].name; }
489             @property string back() { return groups[end-1].name; }
490             @property bool empty() { return start >= end; }
491             @property size_t length() { return end - start; }
492             alias opDollar = length;
493             @property NamedGroupRange save()
494             {
495                 return NamedGroupRange(groups, start, end);
496             }
497             void popFront() { assert(!empty); start++; }
498             void popBack() { assert(!empty); end--; }
499             string opIndex()(size_t i)
500             {
501                 assert(start + i < end,
502                        "Requested named group is out of range.");
503                 return groups[start+i].name;
504             }
505             NamedGroupRange opSlice(size_t low, size_t high) {
506                 assert(low <= high);
507                 assert(start + high <= end);
508                 return NamedGroupRange(groups, start + low, start + high);
509             }
510             NamedGroupRange opSlice() { return this.save; }
511         }
512         return NamedGroupRange(dict, 0, dict.length);
513     }
514 
515 package(std.regex):
516     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
517     NamedGroup[] dict;                     // maps name -> user group number
518     uint ngroup;                           // number of internal groups
519     uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
520     uint hotspotTableSize;                 // number of entries in merge table
521     uint threadCount;                      // upper bound on number of Thompson VM threads
522     uint flags;                            // global regex flags
523     public const(CharMatcher)[]  matchers; // tables that represent character sets
524     public const(BitTable)[] filters;      // bloom filters for conditional loops
525     uint[] backrefed;                      // bit array of backreferenced submatches
526     Kickstart!Char kickstart;
527 
528     //bit access helper
529     uint isBackref(uint n)
530     {
531         if (n/32 >= backrefed.length)
532             return 0;
533         return backrefed[n / 32] & (1 << (n & 31));
534     }
535 
536     //check if searching is not needed
537     void checkIfOneShot()
538     {
539     L_CheckLoop:
540         for (uint i = 0; i < ir.length; i += ir[i].length)
541         {
542             switch (ir[i].code)
543             {
544                 case IR.Bof:
545                     flags |= RegexInfo.oneShot;
546                     break L_CheckLoop;
547                 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
548                 IR.Wordboundary, IR.Notwordboundary:
549                     break;
550                 default:
551                     break L_CheckLoop;
552             }
553         }
554     }
555 
556     //print out disassembly a program's IR
557     @trusted debug(std_regex_parser) void print() const
558     {//@@@BUG@@@ write is system
559         for (uint i = 0; i < ir.length; i += ir[i].length)
560         {
561             writefln("%d\t%s ", i, disassemble(ir, i, dict));
562         }
563         writeln("Total merge table size: ", hotspotTableSize);
564         writeln("Max counter nesting depth: ", maxCounterDepth);
565     }
566 
567 }
568 
569 //@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!)
StaticRegex(Char)570 /*public*/ struct StaticRegex(Char)
571 {
572 package(std.regex):
573     import std.regex.internal.backtracking : BacktrackingMatcher;
574     alias Matcher = BacktrackingMatcher!(true);
575     alias MatchFn = bool function(ref Matcher!Char) @trusted;
576     MatchFn nativeFn;
577 public:
578     Regex!Char _regex;
579     alias _regex this;
580     this(Regex!Char re, MatchFn fn)
581     {
582         _regex = re;
583         nativeFn = fn;
584 
585     }
586 
587 }
588 
589 // The stuff below this point is temporarrily part of IR module
590 // but may need better place in the future (all internals)
591 package(std.regex):
592 
593 //Simple UTF-string abstraction compatible with stream interface
594 struct Input(Char)
595 if (is(Char :dchar))
596 {
597     import std.utf : decode;
598     alias DataIndex = size_t;
599     enum bool isLoopback = false;
600     alias String = const(Char)[];
601     String _origin;
602     size_t _index;
603 
604     //constructs Input object out of plain string
605     this(String input, size_t idx = 0)
606     {
607         _origin = input;
608         _index = idx;
609     }
610 
611     //codepoint at current stream position
pragma(inline,true)612     pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
613     {
614         pos = _index;
615         // DMD's inliner hates multiple return functions
616         // but can live with single statement if/else bodies
617         bool n = !(_index == _origin.length);
618         if (n)
619             res = decode(_origin, _index);
620         return n;
621     }
atEnd()622     @property bool atEnd(){
623         return _index == _origin.length;
624     }
search(Kickstart)625     bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos)
626     {
627         size_t idx = kick.search(_origin, _index);
628         _index = idx;
629         return nextChar(res, pos);
630     }
631 
632     //index of at End position
lastIndex()633     @property size_t lastIndex(){   return _origin.length; }
634 
635     //support for backtracker engine, might not be present
reset(size_t index)636     void reset(size_t index){   _index = index;  }
637 
opSlice(size_t start,size_t end)638     String opSlice(size_t start, size_t end){   return _origin[start .. end]; }
639 
loopBack(size_t index)640     auto loopBack(size_t index){   return BackLooper!Input(this, index); }
641 }
642 
BackLooperImpl(Input)643 struct BackLooperImpl(Input)
644 {
645     import std.utf : strideBack;
646     alias DataIndex = size_t;
647     alias String = Input.String;
648     enum bool isLoopback = true;
649     String _origin;
650     size_t _index;
651     this(Input input, size_t index)
652     {
653         _origin = input._origin;
654         _index = index;
655     }
656     @trusted bool nextChar(ref dchar res,ref size_t pos)
657     {
658         pos = _index;
659         if (_index == 0)
660             return false;
661 
662         res = _origin[0.._index].back;
663         _index -= strideBack(_origin, _index);
664 
665         return true;
666     }
667     @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
668     auto loopBack(size_t index){   return Input(_origin, index); }
669 
670     //support for backtracker engine, might not be present
671     //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
672     void reset(size_t index){   _index = index;  }
673 
674     String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
675     //index of at End position
676     @property size_t lastIndex(){   return 0; }
677 }
678 
BackLooper(E)679 template BackLooper(E)
680 {
681     static if (is(E : BackLooperImpl!U, U))
682     {
683         alias BackLooper = U;
684     }
685     else
686     {
687         alias BackLooper = BackLooperImpl!E;
688     }
689 }
690 
691 //both helpers below are internal, on its own are quite "explosive"
692 //unsafe, no initialization of elements
mallocArray(T)693 @system T[] mallocArray(T)(size_t len)
694 {
695     import core.stdc.stdlib : malloc;
696     return (cast(T*) malloc(len * T.sizeof))[0 .. len];
697 }
698 
699 //very unsafe, no initialization
arrayInChunk(T)700 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
701 {
702     auto ret = (cast(T*) chunk.ptr)[0 .. len];
703     chunk = chunk[len * T.sizeof .. $];
704     return ret;
705 }
706 
707 //
lookupNamedGroup(String)708 @trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
709 {//equal is @system?
710     import std.algorithm.comparison : equal;
711     import std.algorithm.iteration : map;
712     import std.conv : text;
713     import std.range : assumeSorted;
714 
715     auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
716     enforce(fnd < dict.length && equal(dict[fnd].name, name),
717         text("no submatch named ", name));
718     return dict[fnd].group;
719 }
720 
721 //whether ch is one of unicode newline sequences
722 //0-arg template due to @@@BUG@@@ 10985
endOfLine()723 bool endOfLine()(dchar front, bool seenCr)
724 {
725     return ((front == '\n') ^ seenCr) || front == '\r'
726     || front == NEL || front == LS || front == PS;
727 }
728 
729 //
730 //0-arg template due to @@@BUG@@@ 10985
startOfLine()731 bool startOfLine()(dchar back, bool seenNl)
732 {
733     return ((back == '\r') ^ seenNl) || back == '\n'
734     || back == NEL || back == LS || back == PS;
735 }
736 
737 ///Exception object thrown in case of errors during regex compilation.
738 public class RegexException : Exception
739 {
740     mixin basicExceptionCtors;
741 }
742 
743 // simple 128-entry bit-table used with a hash function
744 struct BitTable {
745     uint[4] filter;
746 
thisBitTable747     this(CodepointSet set){
748         foreach (iv; set.byInterval)
749         {
750             foreach (v; iv.a .. iv.b)
751                 add(v);
752         }
753     }
754 
addBitTable755     void add()(dchar ch){
756         immutable i = index(ch);
757         filter[i >> 5]  |=  1<<(i & 31);
758     }
759     // non-zero -> might be present, 0 -> absent
opIndexBitTable760     bool opIndex()(dchar ch) const{
761         immutable i = index(ch);
762         return (filter[i >> 5]>>(i & 31)) & 1;
763     }
764 
indexBitTable765     static uint index()(dchar ch){
766         return ((ch >> 7) ^ ch) & 0x7F;
767     }
768 }
769 
770 struct CharMatcher {
771     BitTable ascii; // fast path for ASCII
772     Trie trie;      // slow path for Unicode
773 
this(CodepointSet set)774     this(CodepointSet set)
775     {
776         auto asciiSet = set & unicode.ASCII;
777         ascii = BitTable(asciiSet);
778         trie = makeTrie(set);
779     }
780 
opIndex()781     bool opIndex()(dchar ch) const
782     {
783         if (ch < 0x80)
784             return ascii[ch];
785         else
786             return trie[ch];
787     }
788 }
789