1 /++
2 $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions)
3 are a commonly used method of pattern matching
4 on strings, with $(I regex) being a catchy word for a pattern in this domain
5 specific language. Typical problems usually solved by regular expressions
6 include validation of user input and the ubiquitous find $(AMP) replace
7 in text processing utilities.
8
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(BOOKTABLE,
11 $(TR $(TH Category) $(TH Functions))
12 $(TR $(TD Matching) $(TD
13 $(LREF bmatch)
14 $(LREF match)
15 $(LREF matchAll)
16 $(LREF matchFirst)
17 ))
18 $(TR $(TD Building) $(TD
19 $(LREF ctRegex)
20 $(LREF escaper)
21 $(LREF _regex)
22 ))
23 $(TR $(TD Replace) $(TD
24 $(LREF replace)
25 $(LREF replaceAll)
26 $(LREF replaceAllInto)
27 $(LREF replaceFirst)
28 $(LREF replaceFirstInto)
29 ))
30 $(TR $(TD Split) $(TD
31 $(LREF split)
32 $(LREF splitter)
33 ))
34 $(TR $(TD Objects) $(TD
35 $(LREF Captures)
36 $(LREF Regex)
37 $(LREF RegexException)
38 $(LREF RegexMatch)
39 $(LREF Splitter)
40 $(LREF StaticRegex)
41 ))
42 )
43
44 $(SECTION Synopsis)
45 ---
46 import std.regex;
47 import std.stdio;
48 void main()
49 {
50 // Print out all possible dd/mm/yy(yy) dates found in user input.
51 auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b");
52 foreach (line; stdin.byLine)
53 {
54 // matchAll() returns a range that can be iterated
55 // to get all subsequent matches.
56 foreach (c; matchAll(line, r))
57 writeln(c.hit);
58 }
59 }
60 ...
61
62 // Create a static regex at compile-time, which contains fast native code.
63 auto ctr = ctRegex!(`^.*/([^/]+)/?$`);
64
65 // It works just like a normal regex:
66 auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any
67 assert(!c2.empty); // Be sure to check if there is a match before examining contents!
68 assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match.
69
70 ...
71 // multi-pattern regex
72 auto multi = regex([`\d+,\d+`,`(a-z]+):(\d+)`]);
73 auto m = "abc:43 12,34".matchAll(multi);
74 assert(m.front.whichPattern == 2);
75 assert(m.front[1] == "abc");
76 assert(m.front[2] == "43");
77 m.popFront();
78 assert(m.front.whichPattern == 1);
79 assert(m.front[1] == "12");
80 ...
81
82 // The result of the `matchAll/matchFirst` is directly testable with if/assert/while.
83 // e.g. test if a string consists of letters:
84 assert(matchFirst("Letter", `^\p{L}+$`));
85 ---
86
87 $(SECTION Syntax and general information)
88 The general usage guideline is to keep regex complexity on the side of simplicity,
89 as its capabilities reside in purely character-level manipulation.
90 As such it's ill-suited for tasks involving higher level invariants
91 like matching an integer number $(U bounded) in an [a,b] interval.
92 Checks of this sort of are better addressed by additional post-processing.
93
94 The basic syntax shouldn't surprise experienced users of regular expressions.
95 For an introduction to $(D std.regex) see a
96 $(HTTP dlang.org/regular-expression.html, short tour) of the module API
97 and its abilities.
98
99 There are other web resources on regular expressions to help newcomers,
100 and a good $(HTTP www.regular-expressions.info, reference with tutorial)
101 can easily be found.
102
103 This library uses a remarkably common ECMAScript syntax flavor
104 with the following extensions:
105 $(UL
106 $(LI Named subexpressions, with Python syntax. )
107 $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.)
108 $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.)
109 )
110
111 $(REG_START Pattern syntax )
112 $(I std.regex operates on codepoint level,
113 'character' in this table denotes a single Unicode codepoint.)
114 $(REG_TABLE
115 $(REG_TITLE Pattern element, Semantics )
116 $(REG_TITLE Atoms, Match single characters )
117 $(REG_ROW any character except [{|*+?()^$, Matches the character itself. )
118 $(REG_ROW ., In single line mode matches any character.
119 Otherwise it matches any character except '\n' and '\r'. )
120 $(REG_ROW [class], Matches a single character
121 that belongs to this character class. )
122 $(REG_ROW [^class], Matches a single character that
123 does $(U not) belong to this character class.)
124 $(REG_ROW \cC, Matches the control character corresponding to letter C)
125 $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. )
126 $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. )
127 $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. )
128 $(REG_ROW \f, Matches a formfeed character. )
129 $(REG_ROW \n, Matches a linefeed character. )
130 $(REG_ROW \r, Matches a carriage return character. )
131 $(REG_ROW \t, Matches a tab character. )
132 $(REG_ROW \v, Matches a vertical tab character. )
133 $(REG_ROW \d, Matches any Unicode digit. )
134 $(REG_ROW \D, Matches any character except Unicode digits. )
135 $(REG_ROW \w, Matches any word character (note: this includes numbers).)
136 $(REG_ROW \W, Matches any non-word character.)
137 $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.)
138 $(REG_ROW \S, Matches any character except those recognized as $(I \s ). )
139 $(REG_ROW \\, Matches \ character. )
140 $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. )
141 $(REG_ROW \p{PropertyName}, Matches a character that belongs
142 to the Unicode PropertyName set.
143 Single letter abbreviations can be used without surrounding {,}. )
144 $(REG_ROW \P{PropertyName}, Matches a character that does not belong
145 to the Unicode PropertyName set.
146 Single letter abbreviations can be used without surrounding {,}. )
147 $(REG_ROW \p{InBasicLatin}, Matches any character that is part of
148 the BasicLatin Unicode $(U block).)
149 $(REG_ROW \P{InBasicLatin}, Matches any character except ones in
150 the BasicLatin Unicode $(U block).)
151 $(REG_ROW \p{Cyrillic}, Matches any character that is part of
152 Cyrillic $(U script).)
153 $(REG_ROW \P{Cyrillic}, Matches any character except ones in
154 Cyrillic $(U script).)
155 $(REG_TITLE Quantifiers, Specify repetition of other elements)
156 $(REG_ROW *, Matches previous character/subexpression 0 or more times.
157 Greedy version - tries as many times as possible.)
158 $(REG_ROW *?, Matches previous character/subexpression 0 or more times.
159 Lazy version - stops as early as possible.)
160 $(REG_ROW +, Matches previous character/subexpression 1 or more times.
161 Greedy version - tries as many times as possible.)
162 $(REG_ROW +?, Matches previous character/subexpression 1 or more times.
163 Lazy version - stops as early as possible.)
164 $(REG_ROW {n}, Matches previous character/subexpression exactly n times. )
165 $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more.
166 Greedy version - tries as many times as possible. )
167 $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more.
168 Lazy version - stops as early as possible.)
169 $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times.
170 Greedy version - tries as many times as possible, but no more than m times. )
171 $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times.
172 Lazy version - stops as early as possible, but no less then n times.)
173 $(REG_TITLE Other, Subexpressions $(AMP) alternations )
174 $(REG_ROW (regex), Matches subexpression regex,
175 saving matched portion of text for later retrieval. )
176 $(REG_ROW (?#comment), An inline comment that is ignored while matching.)
177 $(REG_ROW (?:regex), Matches subexpression regex,
178 $(U not) saving matched portion of text. Useful to speed up matching. )
179 $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. )
180 $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression
181 regex labeling it with name 'name'.
182 When referring to a matched portion of text,
183 names work like aliases in addition to direct numbers.
184 )
185 $(REG_TITLE Assertions, Match position rather than character )
186 $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).)
187 $(REG_ROW $, Matches at the end of input or line (in multiline mode). )
188 $(REG_ROW \b, Matches at word boundary. )
189 $(REG_ROW \B, Matches when $(U not) at word boundary. )
190 $(REG_ROW (?=regex), Zero-width lookahead assertion.
191 Matches at a point where the subexpression
192 regex could be matched starting from the current position.
193 )
194 $(REG_ROW (?!regex), Zero-width negative lookahead assertion.
195 Matches at a point where the subexpression
196 regex could $(U not) be matched starting from the current position.
197 )
198 $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point
199 where the subexpression regex could be matched ending
200 at the current position (matching goes backwards).
201 )
202 $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion.
203 Matches at a point where the subexpression regex could $(U not)
204 be matched ending at the current position (matching goes backwards).
205 )
206 )
207
208 $(REG_START Character classes )
209 $(REG_TABLE
210 $(REG_TITLE Pattern element, Semantics )
211 $(REG_ROW Any atom, Has the same meaning as outside of a character class.)
212 $(REG_ROW a-z, Includes characters a, b, c, ..., z. )
213 $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b],
214 Where a, b are arbitrary classes, means union, set difference,
215 symmetric set difference, and intersection respectively.
216 $(I Any sequence of character class elements implicitly forms a union.) )
217 )
218
219 $(REG_START Regex flags )
220 $(REG_TABLE
221 $(REG_TITLE Flag, Semantics )
222 $(REG_ROW g, Global regex, repeat over the whole input. )
223 $(REG_ROW i, Case insensitive matching. )
224 $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators
225 as well as start and end of input.)
226 $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. )
227 $(REG_ROW x, Free-form syntax, ignores whitespace in pattern,
228 useful for formatting complex regular expressions. )
229 )
230
231 $(SECTION Unicode support)
232
233 This library provides full Level 1 support* according to
234 $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically:
235 $(UL
236 $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.)
237 $(LI 1.2 Unicode properties.)
238 $(LI 1.3 Character classes with set operations.)
239 $(LI 1.4 Word boundaries use the full set of "word" characters.)
240 $(LI 1.5 Using simple casefolding to match case
241 insensitively across the full range of codepoints.)
242 $(LI 1.6 Respecting line breaks as any of
243 \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.)
244 $(LI 1.7 Operating on codepoint level.)
245 )
246 *With exception of point 1.1.1, as of yet, normalization of input
247 is expected to be enforced by user.
248
249 $(SECTION Replace format string)
250
251 A set of functions in this module that do the substitution rely
252 on a simple format to guide the process. In particular the table below
253 applies to the $(D format) argument of
254 $(LREF replaceFirst) and $(LREF replaceAll).
255
256 The format string can reference parts of match using the following notation.
257 $(REG_TABLE
258 $(REG_TITLE Format specifier, Replaced by )
259 $(REG_ROW $$(AMP), the whole match. )
260 $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. )
261 $(REG_ROW $', part of input $(I following) the match. )
262 $(REG_ROW $$, '$' character. )
263 $(REG_ROW \c $(COMMA) where c is any character, the character c itself. )
264 $(REG_ROW \\, '\' character. )
265 $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. )
266 )
267
268 $(SECTION Slicing and zero memory allocations orientation)
269
270 All matches returned by pattern matching functionality in this library
271 are slices of the original input. The notable exception is the $(D replace)
272 family of functions that generate a new string from the input.
273
274 In cases where producing the replacement is the ultimate goal
275 $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy
276 as functions that avoid allocations even for replacement.
277
278 Copyright: Copyright Dmitry Olshansky, 2011-
279
280 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
281
282 Authors: Dmitry Olshansky,
283
284 API and utility constructs are modeled after the original $(D std.regex)
285 by Walter Bright and Andrei Alexandrescu.
286
287 Source: $(PHOBOSSRC std/_regex/_package.d)
288
289 Macros:
290 REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) )
291 REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) )
292 REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table>
293 REG_START = <h3><div align="center"> $0 </div></h3>
294 SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3>
295 S_LINK = <a href="#$1">$+</a>
296 +/
297 module std.regex;
298
299 import std.range.primitives, std.traits;
300 import std.regex.internal.ir;
301 import std.regex.internal.thompson; //TODO: get rid of this dependency
302 import std.typecons; // : Flag, Yes, No;
303
304 /++
305 $(D Regex) object holds regular expression pattern in compiled form.
306
307 Instances of this object are constructed via calls to $(D regex).
308 This is an intended form for caching and storage of frequently
309 used regular expressions.
310
311 Example:
312
313 Test if this object doesn't contain any compiled pattern.
314 ---
315 Regex!char r;
316 assert(r.empty);
317 r = regex(""); // Note: "" is a valid regex pattern.
318 assert(!r.empty);
319 ---
320
321 Getting a range of all the named captures in the regex.
322 ----
323 import std.range;
324 import std.algorithm;
325
326 auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`);
327 auto nc = re.namedCaptures;
328 static assert(isRandomAccessRange!(typeof(nc)));
329 assert(!nc.empty);
330 assert(nc.length == 2);
331 assert(nc.equal(["name", "var"]));
332 assert(nc[0] == "name");
333 assert(nc[1..$].equal(["var"]));
334 ----
335 +/
336 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char);
337
338 /++
339 A $(D StaticRegex) is $(D Regex) object that contains D code specially
340 generated at compile-time to speed up matching.
341
342 Implicitly convertible to normal $(D Regex),
343 however doing so will result in losing this additional capability.
344 +/
345 public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char);
346
347 /++
348 Compile regular expression pattern for the later execution.
349 Returns: $(D Regex) object that works on inputs having
350 the same character width as $(D pattern).
351
352 Params:
353 pattern = A single regular expression to match.
354 patterns = An array of regular expression strings.
355 The resulting `Regex` object will match any expression;
356 use $(LREF whichPattern) to know which.
357 flags = The _attributes (g, i, m and x accepted)
358
359 Throws: $(D RegexException) if there were any errors during compilation.
360 +/
361 @trusted public auto regex(S)(S[] patterns, const(char)[] flags="")
362 if (isSomeString!(S))
363 {
364 import std.array : appender;
365 import std.functional : memoize;
366 enum cacheSize = 8; //TODO: invent nice interface to control regex caching
367 S pat;
368 if (patterns.length > 1)
369 {
370 auto app = appender!S();
foreach(i,p;patterns)371 foreach (i, p; patterns)
372 {
373 if (i != 0)
374 app.put("|");
375 app.put("(?:");
376 app.put(patterns[i]);
377 // terminator for the pattern
378 // to detect if the pattern unexpectedly ends
379 app.put("\\");
380 app.put(cast(dchar)(privateUseStart+i));
381 app.put(")");
382 // another one to return correct whichPattern
383 // for all of potential alternatives in the patterns[i]
384 app.put("\\");
385 app.put(cast(dchar)(privateUseStart+i));
386 }
387 pat = app.data;
388 }
389 else
390 pat = patterns[0];
391
392 if (__ctfe)
393 return regexImpl(pat, flags);
394 return memoize!(regexImpl!S, cacheSize)(pat, flags);
395 }
396
397 ///ditto
398 @trusted public auto regex(S)(S pattern, const(char)[] flags="")
399 if (isSomeString!(S))
400 {
401 return regex([pattern], flags);
402 }
403
404 ///
405 @system unittest
406 {
407 // multi-pattern regex example
408 auto multi = regex([`([a-z]+):(\d+)`, `(\d+),\d+`]); // multi regex
409 auto m = "abc:43 12,34".matchAll(multi);
410 assert(m.front.whichPattern == 1);
411 assert(m.front[1] == "abc");
412 assert(m.front[2] == "43");
413 m.popFront();
414 assert(m.front.whichPattern == 2);
415 assert(m.front[1] == "12");
416 }
417
418 public auto regexImpl(S)(S pattern, const(char)[] flags="")
419 if (isSomeString!(S))
420 {
421 import std.regex.internal.parser : Parser, CodeGen;
422 auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags);
423 auto r = parser.program;
424 return r;
425 }
426
427
428 template ctRegexImpl(alias pattern, string flags=[])
429 {
430 import std.regex.internal.backtracking, std.regex.internal.parser;
431 enum r = regex(pattern, flags);
432 alias Char = BasicElementOf!(typeof(pattern));
433 enum source = ctGenRegExCode(r);
434 alias Matcher = BacktrackingMatcher!(true);
435 @trusted bool func(ref Matcher!Char matcher)
436 {
437 debug(std_regex_ctr) pragma(msg, source);
438 mixin(source);
439 }
440 enum nr = StaticRegex!Char(r, &func);
441 }
442
443 /++
444 Compile regular expression using CTFE
445 and generate optimized native machine code for matching it.
446
447 Returns: StaticRegex object for faster matching.
448
449 Params:
450 pattern = Regular expression
451 flags = The _attributes (g, i, m and x accepted)
452 +/
453 public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr;
454
455 enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R))
456 || is(RegEx == StaticRegex!(BasicElementOf!R));
457
458
459 /++
460 $(D Captures) object contains submatches captured during a call
461 to $(D match) or iteration over $(D RegexMatch) range.
462
463 First element of range is the whole match.
464 +/
465 @trusted public struct Captures(R, DIndex = size_t)
466 if (isSomeString!R)
467 {//@trusted because of union inside
468 alias DataIndex = DIndex;
469 alias String = R;
470 private:
471 import std.conv : text;
472 R _input;
473 int _nMatch;
474 enum smallString = 3;
475 enum SMALL_MASK = 0x8000_0000, REF_MASK= 0x1FFF_FFFF;
476 union
477 {
478 Group!DataIndex[] big_matches;
479 Group!DataIndex[smallString] small_matches;
480 }
481 uint _f, _b;
482 uint _refcount; // ref count or SMALL MASK + num groups
483 NamedGroup[] _names;
484
485 this()(R input, uint n, NamedGroup[] named)
486 {
487 _input = input;
488 _names = named;
489 newMatches(n);
490 _b = n;
491 _f = 0;
492 }
493
494 this(alias Engine)(ref RegexMatch!(R,Engine) rmatch)
495 {
496 _input = rmatch._input;
497 _names = rmatch._engine.re.dict;
498 immutable n = rmatch._engine.re.ngroup;
499 newMatches(n);
500 _b = n;
501 _f = 0;
502 }
503
504 @property inout(Group!DataIndex[]) matches() inout
505 {
506 return (_refcount & SMALL_MASK) ? small_matches[0 .. _refcount & 0xFF] : big_matches;
507 }
508
509 void newMatches(uint n)
510 {
511 import core.stdc.stdlib : calloc;
512 import std.exception : enforce;
513 if (n > smallString)
514 {
515 auto p = cast(Group!DataIndex*) enforce(
516 calloc(Group!DataIndex.sizeof,n),
517 "Failed to allocate Captures struct"
518 );
519 big_matches = p[0 .. n];
520 _refcount = 1;
521 }
522 else
523 {
524 _refcount = SMALL_MASK | n;
525 }
526 }
527
528 bool unique()
529 {
530 return (_refcount & SMALL_MASK) || _refcount == 1;
531 }
532
533 public:
534 this(this)
535 {
536 if (!(_refcount & SMALL_MASK))
537 {
538 _refcount++;
539 }
540 }
541 ~this()
542 {
543 import core.stdc.stdlib : free;
544 if (!(_refcount & SMALL_MASK))
545 {
546 if (--_refcount == 0)
547 {
548 free(big_matches.ptr);
549 big_matches = null;
550 }
551 }
552 }
553 ///Slice of input prior to the match.
554 @property R pre()
555 {
556 return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin];
557 }
558
559 ///Slice of input immediately after the match.
560 @property R post()
561 {
562 return _nMatch == 0 ? _input[] : _input[matches[0].end .. $];
563 }
564
565 ///Slice of matched portion of input.
566 @property R hit()
567 {
568 assert(_nMatch, "attempted to get hit of an empty match");
569 return _input[matches[0].begin .. matches[0].end];
570 }
571
572 ///Range interface.
573 @property R front()
574 {
575 assert(_nMatch, "attempted to get front of an empty match");
576 return _input[matches[_f].begin .. matches[_f].end];
577 }
578
579 ///ditto
580 @property R back()
581 {
582 assert(_nMatch, "attempted to get back of an empty match");
583 return _input[matches[_b - 1].begin .. matches[_b - 1].end];
584 }
585
586 ///ditto
587 void popFront()
588 {
589 assert(!empty);
590 ++_f;
591 }
592
593 ///ditto
594 void popBack()
595 {
596 assert(!empty);
597 --_b;
598 }
599
600 ///ditto
601 @property bool empty() const { return _nMatch == 0 || _f >= _b; }
602
603 ///ditto
604 inout(R) opIndex()(size_t i) inout
605 {
606 assert(_f + i < _b,text("requested submatch number ", i," is out of range"));
607 assert(matches[_f + i].begin <= matches[_f + i].end,
608 text("wrong match: ", matches[_f + i].begin, "..", matches[_f + i].end));
609 return _input[matches[_f + i].begin .. matches[_f + i].end];
610 }
611
612 /++
613 Explicit cast to bool.
614 Useful as a shorthand for !(x.empty) in if and assert statements.
615
616 ---
617 import std.regex;
618
619 assert(!matchFirst("nothing", "something"));
620 ---
621 +/
622
623 @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; }
624
625 /++
626 Number of pattern matched counting, where 1 - the first pattern.
627 Returns 0 on no match.
628 +/
629
630 @safe @property int whichPattern() const nothrow { return _nMatch; }
631
632 ///
633 @system unittest
634 {
635 import std.regex;
636 assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2);
637 }
638
639 /++
640 Lookup named submatch.
641
642 ---
643 import std.regex;
644 import std.range;
645
646 auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`));
647 assert(c["var"] == "a");
648 assert(c["value"] == "42");
649 popFrontN(c, 2);
650 //named groups are unaffected by range primitives
651 assert(c["var"] =="a");
652 assert(c.front == "42");
653 ----
654 +/
655 R opIndex(String)(String i) /*const*/ //@@@BUG@@@
656 if (isSomeString!String)
657 {
658 size_t index = lookupNamedGroup(_names, i);
659 return _input[matches[index].begin .. matches[index].end];
660 }
661
662 ///Number of matches in this object.
663 @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; }
664
665 ///A hook for compatibility with original std.regex.
666 @property ref captures(){ return this; }
667 }
668
669 ///
670 @system unittest
671 {
672 import std.range.primitives : popFrontN;
673
674 auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`));
675 assert(c.pre == "@"); // Part of input preceding match
676 assert(c.post == "#"); // Immediately after match
677 assert(c.hit == c[0] && c.hit == "abc"); // The whole match
678 assert(c[2] == "b");
679 assert(c.front == "abc");
680 c.popFront();
681 assert(c.front == "a");
682 assert(c.back == "c");
683 c.popBack();
684 assert(c.back == "b");
685 popFrontN(c, 2);
686 assert(c.empty);
687
688 assert(!matchFirst("nothing", "something"));
689 }
690
691 /++
692 A regex engine state, as returned by $(D match) family of functions.
693
694 Effectively it's a forward range of Captures!R, produced
695 by lazily searching for matches in a given input.
696
697 $(D alias Engine) specifies an engine type to use during matching,
698 and is automatically deduced in a call to $(D match)/$(D bmatch).
699 +/
700 @trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher)
701 if (isSomeString!R)
702 {
703 private:
704 import core.stdc.stdlib : malloc, free;
705 alias Char = BasicElementOf!R;
706 alias EngineType = Engine!Char;
707 EngineType _engine;
708 R _input;
709 Captures!(R,EngineType.DataIndex) _captures;
710 void[] _memory;//is ref-counted
711
712 this(RegEx)(R input, RegEx prog)
713 {
714 import std.exception : enforce;
715 _input = input;
716 immutable size = EngineType.initialMemory(prog)+size_t.sizeof;
717 _memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
718 scope(failure) free(_memory.ptr);
719 *cast(size_t*)_memory.ptr = 1;
720 _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]);
721 static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
722 _engine.nativeFn = prog.nativeFn;
723 _captures = Captures!(R,EngineType.DataIndex)(this);
724 _captures._nMatch = _engine.match(_captures.matches);
725 debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter);
726 }
727
728 @property ref size_t counter(){ return *cast(size_t*)_memory.ptr; }
729 public:
730 this(this)
731 {
732 if (_memory.ptr)
733 {
734 ++counter;
735 debug(std_regex_allocation) writefln("RefCount (postblit): %x %d",
736 _memory.ptr, *cast(size_t*)_memory.ptr);
737 }
738 }
739
740 ~this()
741 {
742 if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0)
743 {
744 debug(std_regex_allocation) writefln("RefCount (dtor): %x %d",
745 _memory.ptr, *cast(size_t*)_memory.ptr);
746 free(cast(void*)_memory.ptr);
747 }
748 }
749
750 ///Shorthands for front.pre, front.post, front.hit.
751 @property R pre()
752 {
753 return _captures.pre;
754 }
755
756 ///ditto
757 @property R post()
758 {
759 return _captures.post;
760 }
761
762 ///ditto
763 @property R hit()
764 {
765 return _captures.hit;
766 }
767
768 /++
769 Functionality for processing subsequent matches of global regexes via range interface:
770 ---
771 import std.regex;
772 auto m = matchAll("Hello, world!", regex(`\w+`));
773 assert(m.front.hit == "Hello");
774 m.popFront();
775 assert(m.front.hit == "world");
776 m.popFront();
777 assert(m.empty);
778 ---
779 +/
780 @property auto front()
781 {
782 return _captures;
783 }
784
785 ///ditto
786 void popFront()
787 {
788 import std.exception : enforce;
789 if (counter != 1)
790 {//do cow magic first
791 counter--;//we abandon this reference
792 immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof;
793 _memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
794 _engine = _engine.dupTo(_memory[size_t.sizeof .. size]);
795 counter = 1;//points to new chunk
796 }
797
798 if (!_captures.unique)
799 {
800 // has external references - allocate new space
801 _captures.newMatches(_engine.re.ngroup);
802 }
803 _captures._nMatch = _engine.match(_captures.matches);
804 }
805
806 ///ditto
807 auto save(){ return this; }
808
809 ///Test if this match object is empty.
810 @property bool empty() const { return _captures._nMatch == 0; }
811
812 ///Same as !(x.empty), provided for its convenience in conditional statements.
813 T opCast(T:bool)(){ return !empty; }
814
815 /// Same as .front, provided for compatibility with original std.regex.
816 @property auto captures() inout { return _captures; }
817
818 }
819
matchOnce(alias Engine,RegEx,R)820 private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re)
821 {
822 import core.stdc.stdlib : malloc, free;
823 import std.exception : enforce;
824 alias Char = BasicElementOf!R;
825 alias EngineType = Engine!Char;
826
827 size_t size = EngineType.initialMemory(re);
828 void[] memory = enforce(malloc(size), "malloc failed")[0 .. size];
829 scope(exit) free(memory.ptr);
830 auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict);
831 auto engine = EngineType(re, Input!Char(input), memory);
832 static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
833 engine.nativeFn = re.nativeFn;
834 captures._nMatch = engine.match(captures.matches);
835 return captures;
836 }
837
matchMany(alias Engine,RegEx,R)838 private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re)
839 {
840 re.flags |= RegexOption.global;
841 return RegexMatch!(R, Engine)(input, re);
842 }
843
844 @system unittest
845 {
846 //sanity checks for new API
847 auto re = regex("abc");
848 assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty);
849 assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc");
850 }
851
852
853 private enum isReplaceFunctor(alias fun, R) =
854 __traits(compiles, (Captures!R c) { fun(c); });
855
856 // the lowest level - just stuff replacements into the sink
857 private @trusted void replaceCapturesInto(alias output, Sink, R, T)
858 (ref Sink sink, R input, T captures)
859 if (isOutputRange!(Sink, dchar) && isSomeString!R)
860 {
861 if (captures.empty)
862 {
863 sink.put(input);
864 return;
865 }
866 sink.put(captures.pre);
867 // a hack to get around bogus errors, should be simply output(captures, sink)
868 // "is a nested function and cannot be accessed from"
869 static if (isReplaceFunctor!(output, R))
870 sink.put(output(captures)); //"mutator" type of function
871 else
872 output(captures, sink); //"output" type of function
873 sink.put(captures.post);
874 }
875
876 // ditto for a range of captures
877 private void replaceMatchesInto(alias output, Sink, R, T)
878 (ref Sink sink, R input, T matches)
879 if (isOutputRange!(Sink, dchar) && isSomeString!R)
880 {
881 size_t offset = 0;
foreach(cap;matches)882 foreach (cap; matches)
883 {
884 sink.put(cap.pre[offset .. $]);
885 // same hack, see replaceCapturesInto
886 static if (isReplaceFunctor!(output, R))
887 sink.put(output(cap)); //"mutator" type of function
888 else
889 output(cap, sink); //"output" type of function
890 offset = cap.pre.length + cap.hit.length;
891 }
892 sink.put(input[offset .. $]);
893 }
894
895 // a general skeleton of replaceFirst
896 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re)
897 if (isSomeString!R && isRegexFor!(RegEx, R))
898 {
899 import std.array : appender;
900 auto data = matchFirst(input, re);
901 if (data.empty)
902 return input;
903 auto app = appender!(R)();
904 replaceCapturesInto!output(app, input, data);
905 return app.data;
906 }
907
908 // ditto for replaceAll
909 // the method parameter allows old API to ride on the back of the new one
910 private R replaceAllWith(alias output,
911 alias method=matchAll, R, RegEx)(R input, RegEx re)
912 if (isSomeString!R && isRegexFor!(RegEx, R))
913 {
914 import std.array : appender;
915 auto matches = method(input, re); //inout(C)[] fails
916 if (matches.empty)
917 return input;
918 auto app = appender!(R)();
919 replaceMatchesInto!output(app, input, matches);
920 return app.data;
921 }
922
923
924 /++
925 Start matching $(D input) to regex pattern $(D re),
926 using Thompson NFA matching scheme.
927
928 The use of this function is $(RED discouraged) - use either of
929 $(LREF matchAll) or $(LREF matchFirst).
930
931 Delegating the kind of operation
932 to "g" flag is soon to be phased out along with the
933 ability to choose the exact matching scheme. The choice of
934 matching scheme to use depends highly on the pattern kind and
935 can done automatically on case by case basis.
936
937 Returns: a $(D RegexMatch) object holding engine state after first match.
938 +/
939
940 public auto match(R, RegEx)(R input, RegEx re)
941 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
942 {
943 import std.regex.internal.thompson : ThompsonMatcher;
944 return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re);
945 }
946
947 ///ditto
948 public auto match(R, String)(R input, String re)
949 if (isSomeString!R && isSomeString!String)
950 {
951 import std.regex.internal.thompson : ThompsonMatcher;
952 return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re));
953 }
954
955 public auto match(R, RegEx)(R input, RegEx re)
956 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
957 {
958 import std.regex.internal.backtracking : BacktrackingMatcher;
959 return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
960 }
961
962 /++
963 Find the first (leftmost) slice of the $(D input) that
964 matches the pattern $(D re). This function picks the most suitable
965 regular expression engine depending on the pattern properties.
966
967 $(D re) parameter can be one of three types:
968 $(UL
969 $(LI Plain string(s), in which case it's compiled to bytecode before matching. )
970 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of
971 compiled bytecode. )
972 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of
973 compiled native machine code. )
974 )
975
976 Returns:
977 $(LREF Captures) containing the extent of a match together with all submatches
978 if there was a match, otherwise an empty $(LREF Captures) object.
979 +/
980 public auto matchFirst(R, RegEx)(R input, RegEx re)
981 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
982 {
983 import std.regex.internal.thompson : ThompsonMatcher;
984 return matchOnce!ThompsonMatcher(input, re);
985 }
986
987 ///ditto
988 public auto matchFirst(R, String)(R input, String re)
989 if (isSomeString!R && isSomeString!String)
990 {
991 import std.regex.internal.thompson : ThompsonMatcher;
992 return matchOnce!ThompsonMatcher(input, regex(re));
993 }
994
995 ///ditto
996 public auto matchFirst(R, String)(R input, String[] re...)
997 if (isSomeString!R && isSomeString!String)
998 {
999 import std.regex.internal.thompson : ThompsonMatcher;
1000 return matchOnce!ThompsonMatcher(input, regex(re));
1001 }
1002
1003 public auto matchFirst(R, RegEx)(R input, RegEx re)
1004 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
1005 {
1006 import std.regex.internal.backtracking : BacktrackingMatcher;
1007 return matchOnce!(BacktrackingMatcher!true)(input, re);
1008 }
1009
1010 /++
1011 Initiate a search for all non-overlapping matches to the pattern $(D re)
1012 in the given $(D input). The result is a lazy range of matches generated
1013 as they are encountered in the input going left to right.
1014
1015 This function picks the most suitable regular expression engine
1016 depending on the pattern properties.
1017
1018 $(D re) parameter can be one of three types:
1019 $(UL
1020 $(LI Plain string(s), in which case it's compiled to bytecode before matching. )
1021 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of
1022 compiled bytecode. )
1023 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of
1024 compiled native machine code. )
1025 )
1026
1027 Returns:
1028 $(LREF RegexMatch) object that represents matcher state
1029 after the first match was found or an empty one if not present.
1030 +/
1031 public auto matchAll(R, RegEx)(R input, RegEx re)
1032 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
1033 {
1034 import std.regex.internal.thompson : ThompsonMatcher;
1035 return matchMany!ThompsonMatcher(input, re);
1036 }
1037
1038 ///ditto
1039 public auto matchAll(R, String)(R input, String re)
1040 if (isSomeString!R && isSomeString!String)
1041 {
1042 import std.regex.internal.thompson : ThompsonMatcher;
1043 return matchMany!ThompsonMatcher(input, regex(re));
1044 }
1045
1046 ///ditto
1047 public auto matchAll(R, String)(R input, String[] re...)
1048 if (isSomeString!R && isSomeString!String)
1049 {
1050 import std.regex.internal.thompson : ThompsonMatcher;
1051 return matchMany!ThompsonMatcher(input, regex(re));
1052 }
1053
1054 public auto matchAll(R, RegEx)(R input, RegEx re)
1055 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
1056 {
1057 import std.regex.internal.backtracking : BacktrackingMatcher;
1058 return matchMany!(BacktrackingMatcher!true)(input, re);
1059 }
1060
1061 // another set of tests just to cover the new API
1062 @system unittest
1063 {
1064 import std.algorithm.comparison : equal;
1065 import std.algorithm.iteration : map;
1066 import std.conv : to;
1067
1068 foreach (String; AliasSeq!(string, wstring, const(dchar)[]))
1069 {
1070 auto str1 = "blah-bleh".to!String();
1071 auto pat1 = "bl[ae]h".to!String();
1072 auto mf = matchFirst(str1, pat1);
1073 assert(mf.equal(["blah".to!String()]));
1074 auto mAll = matchAll(str1, pat1);
1075 assert(mAll.equal!((a,b) => a.equal(b))
1076 ([["blah".to!String()], ["bleh".to!String()]]));
1077
1078 auto str2 = "1/03/12 - 3/03/12".to!String();
1079 auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]);
1080 auto mf2 = matchFirst(str2, pat2);
1081 assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)()));
1082 auto mAll2 = matchAll(str2, pat2);
1083 assert(mAll2.front.equal(mf2));
1084 mAll2.popFront();
1085 assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)()));
1086 mf2.popFrontN(3);
1087 assert(mf2.equal(["12".to!String()]));
1088
1089 auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String());
1090 auto str = "2 + 34/56 - 6/1".to!String();
1091 auto cmf = matchFirst(str, ctPat);
1092 assert(cmf.equal(["34/56", "34", "56"].map!(to!String)()));
1093 assert(cmf["Quot"] == "34".to!String());
1094 assert(cmf["Denom"] == "56".to!String());
1095
1096 auto cmAll = matchAll(str, ctPat);
1097 assert(cmAll.front.equal(cmf));
1098 cmAll.popFront();
1099 assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)()));
1100 }
1101 }
1102
1103 /++
1104 Start matching of $(D input) to regex pattern $(D re),
1105 using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking,
1106 backtracking) matching scheme.
1107
1108 The use of this function is $(RED discouraged) - use either of
1109 $(LREF matchAll) or $(LREF matchFirst).
1110
1111 Delegating the kind of operation
1112 to "g" flag is soon to be phased out along with the
1113 ability to choose the exact matching scheme. The choice of
1114 matching scheme to use depends highly on the pattern kind and
1115 can done automatically on case by case basis.
1116
1117 Returns: a $(D RegexMatch) object holding engine
1118 state after first match.
1119
1120 +/
1121 public auto bmatch(R, RegEx)(R input, RegEx re)
1122 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
1123 {
1124 import std.regex.internal.backtracking : BacktrackingMatcher;
1125 return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re);
1126 }
1127
1128 ///ditto
1129 public auto bmatch(R, String)(R input, String re)
1130 if (isSomeString!R && isSomeString!String)
1131 {
1132 import std.regex.internal.backtracking : BacktrackingMatcher;
1133 return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re));
1134 }
1135
1136 public auto bmatch(R, RegEx)(R input, RegEx re)
1137 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
1138 {
1139 import std.regex.internal.backtracking : BacktrackingMatcher;
1140 return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
1141 }
1142
1143 // produces replacement string from format using captures for substitution
1144 package void replaceFmt(R, Capt, OutR)
1145 (R format, Capt captures, OutR sink, bool ignoreBadSubs = false)
1146 if (isOutputRange!(OutR, ElementEncodingType!R[]) &&
1147 isOutputRange!(OutR, ElementEncodingType!(Capt.String)[]))
1148 {
1149 import std.algorithm.searching : find;
1150 import std.ascii : isDigit, isAlpha;
1151 import std.conv : text, parse;
1152 import std.exception : enforce;
1153 enum State { Normal, Dollar }
1154 auto state = State.Normal;
1155 size_t offset;
1156 L_Replace_Loop:
1157 while (!format.empty)
1158 final switch (state)
1159 {
1160 case State.Normal:
1161 for (offset = 0; offset < format.length; offset++)//no decoding
1162 {
1163 if (format[offset] == '$')
1164 {
1165 state = State.Dollar;
1166 sink.put(format[0 .. offset]);
1167 format = format[offset+1 .. $];//ditto
1168 continue L_Replace_Loop;
1169 }
1170 }
1171 sink.put(format[0 .. offset]);
1172 format = format[offset .. $];
1173 break;
1174 case State.Dollar:
1175 if (isDigit(format[0]))
1176 {
1177 uint digit = parse!uint(format);
1178 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit));
1179 if (digit < captures.length)
1180 sink.put(captures[digit]);
1181 }
1182 else if (format[0] == '{')
1183 {
1184 auto x = find!(a => !isAlpha(a))(format[1..$]);
1185 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format");
1186 auto name = format[1 .. $ - x.length];
1187 format = x[1..$];
1188 enforce(!name.empty, "invalid name in ${...} replacement format");
1189 sink.put(captures[name]);
1190 }
1191 else if (format[0] == '&')
1192 {
1193 sink.put(captures[0]);
1194 format = format[1 .. $];
1195 }
1196 else if (format[0] == '`')
1197 {
1198 sink.put(captures.pre);
1199 format = format[1 .. $];
1200 }
1201 else if (format[0] == '\'')
1202 {
1203 sink.put(captures.post);
1204 format = format[1 .. $];
1205 }
1206 else if (format[0] == '$')
1207 {
1208 sink.put(format[0 .. 1]);
1209 format = format[1 .. $];
1210 }
1211 state = State.Normal;
1212 break;
1213 }
1214 enforce(state == State.Normal, "invalid format string in regex replace");
1215 }
1216
1217 /++
1218 Construct a new string from $(D input) by replacing the first match with
1219 a string generated from it according to the $(D format) specifier.
1220
1221 To replace all matches use $(LREF replaceAll).
1222
1223 Params:
1224 input = string to search
1225 re = compiled regular expression to use
1226 format = _format string to generate replacements from,
1227 see $(S_LINK Replace _format string, the _format string).
1228
1229 Returns:
1230 A string of the same type with the first match (if any) replaced.
1231 If no match is found returns the input string itself.
1232 +/
1233 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format)
1234 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R))
1235 {
1236 return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re);
1237 }
1238
1239 ///
1240 @system unittest
1241 {
1242 assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon");
1243 }
1244
1245 /++
1246 This is a general replacement tool that construct a new string by replacing
1247 matches of pattern $(D re) in the $(D input). Unlike the other overload
1248 there is no format string instead captures are passed to
1249 to a user-defined functor $(D fun) that returns a new string
1250 to use as replacement.
1251
1252 This version replaces the first match in $(D input),
1253 see $(LREF replaceAll) to replace the all of the matches.
1254
1255 Returns:
1256 A new string of the same type as $(D input) with all matches
1257 replaced by return values of $(D fun). If no matches found
1258 returns the $(D input) itself.
1259 +/
1260 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re)
1261 if (isSomeString!R && isRegexFor!(RegEx, R))
1262 {
1263 return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re);
1264 }
1265
1266 ///
1267 @system unittest
1268 {
1269 import std.conv : to;
1270 string list = "#21 out of 46";
1271 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1))
1272 (list, regex(`[0-9]+`));
1273 assert(newList == "#22 out of 46");
1274 }
1275
1276 /++
1277 A variation on $(LREF replaceFirst) that instead of allocating a new string
1278 on each call outputs the result piece-wise to the $(D sink). In particular
1279 this enables efficient construction of a final output incrementally.
1280
1281 Like in $(LREF replaceFirst) family of functions there is an overload
1282 for the substitution guided by the $(D format) string
1283 and the one with the user defined callback.
1284 +/
1285 public @trusted void replaceFirstInto(Sink, R, C, RegEx)
1286 (ref Sink sink, R input, RegEx re, const(C)[] format)
1287 if (isOutputRange!(Sink, dchar) && isSomeString!R
1288 && is(C : dchar) && isRegexFor!(RegEx, R))
1289 {
1290 replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink))
1291 (sink, input, matchFirst(input, re));
1292 }
1293
1294 ///ditto
1295 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx)
1296 (Sink sink, R input, RegEx re)
1297 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R))
1298 {
1299 replaceCapturesInto!fun(sink, input, matchFirst(input, re));
1300 }
1301
1302 ///
1303 @system unittest
1304 {
1305 import std.array;
1306 string m1 = "first message\n";
1307 string m2 = "second message\n";
1308 auto result = appender!string();
1309 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1");
1310 //equivalent of the above with user-defined callback
1311 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`));
1312 assert(result.data == "first\nsecond\n");
1313 }
1314
1315 //examples for replaceFirst
1316 @system unittest
1317 {
1318 import std.conv;
1319 string list = "#21 out of 46";
1320 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1))
1321 (list, regex(`[0-9]+`));
1322 assert(newList == "#22 out of 46");
1323 import std.array;
1324 string m1 = "first message\n";
1325 string m2 = "second message\n";
1326 auto result = appender!string();
1327 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1");
1328 //equivalent of the above with user-defined callback
1329 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`));
1330 assert(result.data == "first\nsecond\n");
1331 }
1332
1333 /++
1334 Construct a new string from $(D input) by replacing all of the
1335 fragments that match a pattern $(D re) with a string generated
1336 from the match according to the $(D format) specifier.
1337
1338 To replace only the first match use $(LREF replaceFirst).
1339
1340 Params:
1341 input = string to search
1342 re = compiled regular expression to use
1343 format = _format string to generate replacements from,
1344 see $(S_LINK Replace _format string, the _format string).
1345
1346 Returns:
1347 A string of the same type as $(D input) with the all
1348 of the matches (if any) replaced.
1349 If no match is found returns the input string itself.
1350 +/
1351 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format)
1352 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R))
1353 {
1354 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re);
1355 }
1356
1357 ///
1358 @system unittest
1359 {
1360 // insert comma as thousands delimiter
1361 auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g");
1362 assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100");
1363 }
1364
1365 /++
1366 This is a general replacement tool that construct a new string by replacing
1367 matches of pattern $(D re) in the $(D input). Unlike the other overload
1368 there is no format string instead captures are passed to
1369 to a user-defined functor $(D fun) that returns a new string
1370 to use as replacement.
1371
1372 This version replaces all of the matches found in $(D input),
1373 see $(LREF replaceFirst) to replace the first match only.
1374
1375 Returns:
1376 A new string of the same type as $(D input) with all matches
1377 replaced by return values of $(D fun). If no matches found
1378 returns the $(D input) itself.
1379
1380 Params:
1381 input = string to search
1382 re = compiled regular expression
1383 fun = delegate to use
1384 +/
1385 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re)
1386 if (isSomeString!R && isRegexFor!(RegEx, R))
1387 {
1388 return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re);
1389 }
1390
1391 ///
1392 @system unittest
1393 {
1394 string baz(Captures!(string) m)
1395 {
1396 import std.string : toUpper;
1397 return toUpper(m.hit);
1398 }
1399 // Capitalize the letters 'a' and 'r':
1400 auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.",
1401 regex("[ar]"));
1402 assert(s == "StRAp A Rocket engine on A chicken.");
1403 }
1404
1405 /++
1406 A variation on $(LREF replaceAll) that instead of allocating a new string
1407 on each call outputs the result piece-wise to the $(D sink). In particular
1408 this enables efficient construction of a final output incrementally.
1409
1410 As with $(LREF replaceAll) there are 2 overloads - one with a format string,
1411 the other one with a user defined functor.
1412 +/
1413 public @trusted void replaceAllInto(Sink, R, C, RegEx)
1414 (Sink sink, R input, RegEx re, const(C)[] format)
1415 if (isOutputRange!(Sink, dchar) && isSomeString!R
1416 && is(C : dchar) && isRegexFor!(RegEx, R))
1417 {
1418 replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink))
1419 (sink, input, matchAll(input, re));
1420 }
1421
1422 ///ditto
1423 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx)
1424 (Sink sink, R input, RegEx re)
1425 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R))
1426 {
1427 replaceMatchesInto!fun(sink, input, matchAll(input, re));
1428 }
1429
1430 ///
1431 @system unittest
1432 {
1433 // insert comma as thousands delimiter in fifty randomly produced big numbers
1434 import std.array, std.conv, std.random, std.range;
1435 static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g");
1436 auto sink = appender!(char [])();
1437 enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19;
1438 foreach (i; 0 .. 50)
1439 {
1440 sink.clear();
1441 replaceAllInto(sink, text(uniform(min, max)), re, ",");
1442 foreach (pos; iota(sink.data.length - 4, 0, -4))
1443 assert(sink.data[pos] == ',');
1444 }
1445 }
1446
1447 // exercise all of the replace APIs
1448 @system unittest
1449 {
1450 import std.array : appender;
1451 import std.conv;
1452 // try and check first/all simple substitution
1453 foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[]))
1454 {
1455 S s1 = "curt trial".to!S();
1456 S s2 = "round dome".to!S();
1457 S t1F = "court trial".to!S();
1458 S t2F = "hound dome".to!S();
1459 S t1A = "court trial".to!S();
1460 S t2A = "hound home".to!S();
1461 auto re1 = regex("curt".to!S());
1462 auto re2 = regex("[dr]o".to!S());
1463
1464 assert(replaceFirst(s1, re1, "court") == t1F);
1465 assert(replaceFirst(s2, re2, "ho") == t2F);
1466 assert(replaceAll(s1, re1, "court") == t1A);
1467 assert(replaceAll(s2, re2, "ho") == t2A);
1468
1469 auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1);
1470 assert(rep1 == t1F);
1471 assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F);
1472 auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1);
1473 assert(rep1A == t1A);
1474 assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A);
1475
1476 auto sink = appender!S();
1477 replaceFirstInto(sink, s1, re1, "court");
1478 assert(sink.data == t1F);
1479 replaceFirstInto(sink, s2, re2, "ho");
1480 assert(sink.data == t1F~t2F);
1481 replaceAllInto(sink, s1, re1, "court");
1482 assert(sink.data == t1F~t2F~t1A);
1483 replaceAllInto(sink, s2, re2, "ho");
1484 assert(sink.data == t1F~t2F~t1A~t2A);
1485 }
1486 }
1487
1488 /++
1489 Old API for replacement, operation depends on flags of pattern $(D re).
1490 With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it
1491 works the same as $(LREF replaceFirst).
1492
1493 The use of this function is $(RED discouraged), please use $(LREF replaceAll)
1494 or $(LREF replaceFirst) explicitly.
1495 +/
1496 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format)
1497 if (isSomeString!R && isRegexFor!(RegEx, R))
1498 {
1499 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re);
1500 }
1501
1502 ///ditto
1503 public R replace(alias fun, R, RegEx)(R input, RegEx re)
1504 if (isSomeString!R && isRegexFor!(RegEx, R))
1505 {
1506 return replaceAllWith!(fun, match)(input, re);
1507 }
1508
1509 /**
1510 Splits a string `r` using a regular expression `pat` as a separator.
1511
1512 Params:
1513 keepSeparators = flag to specify if the matches should be in the resulting range
1514 r = the string to split
1515 pat = the pattern to split on
1516 Returns:
1517 A lazy range of strings
1518 */
1519 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex)
1520 if (isSomeString!Range && isRegexFor!(RegEx, Range))
1521 {
1522 private:
1523 Range _input;
1524 size_t _offset;
1525 alias Rx = typeof(match(Range.init,RegEx.init));
1526 Rx _match;
1527
1528 static if (keepSeparators) bool onMatch = false;
1529
thisSplitter1530 @trusted this(Range input, RegEx separator)
1531 {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted
1532 _input = input;
1533 separator.flags |= RegexOption.global;
1534 if (_input.empty)
1535 {
1536 //there is nothing to match at all, make _offset > 0
1537 _offset = 1;
1538 }
1539 else
1540 {
1541 _match = Rx(_input, separator);
1542
1543 static if (keepSeparators)
1544 if (_match.pre.empty)
1545 popFront();
1546 }
1547 }
1548
1549 public:
opSliceSplitter1550 auto ref opSlice()
1551 {
1552 return this.save;
1553 }
1554
1555 ///Forward range primitives.
frontSplitter1556 @property Range front()
1557 {
1558 import std.algorithm.comparison : min;
1559
1560 assert(!empty && _offset <= _match.pre.length
1561 && _match.pre.length <= _input.length);
1562
1563 static if (keepSeparators)
1564 {
1565 if (!onMatch)
1566 return _input[_offset .. min($, _match.pre.length)];
1567 else
1568 return _match.hit();
1569 }
1570 else
1571 {
1572 return _input[_offset .. min($, _match.pre.length)];
1573 }
1574 }
1575
1576 ///ditto
emptySplitter1577 @property bool empty()
1578 {
1579 static if (keepSeparators)
1580 return _offset >= _input.length;
1581 else
1582 return _offset > _input.length;
1583 }
1584
1585 ///ditto
popFrontSplitter1586 void popFront()
1587 {
1588 assert(!empty);
1589 if (_match.empty)
1590 {
1591 //No more separators, work is done here
1592 _offset = _input.length + 1;
1593 }
1594 else
1595 {
1596 static if (keepSeparators)
1597 {
1598 if (!onMatch)
1599 {
1600 //skip past the separator
1601 _offset = _match.pre.length;
1602 }
1603 else
1604 {
1605 _offset += _match.hit.length;
1606 _match.popFront();
1607 }
1608
1609 onMatch = !onMatch;
1610 }
1611 else
1612 {
1613 //skip past the separator
1614 _offset = _match.pre.length + _match.hit.length;
1615 _match.popFront();
1616 }
1617 }
1618 }
1619
1620 ///ditto
saveSplitter1621 @property auto save()
1622 {
1623 return this;
1624 }
1625 }
1626
1627 /// ditto
1628 public Splitter!(keepSeparators, Range, RegEx) splitter(
1629 Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat)
1630 if (
1631 is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range))
1632 {
1633 return Splitter!(keepSeparators, Range, RegEx)(r, pat);
1634 }
1635
1636 ///
1637 @system unittest
1638 {
1639 import std.algorithm.comparison : equal;
1640 auto s1 = ", abc, de, fg, hi, ";
1641 assert(equal(splitter(s1, regex(", *")),
1642 ["", "abc", "de", "fg", "hi", ""]));
1643 }
1644
1645 /// Split on a pattern, but keep the matches in the resulting range
1646 @system unittest
1647 {
1648 import std.algorithm.comparison : equal;
1649 import std.typecons : Yes;
1650
1651 auto pattern = regex(`([\.,])`);
1652
1653 assert("2003.04.05"
1654 .splitter!(Yes.keepSeparators)(pattern)
1655 .equal(["2003", ".", "04", ".", "05"]));
1656
1657 assert(",1,2,3"
1658 .splitter!(Yes.keepSeparators)(pattern)
1659 .equal([",", "1", ",", "2", ",", "3"]));
1660 }
1661
1662 ///An eager version of $(D splitter) that creates an array with splitted slices of $(D input).
1663 public @trusted String[] split(String, RegEx)(String input, RegEx rx)
1664 if (isSomeString!String && isRegexFor!(RegEx, String))
1665 {
1666 import std.array : appender;
1667 auto a = appender!(String[])();
1668 foreach (e; splitter(input, rx))
1669 a.put(e);
1670 return a.data;
1671 }
1672
1673 ///Exception object thrown in case of errors during regex compilation.
1674 public alias RegexException = std.regex.internal.ir.RegexException;
1675
1676 /++
1677 A range that lazily produces a string output escaped
1678 to be used inside of a regular expression.
1679 +/
escaper(Range)1680 auto escaper(Range)(Range r)
1681 {
1682 import std.algorithm.searching : find;
1683 static immutable escapables = [Escapables];
1684 static struct Escaper // template to deduce attributes
1685 {
1686 Range r;
1687 bool escaped;
1688
1689 @property ElementType!Range front(){
1690 if (escaped)
1691 return '\\';
1692 else
1693 return r.front;
1694 }
1695
1696 @property bool empty(){ return r.empty; }
1697
1698 void popFront(){
1699 if (escaped) escaped = false;
1700 else
1701 {
1702 r.popFront();
1703 if (!r.empty && !escapables.find(r.front).empty)
1704 escaped = true;
1705 }
1706 }
1707
1708 @property auto save(){ return Escaper(r.save, escaped); }
1709 }
1710
1711 bool escaped = !r.empty && !escapables.find(r.front).empty;
1712 return Escaper(r, escaped);
1713 }
1714
1715 ///
1716 @system unittest
1717 {
1718 import std.algorithm.comparison;
1719 import std.regex;
1720 string s = `This is {unfriendly} to *regex*`;
1721 assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`));
1722 }
1723
1724 @system unittest
1725 {
1726 import std.algorithm.comparison;
1727 import std.conv;
1728 foreach (S; AliasSeq!(string, wstring, dstring))
1729 {
1730 auto s = "^".to!S;
1731 assert(s.escaper.equal(`\^`));
1732 auto s2 = "";
1733 assert(s2.escaper.equal(""));
1734 }
1735 }
1736