1 // Written in the D programming language.
2 
3 /++
4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
5 
6     UTF character support is restricted to
7     $(D '\u0000' <= character <= '\U0010FFFF').
8 
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(BOOKTABLE,
11 $(TR $(TH Category) $(TH Functions))
12 $(TR $(TD Decode) $(TD
13     $(LREF decode)
14     $(LREF decodeFront)
15 ))
16 $(TR $(TD Lazy decode) $(TD
17     $(LREF byCodeUnit)
18     $(LREF byChar)
19     $(LREF byWchar)
20     $(LREF byDchar)
21     $(LREF byUTF)
22 ))
23 $(TR $(TD Encode) $(TD
24     $(LREF encode)
25     $(LREF toUTF8)
26     $(LREF toUTF16)
27     $(LREF toUTF32)
28     $(LREF toUTFz)
29     $(LREF toUTF16z)
30 ))
31 $(TR $(TD Length) $(TD
32     $(LREF codeLength)
33     $(LREF count)
34     $(LREF stride)
35     $(LREF strideBack)
36 ))
37 $(TR $(TD Index) $(TD
38     $(LREF toUCSindex)
39     $(LREF toUTFindex)
40 ))
41 $(TR $(TD Validation) $(TD
42     $(LREF isValidDchar)
43     $(LREF validate)
44 ))
45 $(TR $(TD Miscellaneous) $(TD
46     $(LREF replacementDchar)
47     $(LREF UseReplacementDchar)
48     $(LREF UTFException)
49 ))
50 )
51     See_Also:
52         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
53         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
54         $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
55     Copyright: Copyright Digital Mars 2000 - 2012.
56     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
57     Authors:   $(HTTP digitalmars.com, Walter Bright) and Jonathan M Davis
58     Source:    $(PHOBOSSRC std/_utf.d)
59    +/
60 module std.utf;
61 
62 import std.exception;  // basicExceptionCtors
63 import std.meta;       // AliasSeq
64 import std.range.primitives;
65 import std.traits;     // isSomeChar, isSomeString
66 import std.typecons;   // Flag, Yes, No
67 
68 
69 /++
70     Exception thrown on errors in std.utf functions.
71   +/
72 class UTFException : Exception
73 {
74     import core.internal.string : unsignedToTempString, UnsignedStringBuf;
75 
76     uint[4] sequence;
77     size_t  len;
78 
79     @safe pure nothrow @nogc
setSequence(scope uint[]data...)80     UTFException setSequence(scope uint[] data...)
81     {
82         assert(data.length <= 4);
83 
84         len = data.length < 4 ? data.length : 4;
85         sequence[0 .. len] = data[0 .. len];
86 
87         return this;
88     }
89 
90     // FIXME: Use std.exception.basicExceptionCtors here once bug #11500 is fixed
91 
92     this(string msg, string file = __FILE__, size_t line = __LINE__,
93          Throwable next = null) @nogc @safe pure nothrow
94     {
95         super(msg, file, line, next);
96     }
97 
98     this(string msg, size_t index, string file = __FILE__,
99          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
100     {
101         UnsignedStringBuf buf = void;
102         msg ~= " (at index " ~ unsignedToTempString(index, buf, 10) ~ ")";
103         super(msg, file, line, next);
104     }
105 
106 
toString()107     override string toString() const
108     {
109         if (len == 0)
110         {
111             /* Exception.toString() is not marked as const, although
112              * it is const-compatible.
113              */
114             //return super.toString();
115             auto e = () @trusted { return cast(Exception) super; } ();
116             return e.toString();
117         }
118 
119         string result = "Invalid UTF sequence:";
120 
121         foreach (i; sequence[0 .. len])
122         {
123             UnsignedStringBuf buf = void;
124             result ~= ' ';
125             auto h = unsignedToTempString(i, buf, 16);
126             if (h.length == 1)
127                 result ~= '0';
128             result ~= h;
129             result ~= 'x';
130         }
131 
132         if (super.msg.length > 0)
133         {
134             result ~= " - ";
135             result ~= super.msg;
136         }
137 
138         return result;
139     }
140 }
141 
142 /*
143    Provide array of invalidly encoded UTF strings. Useful for testing.
144 
145    Params:
146         Char = char, wchar, or dchar
147 
148    Returns:
149         an array of invalidly encoded UTF strings
150  */
151 
152 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
153 if (isSomeChar!Char)
154 {
155     static if (is(Char == char))
156     {
157         enum x = 0xDC00;         // invalid surrogate value
158         enum y = 0x110000;       // out of range
159 
160         static immutable string[8] result =
161         [
162             "\x80",             // not a start byte
163             "\xC0",             // truncated
164             "\xC0\xC0",         // invalid continuation
165             "\xF0\x82\x82\xAC", // overlong
166             [
167               0xE0 | (x >> 12),
168               0x80 | ((x >> 6) & 0x3F),
169               0x80 | (x & 0x3F)
170             ],
171             [
172               cast(char)(0xF0 | (y >> 18)),
173               cast(char)(0x80 | ((y >> 12) & 0x3F)),
174               cast(char)(0x80 | ((y >> 6) & 0x3F)),
175               cast(char)(0x80 | (y & 0x3F))
176             ],
177             [
178               cast(char)(0xF8 | 3),     // 5 byte encoding
179               cast(char)(0x80 | 3),
180               cast(char)(0x80 | 3),
181               cast(char)(0x80 | 3),
182               cast(char)(0x80 | 3),
183             ],
184             [
185               cast(char)(0xFC | 3),     // 6 byte encoding
186               cast(char)(0x80 | 3),
187               cast(char)(0x80 | 3),
188               cast(char)(0x80 | 3),
189               cast(char)(0x80 | 3),
190               cast(char)(0x80 | 3),
191             ],
192         ];
193 
194         return result[];
195     }
196     else static if (is(Char == wchar))
197     {
198         static immutable wstring[5] result =
199         [
200             [
201               cast(wchar) 0xDC00,
202             ],
203             [
204               cast(wchar) 0xDFFF,
205             ],
206             [
207               cast(wchar) 0xDBFF,
208               cast(wchar) 0xDBFF,
209             ],
210             [
211               cast(wchar) 0xDBFF,
212               cast(wchar) 0xE000,
213             ],
214             [
215               cast(wchar) 0xD800,
216             ],
217         ];
218 
219         return result[];
220     }
221     else static if (is(Char == dchar))
222     {
223         static immutable dstring[3] result =
224         [
225             [ cast(dchar) 0x110000 ],
226             [ cast(dchar) 0x00D800 ],
227             [ cast(dchar) 0x00DFFF ],
228         ];
229 
230         return result;
231     }
232     else
233         static assert(0);
234 }
235 
236 /++
237     Check whether the given Unicode code point is valid.
238 
239     Params:
240         c = code point to check
241 
242     Returns:
243         $(D true) iff $(D c) is a valid Unicode code point
244 
245     Note:
246     $(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
247     as they are permitted for internal use by an application, but they are
248     not allowed for interchange by the Unicode standard.
249   +/
isValidDchar(dchar c)250 bool isValidDchar(dchar c) pure nothrow @safe @nogc
251 {
252     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
253 }
254 
255 pure nothrow @safe @nogc unittest
256 {
257     import std.exception;
258 
259     assertCTFEable!(
260     {
261     assert( isValidDchar(cast(dchar)'a') == true);
262     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
263 
264     assert(!isValidDchar(cast(dchar) 0x00D800));
265     assert(!isValidDchar(cast(dchar) 0x00DBFF));
266     assert(!isValidDchar(cast(dchar) 0x00DC00));
267     assert(!isValidDchar(cast(dchar) 0x00DFFF));
268     assert( isValidDchar(cast(dchar) 0x00FFFE));
269     assert( isValidDchar(cast(dchar) 0x00FFFF));
270     assert( isValidDchar(cast(dchar) 0x01FFFF));
271     assert( isValidDchar(cast(dchar) 0x10FFFF));
272     assert(!isValidDchar(cast(dchar) 0x110000));
273     });
274 }
275 
276 
277 /++
278     Calculate the length of the UTF sequence starting at $(D index)
279     in $(D str).
280 
281     Params:
282         str = input range of UTF code units. Must be random access if
283         $(D index) is passed
284         index = starting index of UTF sequence (default: $(D 0))
285 
286     Returns:
287         The number of code units in the UTF sequence. For UTF-8, this is a
288         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
289         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
290 
291     Throws:
292         May throw a $(D UTFException) if $(D str[index]) is not the start of a
293         valid UTF sequence.
294 
295     Note:
296         $(D stride) will only analyze the first $(D str[index]) element. It
297         will not fully verify the validity of the UTF sequence, nor even verify
298         the presence of the sequence: it will not actually guarantee that
299         $(D index + stride(str, index) <= str.length).
300   +/
301 uint stride(S)(auto ref S str, size_t index)
302 if (is(S : const char[]) ||
303     (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
304 {
305     static if (is(typeof(str.length) : ulong))
306         assert(index < str.length, "Past the end of the UTF-8 sequence");
307     immutable c = str[index];
308 
309     if (c < 0x80)
310         return 1;
311     else
312         return strideImpl(c, index);
313 }
314 
315 /// Ditto
316 uint stride(S)(auto ref S str)
317 if (is(S : const char[]) ||
318     (isInputRange!S && is(Unqual!(ElementType!S) == char)))
319 {
320     static if (is(S : const char[]))
321         immutable c = str[0];
322     else
323         immutable c = str.front;
324 
325     if (c < 0x80)
326         return 1;
327     else
328         return strideImpl(c, 0);
329 }
330 
strideImpl(char c,size_t index)331 private uint strideImpl(char c, size_t index) @trusted pure
332 in { assert(c & 0x80); }
333 body
334 {
335     import core.bitop : bsr;
336     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
337     if (c == 0xFF || msbs < 2 || msbs > 4)
338         throw new UTFException("Invalid UTF-8 sequence", index);
339     return msbs;
340 }
341 
342 @system unittest
343 {
344     import core.exception : AssertError;
345     import std.conv : to;
346     import std.exception;
347     import std.string : format;
348     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
349     {
350         enforce(stride(s, i) == codeLength!char(c),
351                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
352 
353         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
354                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
355 
356         auto refRandom = new RefRandomCU!char(s);
357         immutable randLen = refRandom.length;
358         enforce(stride(refRandom, i) == codeLength!char(c),
359                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
360         enforce(refRandom.length == randLen,
361                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
362 
363         if (i == 0)
364         {
365             enforce(stride(s) == codeLength!char(c),
366                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
367 
368             enforce(stride(InputCU!char(s)) == codeLength!char(c),
369                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
370 
371             auto refBidir = new RefBidirCU!char(s);
372             immutable bidirLen = refBidir.length;
373             enforce(stride(refBidir) == codeLength!char(c),
374                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
375             enforce(refBidir.length == bidirLen,
376                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
377         }
378     }
379 
380     assertCTFEable!(
381     {
382     test("a", 'a');
383     test(" ", ' ');
384     test("\u2029", '\u2029'); //paraSep
385     test("\u0100", '\u0100');
386     test("\u0430", '\u0430');
387     test("\U00010143", '\U00010143');
388     test("abcdefcdef", 'a');
389     test("hello\U00010143\u0100\U00010143", 'h', 0);
390     test("hello\U00010143\u0100\U00010143", 'e', 1);
391     test("hello\U00010143\u0100\U00010143", 'l', 2);
392     test("hello\U00010143\u0100\U00010143", 'l', 3);
393     test("hello\U00010143\u0100\U00010143", 'o', 4);
394     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
395     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
396     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
397 
398     foreach (S; AliasSeq!(char[], const char[], string))
399     {
400         enum str = to!S("hello world");
401         static assert(isSafe!({ stride(str, 0); }));
402         static assert(isSafe!({ stride(str);    }));
403         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
404         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
405     }
406     });
407 }
408 
409 @safe unittest // invalid start bytes
410 {
411     import std.exception : assertThrown;
412     immutable char[] invalidStartBytes = [
413         0b1111_1000, // indicating a sequence length of 5
414         0b1111_1100, // 6
415         0b1111_1110, // 7
416         0b1111_1111, // 8
417         0b1000_0000, // continuation byte
418     ];
419     foreach (c; invalidStartBytes)
420         assertThrown!UTFException(stride([c]));
421 }
422 
423 /// Ditto
424 uint stride(S)(auto ref S str, size_t index)
425 if (is(S : const wchar[]) ||
426     (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
427 {
428     static if (is(typeof(str.length) : ulong))
429         assert(index < str.length, "Past the end of the UTF-16 sequence");
430     immutable uint u = str[index];
431     return 1 + (u >= 0xD800 && u <= 0xDBFF);
432 }
433 
434 /// Ditto
435 uint stride(S)(auto ref S str) @safe pure
436 if (is(S : const wchar[]))
437 {
438     return stride(str, 0);
439 }
440 
441 /// Ditto
442 uint stride(S)(auto ref S str)
443 if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
444 {
445     assert(!str.empty, "UTF-16 sequence is empty");
446     immutable uint u = str.front;
447     return 1 + (u >= 0xD800 && u <= 0xDBFF);
448 }
449 
450 @system unittest
451 {
452     import core.exception : AssertError;
453     import std.conv : to;
454     import std.exception;
455     import std.string : format;
456     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
457     {
458         enforce(stride(s, i) == codeLength!wchar(c),
459                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
460 
461         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
462                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
463 
464         auto refRandom = new RefRandomCU!wchar(s);
465         immutable randLen = refRandom.length;
466         enforce(stride(refRandom, i) == codeLength!wchar(c),
467                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
468         enforce(refRandom.length == randLen,
469                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
470 
471         if (i == 0)
472         {
473             enforce(stride(s) == codeLength!wchar(c),
474                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
475 
476             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
477                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
478 
479             auto refBidir = new RefBidirCU!wchar(s);
480             immutable bidirLen = refBidir.length;
481             enforce(stride(refBidir) == codeLength!wchar(c),
482                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
483             enforce(refBidir.length == bidirLen,
484                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
485         }
486     }
487 
488     assertCTFEable!(
489     {
490     test("a", 'a');
491     test(" ", ' ');
492     test("\u2029", '\u2029'); //paraSep
493     test("\u0100", '\u0100');
494     test("\u0430", '\u0430');
495     test("\U00010143", '\U00010143');
496     test("abcdefcdef", 'a');
497     test("hello\U00010143\u0100\U00010143", 'h', 0);
498     test("hello\U00010143\u0100\U00010143", 'e', 1);
499     test("hello\U00010143\u0100\U00010143", 'l', 2);
500     test("hello\U00010143\u0100\U00010143", 'l', 3);
501     test("hello\U00010143\u0100\U00010143", 'o', 4);
502     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
503     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
504     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
505 
506     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
507     {
508         enum str = to!S("hello world");
509         static assert(isSafe!(() => stride(str, 0)));
510         static assert(isSafe!(() => stride(str)   ));
511         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
512         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
513     }
514     });
515 }
516 
517 /// Ditto
518 uint stride(S)(auto ref S str, size_t index = 0)
519 if (is(S : const dchar[]) ||
520     (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
521 {
522     static if (is(typeof(str.length) : ulong))
523         assert(index < str.length, "Past the end of the UTF-32 sequence");
524     else
525         assert(!str.empty, "UTF-32 sequence is empty.");
526     return 1;
527 }
528 
529 @system unittest
530 {
531     import core.exception : AssertError;
532     import std.conv : to;
533     import std.exception;
534     import std.string : format;
535     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
536     {
537         enforce(stride(s, i) == codeLength!dchar(c),
538                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
539 
540         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
541                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
542 
543         auto refRandom = new RefRandomCU!dchar(s);
544         immutable randLen = refRandom.length;
545         enforce(stride(refRandom, i) == codeLength!dchar(c),
546                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
547         enforce(refRandom.length == randLen,
548                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
549 
550         if (i == 0)
551         {
552             enforce(stride(s) == codeLength!dchar(c),
553                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
554 
555             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
556                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
557 
558             auto refBidir = new RefBidirCU!dchar(s);
559             immutable bidirLen = refBidir.length;
560             enforce(stride(refBidir) == codeLength!dchar(c),
561                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
562             enforce(refBidir.length == bidirLen,
563                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
564         }
565     }
566 
567     assertCTFEable!(
568     {
569     test("a", 'a');
570     test(" ", ' ');
571     test("\u2029", '\u2029'); //paraSep
572     test("\u0100", '\u0100');
573     test("\u0430", '\u0430');
574     test("\U00010143", '\U00010143');
575     test("abcdefcdef", 'a');
576     test("hello\U00010143\u0100\U00010143", 'h', 0);
577     test("hello\U00010143\u0100\U00010143", 'e', 1);
578     test("hello\U00010143\u0100\U00010143", 'l', 2);
579     test("hello\U00010143\u0100\U00010143", 'l', 3);
580     test("hello\U00010143\u0100\U00010143", 'o', 4);
581     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
582     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
583     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
584 
585     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
586     {
587         enum str = to!S("hello world");
588         static assert(isSafe!(() => stride(str, 0)));
589         static assert(isSafe!(() => stride(str)   ));
590         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
591         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
592     }
593     });
594 }
595 
596 /++
597     Calculate the length of the UTF sequence ending one code unit before
598     $(D index) in $(D str).
599 
600     Params:
601         str = bidirectional range of UTF code units. Must be random access if
602         $(D index) is passed
603         index = index one past end of UTF sequence (default: $(D str.length))
604 
605     Returns:
606         The number of code units in the UTF sequence. For UTF-8, this is a
607         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
608         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
609 
610     Throws:
611         May throw a $(D UTFException) if $(D str[index]) is not one past the
612         end of a valid UTF sequence.
613 
614     Note:
615         $(D strideBack) will only analyze the element at $(D str[index - 1])
616         element. It will not fully verify the validity of the UTF sequence, nor
617         even verify the presence of the sequence: it will not actually
618         guarantee that $(D strideBack(str, index) <= index).
619   +/
620 uint strideBack(S)(auto ref S str, size_t index)
621 if (is(S : const char[]) ||
622     (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
623 {
624     static if (is(typeof(str.length) : ulong))
625         assert(index <= str.length, "Past the end of the UTF-8 sequence");
626     assert(index > 0, "Not the end of the UTF-8 sequence");
627 
628     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
629         return 1;
630 
631     if (index >= 4) //single verification for most common case
632     {
633         foreach (i; AliasSeq!(2, 3, 4))
634         {
635             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
636                 return i;
637         }
638     }
639     else
640     {
641         foreach (i; AliasSeq!(2, 3))
642         {
643             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
644                 return i;
645         }
646     }
647     throw new UTFException("Not the end of the UTF sequence", index);
648 }
649 
650 /// Ditto
651 uint strideBack(S)(auto ref S str)
652 if (is(S : const char[]) ||
653     (isRandomAccessRange!S && hasLength!S && is(Unqual!(ElementType!S) == char)))
654 {
655     return strideBack(str, str.length);
656 }
657 
658 /// Ditto
659 uint strideBack(S)(auto ref S str)
660 if (isBidirectionalRange!S && is(Unqual!(ElementType!S) == char) && !isRandomAccessRange!S)
661 {
662     assert(!str.empty, "Past the end of the UTF-8 sequence");
663     auto temp = str.save;
664     foreach (i; AliasSeq!(1, 2, 3, 4))
665     {
666         if ((temp.back & 0b1100_0000) != 0b1000_0000)
667             return i;
668         temp.popBack();
669         if (temp.empty)
670             break;
671     }
672     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
673 }
674 
675 @system unittest
676 {
677     import core.exception : AssertError;
678     import std.conv : to;
679     import std.exception;
680     import std.string : format;
681     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
682     {
683         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
684                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
685 
686         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
687                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
688 
689         auto refRandom = new RefRandomCU!char(s);
690         immutable randLen = refRandom.length;
691         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
692                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
693         enforce(refRandom.length == randLen,
694                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
695 
696         if (i == size_t.max)
697         {
698             enforce(strideBack(s) == codeLength!char(c),
699                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
700 
701             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
702                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
703 
704             auto refBidir = new RefBidirCU!char(s);
705             immutable bidirLen = refBidir.length;
706             enforce(strideBack(refBidir) == codeLength!char(c),
707                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
708             enforce(refBidir.length == bidirLen,
709                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
710         }
711     }
712 
713     assertCTFEable!(
714     {
715     test("a", 'a');
716     test(" ", ' ');
717     test("\u2029", '\u2029'); //paraSep
718     test("\u0100", '\u0100');
719     test("\u0430", '\u0430');
720     test("\U00010143", '\U00010143');
721     test("abcdefcdef", 'f');
722     test("\U00010143\u0100\U00010143hello", 'o', 15);
723     test("\U00010143\u0100\U00010143hello", 'l', 14);
724     test("\U00010143\u0100\U00010143hello", 'l', 13);
725     test("\U00010143\u0100\U00010143hello", 'e', 12);
726     test("\U00010143\u0100\U00010143hello", 'h', 11);
727     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
728     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
729     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
730 
731     foreach (S; AliasSeq!(char[], const char[], string))
732     {
733         enum str = to!S("hello world");
734         static assert(isSafe!({ strideBack(str, 0); }));
735         static assert(isSafe!({ strideBack(str);    }));
736         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
737         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
738     }
739     });
740 }
741 
742 //UTF-16 is self synchronizing: The length of strideBack can be found from
743 //the value of a single wchar
744 /// Ditto
745 uint strideBack(S)(auto ref S str, size_t index)
746 if (is(S : const wchar[]) ||
747     (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
748 {
749     static if (is(typeof(str.length) : ulong))
750         assert(index <= str.length, "Past the end of the UTF-16 sequence");
751     assert(index > 0, "Not the end of a UTF-16 sequence");
752 
753     immutable c2 = str[index-1];
754     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
755 }
756 
757 /// Ditto
758 uint strideBack(S)(auto ref S str)
759 if (is(S : const wchar[]) ||
760     (isBidirectionalRange!S && is(Unqual!(ElementType!S) == wchar)))
761 {
762     assert(!str.empty, "UTF-16 sequence is empty");
763 
764     static if (is(S : const(wchar)[]))
765         immutable c2 = str[$ - 1];
766     else
767         immutable c2 = str.back;
768 
769     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
770 }
771 
772 @system unittest
773 {
774     import core.exception : AssertError;
775     import std.conv : to;
776     import std.exception;
777     import std.string : format;
778     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
779     {
780         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
781                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
782 
783         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
784                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
785 
786         auto refRandom = new RefRandomCU!wchar(s);
787         immutable randLen = refRandom.length;
788         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
789                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790         enforce(refRandom.length == randLen,
791                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
792 
793         if (i == size_t.max)
794         {
795             enforce(strideBack(s) == codeLength!wchar(c),
796                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
797 
798             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
799                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
800 
801             auto refBidir = new RefBidirCU!wchar(s);
802             immutable bidirLen = refBidir.length;
803             enforce(strideBack(refBidir) == codeLength!wchar(c),
804                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805             enforce(refBidir.length == bidirLen,
806                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
807         }
808     }
809 
810     assertCTFEable!(
811     {
812     test("a", 'a');
813     test(" ", ' ');
814     test("\u2029", '\u2029'); //paraSep
815     test("\u0100", '\u0100');
816     test("\u0430", '\u0430');
817     test("\U00010143", '\U00010143');
818     test("abcdefcdef", 'f');
819     test("\U00010143\u0100\U00010143hello", 'o', 10);
820     test("\U00010143\u0100\U00010143hello", 'l', 9);
821     test("\U00010143\u0100\U00010143hello", 'l', 8);
822     test("\U00010143\u0100\U00010143hello", 'e', 7);
823     test("\U00010143\u0100\U00010143hello", 'h', 6);
824     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
825     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
826     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
827 
828     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
829     {
830         enum str = to!S("hello world");
831         static assert(isSafe!(() => strideBack(str, 0)));
832         static assert(isSafe!(() => strideBack(str)   ));
833         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
834         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
835     }
836     });
837 }
838 
839 /// Ditto
840 uint strideBack(S)(auto ref S str, size_t index)
841 if (isRandomAccessRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
842 {
843     static if (is(typeof(str.length) : ulong))
844         assert(index <= str.length, "Past the end of the UTF-32 sequence");
845     assert(index > 0, "Not the end of the UTF-32 sequence");
846     return 1;
847 }
848 
849 /// Ditto
850 uint strideBack(S)(auto ref S str)
851 if (isBidirectionalRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
852 {
853     assert(!str.empty, "Empty UTF-32 sequence");
854     return 1;
855 }
856 
857 @system unittest
858 {
859     import core.exception : AssertError;
860     import std.conv : to;
861     import std.exception;
862     import std.string : format;
863     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
864     {
865         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
866                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
867 
868         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
869                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
870 
871         auto refRandom = new RefRandomCU!dchar(s);
872         immutable randLen = refRandom.length;
873         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
874                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
875         enforce(refRandom.length == randLen,
876                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
877 
878         if (i == size_t.max)
879         {
880             enforce(strideBack(s) == codeLength!dchar(c),
881                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
882 
883             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
884                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
885 
886             auto refBidir = new RefBidirCU!dchar(s);
887             immutable bidirLen = refBidir.length;
888             enforce(strideBack(refBidir) == codeLength!dchar(c),
889                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
890             enforce(refBidir.length == bidirLen,
891                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
892         }
893     }
894 
895     assertCTFEable!(
896     {
897     test("a", 'a');
898     test(" ", ' ');
899     test("\u2029", '\u2029'); //paraSep
900     test("\u0100", '\u0100');
901     test("\u0430", '\u0430');
902     test("\U00010143", '\U00010143');
903     test("abcdefcdef", 'f');
904     test("\U00010143\u0100\U00010143hello", 'o', 8);
905     test("\U00010143\u0100\U00010143hello", 'l', 7);
906     test("\U00010143\u0100\U00010143hello", 'l', 6);
907     test("\U00010143\u0100\U00010143hello", 'e', 5);
908     test("\U00010143\u0100\U00010143hello", 'h', 4);
909     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
910     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
911     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
912 
913     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
914     {
915         enum str = to!S("hello world");
916         static assert(isSafe!(() => strideBack(str, 0)));
917         static assert(isSafe!(() => strideBack(str)   ));
918         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
919         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
920     }
921     });
922 }
923 
924 
925 /++
926     Given $(D index) into $(D str) and assuming that $(D index) is at the start
927     of a UTF sequence, $(D toUCSindex) determines the number of UCS characters
928     up to $(D index). So, $(D index) is the index of a code unit at the
929     beginning of a code point, and the return value is how many code points into
930     the string that that code point is.
931   +/
932 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
933 if (isSomeChar!C)
934 {
935     static if (is(Unqual!C == dchar))
936         return index;
937     else
938     {
939         size_t n = 0;
940         size_t j = 0;
941 
942         for (; j < index; ++n)
943             j += stride(str, j);
944 
945         if (j > index)
946         {
947             static if (is(Unqual!C == char))
948                 throw new UTFException("Invalid UTF-8 sequence", index);
949             else
950                 throw new UTFException("Invalid UTF-16 sequence", index);
951         }
952 
953         return n;
954     }
955 }
956 
957 ///
958 @safe unittest
959 {
960     assert(toUCSindex(`hello world`, 7) == 7);
961     assert(toUCSindex(`hello world`w, 7) == 7);
962     assert(toUCSindex(`hello world`d, 7) == 7);
963 
964     assert(toUCSindex(`Ma Chérie`, 7) == 6);
965     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
966     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
967 
968     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
969     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
970     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
971 }
972 
973 
974 /++
975     Given a UCS index $(D n) into $(D str), returns the UTF index.
976     So, $(D n) is how many code points into the string the code point is, and
977     the array index of the code unit is returned.
978   +/
979 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
980 if (isSomeChar!C)
981 {
982     static if (is(Unqual!C == dchar))
983     {
984         return n;
985     }
986     else
987     {
988         size_t i;
989         while (n--)
990         {
991             i += stride(str, i);
992         }
993         return i;
994     }
995 }
996 
997 ///
998 @safe unittest
999 {
1000     assert(toUTFindex(`hello world`, 7) == 7);
1001     assert(toUTFindex(`hello world`w, 7) == 7);
1002     assert(toUTFindex(`hello world`d, 7) == 7);
1003 
1004     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1005     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1006     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1007 
1008     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1009     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1010     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1011 }
1012 
1013 
1014 /* =================== Decode ======================= */
1015 
1016 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1017 alias UseReplacementDchar = Flag!"useReplacementDchar";
1018 
1019 /++
1020     Decodes and returns the code point starting at $(D str[index]). $(D index)
1021     is advanced to one past the decoded code point. If the code point is not
1022     well-formed, then a $(D UTFException) is thrown and $(D index) remains
1023     unchanged.
1024 
1025     decode will only work with strings and random access ranges of code units
1026     with length and slicing, whereas $(LREF decodeFront) will work with any
1027     input range of code units.
1028 
1029     Params:
1030         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1031         str = input string or indexable Range
1032         index = starting index into s[]; incremented by number of code units processed
1033 
1034     Returns:
1035         decoded character
1036 
1037     Throws:
1038         $(LREF UTFException) if $(D str[index]) is not the start of a valid UTF
1039         sequence and useReplacementDchar is $(D No.useReplacementDchar)
1040   +/
1041 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1042 if (!isSomeString!S &&
1043     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1044 in
1045 {
1046     assert(index < str.length, "Attempted to decode past the end of a string");
1047 }
out(result)1048 out (result)
1049 {
1050     assert(isValidDchar(result));
1051 }
1052 body
1053 {
1054     if (str[index] < codeUnitLimit!S)
1055         return str[index++];
1056     else
1057         return decodeImpl!(true, useReplacementDchar)(str, index);
1058 }
1059 
1060 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1061 auto ref S str, ref size_t index) @trusted pure
1062 if (isSomeString!S)
1063 in
1064 {
1065     assert(index < str.length, "Attempted to decode past the end of a string");
1066 }
out(result)1067 out (result)
1068 {
1069     assert(isValidDchar(result));
1070 }
1071 body
1072 {
1073     if (str[index] < codeUnitLimit!S)
1074         return str[index++];
1075     else
1076         return decodeImpl!(true, useReplacementDchar)(str, index);
1077 }
1078 
1079 /++
1080     $(D decodeFront) is a variant of $(LREF decode) which specifically decodes
1081     the first code point. Unlike $(LREF decode), $(D decodeFront) accepts any
1082     input range of code units (rather than just a string or random access
1083     range). It also takes the range by $(D ref) and pops off the elements as it
1084     decodes them. If $(D numCodeUnits) is passed in, it gets set to the number
1085     of code units which were in the code point which was decoded.
1086 
1087     Params:
1088         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1089         str = input string or indexable Range
1090         numCodeUnits = set to number of code units processed
1091 
1092     Returns:
1093         decoded character
1094 
1095     Throws:
1096         $(LREF UTFException) if $(D str.front) is not the start of a valid UTF
1097         sequence. If an exception is thrown, then there is no guarantee as to
1098         the number of code units which were popped off, as it depends on the
1099         type of range being used and how many code units had to be popped off
1100         before the code point was determined to be invalid.
1101   +/
1102 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1103 ref S str, out size_t numCodeUnits)
1104 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1105 in
1106 {
1107     assert(!str.empty);
1108 }
out(result)1109 out (result)
1110 {
1111     assert(isValidDchar(result));
1112 }
1113 body
1114 {
1115     immutable fst = str.front;
1116 
1117     if (fst < codeUnitLimit!S)
1118     {
1119         str.popFront();
1120         numCodeUnits = 1;
1121         return fst;
1122     }
1123     else
1124     {
1125         //@@@BUG@@@ 14447 forces canIndex to be done outside of decodeImpl, which
1126         //is undesirable, since not all overloads of decodeImpl need it. So, it
1127         //should be moved back into decodeImpl once bug# 8521 has been fixed.
1128         enum canIndex = isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1129         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1130 
1131         // The other range types were already popped by decodeImpl.
1132         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1133             str = str[numCodeUnits .. str.length];
1134 
1135         return retval;
1136     }
1137 }
1138 
1139 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1140 ref S str, out size_t numCodeUnits) @trusted pure
1141 if (isSomeString!S)
1142 in
1143 {
1144     assert(!str.empty);
1145 }
out(result)1146 out (result)
1147 {
1148     assert(isValidDchar(result));
1149 }
1150 body
1151 {
1152     if (str[0] < codeUnitLimit!S)
1153     {
1154         numCodeUnits = 1;
1155         immutable retval = str[0];
1156         str = str[1 .. $];
1157         return retval;
1158     }
1159     else
1160     {
1161         immutable retval = decodeImpl!(true, useReplacementDchar)(str, numCodeUnits);
1162         str = str[numCodeUnits .. $];
1163         return retval;
1164     }
1165 }
1166 
1167 /++ Ditto +/
1168 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1169 if (isInputRange!S && isSomeChar!(ElementType!S))
1170 {
1171     size_t numCodeUnits;
1172     return decodeFront!useReplacementDchar(str, numCodeUnits);
1173 }
1174 
1175 /++
1176     $(D decodeBack) is a variant of $(LREF decode) which specifically decodes
1177     the last code point. Unlike $(LREF decode), $(D decodeBack) accepts any
1178     bidirectional range of code units (rather than just a string or random access
1179     range). It also takes the range by $(D ref) and pops off the elements as it
1180     decodes them. If $(D numCodeUnits) is passed in, it gets set to the number
1181     of code units which were in the code point which was decoded.
1182 
1183     Params:
1184         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1185         str = input string or bidirectional Range
1186         numCodeUnits = gives the number of code units processed
1187 
1188     Returns:
1189         A decoded UTF character.
1190 
1191     Throws:
1192         $(LREF UTFException) if $(D str.back) is not the end of a valid UTF
1193         sequence. If an exception is thrown, the $(D str) itself remains unchanged,
1194         but there is no guarantee as to the value of $(D numCodeUnits) (when passed).
1195   +/
1196 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1197     ref S str, out size_t numCodeUnits)
1198 if (isSomeString!S)
1199 in
1200 {
1201     assert(!str.empty);
1202 }
out(result)1203 out (result)
1204 {
1205     assert(isValidDchar(result));
1206 }
1207 body
1208 {
1209     if (str[$ - 1] < codeUnitLimit!S)
1210     {
1211         numCodeUnits = 1;
1212         immutable retval = str[$ - 1];
1213         str = str[0 .. $ - 1];
1214         return retval;
1215     }
1216     else
1217     {
1218         numCodeUnits = strideBack(str);
1219         immutable newLength = str.length - numCodeUnits;
1220         size_t index = newLength;
1221         immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1222         str = str[0 .. newLength];
1223         return retval;
1224     }
1225 }
1226 
1227 /++ Ditto +/
1228 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1229     ref S str, out size_t numCodeUnits)
1230 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1231     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1232 in
1233 {
1234     assert(!str.empty);
1235 }
out(result)1236 out (result)
1237 {
1238     assert(isValidDchar(result));
1239 }
1240 body
1241 {
1242     if (str.back < codeUnitLimit!S)
1243     {
1244         numCodeUnits = 1;
1245         immutable retval = str.back;
1246         str.popBack();
1247         return retval;
1248     }
1249     else
1250     {
1251         numCodeUnits = strideBack(str);
1252         static if (isRandomAccessRange!S)
1253         {
1254             size_t index = str.length - numCodeUnits;
1255             immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1256             str.popBackExactly(numCodeUnits);
1257             return retval;
1258         }
1259         else
1260         {
1261             alias Char = Unqual!(ElementType!S);
1262             Char[4] codeUnits;
1263             S tmp = str.save;
1264             for (size_t i = numCodeUnits; i > 0; )
1265             {
1266                 codeUnits[--i] = tmp.back;
1267                 tmp.popBack();
1268             }
1269             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1270             size_t index = 0;
1271             immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1272             str = tmp;
1273             return retval;
1274         }
1275     }
1276 }
1277 
1278 /++ Ditto +/
1279 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1280 if (isSomeString!S
1281     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1282     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1283 in
1284 {
1285     assert(!str.empty);
1286 }
out(result)1287 out (result)
1288 {
1289     assert(isValidDchar(result));
1290 }
1291 body
1292 {
1293     size_t numCodeUnits;
1294     return decodeBack!useReplacementDchar(str, numCodeUnits);
1295 }
1296 
1297 // Gives the maximum value that a code unit for the given range type can hold.
1298 package template codeUnitLimit(S)
1299 if (isSomeChar!(ElementEncodingType!S))
1300 {
1301     static if (is(Unqual!(ElementEncodingType!S) == char))
1302         enum char codeUnitLimit = 0x80;
1303     else static if (is(Unqual!(ElementEncodingType!S) == wchar))
1304         enum wchar codeUnitLimit = 0xD800;
1305     else
1306         enum dchar codeUnitLimit = 0xD800;
1307 }
1308 
1309 /*
1310  * For strings, this function does its own bounds checking to give a
1311  * more useful error message when attempting to decode past the end of a string.
1312  * Subsequently it uses a pointer instead of an array to avoid
1313  * redundant bounds checking.
1314  *
1315  * The three overloads of this operate on chars, wchars, and dchars.
1316  *
1317  * Params:
1318  *      canIndex = if S is indexable
1319  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1320  *      str = input string or Range
1321  *      index = starting index into s[]; incremented by number of code units processed
1322  *
1323  * Returns:
1324  *      decoded character
1325  */
1326 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1327     auto ref S str, ref size_t index)
1328 if (
1329     is(S : const char[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == char)))
1330 {
1331     /* The following encodings are valid, except for the 5 and 6 byte
1332      * combinations:
1333      *  0xxxxxxx
1334      *  110xxxxx 10xxxxxx
1335      *  1110xxxx 10xxxxxx 10xxxxxx
1336      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1337      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1338      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1339      */
1340 
1341     /* Dchar bitmask for different numbers of UTF-8 code units.
1342      */
1343     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1344 
1345     static if (is(S : const char[]))
1346         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1347     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1348         auto pstr = str[index .. str.length];
1349     else
1350         alias pstr = str;
1351 
1352     //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl
1353     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1354 
1355     static if (canIndex)
1356     {
1357         immutable length = str.length - index;
1358         ubyte fst = pstr[0];
1359     }
1360     else
1361     {
1362         ubyte fst = pstr.front;
1363         pstr.popFront();
1364     }
1365 
1366     static if (!useReplacementDchar)
1367     {
1368         static if (canIndex)
1369         {
exception(S)1370             static UTFException exception(S)(S str, string msg)
1371             {
1372                 uint[4] sequence = void;
1373                 size_t i;
1374 
1375                 do
1376                 {
1377                     sequence[i] = str[i];
1378                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1379 
1380                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1381             }
1382         }
1383 
invalidUTF()1384         UTFException invalidUTF()
1385         {
1386             static if (canIndex)
1387                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1388             else
1389             {
1390                 //We can't include the invalid sequence with input strings without
1391                 //saving each of the code units along the way, and we can't do it with
1392                 //forward ranges without saving the entire range. Both would incur a
1393                 //cost for the decoding of every character just to provide a better
1394                 //error message for the (hopefully) rare case when an invalid UTF-8
1395                 //sequence is encountered, so we don't bother trying to include the
1396                 //invalid sequence here, unlike with strings and sliceable ranges.
1397                return new UTFException("Invalid UTF-8 sequence");
1398             }
1399         }
1400 
outOfBounds()1401         UTFException outOfBounds()
1402         {
1403             static if (canIndex)
1404                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1405             else
1406                return new UTFException("Attempted to decode past the end of a string");
1407         }
1408     }
1409 
1410     if ((fst & 0b1100_0000) != 0b1100_0000)
1411     {
1412         static if (useReplacementDchar)
1413         {
1414             ++index;            // always consume bad input to avoid infinite loops
1415             return replacementDchar;
1416         }
1417         else
1418             throw invalidUTF(); // starter must have at least 2 first bits set
1419     }
1420     ubyte tmp = void;
1421     dchar d = fst; // upper control bits are masked out later
1422     fst <<= 1;
1423 
1424     foreach (i; AliasSeq!(1, 2, 3))
1425     {
1426 
1427         static if (canIndex)
1428         {
1429             if (i == length)
1430             {
1431                 static if (useReplacementDchar)
1432                 {
1433                     index += i;
1434                     return replacementDchar;
1435                 }
1436                 else
1437                     throw outOfBounds();
1438             }
1439         }
1440         else
1441         {
1442             if (pstr.empty)
1443             {
1444                 static if (useReplacementDchar)
1445                 {
1446                     index += i;
1447                     return replacementDchar;
1448                 }
1449                 else
1450                     throw outOfBounds();
1451             }
1452         }
1453 
1454         static if (canIndex)
1455             tmp = pstr[i];
1456         else
1457         {
1458             tmp = pstr.front;
1459             pstr.popFront();
1460         }
1461 
1462         if ((tmp & 0xC0) != 0x80)
1463         {
1464             static if (useReplacementDchar)
1465             {
1466                 index += i + 1;
1467                 return replacementDchar;
1468             }
1469             else
1470                 throw invalidUTF();
1471         }
1472 
1473         d = (d << 6) | (tmp & 0x3F);
1474         fst <<= 1;
1475 
1476         if (!(fst & 0x80)) // no more bytes
1477         {
1478             d &= bitMask[i]; // mask out control bits
1479 
1480             // overlong, could have been encoded with i bytes
1481             if ((d & ~bitMask[i - 1]) == 0)
1482             {
1483                 static if (useReplacementDchar)
1484                 {
1485                     index += i + 1;
1486                     return replacementDchar;
1487                 }
1488                 else
1489                     throw invalidUTF();
1490             }
1491 
1492             // check for surrogates only needed for 3 bytes
1493             static if (i == 2)
1494             {
1495                 if (!isValidDchar(d))
1496                 {
1497                     static if (useReplacementDchar)
1498                     {
1499                         index += i + 1;
1500                         return replacementDchar;
1501                     }
1502                     else
1503                         throw invalidUTF();
1504                 }
1505             }
1506 
1507             index += i + 1;
1508             static if (i == 3)
1509             {
1510                 if (d > dchar.max)
1511                 {
1512                     static if (useReplacementDchar)
1513                         d = replacementDchar;
1514                     else
1515                         throw invalidUTF();
1516                 }
1517             }
1518             return d;
1519         }
1520     }
1521 
1522     static if (useReplacementDchar)
1523     {
1524         index += 4;             // read 4 chars by now
1525         return replacementDchar;
1526     }
1527     else
1528         throw invalidUTF();
1529 }
1530 
1531 @safe pure @nogc nothrow
1532 unittest
1533 {
1534     // Add tests for useReplacemendDchar == yes path
1535 
1536     static struct R
1537     {
1538       @safe pure @nogc nothrow:
thisR1539         this(string s) { this.s = s; }
emptyR1540         @property bool empty() { return idx == s.length; }
frontR1541         @property char front() { return s[idx]; }
popFrontR1542         void popFront() { ++idx; }
1543         size_t idx;
1544         string s;
1545     }
1546 
1547     foreach (s; invalidUTFstrings!char())
1548     {
1549         auto r = R(s);
1550         size_t index;
1551         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1552         assert(dc == replacementDchar);
1553         assert(1 <= index && index <= s.length);
1554     }
1555 }
1556 
1557 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1558 (auto ref S str, ref size_t index)
1559 if (is(S : const wchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == wchar)))
1560 {
1561     static if (is(S : const wchar[]))
1562         auto pstr = str.ptr + index;
1563     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1564         auto pstr = str[index .. str.length];
1565     else
1566         alias pstr = str;
1567 
1568     //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl
1569     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1570 
1571     static if (canIndex)
1572     {
1573         immutable length = str.length - index;
1574         uint u = pstr[0];
1575     }
1576     else
1577     {
1578         uint u = pstr.front;
1579         pstr.popFront();
1580     }
1581 
1582     static if (!useReplacementDchar)
1583     {
exception(string msg)1584         UTFException exception(string msg)
1585         {
1586             static if (canIndex)
1587                 return new UTFException(msg).setSequence(pstr[0]);
1588             else
1589                 return new UTFException(msg);
1590         }
1591     }
1592 
1593     // The < case must be taken care of before decodeImpl is called.
1594     assert(u >= 0xD800);
1595 
1596     if (u <= 0xDBFF)
1597     {
1598         static if (canIndex)
1599             immutable onlyOneCodeUnit = length == 1;
1600         else
1601             immutable onlyOneCodeUnit = pstr.empty;
1602 
1603         if (onlyOneCodeUnit)
1604         {
1605             static if (useReplacementDchar)
1606             {
1607                 ++index;
1608                 return replacementDchar;
1609             }
1610             else
1611                 throw exception("surrogate UTF-16 high value past end of string");
1612         }
1613 
1614         static if (canIndex)
1615             immutable uint u2 = pstr[1];
1616         else
1617         {
1618             immutable uint u2 = pstr.front;
1619             pstr.popFront();
1620         }
1621 
1622         if (u2 < 0xDC00 || u2 > 0xDFFF)
1623         {
1624             static if (useReplacementDchar)
1625                 u = replacementDchar;
1626             else
1627                 throw exception("surrogate UTF-16 low value out of range");
1628         }
1629         else
1630             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1631         ++index;
1632     }
1633     else if (u >= 0xDC00 && u <= 0xDFFF)
1634     {
1635         static if (useReplacementDchar)
1636             u = replacementDchar;
1637         else
1638             throw exception("unpaired surrogate UTF-16 value");
1639     }
1640     ++index;
1641 
1642     // Note: u+FFFE and u+FFFF are specifically permitted by the
1643     // Unicode standard for application internal use (see isValidDchar)
1644 
1645     return cast(dchar) u;
1646 }
1647 
1648 @safe pure @nogc nothrow
1649 unittest
1650 {
1651     // Add tests for useReplacemendDchar == true path
1652 
1653     static struct R
1654     {
1655       @safe pure @nogc nothrow:
thisR1656         this(wstring s) { this.s = s; }
emptyR1657         @property bool empty() { return idx == s.length; }
frontR1658         @property wchar front() { return s[idx]; }
popFrontR1659         void popFront() { ++idx; }
1660         size_t idx;
1661         wstring s;
1662     }
1663 
1664     foreach (s; invalidUTFstrings!wchar())
1665     {
1666         auto r = R(s);
1667         size_t index;
1668         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1669         assert(dc == replacementDchar);
1670         assert(1 <= index && index <= s.length);
1671     }
1672 }
1673 
1674 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1675     auto ref S str, ref size_t index)
1676 if (is(S : const dchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
1677 {
1678     static if (is(S : const dchar[]))
1679         auto pstr = str.ptr;
1680     else
1681         alias pstr = str;
1682 
1683     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1684     {
1685         dchar dc = pstr[index];
1686         if (!isValidDchar(dc))
1687         {
1688             static if (useReplacementDchar)
1689                 dc = replacementDchar;
1690             else
1691                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1692         }
1693         ++index;
1694         return dc;
1695     }
1696     else
1697     {
1698         dchar dc = pstr.front;
1699         if (!isValidDchar(dc))
1700         {
1701             static if (useReplacementDchar)
1702                 dc = replacementDchar;
1703             else
1704                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1705         }
1706         ++index;
1707         pstr.popFront();
1708         return dc;
1709     }
1710 }
1711 
1712 @safe pure @nogc nothrow
1713 unittest
1714 {
1715     // Add tests for useReplacemendDchar == true path
1716 
1717     static struct R
1718     {
1719       @safe pure @nogc nothrow:
thisR1720         this(dstring s) { this.s = s; }
emptyR1721         @property bool empty() { return idx == s.length; }
frontR1722         @property dchar front() { return s[idx]; }
popFrontR1723         void popFront() { ++idx; }
1724         size_t idx;
1725         dstring s;
1726     }
1727 
1728     foreach (s; invalidUTFstrings!dchar())
1729     {
1730         auto r = R(s);
1731         size_t index;
1732         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1733         assert(dc == replacementDchar);
1734         assert(1 <= index && index <= s.length);
1735     }
1736 }
1737 
1738 
version(unittest)1739 version (unittest) private void testDecode(R)(R range,
1740                                              size_t index,
1741                                              dchar expectedChar,
1742                                              size_t expectedIndex,
1743                                              size_t line = __LINE__)
1744 {
1745     import core.exception : AssertError;
1746     import std.string : format;
1747 
1748     static if (hasLength!R)
1749         immutable lenBefore = range.length;
1750 
1751     static if (isRandomAccessRange!R)
1752     {
1753         {
1754             immutable result = decode(range, index);
1755             enforce(result == expectedChar,
1756                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1757             enforce(index == expectedIndex,
1758                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1759             static if (hasLength!R)
1760             {
1761                 enforce(range.length == lenBefore,
1762                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1763             }
1764         }
1765     }
1766 }
1767 
version(unittest)1768 version (unittest) private void testDecodeFront(R)(ref R range,
1769                                                   dchar expectedChar,
1770                                                   size_t expectedNumCodeUnits,
1771                                                   size_t line = __LINE__)
1772 {
1773     import core.exception : AssertError;
1774     import std.string : format;
1775 
1776     static if (hasLength!R)
1777         immutable lenBefore = range.length;
1778 
1779     size_t numCodeUnits;
1780     immutable result = decodeFront(range, numCodeUnits);
1781     enforce(result == expectedChar,
1782             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1783     enforce(numCodeUnits == expectedNumCodeUnits,
1784             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1785 
1786     static if (hasLength!R)
1787     {
1788         enforce(range.length == lenBefore - numCodeUnits,
1789                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1790     }
1791 }
1792 
version(unittest)1793 version (unittest) private void testDecodeBack(R)(ref R range,
1794                                                  dchar expectedChar,
1795                                                  size_t expectedNumCodeUnits,
1796                                                  size_t line = __LINE__)
1797 {
1798     // This condition is to allow unit testing all `decode` functions together
1799     static if (!isBidirectionalRange!R)
1800         return;
1801     else
1802     {
1803         import core.exception : AssertError;
1804         import std.string : format;
1805 
1806         static if (hasLength!R)
1807             immutable lenBefore = range.length;
1808 
1809         size_t numCodeUnits;
1810         immutable result = decodeBack(range, numCodeUnits);
1811         enforce(result == expectedChar,
1812                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1813         enforce(numCodeUnits == expectedNumCodeUnits,
1814                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1815 
1816         static if (hasLength!R)
1817         {
1818             enforce(range.length == lenBefore - numCodeUnits,
1819                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
1820         }
1821     }
1822 }
1823 
version(unittest)1824 version (unittest) private void testAllDecode(R)(R range,
1825                                                 dchar expectedChar,
1826                                                 size_t expectedIndex,
1827                                                 size_t line = __LINE__)
1828 {
1829     testDecode(range, 0, expectedChar, expectedIndex, line);
1830     static if (isBidirectionalRange!R)
1831     {
1832         auto rangeCopy = range.save;
1833         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
1834     }
1835     testDecodeFront(range, expectedChar, expectedIndex, line);
1836 }
1837 
version(unittest)1838 version (unittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
1839 {
1840     import core.exception : AssertError;
1841     import std.string : format;
1842 
1843     immutable initialIndex = index;
1844 
1845     static if (hasLength!R)
1846         immutable lenBefore = range.length;
1847 
1848     static if (isRandomAccessRange!R)
1849     {
1850         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
1851         enforce(index == initialIndex,
1852                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1853         static if (hasLength!R)
1854         {
1855             enforce(range.length == lenBefore,
1856                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
1857         }
1858     }
1859 
1860     if (initialIndex == 0)
1861         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
1862 }
1863 
version(unittest)1864 version (unittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
1865 {
1866     // This condition is to allow unit testing all `decode` functions together
1867     static if (!isBidirectionalRange!R)
1868         return;
1869     else
1870     {
1871         import core.exception : AssertError;
1872         import std.string : format;
1873 
1874         static if (hasLength!R)
1875             immutable lenBefore = range.length;
1876 
1877         static if (isRandomAccessRange!R)
1878         {
1879             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
1880             static if (hasLength!R)
1881             {
1882                 enforce(range.length == lenBefore,
1883                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
1884             }
1885         }
1886     }
1887 }
1888 
1889 @system unittest
1890 {
1891     import std.conv : to;
1892     import std.exception;
1893 
1894     assertCTFEable!(
1895     {
1896     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
1897                           (string s) => new RefBidirCU!char(s),
1898                           (string s) => new RefRandomCU!char(s)))
1899     {
1900         enum sHasLength = hasLength!(typeof(S("abcd")));
1901 
1902         {
1903             auto range = S("abcd");
1904             testDecode(range, 0, 'a', 1);
1905             testDecode(range, 1, 'b', 2);
1906             testDecodeFront(range, 'a', 1);
1907             testDecodeFront(range, 'b', 1);
1908             assert(decodeFront(range) == 'c');
1909             assert(decodeFront(range) == 'd');
1910         }
1911 
1912         {
1913             auto range = S("ウェブサイト");
1914             testDecode(range, 0, 'ウ', 3);
1915             testDecode(range, 3, 'ェ', 6);
1916             testDecodeFront(range, 'ウ', 3);
1917             testDecodeFront(range, 'ェ', 3);
1918             assert(decodeFront(range) == 'ブ');
1919             assert(decodeFront(range) == 'サ');
1920         }
1921 
1922         {
1923             auto range = S("abcd");
1924             testDecodeBack(range, 'd', 1);
1925             testDecodeBack(range, 'c', 1);
1926             testDecodeBack(range, 'b', 1);
1927             testDecodeBack(range, 'a', 1);
1928         }
1929 
1930         {
1931             auto range = S("ウェブサイト");
1932             testDecodeBack(range, 'ト', 3);
1933             testDecodeBack(range, 'イ', 3);
1934             testDecodeBack(range, 'サ', 3);
1935             testDecodeBack(range, 'ブ', 3);
1936         }
1937 
1938         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
1939         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
1940 
1941         foreach (str; ["\xE2\x89", // too short
1942                        "\xC0\x8A",
1943                        "\xE0\x80\x8A",
1944                        "\xF0\x80\x80\x8A",
1945                        "\xF8\x80\x80\x80\x8A",
1946                        "\xFC\x80\x80\x80\x80\x8A"])
1947         {
1948             testBadDecode(S(str), 0);
1949             testBadDecode(S(str), 1);
1950             testBadDecodeBack(S(str));
1951         }
1952 
1953         //Invalid UTF-8 sequence where the first code unit is valid.
1954         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
1955         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
1956 
1957         //Invalid UTF-8 sequence where the first code unit isn't valid.
1958         foreach (str; ["\xED\xA0\x80",
1959                        "\xED\xAD\xBF",
1960                        "\xED\xAE\x80",
1961                        "\xED\xAF\xBF",
1962                        "\xED\xB0\x80",
1963                        "\xED\xBE\x80",
1964                        "\xED\xBF\xBF"])
1965         {
1966             testBadDecode(S(str), 0);
1967             testBadDecodeBack(S(str));
1968         }
1969     }
1970     });
1971 }
1972 
1973 @system unittest
1974 {
1975     import std.conv : to;
1976     import std.exception;
1977     assertCTFEable!(
1978     {
1979     foreach (S; AliasSeq!(to!wstring, InputCU!wchar, RandomCU!wchar,
1980                           (wstring s) => new RefBidirCU!wchar(s),
1981                           (wstring s) => new RefRandomCU!wchar(s)))
1982     {
1983         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
1984         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
1985         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
1986         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
1987         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
1988 
1989         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
1990         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
1991 
1992         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
1993         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
1994 
1995         {
1996             auto range = S("ウェブサイト");
1997             testDecode(range, 0, 'ウ', 1);
1998             testDecode(range, 1, 'ェ', 2);
1999             testDecodeFront(range, 'ウ', 1);
2000             testDecodeFront(range, 'ェ', 1);
2001             assert(decodeFront(range) == 'ブ');
2002             assert(decodeFront(range) == 'サ');
2003         }
2004 
2005         {
2006             auto range = S("ウェブサイト");
2007             testDecodeBack(range, 'ト', 1);
2008             testDecodeBack(range, 'イ', 1);
2009             testDecodeBack(range, 'サ', 1);
2010             testDecodeBack(range, 'ブ', 1);
2011         }
2012     }
2013 
2014     foreach (S; AliasSeq!(to!wstring, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2015     {
2016         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2017                       cast(wchar) 0x1400,
2018                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2019         testDecode(str, 0, cast(dchar) 0x10000, 2);
2020         testDecode(str, 2, cast(dchar) 0x1400, 3);
2021         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2022         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2023         testDecodeBack(str, cast(dchar) 0x1400, 1);
2024         testDecodeBack(str, cast(dchar) 0x10000, 2);
2025     }
2026     });
2027 }
2028 
2029 @system unittest
2030 {
2031     import std.conv : to;
2032     import std.exception;
2033     assertCTFEable!(
2034     {
2035     foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, InputCU!dchar,
2036                           (dstring s) => new RefBidirCU!dchar(s),
2037                           (dstring s) => new RefRandomCU!dchar(s)))
2038     {
2039         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2040         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2041         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2042         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2043         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2044 
2045         testBadDecode(S([cast(dchar) 0xD800]), 0);
2046         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2047         testBadDecode(S([cast(dchar) 0x110000]), 0);
2048 
2049         testBadDecodeBack(S([cast(dchar) 0xD800]));
2050         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2051         testBadDecodeBack(S([cast(dchar) 0x110000]));
2052 
2053         {
2054             auto range = S("ウェブサイト");
2055             testDecode(range, 0, 'ウ', 1);
2056             testDecode(range, 1, 'ェ', 2);
2057             testDecodeFront(range, 'ウ', 1);
2058             testDecodeFront(range, 'ェ', 1);
2059             assert(decodeFront(range) == 'ブ');
2060             assert(decodeFront(range) == 'サ');
2061         }
2062 
2063         {
2064             auto range = S("ウェブサイト");
2065             testDecodeBack(range, 'ト', 1);
2066             testDecodeBack(range, 'イ', 1);
2067             testDecodeBack(range, 'サ', 1);
2068             testDecodeBack(range, 'ブ', 1);
2069         }
2070     }
2071 
2072     foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2073     {
2074         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2075         testDecode(str, 0, 0x10000, 1);
2076         testDecode(str, 1, 0x1400, 2);
2077         testDecode(str, 2, 0xB9DDE, 3);
2078         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2079         testDecodeBack(str, cast(dchar) 0x1400, 1);
2080         testDecodeBack(str, cast(dchar) 0x10000, 1);
2081     }
2082     });
2083 }
2084 
2085 @safe unittest
2086 {
2087     import std.exception;
2088     assertCTFEable!(
2089     {
2090     foreach (S; AliasSeq!( char[], const( char)[],  string,
2091                           wchar[], const(wchar)[], wstring,
2092                           dchar[], const(dchar)[], dstring))
2093     {
2094         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2095         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2096         static assert(isSafe!({ S str; decodeFront(str); }));
2097         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2098         static assert((functionAttributes!({
2099             S str; size_t i = 0; decodeFront(str, i);
2100         }) & FunctionAttribute.pure_) != 0);
2101         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2102         static assert((functionAttributes!({
2103             S str; size_t i = 0; decodeBack(str, i);
2104         }) & FunctionAttribute.pure_) != 0);
2105         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2106     }
2107     });
2108 }
2109 
2110 @safe unittest
2111 {
2112     import std.exception;
2113     char[4] val;
2114     val[0] = 0b1111_0111;
2115     val[1] = 0b1011_1111;
2116     val[2] = 0b1011_1111;
2117     val[3] = 0b1011_1111;
2118     size_t i = 0;
2119     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2120 }
2121 /* =================== Encode ======================= */
2122 
_utfException(UseReplacementDchar useReplacementDchar)2123 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2124 {
2125     static if (useReplacementDchar)
2126         return replacementDchar;
2127     else
2128         throw new UTFException(msg).setSequence(c);
2129 }
2130 
2131 /++
2132     Encodes $(D c) into the static array, $(D buf), and returns the actual
2133     length of the encoded character (a number between $(D 1) and $(D 4) for
2134     $(D char[4]) buffers and a number between $(D 1) and $(D 2) for
2135     $(D wchar[2]) buffers).
2136 
2137     Throws:
2138         $(D UTFException) if $(D c) is not a valid UTF code point.
2139   +/
2140 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2141     out char[4] buf, dchar c) @safe pure
2142 {
2143     if (c <= 0x7F)
2144     {
2145         assert(isValidDchar(c));
2146         buf[0] = cast(char) c;
2147         return 1;
2148     }
2149     if (c <= 0x7FF)
2150     {
2151         assert(isValidDchar(c));
2152         buf[0] = cast(char)(0xC0 | (c >> 6));
2153         buf[1] = cast(char)(0x80 | (c & 0x3F));
2154         return 2;
2155     }
2156     if (c <= 0xFFFF)
2157     {
2158         if (0xD800 <= c && c <= 0xDFFF)
2159             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2160 
2161         assert(isValidDchar(c));
2162     L3:
2163         buf[0] = cast(char)(0xE0 | (c >> 12));
2164         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2165         buf[2] = cast(char)(0x80 | (c & 0x3F));
2166         return 3;
2167     }
2168     if (c <= 0x10FFFF)
2169     {
2170         assert(isValidDchar(c));
2171         buf[0] = cast(char)(0xF0 | (c >> 18));
2172         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2173         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2174         buf[3] = cast(char)(0x80 | (c & 0x3F));
2175         return 4;
2176     }
2177 
2178     assert(!isValidDchar(c));
2179     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2180     goto L3;
2181 }
2182 
2183 @safe unittest
2184 {
2185     import std.exception;
2186     assertCTFEable!(
2187     {
2188     char[4] buf;
2189 
2190     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2191     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2192     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2193     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2194     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2195     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2196     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2197     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2198     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2199     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2200     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2201 
2202     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2203     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2204     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2205     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2206     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2207 
2208     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2209     assert(buf.front == replacementDchar);
2210     });
2211 }
2212 
2213 
2214 /// Ditto
2215 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2216     out wchar[2] buf, dchar c) @safe pure
2217 {
2218     if (c <= 0xFFFF)
2219     {
2220         if (0xD800 <= c && c <= 0xDFFF)
2221             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2222 
2223         assert(isValidDchar(c));
2224     L1:
2225         buf[0] = cast(wchar) c;
2226         return 1;
2227     }
2228     if (c <= 0x10FFFF)
2229     {
2230         assert(isValidDchar(c));
2231         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2232         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2233         return 2;
2234     }
2235 
2236     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2237     goto L1;
2238 }
2239 
2240 @safe unittest
2241 {
2242     import std.exception;
2243     assertCTFEable!(
2244     {
2245     wchar[2] buf;
2246 
2247     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2248     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2249     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2250     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2251     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2252     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2253     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2254 
2255     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2256     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2257     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2258     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2259     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2260 
2261     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2262     assert(buf.front == replacementDchar);
2263     });
2264 }
2265 
2266 
2267 /// Ditto
2268 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2269     out dchar[1] buf, dchar c) @safe pure
2270 {
2271     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2272         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2273     else
2274         assert(isValidDchar(c));
2275     buf[0] = c;
2276     return 1;
2277 }
2278 
2279 @safe unittest
2280 {
2281     import std.exception;
2282     assertCTFEable!(
2283     {
2284     dchar[1] buf;
2285 
2286     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2287     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2288     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2289     encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2290     encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2291     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2292 
2293     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2294     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2295     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2296     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2297     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2298 
2299     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2300     assert(buf.front == replacementDchar);
2301     });
2302 }
2303 
2304 
2305 /++
2306     Encodes $(D c) in $(D str)'s encoding and appends it to $(D str).
2307 
2308     Throws:
2309         $(D UTFException) if $(D c) is not a valid UTF code point.
2310   +/
2311 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2312     ref char[] str, dchar c) @safe pure
2313 {
2314     char[] r = str;
2315 
2316     if (c <= 0x7F)
2317     {
2318         assert(isValidDchar(c));
2319         r ~= cast(char) c;
2320     }
2321     else
2322     {
2323         char[4] buf;
2324         uint L;
2325 
2326         if (c <= 0x7FF)
2327         {
2328             assert(isValidDchar(c));
2329             buf[0] = cast(char)(0xC0 | (c >> 6));
2330             buf[1] = cast(char)(0x80 | (c & 0x3F));
2331             L = 2;
2332         }
2333         else if (c <= 0xFFFF)
2334         {
2335             if (0xD800 <= c && c <= 0xDFFF)
2336                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2337 
2338             assert(isValidDchar(c));
2339         L3:
2340             buf[0] = cast(char)(0xE0 | (c >> 12));
2341             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2342             buf[2] = cast(char)(0x80 | (c & 0x3F));
2343             L = 3;
2344         }
2345         else if (c <= 0x10FFFF)
2346         {
2347             assert(isValidDchar(c));
2348             buf[0] = cast(char)(0xF0 | (c >> 18));
2349             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2350             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2351             buf[3] = cast(char)(0x80 | (c & 0x3F));
2352             L = 4;
2353         }
2354         else
2355         {
2356             assert(!isValidDchar(c));
2357             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2358             goto L3;
2359         }
2360         r ~= buf[0 .. L];
2361     }
2362     str = r;
2363 }
2364 
2365 @safe unittest
2366 {
2367     import std.exception;
2368 
2369     assertCTFEable!(
2370     {
2371     char[] s = "abcd".dup;
2372     encode(s, cast(dchar)'a');
2373     assert(s.length == 5);
2374     assert(s == "abcda");
2375 
2376     encode(s, cast(dchar)'\u00A9');
2377     assert(s.length == 7);
2378     assert(s == "abcda\xC2\xA9");
2379     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2380 
2381     encode(s, cast(dchar)'\u2260');
2382     assert(s.length == 10);
2383     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2384     });
2385 }
2386 
2387 @safe unittest
2388 {
2389     import std.exception;
2390     assertCTFEable!(
2391     {
2392     char[] buf;
2393 
2394     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2395     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2396     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2397     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2398     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2399     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2400     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2401     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2402     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2403     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2404     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2405 
2406     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2407     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2408     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2409     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2410     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2411 
2412     assert(buf.back != replacementDchar);
2413     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2414     assert(buf.back == replacementDchar);
2415     });
2416 }
2417 
2418 /// ditto
2419 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2420     ref wchar[] str, dchar c) @safe pure
2421 {
2422     wchar[] r = str;
2423 
2424     if (c <= 0xFFFF)
2425     {
2426         if (0xD800 <= c && c <= 0xDFFF)
2427             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2428 
2429         assert(isValidDchar(c));
2430     L1:
2431         r ~= cast(wchar) c;
2432     }
2433     else if (c <= 0x10FFFF)
2434     {
2435         wchar[2] buf;
2436 
2437         assert(isValidDchar(c));
2438         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2439         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2440         r ~= buf;
2441     }
2442     else
2443     {
2444         assert(!isValidDchar(c));
2445         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2446         goto L1;
2447     }
2448 
2449     str = r;
2450 }
2451 
2452 @safe unittest
2453 {
2454     import std.exception;
2455     assertCTFEable!(
2456     {
2457     wchar[] buf;
2458 
2459     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2460     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2461     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2462     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2463     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2464     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2465     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2466 
2467     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2468     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2469     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2470     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2471     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2472 
2473     assert(buf.back != replacementDchar);
2474     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2475     assert(buf.back == replacementDchar);
2476     });
2477 }
2478 
2479 /// ditto
2480 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2481     ref dchar[] str, dchar c) @safe pure
2482 {
2483     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2484         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2485     else
2486         assert(isValidDchar(c));
2487     str ~= c;
2488 }
2489 
2490 @safe unittest
2491 {
2492     import std.exception;
2493     assertCTFEable!(
2494     {
2495     dchar[] buf;
2496 
2497     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2498     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2499     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2500     encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2501     encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2502     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2503 
2504     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2505     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2506     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2507     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2508     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2509 
2510     assert(buf.back != replacementDchar);
2511     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2512     assert(buf.back == replacementDchar);
2513     });
2514 }
2515 
2516 
2517 /++
2518     Returns the number of code units that are required to encode the code point
2519     $(D c) when $(D C) is the character type used to encode it.
2520   +/
2521 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2522 if (isSomeChar!C)
2523 {
2524     static if (C.sizeof == 1)
2525     {
2526         if (c <= 0x7F) return 1;
2527         if (c <= 0x7FF) return 2;
2528         if (c <= 0xFFFF) return 3;
2529         if (c <= 0x10FFFF) return 4;
2530         assert(false);
2531     }
2532     else static if (C.sizeof == 2)
2533     {
2534         return c <= 0xFFFF ? 1 : 2;
2535     }
2536     else
2537     {
2538         static assert(C.sizeof == 4);
2539         return 1;
2540     }
2541 }
2542 
2543 ///
2544 @safe pure nothrow @nogc unittest
2545 {
2546     assert(codeLength!char('a') == 1);
2547     assert(codeLength!wchar('a') == 1);
2548     assert(codeLength!dchar('a') == 1);
2549 
2550     assert(codeLength!char('\U0010FFFF') == 4);
2551     assert(codeLength!wchar('\U0010FFFF') == 2);
2552     assert(codeLength!dchar('\U0010FFFF') == 1);
2553 }
2554 
2555 
2556 /++
2557     Returns the number of code units that are required to encode $(D str)
2558     in a string whose character type is $(D C). This is particularly useful
2559     when slicing one string with the length of another and the two string
2560     types use different character types.
2561 
2562     Params:
2563         C = the character type to get the encoding length for
2564         input = the input range to calculate the encoding length from
2565     Returns:
2566         The number of code units in `input` when encoded to `C`
2567   +/
2568 size_t codeLength(C, InputRange)(InputRange input)
2569 if (isInputRange!InputRange && !isInfinite!InputRange && is(ElementType!InputRange : dchar))
2570 {
2571     alias EncType = Unqual!(ElementEncodingType!InputRange);
2572     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2573         return input.length;
2574     else
2575     {
2576         size_t total = 0;
2577 
2578         foreach (dchar c; input)
2579             total += codeLength!C(c);
2580 
2581         return total;
2582     }
2583 }
2584 
2585 ///
2586 @safe unittest
2587 {
2588     import std.conv : to;
2589     assert(codeLength!char("hello world") ==
2590            to!string("hello world").length);
2591     assert(codeLength!wchar("hello world") ==
2592            to!wstring("hello world").length);
2593     assert(codeLength!dchar("hello world") ==
2594            to!dstring("hello world").length);
2595 
2596     assert(codeLength!char(`プログラミング`) ==
2597            to!string(`プログラミング`).length);
2598     assert(codeLength!wchar(`プログラミング`) ==
2599            to!wstring(`プログラミング`).length);
2600     assert(codeLength!dchar(`プログラミング`) ==
2601            to!dstring(`プログラミング`).length);
2602 
2603     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2604     wstring needle = `Être sans la verité`;
2605     assert(haystack[codeLength!char(needle) .. $] ==
2606            `, ça, ce ne serait pas bien.`);
2607 }
2608 
2609 @safe unittest
2610 {
2611     import std.algorithm.iteration : filter;
2612     import std.conv : to;
2613     import std.exception;
2614 
2615     assertCTFEable!(
2616     {
2617     foreach (S; AliasSeq!( char[], const  char[],  string,
2618                           wchar[], const wchar[], wstring,
2619                           dchar[], const dchar[], dstring))
2620     {
2621         foreach (C; AliasSeq!(char, wchar, dchar))
2622         {
2623             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2624             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2625             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2626                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2627             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2628                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2629         }
2630     }
2631     });
2632 }
2633 
2634 /+
2635 Internal helper function:
2636 
2637 Returns true if it is safe to search for the Codepoint $(D c) inside
2638 code units, without decoding.
2639 
2640 This is a runtime check that is used an optimization in various functions,
2641 particularly, in $(D std.string).
2642   +/
2643 package bool canSearchInCodeUnits(C)(dchar c)
2644 if (isSomeChar!C)
2645 {
2646     static if (C.sizeof == 1)
2647          return c <= 0x7F;
2648     else static if (C.sizeof == 2)
2649         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2650     else static if (C.sizeof == 4)
2651         return true;
2652     else
2653         static assert(0);
2654 }
2655 @safe unittest
2656 {
2657     assert( canSearchInCodeUnits! char('a'));
2658     assert( canSearchInCodeUnits!wchar('a'));
2659     assert( canSearchInCodeUnits!dchar('a'));
2660     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2661     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2662     assert( canSearchInCodeUnits!wchar('ö'));
2663     assert( canSearchInCodeUnits!dchar('ö'));
2664     assert(!canSearchInCodeUnits! char(''));
2665     assert( canSearchInCodeUnits!wchar(''));
2666     assert( canSearchInCodeUnits!dchar(''));
2667     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2668     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2669     assert(!canSearchInCodeUnits! char('\U00010001'));
2670     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2671     assert( canSearchInCodeUnits!dchar('\U00010001'));
2672 }
2673 
2674 /* =================== Validation ======================= */
2675 
2676 /++
2677     Checks to see if $(D str) is well-formed unicode or not.
2678 
2679     Throws:
2680         $(D UTFException) if $(D str) is not well-formed.
2681   +/
2682 void validate(S)(in S str) @safe pure
2683 if (isSomeString!S)
2684 {
2685     immutable len = str.length;
2686     for (size_t i = 0; i < len; )
2687     {
2688         decode(str, i);
2689     }
2690 }
2691 
2692 
2693 @safe unittest // bugzilla 12923
2694 {
2695     import std.exception;
2696     assertThrown((){
2697         char[3]a=[167, 133, 175];
2698         validate(a[]);
2699     }());
2700 }
2701 
2702 /**
2703  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2704  * string of the elements.
2705  *
2706  * Params:
2707  *     s = the string to encode
2708  * Returns:
2709  *     A UTF-8 string
2710  * See_Also:
2711  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2712  */
2713 string toUTF8(S)(S s)
2714 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2715 {
2716     return toUTFImpl!string(s);
2717 }
2718 
2719 ///
2720 @safe pure unittest
2721 {
2722     import std.algorithm.comparison : equal;
2723 
2724     // The ö is represented by two UTF-8 code units
2725     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2726 
2727     // �� is four code units in UTF-8
2728     assert("��"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2729 }
2730 
2731 @system pure unittest
2732 {
2733     import std.algorithm.comparison : equal;
2734     import std.internal.test.dummyrange : ReferenceInputRange;
2735 
2736     auto r1 = new ReferenceInputRange!dchar("Hellø");
2737     auto r2 = new ReferenceInputRange!dchar("��");
2738 
2739     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2740     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2741 }
2742 
2743 /**
2744  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
2745  * `wstring` of the elements.
2746  *
2747  * Params:
2748  *     s = the range to encode
2749  * Returns:
2750  *     A UTF-16 string
2751  * See_Also:
2752  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2753  */
2754 wstring toUTF16(S)(S s)
2755 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2756 {
2757     return toUTFImpl!wstring(s);
2758 }
2759 
2760 ///
2761 @safe pure unittest
2762 {
2763     import std.algorithm.comparison : equal;
2764 
2765     // these graphemes are two code units in UTF-16 and one in UTF-32
2766     assert("��"d.length == 1);
2767     assert("��"d.length == 1);
2768 
2769     assert("��"d.toUTF16.equal([0xD852, 0xDF62]));
2770     assert("��"d.toUTF16.equal([0xD801, 0xDC37]));
2771 }
2772 
2773 @system pure unittest
2774 {
2775     import std.algorithm.comparison : equal;
2776     import std.internal.test.dummyrange : ReferenceInputRange;
2777 
2778     auto r1 = new ReferenceInputRange!dchar("��");
2779     auto r2 = new ReferenceInputRange!dchar("��");
2780 
2781     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
2782     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
2783 }
2784 
2785 
2786 /**
2787  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
2788  * `dstring` of the elements.
2789  *
2790  * Params:
2791  *     s = the range to encode
2792  * Returns:
2793  *     A UTF-32 string
2794  * See_Also:
2795  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2796  */
2797 dstring toUTF32(S)(S s)
2798 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2799 {
2800     return toUTFImpl!dstring(s);
2801 }
2802 
toUTFImpl(T,S)2803 private T toUTFImpl(T, S)(S s)
2804 {
2805     static if (is(S : T))
2806     {
2807         return s.idup;
2808     }
2809     else
2810     {
2811         import std.array : appender;
2812         auto app = appender!T();
2813 
2814         static if (hasLength!S || isSomeString!S)
2815             app.reserve(s.length);
2816 
2817         foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
2818             app.put(c);
2819 
2820         return app.data;
2821     }
2822 }
2823 
2824 /* =================== toUTFz ======================= */
2825 
2826 /++
2827     Returns a C-style zero-terminated string equivalent to $(D str). $(D str)
2828     must not contain embedded $(D '\0')'s as any C function will treat the first
2829     $(D '\0') that it sees as the end of the string. If $(D str.empty) is
2830     $(D true), then a string containing only $(D '\0') is returned.
2831 
2832     $(D toUTFz) accepts any type of string and is templated on the type of
2833     character pointer that you wish to convert to. It will avoid allocating a
2834     new string if it can, but there's a decent chance that it will end up having
2835     to allocate a new string - particularly when dealing with character types
2836     other than $(D char).
2837 
2838     $(RED Warning 1:) If the result of $(D toUTFz) equals $(D str.ptr), then if
2839     anything alters the character one past the end of $(D str) (which is the
2840     $(D '\0') character terminating the string), then the string won't be
2841     zero-terminated anymore. The most likely scenarios for that are if you
2842     append to $(D str) and no reallocation takes place or when $(D str) is a
2843     slice of a larger array, and you alter the character in the larger array
2844     which is one character past the end of $(D str). Another case where it could
2845     occur would be if you had a mutable character array immediately after
2846     $(D str) in memory (for example, if they're member variables in a
2847     user-defined type with one declared right after the other) and that
2848     character array happened to start with $(D '\0'). Such scenarios will never
2849     occur if you immediately use the zero-terminated string after calling
2850     $(D toUTFz) and the C function using it doesn't keep a reference to it.
2851     Also, they are unlikely to occur even if you save the zero-terminated string
2852     (the cases above would be among the few examples of where it could happen).
2853     However, if you save the zero-terminate string and want to be absolutely
2854     certain that the string stays zero-terminated, then simply append a
2855     $(D '\0') to the string and use its $(D ptr) property rather than calling
2856     $(D toUTFz).
2857 
2858     $(RED Warning 2:) When passing a character pointer to a C function, and the
2859     C function keeps it around for any reason, make sure that you keep a
2860     reference to it in your D code. Otherwise, it may go away during a garbage
2861     collection cycle and cause a nasty bug when the C code tries to use it.
2862   +/
2863 template toUTFz(P)
2864 {
2865     P toUTFz(S)(S str) @safe pure
2866     {
2867         return toUTFzImpl!(P, S)(str);
2868     }
2869 }
2870 
2871 ///
2872 @safe pure unittest
2873 {
2874     auto p1 = toUTFz!(char*)("hello world");
2875     auto p2 = toUTFz!(const(char)*)("hello world");
2876     auto p3 = toUTFz!(immutable(char)*)("hello world");
2877     auto p4 = toUTFz!(char*)("hello world"d);
2878     auto p5 = toUTFz!(const(wchar)*)("hello world");
2879     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
2880 }
2881 
2882 private P toUTFzImpl(P, S)(S str) @safe pure
2883 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2884     is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
2885     is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
2886 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
2887 {
2888     if (str.empty)
2889     {
2890         typeof(*P.init)[] retval = ['\0'];
2891 
trustedPtr()2892         auto trustedPtr() @trusted { return retval.ptr; }
2893         return trustedPtr();
2894     }
2895 
2896     alias C = Unqual!(ElementEncodingType!S);
2897 
2898     //If the P is mutable, then we have to make a copy.
2899     static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
2900     {
2901         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2902     }
2903     else
2904     {
2905         if (!__ctfe)
2906         {
trustedPtrAdd(S s)2907             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
2908             immutable p = trustedPtrAdd(str);
2909 
2910             // Peek past end of str, if it's 0, no conversion necessary.
2911             // Note that the compiler will put a 0 past the end of static
2912             // strings, and the storage allocator will put a 0 past the end
2913             // of newly allocated char[]'s.
2914             // Is p dereferenceable? A simple test: if the p points to an
2915             // address multiple of 4, then conservatively assume the pointer
2916             // might be pointing to a new block of memory, which might be
2917             // unreadable. Otherwise, it's definitely pointing to valid
2918             // memory.
2919             if ((cast(size_t) p & 3) && *p == '\0')
2920                 return &str[0];
2921         }
2922 
2923         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2924     }
2925 }
2926 
2927 private P toUTFzImpl(P, S)(S str) @safe pure
2928 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2929     is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
2930     !is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
2931 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
2932 {
2933     alias InChar  = ElementEncodingType!S;
2934     alias OutChar = typeof(*P.init);
2935 
2936     //const(C)[] -> const(C)* or
2937     //C[] -> C* or const(C)*
2938     static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
2939                (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
2940     {
2941         if (!__ctfe)
2942         {
trustedPtrAdd(S s)2943             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
2944             auto p = trustedPtrAdd(str);
2945 
2946             if ((cast(size_t) p & 3) && *p == '\0')
2947                 return &str[0];
2948         }
2949 
2950         str ~= '\0';
2951         return &str[0];
2952     }
2953     //const(C)[] -> C* or immutable(C)* or
2954     //C[] -> immutable(C)*
2955     else
2956     {
2957         import std.array : uninitializedArray;
2958         auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
2959         copy[0 .. $ - 1] = str[];
2960         copy[$ - 1] = '\0';
2961 
trustedCast(typeof (copy)c)2962         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
2963         return trustedCast(copy);
2964     }
2965 }
2966 
2967 private P toUTFzImpl(P, S)(S str) @safe pure
2968 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2969     !is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)))
2970 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
2971 {
2972     import std.array : appender;
2973     auto retval = appender!(typeof(*P.init)[])();
2974 
2975     foreach (dchar c; str)
2976         retval.put(c);
2977     retval.put('\0');
2978 
2979     return () @trusted { return cast(P) retval.data.ptr; } ();
2980 }
2981 
2982 @safe pure unittest
2983 {
2984     import core.exception : AssertError;
2985     import std.algorithm;
2986     import std.conv : to;
2987     import std.exception;
2988     import std.string : format;
2989 
2990     assertCTFEable!(
2991     {
2992     foreach (S; AliasSeq!(string, wstring, dstring))
2993     {
2994         alias C = Unqual!(ElementEncodingType!S);
2995 
2996         auto s1 = to!S("hello\U00010143\u0100\U00010143");
2997         auto temp = new C[](s1.length + 1);
2998         temp[0 .. $ - 1] = s1[0 .. $];
2999         temp[$ - 1] = '\n';
3000         --temp.length;
3001         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3002         auto s2 = trustedAssumeUnique(temp);
3003         assert(s1 == s2);
3004 
3005         void trustedCStringAssert(P, S)(S s) @trusted
3006         {
3007             auto p = toUTFz!P(s);
3008             assert(p[0 .. s.length] == s);
3009             assert(p[s.length] == '\0');
3010         }
3011 
3012         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3013         {
3014             trustedCStringAssert!P(s1);
3015             trustedCStringAssert!P(s2);
3016         }
3017     }
3018     });
3019 
test(P,S)3020     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3021     {
3022         static size_t zeroLen(C)(const(C)* ptr) @trusted
3023         {
3024             size_t len = 0;
3025             while (*ptr != '\0') { ++ptr; ++len; }
3026             return len;
3027         }
3028 
3029         auto p = toUTFz!P(s);
3030         immutable len = zeroLen(p);
3031         enforce(cmp(s, p[0 .. len]) == 0,
3032                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3033                                 __FILE__, line));
3034     }
3035 
3036     assertCTFEable!(
3037     {
3038     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3039                           dchar*, const(dchar)*, immutable(dchar)*))
3040     {
3041         test!P("hello\U00010143\u0100\U00010143");
3042     }
3043     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3044                           dchar*, const(dchar)*, immutable(dchar)*))
3045     {
3046         test!P("hello\U00010143\u0100\U00010143"w);
3047     }
3048     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3049                           wchar*, const(wchar)*, immutable(wchar)*))
3050     {
3051         test!P("hello\U00010143\u0100\U00010143"d);
3052     }
3053     foreach (S; AliasSeq!( char[], const( char)[],
3054                           wchar[], const(wchar)[],
3055                           dchar[], const(dchar)[]))
3056     {
3057         auto s = to!S("hello\U00010143\u0100\U00010143");
3058 
3059         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3060                               wchar*, const(wchar)*, immutable(wchar)*,
3061                               dchar*, const(dchar)*, immutable(dchar)*))
3062         {
3063             test!P(s);
3064         }
3065     }
3066     });
3067 }
3068 
3069 
3070 /++
3071     $(D toUTF16z) is a convenience function for $(D toUTFz!(const(wchar)*)).
3072 
3073     Encodes string $(D s) into UTF-16 and returns the encoded string.
3074     $(D toUTF16z) is suitable for calling the 'W' functions in the Win32 API
3075     that take an $(D LPWSTR) or $(D LPCWSTR) argument.
3076   +/
3077 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3078 if (isSomeChar!C)
3079 {
3080     return toUTFz!(const(wchar)*)(str);
3081 }
3082 
3083 @safe pure unittest
3084 {
3085     import std.conv : to;
3086     //toUTFz is already thoroughly tested, so this will just verify that
3087     //toUTF16z compiles properly for the various string types.
3088     foreach (S; AliasSeq!(string, wstring, dstring))
3089         assert(toUTF16z(to!S("hello world")) !is null);
3090 }
3091 
3092 
3093 /* ================================ tests ================================== */
3094 
3095 @safe pure unittest
3096 {
3097     import std.exception;
3098 
3099     assertCTFEable!(
3100     {
3101     assert(toUTF16("hello"c) == "hello");
3102     assert(toUTF32("hello"c) == "hello");
3103     assert(toUTF8 ("hello"w) == "hello");
3104     assert(toUTF32("hello"w) == "hello");
3105     assert(toUTF8 ("hello"d) == "hello");
3106     assert(toUTF16("hello"d) == "hello");
3107 
3108     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3109     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3110     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3111     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3112     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3113     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3114 
3115     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3116     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3117     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3118     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3119     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3120     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3121     });
3122 }
3123 
3124 
3125 /++
3126     Returns the total number of code points encoded in $(D str).
3127 
3128     Supercedes: This function supercedes $(LREF toUCSindex).
3129 
3130     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3131 
3132     Throws:
3133         $(D UTFException) if $(D str) is not well-formed.
3134   +/
3135 size_t count(C)(const(C)[] str) @trusted pure nothrow @nogc
3136 if (isSomeChar!C)
3137 {
3138     return walkLength(str);
3139 }
3140 
3141 @safe pure nothrow @nogc unittest
3142 {
3143     import std.exception;
3144     assertCTFEable!(
3145     {
3146     assert(count("") == 0);
3147     assert(count("a") == 1);
3148     assert(count("abc") == 3);
3149     assert(count("\u20AC100") == 4);
3150     });
3151 }
3152 
3153 
3154 // Ranges of code units for testing.
version(unittest)3155 version (unittest)
3156 {
3157     struct InputCU(C)
3158     {
3159         import std.conv : to;
3160         @property bool empty() { return _str.empty; }
3161         @property C front() { return _str[0]; }
3162         void popFront() { _str = _str[1 .. $]; }
3163 
3164         this(inout(C)[] str)
3165         {
3166             _str = to!(C[])(str);
3167         }
3168 
3169         C[] _str;
3170     }
3171 
3172     struct BidirCU(C)
3173     {
3174         import std.conv : to;
3175         @property bool empty() { return _str.empty; }
3176         @property C front() { return _str[0]; }
3177         void popFront() { _str = _str[1 .. $]; }
3178         @property C back() { return _str[$ - 1]; }
3179         void popBack() { _str = _str[0 .. $ - 1]; }
3180         @property auto save() { return BidirCU(_str); }
3181         @property size_t length() { return _str.length; }
3182 
3183         this(inout(C)[] str)
3184         {
3185             _str = to!(C[])(str);
3186         }
3187 
3188         C[] _str;
3189     }
3190 
3191     struct RandomCU(C)
3192     {
3193         import std.conv : to;
3194         @property bool empty() { return _str.empty; }
3195         @property C front() { return _str[0]; }
3196         void popFront() { _str = _str[1 .. $]; }
3197         @property C back() { return _str[$ - 1]; }
3198         void popBack() { _str = _str[0 .. $ - 1]; }
3199         @property auto save() { return RandomCU(_str); }
3200         @property size_t length() { return _str.length; }
3201         C opIndex(size_t i) { return _str[i]; }
3202         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3203 
3204         this(inout(C)[] str)
3205         {
3206             _str = to!(C[])(str);
3207         }
3208 
3209         C[] _str;
3210     }
3211 
3212     class RefBidirCU(C)
3213     {
3214         import std.conv : to;
3215         @property bool empty() { return _str.empty; }
3216         @property C front() { return _str[0]; }
3217         void popFront() { _str = _str[1 .. $]; }
3218         @property C back() { return _str[$ - 1]; }
3219         void popBack() { _str = _str[0 .. $ - 1]; }
3220         @property auto save() { return new RefBidirCU(_str); }
3221         @property size_t length() { return _str.length; }
3222 
3223         this(inout(C)[] str)
3224         {
3225             _str = to!(C[])(str);
3226         }
3227 
3228         C[] _str;
3229     }
3230 
3231     class RefRandomCU(C)
3232     {
3233         import std.conv : to;
3234         @property bool empty() { return _str.empty; }
3235         @property C front() { return _str[0]; }
3236         void popFront() { _str = _str[1 .. $]; }
3237         @property C back() { return _str[$ - 1]; }
3238         void popBack() { _str = _str[0 .. $ - 1]; }
3239         @property auto save() { return new RefRandomCU(_str); }
3240         @property size_t length() { return _str.length; }
3241         C opIndex(size_t i) { return _str[i]; }
3242         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3243 
3244         this(inout(C)[] str)
3245         {
3246             _str = to!(C[])(str);
3247         }
3248 
3249         C[] _str;
3250     }
3251 }
3252 
3253 
3254 /**
3255  * Inserted in place of invalid UTF sequences.
3256  *
3257  * References:
3258  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3259  */
3260 enum dchar replacementDchar = '\uFFFD';
3261 
3262 /********************************************
3263  * Iterate a range of char, wchar, or dchars by code unit.
3264  *
3265  * The purpose is to bypass the special case decoding that
3266  * $(REF front, std,range,primitives) does to character arrays. As a result,
3267  * using ranges with `byCodeUnit` can be `nothrow` while
3268  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3269  * sequences.
3270  *
3271  * A code unit is a building block of the UTF encodings. Generally, an
3272  * individual code unit does not represent what's perceived as a full
3273  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3274  * are encoded with multiple code units. For example, the UTF-8 code units for
3275  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3276  * often does not form a character on its own. Attempting to treat it as
3277  * one while iterating over the resulting range will give nonsensical results.
3278  *
3279  * Params:
3280  *      r = an input range of characters (including strings) or a type that
3281  *          implicitly converts to a string type.
3282  * Returns:
3283  *     If `r` is not an auto-decodable string (i.e. a narrow string or a
3284  *     user-defined type that implicits converts to a string type), then `r`
3285  *     is returned.
3286  *
3287  *      Otherwise, `r` is converted to its corresponding string type (if it's
3288  *      not already a string) and wrapped in a random-access range where the
3289  *      element encoding type of the string (its code unit) is the element type
3290  *      of the range, and that range returned. The range has slicing.
3291  *
3292  *      If `r` is quirky enough to be a struct or class which is an input range
3293  *      of characters on its own (i.e. it has the input range API as member
3294  *      functions), $(I and) it's implicitly convertible to a string type, then
3295  *      `r` is returned, and no implicit conversion takes place.
3296  * See_Also:
3297  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3298  *      terminology.
3299  *
3300  *      For a range that iterates by grapheme cluster (written character) see
3301  *      $(REF byGrapheme, std,uni).
3302  */
3303 auto byCodeUnit(R)(R r)
3304 if (isAutodecodableString!R ||
3305     isInputRange!R && isSomeChar!(ElementEncodingType!R) ||
3306     (is(R : const dchar[]) && !isStaticArray!R))
3307 {
3308     static if (isNarrowString!R ||
3309                // This would be cleaner if we had a way to check whether a type
3310                // was a range without any implicit conversions.
3311                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3312                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3313     {
3314         static struct ByCodeUnitImpl
3315         {
3316         @safe pure nothrow @nogc:
3317 
emptyByCodeUnitImpl3318             @property bool empty() const     { return str.length == 0; }
frontByCodeUnitImpl3319             @property auto ref front() inout { return str[0]; }
popFrontByCodeUnitImpl3320             void popFront()                  { str = str[1 .. $]; }
3321 
saveByCodeUnitImpl3322             @property auto save() { return ByCodeUnitImpl(str.save); }
3323 
backByCodeUnitImpl3324             @property auto ref back() inout { return str[$ - 1]; }
popBackByCodeUnitImpl3325             void popBack()                  { str = str[0 .. $-1]; }
3326 
opIndexByCodeUnitImpl3327             auto ref opIndex(size_t index) inout     { return str[index]; }
opSliceByCodeUnitImpl3328             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(str[lower .. upper]); }
3329 
lengthByCodeUnitImpl3330             @property size_t length() const { return str.length; }
3331             alias opDollar = length;
3332 
3333           private:
3334             StringTypeOf!R str;
3335         }
3336 
3337         static assert(isRandomAccessRange!ByCodeUnitImpl);
3338 
3339         return ByCodeUnitImpl(r);
3340     }
3341     else static if (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3342                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))
3343     {
3344         return cast(StringTypeOf!R) r;
3345     }
3346     else
3347     {
3348         // byCodeUnit for ranges and dchar[] is a no-op
3349         return r;
3350     }
3351 }
3352 
3353 ///
3354 @safe unittest
3355 {
3356     import std.range.primitives;
3357 
3358     auto r = "Hello, World!".byCodeUnit();
3359     static assert(hasLength!(typeof(r)));
3360     static assert(hasSlicing!(typeof(r)));
3361     static assert(isRandomAccessRange!(typeof(r)));
3362     static assert(is(ElementType!(typeof(r)) == immutable char));
3363 
3364     // contrast with the range capabilities of standard strings
3365     auto s = "Hello, World!";
3366     static assert(isBidirectionalRange!(typeof(r)));
3367     static assert(is(ElementType!(typeof(s)) == dchar));
3368 
3369     static assert(!isRandomAccessRange!(typeof(s)));
3370     static assert(!hasSlicing!(typeof(s)));
3371     static assert(!hasLength!(typeof(s)));
3372 }
3373 
3374 /// `byCodeUnit` does no Unicode decoding
3375 @safe unittest
3376 {
3377     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3378     assert(noel1.byCodeUnit[2] != 'ë');
3379     assert(noel1.byCodeUnit[2] == 'e');
3380 
3381     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3382     // Because string is UTF-8, the code unit at index 2 is just
3383     // the first of a sequence that encodes 'ë'
3384     assert(noel2.byCodeUnit[2] != 'ë');
3385 }
3386 
3387 @safe pure nothrow @nogc unittest
3388 {
3389     import std.range;
3390     {
3391         enum testStr = "������ hello ディラン";
3392         char[testStr.length] s;
3393         int i;
3394         foreach (c; testStr.byCodeUnit().byCodeUnit())
3395         {
3396             s[i++] = c;
3397         }
3398         assert(s == testStr);
3399     }
3400     {
3401         enum testStr = "������ hello ディラン"w;
3402         wchar[testStr.length] s;
3403         int i;
3404         foreach (c; testStr.byCodeUnit().byCodeUnit())
3405         {
3406             s[i++] = c;
3407         }
3408         assert(s == testStr);
3409     }
3410     {
3411         enum testStr = "������ hello ディラン"d;
3412         dchar[testStr.length] s;
3413         int i;
3414         foreach (c; testStr.byCodeUnit().byCodeUnit())
3415         {
3416             s[i++] = c;
3417         }
3418         assert(s == testStr);
3419     }
3420     {
3421         auto bcu = "hello".byCodeUnit();
3422         assert(bcu.length == 5);
3423         assert(bcu[3] == 'l');
3424         assert(bcu[2 .. 4][1] == 'l');
3425     }
3426     {
3427         char[5] orig = "hello";
3428         auto bcu = orig[].byCodeUnit();
3429         bcu.front = 'H';
3430         assert(bcu.front == 'H');
3431         bcu[1] = 'E';
3432         assert(bcu[1] == 'E');
3433     }
3434     {
3435         auto bcu = "hello".byCodeUnit().byCodeUnit();
3436         static assert(isForwardRange!(typeof(bcu)));
3437         static assert(is(typeof(bcu) == struct));
3438         auto s = bcu.save;
3439         bcu.popFront();
3440         assert(s.front == 'h');
3441     }
3442     {
3443         auto bcu = "hello".byCodeUnit();
3444         static assert(hasSlicing!(typeof(bcu)));
3445         static assert(isBidirectionalRange!(typeof(bcu)));
3446         static assert(is(typeof(bcu) == struct));
3447         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3448         auto ret = bcu.retro;
3449         assert(ret.front == 'o');
3450         ret.popFront();
3451         assert(ret.front == 'l');
3452     }
3453     {
3454         auto bcu = "κόσμε"w.byCodeUnit();
3455         static assert(hasSlicing!(typeof(bcu)));
3456         static assert(isBidirectionalRange!(typeof(bcu)));
3457         static assert(is(typeof(bcu) == struct));
3458         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3459         auto ret = bcu.retro;
3460         assert(ret.front == 'ε');
3461         ret.popFront();
3462         assert(ret.front == 'μ');
3463     }
3464     {
3465         static struct Stringish
3466         {
3467             string s;
3468             alias s this;
3469         }
3470 
3471         auto orig = Stringish("\U0010fff8 �� foo ��");
3472         auto bcu = orig.byCodeUnit();
3473         static assert(is(typeof(bcu) == struct));
3474         static assert(!is(typeof(bcu) == Stringish));
3475         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3476         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3477         assert(bcu.front == cast(char) 244);
3478     }
3479     {
3480         static struct WStringish
3481         {
3482             wstring s;
3483             alias s this;
3484         }
3485 
3486         auto orig = WStringish("\U0010fff8 �� foo ��"w);
3487         auto bcu = orig.byCodeUnit();
3488         static assert(is(typeof(bcu) == struct));
3489         static assert(!is(typeof(bcu) == WStringish));
3490         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3491         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3492         assert(bcu.front == cast(wchar) 56319);
3493     }
3494     {
3495         static struct DStringish
3496         {
3497             dstring s;
3498             alias s this;
3499         }
3500 
3501         auto orig = DStringish("\U0010fff8 �� foo ��"d);
3502         auto bcu = orig.byCodeUnit();
3503         static assert(is(typeof(bcu) == dstring));
3504         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3505         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3506         assert(bcu.front == cast(dchar) 1114104);
3507     }
3508     {
3509         static struct FuncStringish
3510         {
3511             string str;
sFuncStringish3512             string s() pure nothrow @nogc { return str; }
3513             alias s this;
3514         }
3515 
3516         auto orig = FuncStringish("\U0010fff8 �� foo ��");
3517         auto bcu = orig.byCodeUnit();
3518         static assert(is(typeof(bcu) == struct));
3519         static assert(!is(typeof(bcu) == FuncStringish));
3520         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3521         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3522         assert(bcu.front == cast(char) 244);
3523     }
3524     {
3525         static struct Range
3526         {
3527             string data;
emptyRange3528             bool empty() pure nothrow @nogc { return data.empty; }
frontRange3529             char front() pure nothrow @nogc { return data[0]; }
popFrontRange3530             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3531         }
3532 
3533         auto orig = Range("\U0010fff8 �� foo ��");
3534         auto bcu = orig.byCodeUnit();
3535         static assert(is(typeof(bcu) == Range));
3536         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3537         static assert(is(ElementType!(typeof(bcu)) == char));
3538         assert(bcu.front == cast(char) 244);
3539     }
3540     {
3541         static struct WRange
3542         {
3543             wstring data;
emptyWRange3544             bool empty() pure nothrow @nogc { return data.empty; }
frontWRange3545             wchar front() pure nothrow @nogc { return data[0]; }
popFrontWRange3546             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3547         }
3548 
3549         auto orig = WRange("\U0010fff8 �� foo ��"w);
3550         auto bcu = orig.byCodeUnit();
3551         static assert(is(typeof(bcu) == WRange));
3552         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3553         static assert(is(ElementType!(typeof(bcu)) == wchar));
3554         assert(bcu.front == 56319);
3555     }
3556     {
3557         static struct DRange
3558         {
3559             dstring data;
emptyDRange3560             bool empty() pure nothrow @nogc { return data.empty; }
frontDRange3561             dchar front() pure nothrow @nogc { return data[0]; }
popFrontDRange3562             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3563         }
3564 
3565         auto orig = DRange("\U0010fff8 �� foo ��"d);
3566         auto bcu = orig.byCodeUnit();
3567         static assert(is(typeof(bcu) == DRange));
3568         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3569         static assert(is(ElementType!(typeof(bcu)) == dchar));
3570         assert(bcu.front == 1114104);
3571     }
3572     {
3573         static struct RangeAndStringish
3574         {
emptyRangeAndStringish3575             bool empty() pure nothrow @nogc { return data.empty; }
frontRangeAndStringish3576             char front() pure nothrow @nogc { return data[0]; }
popFrontRangeAndStringish3577             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3578 
3579             string data;
3580             string s;
3581             alias s this;
3582         }
3583 
3584         auto orig = RangeAndStringish("test.d", "other");
3585         auto bcu = orig.byCodeUnit();
3586         static assert(is(typeof(bcu) == RangeAndStringish));
3587         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3588         static assert(is(ElementType!(typeof(bcu)) == char));
3589         assert(bcu.front == 't');
3590     }
3591     {
3592         static struct WRangeAndStringish
3593         {
emptyWRangeAndStringish3594             bool empty() pure nothrow @nogc { return data.empty; }
frontWRangeAndStringish3595             wchar front() pure nothrow @nogc { return data[0]; }
popFrontWRangeAndStringish3596             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3597 
3598             wstring data;
3599             wstring s;
3600             alias s this;
3601         }
3602 
3603         auto orig = WRangeAndStringish("test.d"w, "other"w);
3604         auto bcu = orig.byCodeUnit();
3605         static assert(is(typeof(bcu) == WRangeAndStringish));
3606         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3607         static assert(is(ElementType!(typeof(bcu)) == wchar));
3608         assert(bcu.front == 't');
3609     }
3610     {
3611         static struct DRangeAndStringish
3612         {
emptyDRangeAndStringish3613             bool empty() pure nothrow @nogc { return data.empty; }
frontDRangeAndStringish3614             dchar front() pure nothrow @nogc { return data[0]; }
popFrontDRangeAndStringish3615             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3616 
3617             dstring data;
3618             dstring s;
3619             alias s this;
3620         }
3621 
3622         auto orig = DRangeAndStringish("test.d"d, "other"d);
3623         auto bcu = orig.byCodeUnit();
3624         static assert(is(typeof(bcu) == DRangeAndStringish));
3625         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3626         static assert(is(ElementType!(typeof(bcu)) == dchar));
3627         assert(bcu.front == 't');
3628     }
3629     {
3630         enum Enum : string { a = "test.d" }
3631 
3632         auto orig = Enum.a;
3633         auto bcu = orig.byCodeUnit();
3634         static assert(!is(typeof(bcu) == Enum));
3635         static assert(is(typeof(bcu) == struct));
3636         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3637         assert(bcu.front == 't');
3638     }
3639     {
3640         enum WEnum : wstring { a = "test.d"w }
3641 
3642         auto orig = WEnum.a;
3643         auto bcu = orig.byCodeUnit();
3644         static assert(!is(typeof(bcu) == WEnum));
3645         static assert(is(typeof(bcu) == struct));
3646         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3647         assert(bcu.front == 't');
3648     }
3649     {
3650         enum DEnum : dstring { a = "test.d"d }
3651 
3652         auto orig = DEnum.a;
3653         auto bcu = orig.byCodeUnit();
3654         static assert(is(typeof(bcu) == dstring));
3655         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3656         assert(bcu.front == 't');
3657     }
3658 
3659     static assert(!is(typeof(byCodeUnit("hello")) == string));
3660     static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
3661     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
3662 
3663     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
3664     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
3665     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
3666 
3667     enum SEnum : char[5] { a = "hello" }
3668     enum WSEnum : wchar[5] { a = "hello"w }
3669     enum DSEnum : dchar[5] { a = "hello"d }
3670 
3671     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
3672     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
3673     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
3674 }
3675 
3676 /****************************
3677  * Iterate an input range of characters by char, wchar, or dchar.
3678  * These aliases simply forward to $(LREF byUTF) with the
3679  * corresponding C argument.
3680  *
3681  * Params:
3682  *      r = input range of characters, or array of characters
3683  */
3684 alias byChar = byUTF!char;
3685 
3686 /// Ditto
3687 alias byWchar = byUTF!wchar;
3688 
3689 /// Ditto
3690 alias byDchar = byUTF!dchar;
3691 
3692 @safe pure nothrow @nogc unittest
3693 {
3694   {
3695     char[5] s;
3696     int i;
3697     foreach (c; "hello".byChar.byChar())
3698     {
3699         //writefln("[%d] '%c'", i, c);
3700         s[i++] = c;
3701     }
3702     assert(s == "hello");
3703   }
3704   {
3705     char[5+2+3+4+3+3] s;
3706     int i;
3707     dchar[10] a;
3708     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
3709     a[8] = 0xD800;   // invalid
3710     a[9] = cast(dchar) 0x110000; // invalid
3711     foreach (c; a[].byChar())
3712     {
3713         //writefln("[%d] '%c'", i, c);
3714         s[i++] = c;
3715     }
3716     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
3717   }
3718   {
3719     auto r = "hello"w.byChar();
3720     r.popFront();
3721     r.popFront();
3722     assert(r.front == 'l');
3723   }
3724   {
3725     auto r = "hello"d.byChar();
3726     r.popFront();
3727     r.popFront();
3728     assert(r.front == 'l');
3729   }
3730   {
3731     auto r = "hello"d.byChar();
3732     assert(isForwardRange!(typeof(r)));
3733     auto s = r.save;
3734     r.popFront();
3735     assert(s.front == 'h');
3736   }
3737 }
3738 
3739 @safe pure nothrow @nogc unittest
3740 {
3741   {
3742     wchar[11] s;
3743     int i;
3744     dchar[10] a;
3745     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
3746     a[8] = 0xD800;   // invalid
3747     a[9] = cast(dchar) 0x110000; // invalid
3748     foreach (c; a[].byWchar())
3749     {
3750         //writefln("[%d] '%c' x%x", i, c, c);
3751         s[i++] = c;
3752     }
3753     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
3754     {
3755         //writefln("[%d] '%c' x%x", j, c, c);
3756     }
3757     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
3758   }
3759 
3760   {
3761     auto r = "hello".byWchar();
3762     r.popFront();
3763     r.popFront();
3764     assert(r.front == 'l');
3765   }
3766   {
3767     auto r = "hello"d.byWchar();
3768     r.popFront();
3769     r.popFront();
3770     assert(r.front == 'l');
3771   }
3772   {
3773     auto r = "hello"d.byWchar();
3774     assert(isForwardRange!(typeof(r)));
3775     auto s = r.save;
3776     r.popFront();
3777     assert(s.front == 'h');
3778   }
3779 }
3780 
3781 @safe pure nothrow @nogc unittest
3782 {
3783   {
3784     dchar[9] s;
3785     int i;
3786     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
3787     foreach (c; a.byDchar())
3788     {
3789         s[i++] = c;
3790     }
3791     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
3792   }
3793   {
3794     foreach (s; invalidUTFstrings!char())
3795     {
3796         auto r = s.byDchar();
3797         assert(!r.empty);
3798         assert(r.front == r.front);
3799         dchar c = r.front;
3800         assert(c == replacementDchar);
3801     }
3802   }
3803   {
3804     auto r = "hello".byDchar();
3805     r.popFront();
3806     r.popFront();
3807     assert(r.front == 'l');
3808   }
3809 
3810   {
3811     dchar[8] s;
3812     int i;
3813     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
3814     foreach (c; a.byDchar())
3815     {
3816         //writefln("[%d] '%c' x%x", i, c, c);
3817         s[i++] = c;
3818     }
3819     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
3820   }
3821   {
3822     foreach (s; invalidUTFstrings!wchar())
3823     {
3824         auto r = s.byDchar();
3825         assert(!r.empty);
3826         assert(r.front == r.front);
3827         dchar c = r.front;
3828         assert(c == replacementDchar);
3829     }
3830   }
3831   {
3832     wchar[2] ws;
3833     ws[0] = 0xD800;
3834     ws[1] = 0xDD00;             // correct surrogate pair
3835     auto r = ws[].byDchar();
3836     assert(!r.empty);
3837     assert(r.front == r.front);
3838     dchar c = r.front;
3839     assert(c == '\U00010100');
3840   }
3841   {
3842     auto r = "hello"w.byDchar();
3843     r.popFront();
3844     r.popFront();
3845     assert(r.front == 'l');
3846   }
3847 
3848   {
3849     dchar[5] s;
3850     int i;
3851     dstring a = "hello"d;
3852     foreach (c; a.byDchar.byDchar())
3853     {
3854         //writefln("[%d] '%c' x%x", i, c, c);
3855         s[i++] = c;
3856     }
3857     assert(s == "hello"d);
3858   }
3859   {
3860     auto r = "hello".byDchar();
3861     assert(isForwardRange!(typeof(r)));
3862     auto s = r.save;
3863     r.popFront();
3864     assert(s.front == 'h');
3865   }
3866   {
3867     auto r = "hello"w.byDchar();
3868     assert(isForwardRange!(typeof(r)));
3869     auto s = r.save;
3870     r.popFront();
3871     assert(s.front == 'h');
3872   }
3873 }
3874 
3875 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
3876 // which needs to support ranges with and without those attributes
3877 
3878 pure @safe nothrow @nogc unittest
3879 {
3880     dchar[5] s = "hello"d;
3881     foreach (c; s[].byChar())  { }
3882     foreach (c; s[].byWchar()) { }
3883     foreach (c; s[].byDchar()) { }
3884 }
3885 
version(unittest)3886 version (unittest)
3887 int impureVariable;
3888 
3889 @system unittest
3890 {
3891     static struct ImpureThrowingSystemRange(Char)
3892     {
3893         @property bool empty() const { return true; }
3894         @property Char front() const { return Char.init; }
3895         void popFront()
3896         {
3897             impureVariable++;
3898             throw new Exception("only for testing nothrow");
3899         }
3900     }
3901 
3902     foreach (Char; AliasSeq!(char, wchar, dchar))
3903     {
3904         ImpureThrowingSystemRange!Char range;
3905         foreach (c; range.byChar())  { }
3906         foreach (c; range.byWchar()) { }
3907         foreach (c; range.byDchar()) { }
3908     }
3909 }
3910 
3911 /****************************
3912  * Iterate an input range of characters by char type `C` by
3913  * encoding the elements of the range.
3914  *
3915  * UTF sequences that cannot be converted to the specified encoding are
3916  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
3917  * of the Unicode Standard 6.2. Hence byUTF is not symmetric.
3918  * This algorithm is lazy, and does not allocate memory.
3919  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
3920  * `r` parameter.
3921  *
3922  * Params:
3923  *      C = `char`, `wchar`, or `dchar`
3924  *
3925  * Returns:
3926  *      A forward range if `R` is a range and not auto-decodable, as defined by
3927  *      $(REF isAutodecodableString, std, traits), and if the base range is
3928  *      also a forward range.
3929  *
3930  *      Or, if `R` is a range and it is auto-decodable and
3931  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
3932  *      to $(LREF byCodeUnit).
3933  *
3934  *      Otherwise, an input range of characters.
3935  */
3936 template byUTF(C)
3937 if (isSomeChar!C)
3938 {
3939     static if (!is(Unqual!C == C))
3940         alias byUTF = byUTF!(Unqual!C);
3941     else:
3942 
3943     auto ref byUTF(R)(R r)
3944         if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
3945     {
3946         return byUTF(r.byCodeUnit());
3947     }
3948 
3949     auto ref byUTF(R)(R r)
3950         if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
3951     {
3952         alias RC = Unqual!(ElementEncodingType!R);
3953 
3954         static if (is(RC == C))
3955         {
3956             return r.byCodeUnit();
3957         }
3958         else
3959         {
3960             static struct Result
3961             {
emptyResult3962                 @property bool empty()
3963                 {
3964                     return pos == fill && r.empty;
3965                 }
3966 
frontResult3967                 @property auto front() scope // 'scope' required by call to decodeFront() below
3968                 {
3969                     if (pos == fill)
3970                     {
3971                         pos = 0;
3972                         auto c = r.front;
3973 
3974                         if (c <= 0x7F)
3975                         {
3976                             fill = 1;
3977                             r.popFront;
3978                             buf[pos] = cast(C) c;
3979                         }
3980                         else
3981                         {
3982                             static if (is(RC == dchar))
3983                             {
3984                                 r.popFront;
3985                                 dchar dc = c;
3986                             }
3987                             else
3988                                 dchar dc = () @trusted { return decodeFront!(Yes.useReplacementDchar)(r); }();
3989                             fill = cast(ushort) encode!(Yes.useReplacementDchar)(buf, dc);
3990                         }
3991                     }
3992                     return buf[pos];
3993                 }
3994 
popFrontResult3995                 void popFront()
3996                 {
3997                     if (pos == fill)
3998                         front;
3999                     ++pos;
4000                 }
4001 
4002                 static if (isForwardRange!R)
4003                 {
4004                     @property auto save() return scope
4005                     /* `return scope` cannot be inferred because compiler does not
4006                      * track it backwards from assignment to local `ret`
4007                      */
4008                     {
4009                         auto ret = this;
4010                         ret.r = r.save;
4011                         return ret;
4012                     }
4013                 }
4014 
4015             private:
4016 
4017                 R r;
4018                 C[4 / C.sizeof] buf = void;
4019                 ushort pos, fill;
4020             }
4021 
4022             return Result(r);
4023         }
4024     }
4025 }
4026 
4027 ///
4028 @safe pure nothrow unittest
4029 {
4030     import std.algorithm.comparison : equal;
4031 
4032     // hellö as a range of `char`s, which are UTF-8
4033     "hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]);
4034 
4035     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4036     "hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']);
4037 
4038     // �� is four code units in UTF-8, two in UTF-16, and one in UTF-32
4039     "��".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]);
4040     "��".byUTF!wchar().equal([0xD801, 0xDC37]);
4041     "��".byUTF!dchar().equal([0x00010437]);
4042 }
4043