1 // Written in the D programming language.
2
3 /++
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
5
6 UTF character support is restricted to
7 $(D '\u0000' <= character <= '\U0010FFFF').
8
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(BOOKTABLE,
11 $(TR $(TH Category) $(TH Functions))
12 $(TR $(TD Decode) $(TD
13 $(LREF decode)
14 $(LREF decodeFront)
15 ))
16 $(TR $(TD Lazy decode) $(TD
17 $(LREF byCodeUnit)
18 $(LREF byChar)
19 $(LREF byWchar)
20 $(LREF byDchar)
21 $(LREF byUTF)
22 ))
23 $(TR $(TD Encode) $(TD
24 $(LREF encode)
25 $(LREF toUTF8)
26 $(LREF toUTF16)
27 $(LREF toUTF32)
28 $(LREF toUTFz)
29 $(LREF toUTF16z)
30 ))
31 $(TR $(TD Length) $(TD
32 $(LREF codeLength)
33 $(LREF count)
34 $(LREF stride)
35 $(LREF strideBack)
36 ))
37 $(TR $(TD Index) $(TD
38 $(LREF toUCSindex)
39 $(LREF toUTFindex)
40 ))
41 $(TR $(TD Validation) $(TD
42 $(LREF isValidDchar)
43 $(LREF validate)
44 ))
45 $(TR $(TD Miscellaneous) $(TD
46 $(LREF replacementDchar)
47 $(LREF UseReplacementDchar)
48 $(LREF UTFException)
49 ))
50 )
51 See_Also:
52 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
53 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
54 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
55 Copyright: Copyright Digital Mars 2000 - 2012.
56 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
57 Authors: $(HTTP digitalmars.com, Walter Bright) and Jonathan M Davis
58 Source: $(PHOBOSSRC std/_utf.d)
59 +/
60 module std.utf;
61
62 import std.exception; // basicExceptionCtors
63 import std.meta; // AliasSeq
64 import std.range.primitives;
65 import std.traits; // isSomeChar, isSomeString
66 import std.typecons; // Flag, Yes, No
67
68
69 /++
70 Exception thrown on errors in std.utf functions.
71 +/
72 class UTFException : Exception
73 {
74 import core.internal.string : unsignedToTempString, UnsignedStringBuf;
75
76 uint[4] sequence;
77 size_t len;
78
79 @safe pure nothrow @nogc
setSequence(scope uint[]data...)80 UTFException setSequence(scope uint[] data...)
81 {
82 assert(data.length <= 4);
83
84 len = data.length < 4 ? data.length : 4;
85 sequence[0 .. len] = data[0 .. len];
86
87 return this;
88 }
89
90 // FIXME: Use std.exception.basicExceptionCtors here once bug #11500 is fixed
91
92 this(string msg, string file = __FILE__, size_t line = __LINE__,
93 Throwable next = null) @nogc @safe pure nothrow
94 {
95 super(msg, file, line, next);
96 }
97
98 this(string msg, size_t index, string file = __FILE__,
99 size_t line = __LINE__, Throwable next = null) @safe pure nothrow
100 {
101 UnsignedStringBuf buf = void;
102 msg ~= " (at index " ~ unsignedToTempString(index, buf, 10) ~ ")";
103 super(msg, file, line, next);
104 }
105
106
toString()107 override string toString() const
108 {
109 if (len == 0)
110 {
111 /* Exception.toString() is not marked as const, although
112 * it is const-compatible.
113 */
114 //return super.toString();
115 auto e = () @trusted { return cast(Exception) super; } ();
116 return e.toString();
117 }
118
119 string result = "Invalid UTF sequence:";
120
121 foreach (i; sequence[0 .. len])
122 {
123 UnsignedStringBuf buf = void;
124 result ~= ' ';
125 auto h = unsignedToTempString(i, buf, 16);
126 if (h.length == 1)
127 result ~= '0';
128 result ~= h;
129 result ~= 'x';
130 }
131
132 if (super.msg.length > 0)
133 {
134 result ~= " - ";
135 result ~= super.msg;
136 }
137
138 return result;
139 }
140 }
141
142 /*
143 Provide array of invalidly encoded UTF strings. Useful for testing.
144
145 Params:
146 Char = char, wchar, or dchar
147
148 Returns:
149 an array of invalidly encoded UTF strings
150 */
151
152 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
153 if (isSomeChar!Char)
154 {
155 static if (is(Char == char))
156 {
157 enum x = 0xDC00; // invalid surrogate value
158 enum y = 0x110000; // out of range
159
160 static immutable string[8] result =
161 [
162 "\x80", // not a start byte
163 "\xC0", // truncated
164 "\xC0\xC0", // invalid continuation
165 "\xF0\x82\x82\xAC", // overlong
166 [
167 0xE0 | (x >> 12),
168 0x80 | ((x >> 6) & 0x3F),
169 0x80 | (x & 0x3F)
170 ],
171 [
172 cast(char)(0xF0 | (y >> 18)),
173 cast(char)(0x80 | ((y >> 12) & 0x3F)),
174 cast(char)(0x80 | ((y >> 6) & 0x3F)),
175 cast(char)(0x80 | (y & 0x3F))
176 ],
177 [
178 cast(char)(0xF8 | 3), // 5 byte encoding
179 cast(char)(0x80 | 3),
180 cast(char)(0x80 | 3),
181 cast(char)(0x80 | 3),
182 cast(char)(0x80 | 3),
183 ],
184 [
185 cast(char)(0xFC | 3), // 6 byte encoding
186 cast(char)(0x80 | 3),
187 cast(char)(0x80 | 3),
188 cast(char)(0x80 | 3),
189 cast(char)(0x80 | 3),
190 cast(char)(0x80 | 3),
191 ],
192 ];
193
194 return result[];
195 }
196 else static if (is(Char == wchar))
197 {
198 static immutable wstring[5] result =
199 [
200 [
201 cast(wchar) 0xDC00,
202 ],
203 [
204 cast(wchar) 0xDFFF,
205 ],
206 [
207 cast(wchar) 0xDBFF,
208 cast(wchar) 0xDBFF,
209 ],
210 [
211 cast(wchar) 0xDBFF,
212 cast(wchar) 0xE000,
213 ],
214 [
215 cast(wchar) 0xD800,
216 ],
217 ];
218
219 return result[];
220 }
221 else static if (is(Char == dchar))
222 {
223 static immutable dstring[3] result =
224 [
225 [ cast(dchar) 0x110000 ],
226 [ cast(dchar) 0x00D800 ],
227 [ cast(dchar) 0x00DFFF ],
228 ];
229
230 return result;
231 }
232 else
233 static assert(0);
234 }
235
236 /++
237 Check whether the given Unicode code point is valid.
238
239 Params:
240 c = code point to check
241
242 Returns:
243 $(D true) iff $(D c) is a valid Unicode code point
244
245 Note:
246 $(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
247 as they are permitted for internal use by an application, but they are
248 not allowed for interchange by the Unicode standard.
249 +/
isValidDchar(dchar c)250 bool isValidDchar(dchar c) pure nothrow @safe @nogc
251 {
252 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
253 }
254
255 pure nothrow @safe @nogc unittest
256 {
257 import std.exception;
258
259 assertCTFEable!(
260 {
261 assert( isValidDchar(cast(dchar)'a') == true);
262 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
263
264 assert(!isValidDchar(cast(dchar) 0x00D800));
265 assert(!isValidDchar(cast(dchar) 0x00DBFF));
266 assert(!isValidDchar(cast(dchar) 0x00DC00));
267 assert(!isValidDchar(cast(dchar) 0x00DFFF));
268 assert( isValidDchar(cast(dchar) 0x00FFFE));
269 assert( isValidDchar(cast(dchar) 0x00FFFF));
270 assert( isValidDchar(cast(dchar) 0x01FFFF));
271 assert( isValidDchar(cast(dchar) 0x10FFFF));
272 assert(!isValidDchar(cast(dchar) 0x110000));
273 });
274 }
275
276
277 /++
278 Calculate the length of the UTF sequence starting at $(D index)
279 in $(D str).
280
281 Params:
282 str = input range of UTF code units. Must be random access if
283 $(D index) is passed
284 index = starting index of UTF sequence (default: $(D 0))
285
286 Returns:
287 The number of code units in the UTF sequence. For UTF-8, this is a
288 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
289 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
290
291 Throws:
292 May throw a $(D UTFException) if $(D str[index]) is not the start of a
293 valid UTF sequence.
294
295 Note:
296 $(D stride) will only analyze the first $(D str[index]) element. It
297 will not fully verify the validity of the UTF sequence, nor even verify
298 the presence of the sequence: it will not actually guarantee that
299 $(D index + stride(str, index) <= str.length).
300 +/
301 uint stride(S)(auto ref S str, size_t index)
302 if (is(S : const char[]) ||
303 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
304 {
305 static if (is(typeof(str.length) : ulong))
306 assert(index < str.length, "Past the end of the UTF-8 sequence");
307 immutable c = str[index];
308
309 if (c < 0x80)
310 return 1;
311 else
312 return strideImpl(c, index);
313 }
314
315 /// Ditto
316 uint stride(S)(auto ref S str)
317 if (is(S : const char[]) ||
318 (isInputRange!S && is(Unqual!(ElementType!S) == char)))
319 {
320 static if (is(S : const char[]))
321 immutable c = str[0];
322 else
323 immutable c = str.front;
324
325 if (c < 0x80)
326 return 1;
327 else
328 return strideImpl(c, 0);
329 }
330
strideImpl(char c,size_t index)331 private uint strideImpl(char c, size_t index) @trusted pure
332 in { assert(c & 0x80); }
333 body
334 {
335 import core.bitop : bsr;
336 immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
337 if (c == 0xFF || msbs < 2 || msbs > 4)
338 throw new UTFException("Invalid UTF-8 sequence", index);
339 return msbs;
340 }
341
342 @system unittest
343 {
344 import core.exception : AssertError;
345 import std.conv : to;
346 import std.exception;
347 import std.string : format;
348 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
349 {
350 enforce(stride(s, i) == codeLength!char(c),
351 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
352
353 enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
354 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
355
356 auto refRandom = new RefRandomCU!char(s);
357 immutable randLen = refRandom.length;
358 enforce(stride(refRandom, i) == codeLength!char(c),
359 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
360 enforce(refRandom.length == randLen,
361 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
362
363 if (i == 0)
364 {
365 enforce(stride(s) == codeLength!char(c),
366 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
367
368 enforce(stride(InputCU!char(s)) == codeLength!char(c),
369 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
370
371 auto refBidir = new RefBidirCU!char(s);
372 immutable bidirLen = refBidir.length;
373 enforce(stride(refBidir) == codeLength!char(c),
374 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
375 enforce(refBidir.length == bidirLen,
376 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
377 }
378 }
379
380 assertCTFEable!(
381 {
382 test("a", 'a');
383 test(" ", ' ');
384 test("\u2029", '\u2029'); //paraSep
385 test("\u0100", '\u0100');
386 test("\u0430", '\u0430');
387 test("\U00010143", '\U00010143');
388 test("abcdefcdef", 'a');
389 test("hello\U00010143\u0100\U00010143", 'h', 0);
390 test("hello\U00010143\u0100\U00010143", 'e', 1);
391 test("hello\U00010143\u0100\U00010143", 'l', 2);
392 test("hello\U00010143\u0100\U00010143", 'l', 3);
393 test("hello\U00010143\u0100\U00010143", 'o', 4);
394 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
395 test("hello\U00010143\u0100\U00010143", '\u0100', 9);
396 test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
397
398 foreach (S; AliasSeq!(char[], const char[], string))
399 {
400 enum str = to!S("hello world");
401 static assert(isSafe!({ stride(str, 0); }));
402 static assert(isSafe!({ stride(str); }));
403 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
404 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0);
405 }
406 });
407 }
408
409 @safe unittest // invalid start bytes
410 {
411 import std.exception : assertThrown;
412 immutable char[] invalidStartBytes = [
413 0b1111_1000, // indicating a sequence length of 5
414 0b1111_1100, // 6
415 0b1111_1110, // 7
416 0b1111_1111, // 8
417 0b1000_0000, // continuation byte
418 ];
419 foreach (c; invalidStartBytes)
420 assertThrown!UTFException(stride([c]));
421 }
422
423 /// Ditto
424 uint stride(S)(auto ref S str, size_t index)
425 if (is(S : const wchar[]) ||
426 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
427 {
428 static if (is(typeof(str.length) : ulong))
429 assert(index < str.length, "Past the end of the UTF-16 sequence");
430 immutable uint u = str[index];
431 return 1 + (u >= 0xD800 && u <= 0xDBFF);
432 }
433
434 /// Ditto
435 uint stride(S)(auto ref S str) @safe pure
436 if (is(S : const wchar[]))
437 {
438 return stride(str, 0);
439 }
440
441 /// Ditto
442 uint stride(S)(auto ref S str)
443 if (isInputRange!S && is(Unqual!(ElementType!S) == wchar))
444 {
445 assert(!str.empty, "UTF-16 sequence is empty");
446 immutable uint u = str.front;
447 return 1 + (u >= 0xD800 && u <= 0xDBFF);
448 }
449
450 @system unittest
451 {
452 import core.exception : AssertError;
453 import std.conv : to;
454 import std.exception;
455 import std.string : format;
456 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
457 {
458 enforce(stride(s, i) == codeLength!wchar(c),
459 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
460
461 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
462 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
463
464 auto refRandom = new RefRandomCU!wchar(s);
465 immutable randLen = refRandom.length;
466 enforce(stride(refRandom, i) == codeLength!wchar(c),
467 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
468 enforce(refRandom.length == randLen,
469 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
470
471 if (i == 0)
472 {
473 enforce(stride(s) == codeLength!wchar(c),
474 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
475
476 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
477 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
478
479 auto refBidir = new RefBidirCU!wchar(s);
480 immutable bidirLen = refBidir.length;
481 enforce(stride(refBidir) == codeLength!wchar(c),
482 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
483 enforce(refBidir.length == bidirLen,
484 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
485 }
486 }
487
488 assertCTFEable!(
489 {
490 test("a", 'a');
491 test(" ", ' ');
492 test("\u2029", '\u2029'); //paraSep
493 test("\u0100", '\u0100');
494 test("\u0430", '\u0430');
495 test("\U00010143", '\U00010143');
496 test("abcdefcdef", 'a');
497 test("hello\U00010143\u0100\U00010143", 'h', 0);
498 test("hello\U00010143\u0100\U00010143", 'e', 1);
499 test("hello\U00010143\u0100\U00010143", 'l', 2);
500 test("hello\U00010143\u0100\U00010143", 'l', 3);
501 test("hello\U00010143\u0100\U00010143", 'o', 4);
502 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
503 test("hello\U00010143\u0100\U00010143", '\u0100', 7);
504 test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
505
506 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
507 {
508 enum str = to!S("hello world");
509 static assert(isSafe!(() => stride(str, 0)));
510 static assert(isSafe!(() => stride(str) ));
511 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
512 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
513 }
514 });
515 }
516
517 /// Ditto
518 uint stride(S)(auto ref S str, size_t index = 0)
519 if (is(S : const dchar[]) ||
520 (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
521 {
522 static if (is(typeof(str.length) : ulong))
523 assert(index < str.length, "Past the end of the UTF-32 sequence");
524 else
525 assert(!str.empty, "UTF-32 sequence is empty.");
526 return 1;
527 }
528
529 @system unittest
530 {
531 import core.exception : AssertError;
532 import std.conv : to;
533 import std.exception;
534 import std.string : format;
535 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
536 {
537 enforce(stride(s, i) == codeLength!dchar(c),
538 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
539
540 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
541 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
542
543 auto refRandom = new RefRandomCU!dchar(s);
544 immutable randLen = refRandom.length;
545 enforce(stride(refRandom, i) == codeLength!dchar(c),
546 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
547 enforce(refRandom.length == randLen,
548 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
549
550 if (i == 0)
551 {
552 enforce(stride(s) == codeLength!dchar(c),
553 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
554
555 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
556 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
557
558 auto refBidir = new RefBidirCU!dchar(s);
559 immutable bidirLen = refBidir.length;
560 enforce(stride(refBidir) == codeLength!dchar(c),
561 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
562 enforce(refBidir.length == bidirLen,
563 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
564 }
565 }
566
567 assertCTFEable!(
568 {
569 test("a", 'a');
570 test(" ", ' ');
571 test("\u2029", '\u2029'); //paraSep
572 test("\u0100", '\u0100');
573 test("\u0430", '\u0430');
574 test("\U00010143", '\U00010143');
575 test("abcdefcdef", 'a');
576 test("hello\U00010143\u0100\U00010143", 'h', 0);
577 test("hello\U00010143\u0100\U00010143", 'e', 1);
578 test("hello\U00010143\u0100\U00010143", 'l', 2);
579 test("hello\U00010143\u0100\U00010143", 'l', 3);
580 test("hello\U00010143\u0100\U00010143", 'o', 4);
581 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
582 test("hello\U00010143\u0100\U00010143", '\u0100', 6);
583 test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
584
585 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
586 {
587 enum str = to!S("hello world");
588 static assert(isSafe!(() => stride(str, 0)));
589 static assert(isSafe!(() => stride(str) ));
590 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
591 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
592 }
593 });
594 }
595
596 /++
597 Calculate the length of the UTF sequence ending one code unit before
598 $(D index) in $(D str).
599
600 Params:
601 str = bidirectional range of UTF code units. Must be random access if
602 $(D index) is passed
603 index = index one past end of UTF sequence (default: $(D str.length))
604
605 Returns:
606 The number of code units in the UTF sequence. For UTF-8, this is a
607 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
608 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
609
610 Throws:
611 May throw a $(D UTFException) if $(D str[index]) is not one past the
612 end of a valid UTF sequence.
613
614 Note:
615 $(D strideBack) will only analyze the element at $(D str[index - 1])
616 element. It will not fully verify the validity of the UTF sequence, nor
617 even verify the presence of the sequence: it will not actually
618 guarantee that $(D strideBack(str, index) <= index).
619 +/
620 uint strideBack(S)(auto ref S str, size_t index)
621 if (is(S : const char[]) ||
622 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
623 {
624 static if (is(typeof(str.length) : ulong))
625 assert(index <= str.length, "Past the end of the UTF-8 sequence");
626 assert(index > 0, "Not the end of the UTF-8 sequence");
627
628 if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
629 return 1;
630
631 if (index >= 4) //single verification for most common case
632 {
633 foreach (i; AliasSeq!(2, 3, 4))
634 {
635 if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
636 return i;
637 }
638 }
639 else
640 {
641 foreach (i; AliasSeq!(2, 3))
642 {
643 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
644 return i;
645 }
646 }
647 throw new UTFException("Not the end of the UTF sequence", index);
648 }
649
650 /// Ditto
651 uint strideBack(S)(auto ref S str)
652 if (is(S : const char[]) ||
653 (isRandomAccessRange!S && hasLength!S && is(Unqual!(ElementType!S) == char)))
654 {
655 return strideBack(str, str.length);
656 }
657
658 /// Ditto
659 uint strideBack(S)(auto ref S str)
660 if (isBidirectionalRange!S && is(Unqual!(ElementType!S) == char) && !isRandomAccessRange!S)
661 {
662 assert(!str.empty, "Past the end of the UTF-8 sequence");
663 auto temp = str.save;
664 foreach (i; AliasSeq!(1, 2, 3, 4))
665 {
666 if ((temp.back & 0b1100_0000) != 0b1000_0000)
667 return i;
668 temp.popBack();
669 if (temp.empty)
670 break;
671 }
672 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
673 }
674
675 @system unittest
676 {
677 import core.exception : AssertError;
678 import std.conv : to;
679 import std.exception;
680 import std.string : format;
681 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
682 {
683 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
684 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
685
686 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
687 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
688
689 auto refRandom = new RefRandomCU!char(s);
690 immutable randLen = refRandom.length;
691 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
692 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
693 enforce(refRandom.length == randLen,
694 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
695
696 if (i == size_t.max)
697 {
698 enforce(strideBack(s) == codeLength!char(c),
699 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
700
701 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
702 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
703
704 auto refBidir = new RefBidirCU!char(s);
705 immutable bidirLen = refBidir.length;
706 enforce(strideBack(refBidir) == codeLength!char(c),
707 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
708 enforce(refBidir.length == bidirLen,
709 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
710 }
711 }
712
713 assertCTFEable!(
714 {
715 test("a", 'a');
716 test(" ", ' ');
717 test("\u2029", '\u2029'); //paraSep
718 test("\u0100", '\u0100');
719 test("\u0430", '\u0430');
720 test("\U00010143", '\U00010143');
721 test("abcdefcdef", 'f');
722 test("\U00010143\u0100\U00010143hello", 'o', 15);
723 test("\U00010143\u0100\U00010143hello", 'l', 14);
724 test("\U00010143\u0100\U00010143hello", 'l', 13);
725 test("\U00010143\u0100\U00010143hello", 'e', 12);
726 test("\U00010143\u0100\U00010143hello", 'h', 11);
727 test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
728 test("\U00010143\u0100\U00010143hello", '\u0100', 6);
729 test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
730
731 foreach (S; AliasSeq!(char[], const char[], string))
732 {
733 enum str = to!S("hello world");
734 static assert(isSafe!({ strideBack(str, 0); }));
735 static assert(isSafe!({ strideBack(str); }));
736 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
737 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0);
738 }
739 });
740 }
741
742 //UTF-16 is self synchronizing: The length of strideBack can be found from
743 //the value of a single wchar
744 /// Ditto
745 uint strideBack(S)(auto ref S str, size_t index)
746 if (is(S : const wchar[]) ||
747 (isRandomAccessRange!S && is(Unqual!(ElementType!S) == wchar)))
748 {
749 static if (is(typeof(str.length) : ulong))
750 assert(index <= str.length, "Past the end of the UTF-16 sequence");
751 assert(index > 0, "Not the end of a UTF-16 sequence");
752
753 immutable c2 = str[index-1];
754 return 1 + (0xDC00 <= c2 && c2 < 0xE000);
755 }
756
757 /// Ditto
758 uint strideBack(S)(auto ref S str)
759 if (is(S : const wchar[]) ||
760 (isBidirectionalRange!S && is(Unqual!(ElementType!S) == wchar)))
761 {
762 assert(!str.empty, "UTF-16 sequence is empty");
763
764 static if (is(S : const(wchar)[]))
765 immutable c2 = str[$ - 1];
766 else
767 immutable c2 = str.back;
768
769 return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
770 }
771
772 @system unittest
773 {
774 import core.exception : AssertError;
775 import std.conv : to;
776 import std.exception;
777 import std.string : format;
778 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
779 {
780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
782
783 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
785
786 auto refRandom = new RefRandomCU!wchar(s);
787 immutable randLen = refRandom.length;
788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790 enforce(refRandom.length == randLen,
791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
792
793 if (i == size_t.max)
794 {
795 enforce(strideBack(s) == codeLength!wchar(c),
796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
797
798 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
800
801 auto refBidir = new RefBidirCU!wchar(s);
802 immutable bidirLen = refBidir.length;
803 enforce(strideBack(refBidir) == codeLength!wchar(c),
804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805 enforce(refBidir.length == bidirLen,
806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
807 }
808 }
809
810 assertCTFEable!(
811 {
812 test("a", 'a');
813 test(" ", ' ');
814 test("\u2029", '\u2029'); //paraSep
815 test("\u0100", '\u0100');
816 test("\u0430", '\u0430');
817 test("\U00010143", '\U00010143');
818 test("abcdefcdef", 'f');
819 test("\U00010143\u0100\U00010143hello", 'o', 10);
820 test("\U00010143\u0100\U00010143hello", 'l', 9);
821 test("\U00010143\u0100\U00010143hello", 'l', 8);
822 test("\U00010143\u0100\U00010143hello", 'e', 7);
823 test("\U00010143\u0100\U00010143hello", 'h', 6);
824 test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
825 test("\U00010143\u0100\U00010143hello", '\u0100', 3);
826 test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
827
828 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
829 {
830 enum str = to!S("hello world");
831 static assert(isSafe!(() => strideBack(str, 0)));
832 static assert(isSafe!(() => strideBack(str) ));
833 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
834 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
835 }
836 });
837 }
838
839 /// Ditto
840 uint strideBack(S)(auto ref S str, size_t index)
841 if (isRandomAccessRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
842 {
843 static if (is(typeof(str.length) : ulong))
844 assert(index <= str.length, "Past the end of the UTF-32 sequence");
845 assert(index > 0, "Not the end of the UTF-32 sequence");
846 return 1;
847 }
848
849 /// Ditto
850 uint strideBack(S)(auto ref S str)
851 if (isBidirectionalRange!S && is(Unqual!(ElementEncodingType!S) == dchar))
852 {
853 assert(!str.empty, "Empty UTF-32 sequence");
854 return 1;
855 }
856
857 @system unittest
858 {
859 import core.exception : AssertError;
860 import std.conv : to;
861 import std.exception;
862 import std.string : format;
863 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
864 {
865 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
866 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
867
868 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
869 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
870
871 auto refRandom = new RefRandomCU!dchar(s);
872 immutable randLen = refRandom.length;
873 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
874 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
875 enforce(refRandom.length == randLen,
876 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
877
878 if (i == size_t.max)
879 {
880 enforce(strideBack(s) == codeLength!dchar(c),
881 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
882
883 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
884 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
885
886 auto refBidir = new RefBidirCU!dchar(s);
887 immutable bidirLen = refBidir.length;
888 enforce(strideBack(refBidir) == codeLength!dchar(c),
889 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
890 enforce(refBidir.length == bidirLen,
891 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
892 }
893 }
894
895 assertCTFEable!(
896 {
897 test("a", 'a');
898 test(" ", ' ');
899 test("\u2029", '\u2029'); //paraSep
900 test("\u0100", '\u0100');
901 test("\u0430", '\u0430');
902 test("\U00010143", '\U00010143');
903 test("abcdefcdef", 'f');
904 test("\U00010143\u0100\U00010143hello", 'o', 8);
905 test("\U00010143\u0100\U00010143hello", 'l', 7);
906 test("\U00010143\u0100\U00010143hello", 'l', 6);
907 test("\U00010143\u0100\U00010143hello", 'e', 5);
908 test("\U00010143\u0100\U00010143hello", 'h', 4);
909 test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
910 test("\U00010143\u0100\U00010143hello", '\u0100', 2);
911 test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
912
913 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
914 {
915 enum str = to!S("hello world");
916 static assert(isSafe!(() => strideBack(str, 0)));
917 static assert(isSafe!(() => strideBack(str) ));
918 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
919 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
920 }
921 });
922 }
923
924
925 /++
926 Given $(D index) into $(D str) and assuming that $(D index) is at the start
927 of a UTF sequence, $(D toUCSindex) determines the number of UCS characters
928 up to $(D index). So, $(D index) is the index of a code unit at the
929 beginning of a code point, and the return value is how many code points into
930 the string that that code point is.
931 +/
932 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
933 if (isSomeChar!C)
934 {
935 static if (is(Unqual!C == dchar))
936 return index;
937 else
938 {
939 size_t n = 0;
940 size_t j = 0;
941
942 for (; j < index; ++n)
943 j += stride(str, j);
944
945 if (j > index)
946 {
947 static if (is(Unqual!C == char))
948 throw new UTFException("Invalid UTF-8 sequence", index);
949 else
950 throw new UTFException("Invalid UTF-16 sequence", index);
951 }
952
953 return n;
954 }
955 }
956
957 ///
958 @safe unittest
959 {
960 assert(toUCSindex(`hello world`, 7) == 7);
961 assert(toUCSindex(`hello world`w, 7) == 7);
962 assert(toUCSindex(`hello world`d, 7) == 7);
963
964 assert(toUCSindex(`Ma Chérie`, 7) == 6);
965 assert(toUCSindex(`Ma Chérie`w, 7) == 7);
966 assert(toUCSindex(`Ma Chérie`d, 7) == 7);
967
968 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
969 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
970 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
971 }
972
973
974 /++
975 Given a UCS index $(D n) into $(D str), returns the UTF index.
976 So, $(D n) is how many code points into the string the code point is, and
977 the array index of the code unit is returned.
978 +/
979 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
980 if (isSomeChar!C)
981 {
982 static if (is(Unqual!C == dchar))
983 {
984 return n;
985 }
986 else
987 {
988 size_t i;
989 while (n--)
990 {
991 i += stride(str, i);
992 }
993 return i;
994 }
995 }
996
997 ///
998 @safe unittest
999 {
1000 assert(toUTFindex(`hello world`, 7) == 7);
1001 assert(toUTFindex(`hello world`w, 7) == 7);
1002 assert(toUTFindex(`hello world`d, 7) == 7);
1003
1004 assert(toUTFindex(`Ma Chérie`, 6) == 7);
1005 assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1006 assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1007
1008 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1009 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1010 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1011 }
1012
1013
1014 /* =================== Decode ======================= */
1015
1016 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1017 alias UseReplacementDchar = Flag!"useReplacementDchar";
1018
1019 /++
1020 Decodes and returns the code point starting at $(D str[index]). $(D index)
1021 is advanced to one past the decoded code point. If the code point is not
1022 well-formed, then a $(D UTFException) is thrown and $(D index) remains
1023 unchanged.
1024
1025 decode will only work with strings and random access ranges of code units
1026 with length and slicing, whereas $(LREF decodeFront) will work with any
1027 input range of code units.
1028
1029 Params:
1030 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1031 str = input string or indexable Range
1032 index = starting index into s[]; incremented by number of code units processed
1033
1034 Returns:
1035 decoded character
1036
1037 Throws:
1038 $(LREF UTFException) if $(D str[index]) is not the start of a valid UTF
1039 sequence and useReplacementDchar is $(D No.useReplacementDchar)
1040 +/
1041 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1042 if (!isSomeString!S &&
1043 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1044 in
1045 {
1046 assert(index < str.length, "Attempted to decode past the end of a string");
1047 }
out(result)1048 out (result)
1049 {
1050 assert(isValidDchar(result));
1051 }
1052 body
1053 {
1054 if (str[index] < codeUnitLimit!S)
1055 return str[index++];
1056 else
1057 return decodeImpl!(true, useReplacementDchar)(str, index);
1058 }
1059
1060 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1061 auto ref S str, ref size_t index) @trusted pure
1062 if (isSomeString!S)
1063 in
1064 {
1065 assert(index < str.length, "Attempted to decode past the end of a string");
1066 }
out(result)1067 out (result)
1068 {
1069 assert(isValidDchar(result));
1070 }
1071 body
1072 {
1073 if (str[index] < codeUnitLimit!S)
1074 return str[index++];
1075 else
1076 return decodeImpl!(true, useReplacementDchar)(str, index);
1077 }
1078
1079 /++
1080 $(D decodeFront) is a variant of $(LREF decode) which specifically decodes
1081 the first code point. Unlike $(LREF decode), $(D decodeFront) accepts any
1082 input range of code units (rather than just a string or random access
1083 range). It also takes the range by $(D ref) and pops off the elements as it
1084 decodes them. If $(D numCodeUnits) is passed in, it gets set to the number
1085 of code units which were in the code point which was decoded.
1086
1087 Params:
1088 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1089 str = input string or indexable Range
1090 numCodeUnits = set to number of code units processed
1091
1092 Returns:
1093 decoded character
1094
1095 Throws:
1096 $(LREF UTFException) if $(D str.front) is not the start of a valid UTF
1097 sequence. If an exception is thrown, then there is no guarantee as to
1098 the number of code units which were popped off, as it depends on the
1099 type of range being used and how many code units had to be popped off
1100 before the code point was determined to be invalid.
1101 +/
1102 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1103 ref S str, out size_t numCodeUnits)
1104 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1105 in
1106 {
1107 assert(!str.empty);
1108 }
out(result)1109 out (result)
1110 {
1111 assert(isValidDchar(result));
1112 }
1113 body
1114 {
1115 immutable fst = str.front;
1116
1117 if (fst < codeUnitLimit!S)
1118 {
1119 str.popFront();
1120 numCodeUnits = 1;
1121 return fst;
1122 }
1123 else
1124 {
1125 //@@@BUG@@@ 14447 forces canIndex to be done outside of decodeImpl, which
1126 //is undesirable, since not all overloads of decodeImpl need it. So, it
1127 //should be moved back into decodeImpl once bug# 8521 has been fixed.
1128 enum canIndex = isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1129 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1130
1131 // The other range types were already popped by decodeImpl.
1132 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1133 str = str[numCodeUnits .. str.length];
1134
1135 return retval;
1136 }
1137 }
1138
1139 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1140 ref S str, out size_t numCodeUnits) @trusted pure
1141 if (isSomeString!S)
1142 in
1143 {
1144 assert(!str.empty);
1145 }
out(result)1146 out (result)
1147 {
1148 assert(isValidDchar(result));
1149 }
1150 body
1151 {
1152 if (str[0] < codeUnitLimit!S)
1153 {
1154 numCodeUnits = 1;
1155 immutable retval = str[0];
1156 str = str[1 .. $];
1157 return retval;
1158 }
1159 else
1160 {
1161 immutable retval = decodeImpl!(true, useReplacementDchar)(str, numCodeUnits);
1162 str = str[numCodeUnits .. $];
1163 return retval;
1164 }
1165 }
1166
1167 /++ Ditto +/
1168 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1169 if (isInputRange!S && isSomeChar!(ElementType!S))
1170 {
1171 size_t numCodeUnits;
1172 return decodeFront!useReplacementDchar(str, numCodeUnits);
1173 }
1174
1175 /++
1176 $(D decodeBack) is a variant of $(LREF decode) which specifically decodes
1177 the last code point. Unlike $(LREF decode), $(D decodeBack) accepts any
1178 bidirectional range of code units (rather than just a string or random access
1179 range). It also takes the range by $(D ref) and pops off the elements as it
1180 decodes them. If $(D numCodeUnits) is passed in, it gets set to the number
1181 of code units which were in the code point which was decoded.
1182
1183 Params:
1184 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1185 str = input string or bidirectional Range
1186 numCodeUnits = gives the number of code units processed
1187
1188 Returns:
1189 A decoded UTF character.
1190
1191 Throws:
1192 $(LREF UTFException) if $(D str.back) is not the end of a valid UTF
1193 sequence. If an exception is thrown, the $(D str) itself remains unchanged,
1194 but there is no guarantee as to the value of $(D numCodeUnits) (when passed).
1195 +/
1196 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1197 ref S str, out size_t numCodeUnits)
1198 if (isSomeString!S)
1199 in
1200 {
1201 assert(!str.empty);
1202 }
out(result)1203 out (result)
1204 {
1205 assert(isValidDchar(result));
1206 }
1207 body
1208 {
1209 if (str[$ - 1] < codeUnitLimit!S)
1210 {
1211 numCodeUnits = 1;
1212 immutable retval = str[$ - 1];
1213 str = str[0 .. $ - 1];
1214 return retval;
1215 }
1216 else
1217 {
1218 numCodeUnits = strideBack(str);
1219 immutable newLength = str.length - numCodeUnits;
1220 size_t index = newLength;
1221 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1222 str = str[0 .. newLength];
1223 return retval;
1224 }
1225 }
1226
1227 /++ Ditto +/
1228 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1229 ref S str, out size_t numCodeUnits)
1230 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1231 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1232 in
1233 {
1234 assert(!str.empty);
1235 }
out(result)1236 out (result)
1237 {
1238 assert(isValidDchar(result));
1239 }
1240 body
1241 {
1242 if (str.back < codeUnitLimit!S)
1243 {
1244 numCodeUnits = 1;
1245 immutable retval = str.back;
1246 str.popBack();
1247 return retval;
1248 }
1249 else
1250 {
1251 numCodeUnits = strideBack(str);
1252 static if (isRandomAccessRange!S)
1253 {
1254 size_t index = str.length - numCodeUnits;
1255 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1256 str.popBackExactly(numCodeUnits);
1257 return retval;
1258 }
1259 else
1260 {
1261 alias Char = Unqual!(ElementType!S);
1262 Char[4] codeUnits;
1263 S tmp = str.save;
1264 for (size_t i = numCodeUnits; i > 0; )
1265 {
1266 codeUnits[--i] = tmp.back;
1267 tmp.popBack();
1268 }
1269 const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1270 size_t index = 0;
1271 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1272 str = tmp;
1273 return retval;
1274 }
1275 }
1276 }
1277
1278 /++ Ditto +/
1279 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1280 if (isSomeString!S
1281 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1282 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1283 in
1284 {
1285 assert(!str.empty);
1286 }
out(result)1287 out (result)
1288 {
1289 assert(isValidDchar(result));
1290 }
1291 body
1292 {
1293 size_t numCodeUnits;
1294 return decodeBack!useReplacementDchar(str, numCodeUnits);
1295 }
1296
1297 // Gives the maximum value that a code unit for the given range type can hold.
1298 package template codeUnitLimit(S)
1299 if (isSomeChar!(ElementEncodingType!S))
1300 {
1301 static if (is(Unqual!(ElementEncodingType!S) == char))
1302 enum char codeUnitLimit = 0x80;
1303 else static if (is(Unqual!(ElementEncodingType!S) == wchar))
1304 enum wchar codeUnitLimit = 0xD800;
1305 else
1306 enum dchar codeUnitLimit = 0xD800;
1307 }
1308
1309 /*
1310 * For strings, this function does its own bounds checking to give a
1311 * more useful error message when attempting to decode past the end of a string.
1312 * Subsequently it uses a pointer instead of an array to avoid
1313 * redundant bounds checking.
1314 *
1315 * The three overloads of this operate on chars, wchars, and dchars.
1316 *
1317 * Params:
1318 * canIndex = if S is indexable
1319 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1320 * str = input string or Range
1321 * index = starting index into s[]; incremented by number of code units processed
1322 *
1323 * Returns:
1324 * decoded character
1325 */
1326 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1327 auto ref S str, ref size_t index)
1328 if (
1329 is(S : const char[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == char)))
1330 {
1331 /* The following encodings are valid, except for the 5 and 6 byte
1332 * combinations:
1333 * 0xxxxxxx
1334 * 110xxxxx 10xxxxxx
1335 * 1110xxxx 10xxxxxx 10xxxxxx
1336 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1337 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1338 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1339 */
1340
1341 /* Dchar bitmask for different numbers of UTF-8 code units.
1342 */
1343 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1344
1345 static if (is(S : const char[]))
1346 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code
1347 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1348 auto pstr = str[index .. str.length];
1349 else
1350 alias pstr = str;
1351
1352 //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl
1353 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1354
1355 static if (canIndex)
1356 {
1357 immutable length = str.length - index;
1358 ubyte fst = pstr[0];
1359 }
1360 else
1361 {
1362 ubyte fst = pstr.front;
1363 pstr.popFront();
1364 }
1365
1366 static if (!useReplacementDchar)
1367 {
1368 static if (canIndex)
1369 {
exception(S)1370 static UTFException exception(S)(S str, string msg)
1371 {
1372 uint[4] sequence = void;
1373 size_t i;
1374
1375 do
1376 {
1377 sequence[i] = str[i];
1378 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1379
1380 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1381 }
1382 }
1383
invalidUTF()1384 UTFException invalidUTF()
1385 {
1386 static if (canIndex)
1387 return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1388 else
1389 {
1390 //We can't include the invalid sequence with input strings without
1391 //saving each of the code units along the way, and we can't do it with
1392 //forward ranges without saving the entire range. Both would incur a
1393 //cost for the decoding of every character just to provide a better
1394 //error message for the (hopefully) rare case when an invalid UTF-8
1395 //sequence is encountered, so we don't bother trying to include the
1396 //invalid sequence here, unlike with strings and sliceable ranges.
1397 return new UTFException("Invalid UTF-8 sequence");
1398 }
1399 }
1400
outOfBounds()1401 UTFException outOfBounds()
1402 {
1403 static if (canIndex)
1404 return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1405 else
1406 return new UTFException("Attempted to decode past the end of a string");
1407 }
1408 }
1409
1410 if ((fst & 0b1100_0000) != 0b1100_0000)
1411 {
1412 static if (useReplacementDchar)
1413 {
1414 ++index; // always consume bad input to avoid infinite loops
1415 return replacementDchar;
1416 }
1417 else
1418 throw invalidUTF(); // starter must have at least 2 first bits set
1419 }
1420 ubyte tmp = void;
1421 dchar d = fst; // upper control bits are masked out later
1422 fst <<= 1;
1423
1424 foreach (i; AliasSeq!(1, 2, 3))
1425 {
1426
1427 static if (canIndex)
1428 {
1429 if (i == length)
1430 {
1431 static if (useReplacementDchar)
1432 {
1433 index += i;
1434 return replacementDchar;
1435 }
1436 else
1437 throw outOfBounds();
1438 }
1439 }
1440 else
1441 {
1442 if (pstr.empty)
1443 {
1444 static if (useReplacementDchar)
1445 {
1446 index += i;
1447 return replacementDchar;
1448 }
1449 else
1450 throw outOfBounds();
1451 }
1452 }
1453
1454 static if (canIndex)
1455 tmp = pstr[i];
1456 else
1457 {
1458 tmp = pstr.front;
1459 pstr.popFront();
1460 }
1461
1462 if ((tmp & 0xC0) != 0x80)
1463 {
1464 static if (useReplacementDchar)
1465 {
1466 index += i + 1;
1467 return replacementDchar;
1468 }
1469 else
1470 throw invalidUTF();
1471 }
1472
1473 d = (d << 6) | (tmp & 0x3F);
1474 fst <<= 1;
1475
1476 if (!(fst & 0x80)) // no more bytes
1477 {
1478 d &= bitMask[i]; // mask out control bits
1479
1480 // overlong, could have been encoded with i bytes
1481 if ((d & ~bitMask[i - 1]) == 0)
1482 {
1483 static if (useReplacementDchar)
1484 {
1485 index += i + 1;
1486 return replacementDchar;
1487 }
1488 else
1489 throw invalidUTF();
1490 }
1491
1492 // check for surrogates only needed for 3 bytes
1493 static if (i == 2)
1494 {
1495 if (!isValidDchar(d))
1496 {
1497 static if (useReplacementDchar)
1498 {
1499 index += i + 1;
1500 return replacementDchar;
1501 }
1502 else
1503 throw invalidUTF();
1504 }
1505 }
1506
1507 index += i + 1;
1508 static if (i == 3)
1509 {
1510 if (d > dchar.max)
1511 {
1512 static if (useReplacementDchar)
1513 d = replacementDchar;
1514 else
1515 throw invalidUTF();
1516 }
1517 }
1518 return d;
1519 }
1520 }
1521
1522 static if (useReplacementDchar)
1523 {
1524 index += 4; // read 4 chars by now
1525 return replacementDchar;
1526 }
1527 else
1528 throw invalidUTF();
1529 }
1530
1531 @safe pure @nogc nothrow
1532 unittest
1533 {
1534 // Add tests for useReplacemendDchar == yes path
1535
1536 static struct R
1537 {
1538 @safe pure @nogc nothrow:
thisR1539 this(string s) { this.s = s; }
emptyR1540 @property bool empty() { return idx == s.length; }
frontR1541 @property char front() { return s[idx]; }
popFrontR1542 void popFront() { ++idx; }
1543 size_t idx;
1544 string s;
1545 }
1546
1547 foreach (s; invalidUTFstrings!char())
1548 {
1549 auto r = R(s);
1550 size_t index;
1551 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1552 assert(dc == replacementDchar);
1553 assert(1 <= index && index <= s.length);
1554 }
1555 }
1556
1557 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1558 (auto ref S str, ref size_t index)
1559 if (is(S : const wchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == wchar)))
1560 {
1561 static if (is(S : const wchar[]))
1562 auto pstr = str.ptr + index;
1563 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1564 auto pstr = str[index .. str.length];
1565 else
1566 alias pstr = str;
1567
1568 //@@@BUG@@@ 14447 forces this to be done outside of decodeImpl
1569 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1570
1571 static if (canIndex)
1572 {
1573 immutable length = str.length - index;
1574 uint u = pstr[0];
1575 }
1576 else
1577 {
1578 uint u = pstr.front;
1579 pstr.popFront();
1580 }
1581
1582 static if (!useReplacementDchar)
1583 {
exception(string msg)1584 UTFException exception(string msg)
1585 {
1586 static if (canIndex)
1587 return new UTFException(msg).setSequence(pstr[0]);
1588 else
1589 return new UTFException(msg);
1590 }
1591 }
1592
1593 // The < case must be taken care of before decodeImpl is called.
1594 assert(u >= 0xD800);
1595
1596 if (u <= 0xDBFF)
1597 {
1598 static if (canIndex)
1599 immutable onlyOneCodeUnit = length == 1;
1600 else
1601 immutable onlyOneCodeUnit = pstr.empty;
1602
1603 if (onlyOneCodeUnit)
1604 {
1605 static if (useReplacementDchar)
1606 {
1607 ++index;
1608 return replacementDchar;
1609 }
1610 else
1611 throw exception("surrogate UTF-16 high value past end of string");
1612 }
1613
1614 static if (canIndex)
1615 immutable uint u2 = pstr[1];
1616 else
1617 {
1618 immutable uint u2 = pstr.front;
1619 pstr.popFront();
1620 }
1621
1622 if (u2 < 0xDC00 || u2 > 0xDFFF)
1623 {
1624 static if (useReplacementDchar)
1625 u = replacementDchar;
1626 else
1627 throw exception("surrogate UTF-16 low value out of range");
1628 }
1629 else
1630 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1631 ++index;
1632 }
1633 else if (u >= 0xDC00 && u <= 0xDFFF)
1634 {
1635 static if (useReplacementDchar)
1636 u = replacementDchar;
1637 else
1638 throw exception("unpaired surrogate UTF-16 value");
1639 }
1640 ++index;
1641
1642 // Note: u+FFFE and u+FFFF are specifically permitted by the
1643 // Unicode standard for application internal use (see isValidDchar)
1644
1645 return cast(dchar) u;
1646 }
1647
1648 @safe pure @nogc nothrow
1649 unittest
1650 {
1651 // Add tests for useReplacemendDchar == true path
1652
1653 static struct R
1654 {
1655 @safe pure @nogc nothrow:
thisR1656 this(wstring s) { this.s = s; }
emptyR1657 @property bool empty() { return idx == s.length; }
frontR1658 @property wchar front() { return s[idx]; }
popFrontR1659 void popFront() { ++idx; }
1660 size_t idx;
1661 wstring s;
1662 }
1663
1664 foreach (s; invalidUTFstrings!wchar())
1665 {
1666 auto r = R(s);
1667 size_t index;
1668 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1669 assert(dc == replacementDchar);
1670 assert(1 <= index && index <= s.length);
1671 }
1672 }
1673
1674 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1675 auto ref S str, ref size_t index)
1676 if (is(S : const dchar[]) || (isInputRange!S && is(Unqual!(ElementEncodingType!S) == dchar)))
1677 {
1678 static if (is(S : const dchar[]))
1679 auto pstr = str.ptr;
1680 else
1681 alias pstr = str;
1682
1683 static if (is(S : const dchar[]) || isRandomAccessRange!S)
1684 {
1685 dchar dc = pstr[index];
1686 if (!isValidDchar(dc))
1687 {
1688 static if (useReplacementDchar)
1689 dc = replacementDchar;
1690 else
1691 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1692 }
1693 ++index;
1694 return dc;
1695 }
1696 else
1697 {
1698 dchar dc = pstr.front;
1699 if (!isValidDchar(dc))
1700 {
1701 static if (useReplacementDchar)
1702 dc = replacementDchar;
1703 else
1704 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1705 }
1706 ++index;
1707 pstr.popFront();
1708 return dc;
1709 }
1710 }
1711
1712 @safe pure @nogc nothrow
1713 unittest
1714 {
1715 // Add tests for useReplacemendDchar == true path
1716
1717 static struct R
1718 {
1719 @safe pure @nogc nothrow:
thisR1720 this(dstring s) { this.s = s; }
emptyR1721 @property bool empty() { return idx == s.length; }
frontR1722 @property dchar front() { return s[idx]; }
popFrontR1723 void popFront() { ++idx; }
1724 size_t idx;
1725 dstring s;
1726 }
1727
1728 foreach (s; invalidUTFstrings!dchar())
1729 {
1730 auto r = R(s);
1731 size_t index;
1732 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1733 assert(dc == replacementDchar);
1734 assert(1 <= index && index <= s.length);
1735 }
1736 }
1737
1738
version(unittest)1739 version (unittest) private void testDecode(R)(R range,
1740 size_t index,
1741 dchar expectedChar,
1742 size_t expectedIndex,
1743 size_t line = __LINE__)
1744 {
1745 import core.exception : AssertError;
1746 import std.string : format;
1747
1748 static if (hasLength!R)
1749 immutable lenBefore = range.length;
1750
1751 static if (isRandomAccessRange!R)
1752 {
1753 {
1754 immutable result = decode(range, index);
1755 enforce(result == expectedChar,
1756 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1757 enforce(index == expectedIndex,
1758 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1759 static if (hasLength!R)
1760 {
1761 enforce(range.length == lenBefore,
1762 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1763 }
1764 }
1765 }
1766 }
1767
version(unittest)1768 version (unittest) private void testDecodeFront(R)(ref R range,
1769 dchar expectedChar,
1770 size_t expectedNumCodeUnits,
1771 size_t line = __LINE__)
1772 {
1773 import core.exception : AssertError;
1774 import std.string : format;
1775
1776 static if (hasLength!R)
1777 immutable lenBefore = range.length;
1778
1779 size_t numCodeUnits;
1780 immutable result = decodeFront(range, numCodeUnits);
1781 enforce(result == expectedChar,
1782 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1783 enforce(numCodeUnits == expectedNumCodeUnits,
1784 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1785
1786 static if (hasLength!R)
1787 {
1788 enforce(range.length == lenBefore - numCodeUnits,
1789 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1790 }
1791 }
1792
version(unittest)1793 version (unittest) private void testDecodeBack(R)(ref R range,
1794 dchar expectedChar,
1795 size_t expectedNumCodeUnits,
1796 size_t line = __LINE__)
1797 {
1798 // This condition is to allow unit testing all `decode` functions together
1799 static if (!isBidirectionalRange!R)
1800 return;
1801 else
1802 {
1803 import core.exception : AssertError;
1804 import std.string : format;
1805
1806 static if (hasLength!R)
1807 immutable lenBefore = range.length;
1808
1809 size_t numCodeUnits;
1810 immutable result = decodeBack(range, numCodeUnits);
1811 enforce(result == expectedChar,
1812 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1813 enforce(numCodeUnits == expectedNumCodeUnits,
1814 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1815
1816 static if (hasLength!R)
1817 {
1818 enforce(range.length == lenBefore - numCodeUnits,
1819 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
1820 }
1821 }
1822 }
1823
version(unittest)1824 version (unittest) private void testAllDecode(R)(R range,
1825 dchar expectedChar,
1826 size_t expectedIndex,
1827 size_t line = __LINE__)
1828 {
1829 testDecode(range, 0, expectedChar, expectedIndex, line);
1830 static if (isBidirectionalRange!R)
1831 {
1832 auto rangeCopy = range.save;
1833 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
1834 }
1835 testDecodeFront(range, expectedChar, expectedIndex, line);
1836 }
1837
version(unittest)1838 version (unittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
1839 {
1840 import core.exception : AssertError;
1841 import std.string : format;
1842
1843 immutable initialIndex = index;
1844
1845 static if (hasLength!R)
1846 immutable lenBefore = range.length;
1847
1848 static if (isRandomAccessRange!R)
1849 {
1850 assertThrown!UTFException(decode(range, index), null, __FILE__, line);
1851 enforce(index == initialIndex,
1852 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1853 static if (hasLength!R)
1854 {
1855 enforce(range.length == lenBefore,
1856 new AssertError(format("decode: length changed:", range.length), __FILE__, line));
1857 }
1858 }
1859
1860 if (initialIndex == 0)
1861 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
1862 }
1863
version(unittest)1864 version (unittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
1865 {
1866 // This condition is to allow unit testing all `decode` functions together
1867 static if (!isBidirectionalRange!R)
1868 return;
1869 else
1870 {
1871 import core.exception : AssertError;
1872 import std.string : format;
1873
1874 static if (hasLength!R)
1875 immutable lenBefore = range.length;
1876
1877 static if (isRandomAccessRange!R)
1878 {
1879 assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
1880 static if (hasLength!R)
1881 {
1882 enforce(range.length == lenBefore,
1883 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
1884 }
1885 }
1886 }
1887 }
1888
1889 @system unittest
1890 {
1891 import std.conv : to;
1892 import std.exception;
1893
1894 assertCTFEable!(
1895 {
1896 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
1897 (string s) => new RefBidirCU!char(s),
1898 (string s) => new RefRandomCU!char(s)))
1899 {
1900 enum sHasLength = hasLength!(typeof(S("abcd")));
1901
1902 {
1903 auto range = S("abcd");
1904 testDecode(range, 0, 'a', 1);
1905 testDecode(range, 1, 'b', 2);
1906 testDecodeFront(range, 'a', 1);
1907 testDecodeFront(range, 'b', 1);
1908 assert(decodeFront(range) == 'c');
1909 assert(decodeFront(range) == 'd');
1910 }
1911
1912 {
1913 auto range = S("ウェブサイト");
1914 testDecode(range, 0, 'ウ', 3);
1915 testDecode(range, 3, 'ェ', 6);
1916 testDecodeFront(range, 'ウ', 3);
1917 testDecodeFront(range, 'ェ', 3);
1918 assert(decodeFront(range) == 'ブ');
1919 assert(decodeFront(range) == 'サ');
1920 }
1921
1922 {
1923 auto range = S("abcd");
1924 testDecodeBack(range, 'd', 1);
1925 testDecodeBack(range, 'c', 1);
1926 testDecodeBack(range, 'b', 1);
1927 testDecodeBack(range, 'a', 1);
1928 }
1929
1930 {
1931 auto range = S("ウェブサイト");
1932 testDecodeBack(range, 'ト', 3);
1933 testDecodeBack(range, 'イ', 3);
1934 testDecodeBack(range, 'サ', 3);
1935 testDecodeBack(range, 'ブ', 3);
1936 }
1937
1938 testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
1939 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
1940
1941 foreach (str; ["\xE2\x89", // too short
1942 "\xC0\x8A",
1943 "\xE0\x80\x8A",
1944 "\xF0\x80\x80\x8A",
1945 "\xF8\x80\x80\x80\x8A",
1946 "\xFC\x80\x80\x80\x80\x8A"])
1947 {
1948 testBadDecode(S(str), 0);
1949 testBadDecode(S(str), 1);
1950 testBadDecodeBack(S(str));
1951 }
1952
1953 //Invalid UTF-8 sequence where the first code unit is valid.
1954 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
1955 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
1956
1957 //Invalid UTF-8 sequence where the first code unit isn't valid.
1958 foreach (str; ["\xED\xA0\x80",
1959 "\xED\xAD\xBF",
1960 "\xED\xAE\x80",
1961 "\xED\xAF\xBF",
1962 "\xED\xB0\x80",
1963 "\xED\xBE\x80",
1964 "\xED\xBF\xBF"])
1965 {
1966 testBadDecode(S(str), 0);
1967 testBadDecodeBack(S(str));
1968 }
1969 }
1970 });
1971 }
1972
1973 @system unittest
1974 {
1975 import std.conv : to;
1976 import std.exception;
1977 assertCTFEable!(
1978 {
1979 foreach (S; AliasSeq!(to!wstring, InputCU!wchar, RandomCU!wchar,
1980 (wstring s) => new RefBidirCU!wchar(s),
1981 (wstring s) => new RefRandomCU!wchar(s)))
1982 {
1983 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
1984 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
1985 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
1986 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
1987 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
1988
1989 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
1990 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
1991
1992 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
1993 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
1994
1995 {
1996 auto range = S("ウェブサイト");
1997 testDecode(range, 0, 'ウ', 1);
1998 testDecode(range, 1, 'ェ', 2);
1999 testDecodeFront(range, 'ウ', 1);
2000 testDecodeFront(range, 'ェ', 1);
2001 assert(decodeFront(range) == 'ブ');
2002 assert(decodeFront(range) == 'サ');
2003 }
2004
2005 {
2006 auto range = S("ウェブサイト");
2007 testDecodeBack(range, 'ト', 1);
2008 testDecodeBack(range, 'イ', 1);
2009 testDecodeBack(range, 'サ', 1);
2010 testDecodeBack(range, 'ブ', 1);
2011 }
2012 }
2013
2014 foreach (S; AliasSeq!(to!wstring, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2015 {
2016 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2017 cast(wchar) 0x1400,
2018 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2019 testDecode(str, 0, cast(dchar) 0x10000, 2);
2020 testDecode(str, 2, cast(dchar) 0x1400, 3);
2021 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2022 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2023 testDecodeBack(str, cast(dchar) 0x1400, 1);
2024 testDecodeBack(str, cast(dchar) 0x10000, 2);
2025 }
2026 });
2027 }
2028
2029 @system unittest
2030 {
2031 import std.conv : to;
2032 import std.exception;
2033 assertCTFEable!(
2034 {
2035 foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, InputCU!dchar,
2036 (dstring s) => new RefBidirCU!dchar(s),
2037 (dstring s) => new RefRandomCU!dchar(s)))
2038 {
2039 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2040 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2041 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2042 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2043 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2044
2045 testBadDecode(S([cast(dchar) 0xD800]), 0);
2046 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2047 testBadDecode(S([cast(dchar) 0x110000]), 0);
2048
2049 testBadDecodeBack(S([cast(dchar) 0xD800]));
2050 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2051 testBadDecodeBack(S([cast(dchar) 0x110000]));
2052
2053 {
2054 auto range = S("ウェブサイト");
2055 testDecode(range, 0, 'ウ', 1);
2056 testDecode(range, 1, 'ェ', 2);
2057 testDecodeFront(range, 'ウ', 1);
2058 testDecodeFront(range, 'ェ', 1);
2059 assert(decodeFront(range) == 'ブ');
2060 assert(decodeFront(range) == 'サ');
2061 }
2062
2063 {
2064 auto range = S("ウェブサイト");
2065 testDecodeBack(range, 'ト', 1);
2066 testDecodeBack(range, 'イ', 1);
2067 testDecodeBack(range, 'サ', 1);
2068 testDecodeBack(range, 'ブ', 1);
2069 }
2070 }
2071
2072 foreach (S; AliasSeq!(to!dstring, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2073 {
2074 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2075 testDecode(str, 0, 0x10000, 1);
2076 testDecode(str, 1, 0x1400, 2);
2077 testDecode(str, 2, 0xB9DDE, 3);
2078 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2079 testDecodeBack(str, cast(dchar) 0x1400, 1);
2080 testDecodeBack(str, cast(dchar) 0x10000, 1);
2081 }
2082 });
2083 }
2084
2085 @safe unittest
2086 {
2087 import std.exception;
2088 assertCTFEable!(
2089 {
2090 foreach (S; AliasSeq!( char[], const( char)[], string,
2091 wchar[], const(wchar)[], wstring,
2092 dchar[], const(dchar)[], dstring))
2093 {
2094 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); }));
2095 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2096 static assert(isSafe!({ S str; decodeFront(str); }));
2097 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2098 static assert((functionAttributes!({
2099 S str; size_t i = 0; decodeFront(str, i);
2100 }) & FunctionAttribute.pure_) != 0);
2101 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2102 static assert((functionAttributes!({
2103 S str; size_t i = 0; decodeBack(str, i);
2104 }) & FunctionAttribute.pure_) != 0);
2105 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2106 }
2107 });
2108 }
2109
2110 @safe unittest
2111 {
2112 import std.exception;
2113 char[4] val;
2114 val[0] = 0b1111_0111;
2115 val[1] = 0b1011_1111;
2116 val[2] = 0b1011_1111;
2117 val[3] = 0b1011_1111;
2118 size_t i = 0;
2119 assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2120 }
2121 /* =================== Encode ======================= */
2122
_utfException(UseReplacementDchar useReplacementDchar)2123 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2124 {
2125 static if (useReplacementDchar)
2126 return replacementDchar;
2127 else
2128 throw new UTFException(msg).setSequence(c);
2129 }
2130
2131 /++
2132 Encodes $(D c) into the static array, $(D buf), and returns the actual
2133 length of the encoded character (a number between $(D 1) and $(D 4) for
2134 $(D char[4]) buffers and a number between $(D 1) and $(D 2) for
2135 $(D wchar[2]) buffers).
2136
2137 Throws:
2138 $(D UTFException) if $(D c) is not a valid UTF code point.
2139 +/
2140 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2141 out char[4] buf, dchar c) @safe pure
2142 {
2143 if (c <= 0x7F)
2144 {
2145 assert(isValidDchar(c));
2146 buf[0] = cast(char) c;
2147 return 1;
2148 }
2149 if (c <= 0x7FF)
2150 {
2151 assert(isValidDchar(c));
2152 buf[0] = cast(char)(0xC0 | (c >> 6));
2153 buf[1] = cast(char)(0x80 | (c & 0x3F));
2154 return 2;
2155 }
2156 if (c <= 0xFFFF)
2157 {
2158 if (0xD800 <= c && c <= 0xDFFF)
2159 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2160
2161 assert(isValidDchar(c));
2162 L3:
2163 buf[0] = cast(char)(0xE0 | (c >> 12));
2164 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2165 buf[2] = cast(char)(0x80 | (c & 0x3F));
2166 return 3;
2167 }
2168 if (c <= 0x10FFFF)
2169 {
2170 assert(isValidDchar(c));
2171 buf[0] = cast(char)(0xF0 | (c >> 18));
2172 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2173 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2174 buf[3] = cast(char)(0x80 | (c & 0x3F));
2175 return 4;
2176 }
2177
2178 assert(!isValidDchar(c));
2179 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2180 goto L3;
2181 }
2182
2183 @safe unittest
2184 {
2185 import std.exception;
2186 assertCTFEable!(
2187 {
2188 char[4] buf;
2189
2190 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2191 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2192 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2193 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2194 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2195 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2196 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2197 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2198 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2199 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2200 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2201
2202 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2203 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2204 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2205 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2206 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2207
2208 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2209 assert(buf.front == replacementDchar);
2210 });
2211 }
2212
2213
2214 /// Ditto
2215 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2216 out wchar[2] buf, dchar c) @safe pure
2217 {
2218 if (c <= 0xFFFF)
2219 {
2220 if (0xD800 <= c && c <= 0xDFFF)
2221 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2222
2223 assert(isValidDchar(c));
2224 L1:
2225 buf[0] = cast(wchar) c;
2226 return 1;
2227 }
2228 if (c <= 0x10FFFF)
2229 {
2230 assert(isValidDchar(c));
2231 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2232 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2233 return 2;
2234 }
2235
2236 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2237 goto L1;
2238 }
2239
2240 @safe unittest
2241 {
2242 import std.exception;
2243 assertCTFEable!(
2244 {
2245 wchar[2] buf;
2246
2247 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2248 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2249 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2250 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2251 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2252 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2253 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2254
2255 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2256 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2257 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2258 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2259 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2260
2261 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2262 assert(buf.front == replacementDchar);
2263 });
2264 }
2265
2266
2267 /// Ditto
2268 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2269 out dchar[1] buf, dchar c) @safe pure
2270 {
2271 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2272 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2273 else
2274 assert(isValidDchar(c));
2275 buf[0] = c;
2276 return 1;
2277 }
2278
2279 @safe unittest
2280 {
2281 import std.exception;
2282 assertCTFEable!(
2283 {
2284 dchar[1] buf;
2285
2286 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2287 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2288 encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2289 encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2290 encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2291 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2292
2293 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2294 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2295 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2296 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2297 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2298
2299 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2300 assert(buf.front == replacementDchar);
2301 });
2302 }
2303
2304
2305 /++
2306 Encodes $(D c) in $(D str)'s encoding and appends it to $(D str).
2307
2308 Throws:
2309 $(D UTFException) if $(D c) is not a valid UTF code point.
2310 +/
2311 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2312 ref char[] str, dchar c) @safe pure
2313 {
2314 char[] r = str;
2315
2316 if (c <= 0x7F)
2317 {
2318 assert(isValidDchar(c));
2319 r ~= cast(char) c;
2320 }
2321 else
2322 {
2323 char[4] buf;
2324 uint L;
2325
2326 if (c <= 0x7FF)
2327 {
2328 assert(isValidDchar(c));
2329 buf[0] = cast(char)(0xC0 | (c >> 6));
2330 buf[1] = cast(char)(0x80 | (c & 0x3F));
2331 L = 2;
2332 }
2333 else if (c <= 0xFFFF)
2334 {
2335 if (0xD800 <= c && c <= 0xDFFF)
2336 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2337
2338 assert(isValidDchar(c));
2339 L3:
2340 buf[0] = cast(char)(0xE0 | (c >> 12));
2341 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2342 buf[2] = cast(char)(0x80 | (c & 0x3F));
2343 L = 3;
2344 }
2345 else if (c <= 0x10FFFF)
2346 {
2347 assert(isValidDchar(c));
2348 buf[0] = cast(char)(0xF0 | (c >> 18));
2349 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2350 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2351 buf[3] = cast(char)(0x80 | (c & 0x3F));
2352 L = 4;
2353 }
2354 else
2355 {
2356 assert(!isValidDchar(c));
2357 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2358 goto L3;
2359 }
2360 r ~= buf[0 .. L];
2361 }
2362 str = r;
2363 }
2364
2365 @safe unittest
2366 {
2367 import std.exception;
2368
2369 assertCTFEable!(
2370 {
2371 char[] s = "abcd".dup;
2372 encode(s, cast(dchar)'a');
2373 assert(s.length == 5);
2374 assert(s == "abcda");
2375
2376 encode(s, cast(dchar)'\u00A9');
2377 assert(s.length == 7);
2378 assert(s == "abcda\xC2\xA9");
2379 //assert(s == "abcda\u00A9"); // BUG: fix compiler
2380
2381 encode(s, cast(dchar)'\u2260');
2382 assert(s.length == 10);
2383 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2384 });
2385 }
2386
2387 @safe unittest
2388 {
2389 import std.exception;
2390 assertCTFEable!(
2391 {
2392 char[] buf;
2393
2394 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2395 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2396 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2397 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2398 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2399 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2400 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2401 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2402 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2403 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2404 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2405
2406 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2407 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2408 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2409 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2410 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2411
2412 assert(buf.back != replacementDchar);
2413 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2414 assert(buf.back == replacementDchar);
2415 });
2416 }
2417
2418 /// ditto
2419 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2420 ref wchar[] str, dchar c) @safe pure
2421 {
2422 wchar[] r = str;
2423
2424 if (c <= 0xFFFF)
2425 {
2426 if (0xD800 <= c && c <= 0xDFFF)
2427 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2428
2429 assert(isValidDchar(c));
2430 L1:
2431 r ~= cast(wchar) c;
2432 }
2433 else if (c <= 0x10FFFF)
2434 {
2435 wchar[2] buf;
2436
2437 assert(isValidDchar(c));
2438 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2439 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2440 r ~= buf;
2441 }
2442 else
2443 {
2444 assert(!isValidDchar(c));
2445 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2446 goto L1;
2447 }
2448
2449 str = r;
2450 }
2451
2452 @safe unittest
2453 {
2454 import std.exception;
2455 assertCTFEable!(
2456 {
2457 wchar[] buf;
2458
2459 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2460 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2461 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2462 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2463 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2464 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2465 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2466
2467 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2468 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2469 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2470 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2471 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2472
2473 assert(buf.back != replacementDchar);
2474 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2475 assert(buf.back == replacementDchar);
2476 });
2477 }
2478
2479 /// ditto
2480 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2481 ref dchar[] str, dchar c) @safe pure
2482 {
2483 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2484 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2485 else
2486 assert(isValidDchar(c));
2487 str ~= c;
2488 }
2489
2490 @safe unittest
2491 {
2492 import std.exception;
2493 assertCTFEable!(
2494 {
2495 dchar[] buf;
2496
2497 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2498 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2499 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2500 encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2501 encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2502 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2503
2504 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2505 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2506 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2507 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2508 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2509
2510 assert(buf.back != replacementDchar);
2511 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2512 assert(buf.back == replacementDchar);
2513 });
2514 }
2515
2516
2517 /++
2518 Returns the number of code units that are required to encode the code point
2519 $(D c) when $(D C) is the character type used to encode it.
2520 +/
2521 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2522 if (isSomeChar!C)
2523 {
2524 static if (C.sizeof == 1)
2525 {
2526 if (c <= 0x7F) return 1;
2527 if (c <= 0x7FF) return 2;
2528 if (c <= 0xFFFF) return 3;
2529 if (c <= 0x10FFFF) return 4;
2530 assert(false);
2531 }
2532 else static if (C.sizeof == 2)
2533 {
2534 return c <= 0xFFFF ? 1 : 2;
2535 }
2536 else
2537 {
2538 static assert(C.sizeof == 4);
2539 return 1;
2540 }
2541 }
2542
2543 ///
2544 @safe pure nothrow @nogc unittest
2545 {
2546 assert(codeLength!char('a') == 1);
2547 assert(codeLength!wchar('a') == 1);
2548 assert(codeLength!dchar('a') == 1);
2549
2550 assert(codeLength!char('\U0010FFFF') == 4);
2551 assert(codeLength!wchar('\U0010FFFF') == 2);
2552 assert(codeLength!dchar('\U0010FFFF') == 1);
2553 }
2554
2555
2556 /++
2557 Returns the number of code units that are required to encode $(D str)
2558 in a string whose character type is $(D C). This is particularly useful
2559 when slicing one string with the length of another and the two string
2560 types use different character types.
2561
2562 Params:
2563 C = the character type to get the encoding length for
2564 input = the input range to calculate the encoding length from
2565 Returns:
2566 The number of code units in `input` when encoded to `C`
2567 +/
2568 size_t codeLength(C, InputRange)(InputRange input)
2569 if (isInputRange!InputRange && !isInfinite!InputRange && is(ElementType!InputRange : dchar))
2570 {
2571 alias EncType = Unqual!(ElementEncodingType!InputRange);
2572 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2573 return input.length;
2574 else
2575 {
2576 size_t total = 0;
2577
2578 foreach (dchar c; input)
2579 total += codeLength!C(c);
2580
2581 return total;
2582 }
2583 }
2584
2585 ///
2586 @safe unittest
2587 {
2588 import std.conv : to;
2589 assert(codeLength!char("hello world") ==
2590 to!string("hello world").length);
2591 assert(codeLength!wchar("hello world") ==
2592 to!wstring("hello world").length);
2593 assert(codeLength!dchar("hello world") ==
2594 to!dstring("hello world").length);
2595
2596 assert(codeLength!char(`プログラミング`) ==
2597 to!string(`プログラミング`).length);
2598 assert(codeLength!wchar(`プログラミング`) ==
2599 to!wstring(`プログラミング`).length);
2600 assert(codeLength!dchar(`プログラミング`) ==
2601 to!dstring(`プログラミング`).length);
2602
2603 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2604 wstring needle = `Être sans la verité`;
2605 assert(haystack[codeLength!char(needle) .. $] ==
2606 `, ça, ce ne serait pas bien.`);
2607 }
2608
2609 @safe unittest
2610 {
2611 import std.algorithm.iteration : filter;
2612 import std.conv : to;
2613 import std.exception;
2614
2615 assertCTFEable!(
2616 {
2617 foreach (S; AliasSeq!( char[], const char[], string,
2618 wchar[], const wchar[], wstring,
2619 dchar[], const dchar[], dstring))
2620 {
2621 foreach (C; AliasSeq!(char, wchar, dchar))
2622 {
2623 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2624 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2625 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2626 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2627 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2628 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2629 }
2630 }
2631 });
2632 }
2633
2634 /+
2635 Internal helper function:
2636
2637 Returns true if it is safe to search for the Codepoint $(D c) inside
2638 code units, without decoding.
2639
2640 This is a runtime check that is used an optimization in various functions,
2641 particularly, in $(D std.string).
2642 +/
2643 package bool canSearchInCodeUnits(C)(dchar c)
2644 if (isSomeChar!C)
2645 {
2646 static if (C.sizeof == 1)
2647 return c <= 0x7F;
2648 else static if (C.sizeof == 2)
2649 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2650 else static if (C.sizeof == 4)
2651 return true;
2652 else
2653 static assert(0);
2654 }
2655 @safe unittest
2656 {
2657 assert( canSearchInCodeUnits! char('a'));
2658 assert( canSearchInCodeUnits!wchar('a'));
2659 assert( canSearchInCodeUnits!dchar('a'));
2660 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2661 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2662 assert( canSearchInCodeUnits!wchar('ö'));
2663 assert( canSearchInCodeUnits!dchar('ö'));
2664 assert(!canSearchInCodeUnits! char('日'));
2665 assert( canSearchInCodeUnits!wchar('日'));
2666 assert( canSearchInCodeUnits!dchar('日'));
2667 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2668 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2669 assert(!canSearchInCodeUnits! char('\U00010001'));
2670 assert(!canSearchInCodeUnits!wchar('\U00010001'));
2671 assert( canSearchInCodeUnits!dchar('\U00010001'));
2672 }
2673
2674 /* =================== Validation ======================= */
2675
2676 /++
2677 Checks to see if $(D str) is well-formed unicode or not.
2678
2679 Throws:
2680 $(D UTFException) if $(D str) is not well-formed.
2681 +/
2682 void validate(S)(in S str) @safe pure
2683 if (isSomeString!S)
2684 {
2685 immutable len = str.length;
2686 for (size_t i = 0; i < len; )
2687 {
2688 decode(str, i);
2689 }
2690 }
2691
2692
2693 @safe unittest // bugzilla 12923
2694 {
2695 import std.exception;
2696 assertThrown((){
2697 char[3]a=[167, 133, 175];
2698 validate(a[]);
2699 }());
2700 }
2701
2702 /**
2703 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2704 * string of the elements.
2705 *
2706 * Params:
2707 * s = the string to encode
2708 * Returns:
2709 * A UTF-8 string
2710 * See_Also:
2711 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2712 */
2713 string toUTF8(S)(S s)
2714 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2715 {
2716 return toUTFImpl!string(s);
2717 }
2718
2719 ///
2720 @safe pure unittest
2721 {
2722 import std.algorithm.comparison : equal;
2723
2724 // The ö is represented by two UTF-8 code units
2725 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2726
2727 // is four code units in UTF-8
2728 assert(""d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2729 }
2730
2731 @system pure unittest
2732 {
2733 import std.algorithm.comparison : equal;
2734 import std.internal.test.dummyrange : ReferenceInputRange;
2735
2736 auto r1 = new ReferenceInputRange!dchar("Hellø");
2737 auto r2 = new ReferenceInputRange!dchar("");
2738
2739 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2740 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2741 }
2742
2743 /**
2744 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
2745 * `wstring` of the elements.
2746 *
2747 * Params:
2748 * s = the range to encode
2749 * Returns:
2750 * A UTF-16 string
2751 * See_Also:
2752 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2753 */
2754 wstring toUTF16(S)(S s)
2755 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2756 {
2757 return toUTFImpl!wstring(s);
2758 }
2759
2760 ///
2761 @safe pure unittest
2762 {
2763 import std.algorithm.comparison : equal;
2764
2765 // these graphemes are two code units in UTF-16 and one in UTF-32
2766 assert(""d.length == 1);
2767 assert(""d.length == 1);
2768
2769 assert(""d.toUTF16.equal([0xD852, 0xDF62]));
2770 assert(""d.toUTF16.equal([0xD801, 0xDC37]));
2771 }
2772
2773 @system pure unittest
2774 {
2775 import std.algorithm.comparison : equal;
2776 import std.internal.test.dummyrange : ReferenceInputRange;
2777
2778 auto r1 = new ReferenceInputRange!dchar("");
2779 auto r2 = new ReferenceInputRange!dchar("");
2780
2781 assert(r1.toUTF16.equal([0xD852, 0xDF62]));
2782 assert(r2.toUTF16.equal([0xD801, 0xDC37]));
2783 }
2784
2785
2786 /**
2787 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
2788 * `dstring` of the elements.
2789 *
2790 * Params:
2791 * s = the range to encode
2792 * Returns:
2793 * A UTF-32 string
2794 * See_Also:
2795 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2796 */
2797 dstring toUTF32(S)(S s)
2798 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2799 {
2800 return toUTFImpl!dstring(s);
2801 }
2802
toUTFImpl(T,S)2803 private T toUTFImpl(T, S)(S s)
2804 {
2805 static if (is(S : T))
2806 {
2807 return s.idup;
2808 }
2809 else
2810 {
2811 import std.array : appender;
2812 auto app = appender!T();
2813
2814 static if (hasLength!S || isSomeString!S)
2815 app.reserve(s.length);
2816
2817 foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
2818 app.put(c);
2819
2820 return app.data;
2821 }
2822 }
2823
2824 /* =================== toUTFz ======================= */
2825
2826 /++
2827 Returns a C-style zero-terminated string equivalent to $(D str). $(D str)
2828 must not contain embedded $(D '\0')'s as any C function will treat the first
2829 $(D '\0') that it sees as the end of the string. If $(D str.empty) is
2830 $(D true), then a string containing only $(D '\0') is returned.
2831
2832 $(D toUTFz) accepts any type of string and is templated on the type of
2833 character pointer that you wish to convert to. It will avoid allocating a
2834 new string if it can, but there's a decent chance that it will end up having
2835 to allocate a new string - particularly when dealing with character types
2836 other than $(D char).
2837
2838 $(RED Warning 1:) If the result of $(D toUTFz) equals $(D str.ptr), then if
2839 anything alters the character one past the end of $(D str) (which is the
2840 $(D '\0') character terminating the string), then the string won't be
2841 zero-terminated anymore. The most likely scenarios for that are if you
2842 append to $(D str) and no reallocation takes place or when $(D str) is a
2843 slice of a larger array, and you alter the character in the larger array
2844 which is one character past the end of $(D str). Another case where it could
2845 occur would be if you had a mutable character array immediately after
2846 $(D str) in memory (for example, if they're member variables in a
2847 user-defined type with one declared right after the other) and that
2848 character array happened to start with $(D '\0'). Such scenarios will never
2849 occur if you immediately use the zero-terminated string after calling
2850 $(D toUTFz) and the C function using it doesn't keep a reference to it.
2851 Also, they are unlikely to occur even if you save the zero-terminated string
2852 (the cases above would be among the few examples of where it could happen).
2853 However, if you save the zero-terminate string and want to be absolutely
2854 certain that the string stays zero-terminated, then simply append a
2855 $(D '\0') to the string and use its $(D ptr) property rather than calling
2856 $(D toUTFz).
2857
2858 $(RED Warning 2:) When passing a character pointer to a C function, and the
2859 C function keeps it around for any reason, make sure that you keep a
2860 reference to it in your D code. Otherwise, it may go away during a garbage
2861 collection cycle and cause a nasty bug when the C code tries to use it.
2862 +/
2863 template toUTFz(P)
2864 {
2865 P toUTFz(S)(S str) @safe pure
2866 {
2867 return toUTFzImpl!(P, S)(str);
2868 }
2869 }
2870
2871 ///
2872 @safe pure unittest
2873 {
2874 auto p1 = toUTFz!(char*)("hello world");
2875 auto p2 = toUTFz!(const(char)*)("hello world");
2876 auto p3 = toUTFz!(immutable(char)*)("hello world");
2877 auto p4 = toUTFz!(char*)("hello world"d);
2878 auto p5 = toUTFz!(const(wchar)*)("hello world");
2879 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
2880 }
2881
2882 private P toUTFzImpl(P, S)(S str) @safe pure
2883 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2884 is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
2885 is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
2886 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
2887 {
2888 if (str.empty)
2889 {
2890 typeof(*P.init)[] retval = ['\0'];
2891
trustedPtr()2892 auto trustedPtr() @trusted { return retval.ptr; }
2893 return trustedPtr();
2894 }
2895
2896 alias C = Unqual!(ElementEncodingType!S);
2897
2898 //If the P is mutable, then we have to make a copy.
2899 static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
2900 {
2901 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2902 }
2903 else
2904 {
2905 if (!__ctfe)
2906 {
trustedPtrAdd(S s)2907 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
2908 immutable p = trustedPtrAdd(str);
2909
2910 // Peek past end of str, if it's 0, no conversion necessary.
2911 // Note that the compiler will put a 0 past the end of static
2912 // strings, and the storage allocator will put a 0 past the end
2913 // of newly allocated char[]'s.
2914 // Is p dereferenceable? A simple test: if the p points to an
2915 // address multiple of 4, then conservatively assume the pointer
2916 // might be pointing to a new block of memory, which might be
2917 // unreadable. Otherwise, it's definitely pointing to valid
2918 // memory.
2919 if ((cast(size_t) p & 3) && *p == '\0')
2920 return &str[0];
2921 }
2922
2923 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2924 }
2925 }
2926
2927 private P toUTFzImpl(P, S)(S str) @safe pure
2928 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2929 is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
2930 !is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
2931 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
2932 {
2933 alias InChar = ElementEncodingType!S;
2934 alias OutChar = typeof(*P.init);
2935
2936 //const(C)[] -> const(C)* or
2937 //C[] -> C* or const(C)*
2938 static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) ||
2939 (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
2940 {
2941 if (!__ctfe)
2942 {
trustedPtrAdd(S s)2943 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
2944 auto p = trustedPtrAdd(str);
2945
2946 if ((cast(size_t) p & 3) && *p == '\0')
2947 return &str[0];
2948 }
2949
2950 str ~= '\0';
2951 return &str[0];
2952 }
2953 //const(C)[] -> C* or immutable(C)* or
2954 //C[] -> immutable(C)*
2955 else
2956 {
2957 import std.array : uninitializedArray;
2958 auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
2959 copy[0 .. $ - 1] = str[];
2960 copy[$ - 1] = '\0';
2961
trustedCast(typeof (copy)c)2962 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
2963 return trustedCast(copy);
2964 }
2965 }
2966
2967 private P toUTFzImpl(P, S)(S str) @safe pure
2968 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
2969 !is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)))
2970 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
2971 {
2972 import std.array : appender;
2973 auto retval = appender!(typeof(*P.init)[])();
2974
2975 foreach (dchar c; str)
2976 retval.put(c);
2977 retval.put('\0');
2978
2979 return () @trusted { return cast(P) retval.data.ptr; } ();
2980 }
2981
2982 @safe pure unittest
2983 {
2984 import core.exception : AssertError;
2985 import std.algorithm;
2986 import std.conv : to;
2987 import std.exception;
2988 import std.string : format;
2989
2990 assertCTFEable!(
2991 {
2992 foreach (S; AliasSeq!(string, wstring, dstring))
2993 {
2994 alias C = Unqual!(ElementEncodingType!S);
2995
2996 auto s1 = to!S("hello\U00010143\u0100\U00010143");
2997 auto temp = new C[](s1.length + 1);
2998 temp[0 .. $ - 1] = s1[0 .. $];
2999 temp[$ - 1] = '\n';
3000 --temp.length;
3001 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3002 auto s2 = trustedAssumeUnique(temp);
3003 assert(s1 == s2);
3004
3005 void trustedCStringAssert(P, S)(S s) @trusted
3006 {
3007 auto p = toUTFz!P(s);
3008 assert(p[0 .. s.length] == s);
3009 assert(p[s.length] == '\0');
3010 }
3011
3012 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3013 {
3014 trustedCStringAssert!P(s1);
3015 trustedCStringAssert!P(s2);
3016 }
3017 }
3018 });
3019
test(P,S)3020 static void test(P, S)(S s, size_t line = __LINE__) @trusted
3021 {
3022 static size_t zeroLen(C)(const(C)* ptr) @trusted
3023 {
3024 size_t len = 0;
3025 while (*ptr != '\0') { ++ptr; ++len; }
3026 return len;
3027 }
3028
3029 auto p = toUTFz!P(s);
3030 immutable len = zeroLen(p);
3031 enforce(cmp(s, p[0 .. len]) == 0,
3032 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3033 __FILE__, line));
3034 }
3035
3036 assertCTFEable!(
3037 {
3038 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3039 dchar*, const(dchar)*, immutable(dchar)*))
3040 {
3041 test!P("hello\U00010143\u0100\U00010143");
3042 }
3043 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3044 dchar*, const(dchar)*, immutable(dchar)*))
3045 {
3046 test!P("hello\U00010143\u0100\U00010143"w);
3047 }
3048 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3049 wchar*, const(wchar)*, immutable(wchar)*))
3050 {
3051 test!P("hello\U00010143\u0100\U00010143"d);
3052 }
3053 foreach (S; AliasSeq!( char[], const( char)[],
3054 wchar[], const(wchar)[],
3055 dchar[], const(dchar)[]))
3056 {
3057 auto s = to!S("hello\U00010143\u0100\U00010143");
3058
3059 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3060 wchar*, const(wchar)*, immutable(wchar)*,
3061 dchar*, const(dchar)*, immutable(dchar)*))
3062 {
3063 test!P(s);
3064 }
3065 }
3066 });
3067 }
3068
3069
3070 /++
3071 $(D toUTF16z) is a convenience function for $(D toUTFz!(const(wchar)*)).
3072
3073 Encodes string $(D s) into UTF-16 and returns the encoded string.
3074 $(D toUTF16z) is suitable for calling the 'W' functions in the Win32 API
3075 that take an $(D LPWSTR) or $(D LPCWSTR) argument.
3076 +/
3077 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3078 if (isSomeChar!C)
3079 {
3080 return toUTFz!(const(wchar)*)(str);
3081 }
3082
3083 @safe pure unittest
3084 {
3085 import std.conv : to;
3086 //toUTFz is already thoroughly tested, so this will just verify that
3087 //toUTF16z compiles properly for the various string types.
3088 foreach (S; AliasSeq!(string, wstring, dstring))
3089 assert(toUTF16z(to!S("hello world")) !is null);
3090 }
3091
3092
3093 /* ================================ tests ================================== */
3094
3095 @safe pure unittest
3096 {
3097 import std.exception;
3098
3099 assertCTFEable!(
3100 {
3101 assert(toUTF16("hello"c) == "hello");
3102 assert(toUTF32("hello"c) == "hello");
3103 assert(toUTF8 ("hello"w) == "hello");
3104 assert(toUTF32("hello"w) == "hello");
3105 assert(toUTF8 ("hello"d) == "hello");
3106 assert(toUTF16("hello"d) == "hello");
3107
3108 assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3109 assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3110 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3111 assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3112 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3113 assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3114
3115 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3116 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3117 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3118 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3119 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3120 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3121 });
3122 }
3123
3124
3125 /++
3126 Returns the total number of code points encoded in $(D str).
3127
3128 Supercedes: This function supercedes $(LREF toUCSindex).
3129
3130 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3131
3132 Throws:
3133 $(D UTFException) if $(D str) is not well-formed.
3134 +/
3135 size_t count(C)(const(C)[] str) @trusted pure nothrow @nogc
3136 if (isSomeChar!C)
3137 {
3138 return walkLength(str);
3139 }
3140
3141 @safe pure nothrow @nogc unittest
3142 {
3143 import std.exception;
3144 assertCTFEable!(
3145 {
3146 assert(count("") == 0);
3147 assert(count("a") == 1);
3148 assert(count("abc") == 3);
3149 assert(count("\u20AC100") == 4);
3150 });
3151 }
3152
3153
3154 // Ranges of code units for testing.
version(unittest)3155 version (unittest)
3156 {
3157 struct InputCU(C)
3158 {
3159 import std.conv : to;
3160 @property bool empty() { return _str.empty; }
3161 @property C front() { return _str[0]; }
3162 void popFront() { _str = _str[1 .. $]; }
3163
3164 this(inout(C)[] str)
3165 {
3166 _str = to!(C[])(str);
3167 }
3168
3169 C[] _str;
3170 }
3171
3172 struct BidirCU(C)
3173 {
3174 import std.conv : to;
3175 @property bool empty() { return _str.empty; }
3176 @property C front() { return _str[0]; }
3177 void popFront() { _str = _str[1 .. $]; }
3178 @property C back() { return _str[$ - 1]; }
3179 void popBack() { _str = _str[0 .. $ - 1]; }
3180 @property auto save() { return BidirCU(_str); }
3181 @property size_t length() { return _str.length; }
3182
3183 this(inout(C)[] str)
3184 {
3185 _str = to!(C[])(str);
3186 }
3187
3188 C[] _str;
3189 }
3190
3191 struct RandomCU(C)
3192 {
3193 import std.conv : to;
3194 @property bool empty() { return _str.empty; }
3195 @property C front() { return _str[0]; }
3196 void popFront() { _str = _str[1 .. $]; }
3197 @property C back() { return _str[$ - 1]; }
3198 void popBack() { _str = _str[0 .. $ - 1]; }
3199 @property auto save() { return RandomCU(_str); }
3200 @property size_t length() { return _str.length; }
3201 C opIndex(size_t i) { return _str[i]; }
3202 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3203
3204 this(inout(C)[] str)
3205 {
3206 _str = to!(C[])(str);
3207 }
3208
3209 C[] _str;
3210 }
3211
3212 class RefBidirCU(C)
3213 {
3214 import std.conv : to;
3215 @property bool empty() { return _str.empty; }
3216 @property C front() { return _str[0]; }
3217 void popFront() { _str = _str[1 .. $]; }
3218 @property C back() { return _str[$ - 1]; }
3219 void popBack() { _str = _str[0 .. $ - 1]; }
3220 @property auto save() { return new RefBidirCU(_str); }
3221 @property size_t length() { return _str.length; }
3222
3223 this(inout(C)[] str)
3224 {
3225 _str = to!(C[])(str);
3226 }
3227
3228 C[] _str;
3229 }
3230
3231 class RefRandomCU(C)
3232 {
3233 import std.conv : to;
3234 @property bool empty() { return _str.empty; }
3235 @property C front() { return _str[0]; }
3236 void popFront() { _str = _str[1 .. $]; }
3237 @property C back() { return _str[$ - 1]; }
3238 void popBack() { _str = _str[0 .. $ - 1]; }
3239 @property auto save() { return new RefRandomCU(_str); }
3240 @property size_t length() { return _str.length; }
3241 C opIndex(size_t i) { return _str[i]; }
3242 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3243
3244 this(inout(C)[] str)
3245 {
3246 _str = to!(C[])(str);
3247 }
3248
3249 C[] _str;
3250 }
3251 }
3252
3253
3254 /**
3255 * Inserted in place of invalid UTF sequences.
3256 *
3257 * References:
3258 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3259 */
3260 enum dchar replacementDchar = '\uFFFD';
3261
3262 /********************************************
3263 * Iterate a range of char, wchar, or dchars by code unit.
3264 *
3265 * The purpose is to bypass the special case decoding that
3266 * $(REF front, std,range,primitives) does to character arrays. As a result,
3267 * using ranges with `byCodeUnit` can be `nothrow` while
3268 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3269 * sequences.
3270 *
3271 * A code unit is a building block of the UTF encodings. Generally, an
3272 * individual code unit does not represent what's perceived as a full
3273 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3274 * are encoded with multiple code units. For example, the UTF-8 code units for
3275 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3276 * often does not form a character on its own. Attempting to treat it as
3277 * one while iterating over the resulting range will give nonsensical results.
3278 *
3279 * Params:
3280 * r = an input range of characters (including strings) or a type that
3281 * implicitly converts to a string type.
3282 * Returns:
3283 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3284 * user-defined type that implicits converts to a string type), then `r`
3285 * is returned.
3286 *
3287 * Otherwise, `r` is converted to its corresponding string type (if it's
3288 * not already a string) and wrapped in a random-access range where the
3289 * element encoding type of the string (its code unit) is the element type
3290 * of the range, and that range returned. The range has slicing.
3291 *
3292 * If `r` is quirky enough to be a struct or class which is an input range
3293 * of characters on its own (i.e. it has the input range API as member
3294 * functions), $(I and) it's implicitly convertible to a string type, then
3295 * `r` is returned, and no implicit conversion takes place.
3296 * See_Also:
3297 * Refer to the $(MREF std, uni) docs for a reference on Unicode
3298 * terminology.
3299 *
3300 * For a range that iterates by grapheme cluster (written character) see
3301 * $(REF byGrapheme, std,uni).
3302 */
3303 auto byCodeUnit(R)(R r)
3304 if (isAutodecodableString!R ||
3305 isInputRange!R && isSomeChar!(ElementEncodingType!R) ||
3306 (is(R : const dchar[]) && !isStaticArray!R))
3307 {
3308 static if (isNarrowString!R ||
3309 // This would be cleaner if we had a way to check whether a type
3310 // was a range without any implicit conversions.
3311 (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3312 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3313 {
3314 static struct ByCodeUnitImpl
3315 {
3316 @safe pure nothrow @nogc:
3317
emptyByCodeUnitImpl3318 @property bool empty() const { return str.length == 0; }
frontByCodeUnitImpl3319 @property auto ref front() inout { return str[0]; }
popFrontByCodeUnitImpl3320 void popFront() { str = str[1 .. $]; }
3321
saveByCodeUnitImpl3322 @property auto save() { return ByCodeUnitImpl(str.save); }
3323
backByCodeUnitImpl3324 @property auto ref back() inout { return str[$ - 1]; }
popBackByCodeUnitImpl3325 void popBack() { str = str[0 .. $-1]; }
3326
opIndexByCodeUnitImpl3327 auto ref opIndex(size_t index) inout { return str[index]; }
opSliceByCodeUnitImpl3328 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(str[lower .. upper]); }
3329
lengthByCodeUnitImpl3330 @property size_t length() const { return str.length; }
3331 alias opDollar = length;
3332
3333 private:
3334 StringTypeOf!R str;
3335 }
3336
3337 static assert(isRandomAccessRange!ByCodeUnitImpl);
3338
3339 return ByCodeUnitImpl(r);
3340 }
3341 else static if (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3342 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))
3343 {
3344 return cast(StringTypeOf!R) r;
3345 }
3346 else
3347 {
3348 // byCodeUnit for ranges and dchar[] is a no-op
3349 return r;
3350 }
3351 }
3352
3353 ///
3354 @safe unittest
3355 {
3356 import std.range.primitives;
3357
3358 auto r = "Hello, World!".byCodeUnit();
3359 static assert(hasLength!(typeof(r)));
3360 static assert(hasSlicing!(typeof(r)));
3361 static assert(isRandomAccessRange!(typeof(r)));
3362 static assert(is(ElementType!(typeof(r)) == immutable char));
3363
3364 // contrast with the range capabilities of standard strings
3365 auto s = "Hello, World!";
3366 static assert(isBidirectionalRange!(typeof(r)));
3367 static assert(is(ElementType!(typeof(s)) == dchar));
3368
3369 static assert(!isRandomAccessRange!(typeof(s)));
3370 static assert(!hasSlicing!(typeof(s)));
3371 static assert(!hasLength!(typeof(s)));
3372 }
3373
3374 /// `byCodeUnit` does no Unicode decoding
3375 @safe unittest
3376 {
3377 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3378 assert(noel1.byCodeUnit[2] != 'ë');
3379 assert(noel1.byCodeUnit[2] == 'e');
3380
3381 string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3382 // Because string is UTF-8, the code unit at index 2 is just
3383 // the first of a sequence that encodes 'ë'
3384 assert(noel2.byCodeUnit[2] != 'ë');
3385 }
3386
3387 @safe pure nothrow @nogc unittest
3388 {
3389 import std.range;
3390 {
3391 enum testStr = " hello ディラン";
3392 char[testStr.length] s;
3393 int i;
3394 foreach (c; testStr.byCodeUnit().byCodeUnit())
3395 {
3396 s[i++] = c;
3397 }
3398 assert(s == testStr);
3399 }
3400 {
3401 enum testStr = " hello ディラン"w;
3402 wchar[testStr.length] s;
3403 int i;
3404 foreach (c; testStr.byCodeUnit().byCodeUnit())
3405 {
3406 s[i++] = c;
3407 }
3408 assert(s == testStr);
3409 }
3410 {
3411 enum testStr = " hello ディラン"d;
3412 dchar[testStr.length] s;
3413 int i;
3414 foreach (c; testStr.byCodeUnit().byCodeUnit())
3415 {
3416 s[i++] = c;
3417 }
3418 assert(s == testStr);
3419 }
3420 {
3421 auto bcu = "hello".byCodeUnit();
3422 assert(bcu.length == 5);
3423 assert(bcu[3] == 'l');
3424 assert(bcu[2 .. 4][1] == 'l');
3425 }
3426 {
3427 char[5] orig = "hello";
3428 auto bcu = orig[].byCodeUnit();
3429 bcu.front = 'H';
3430 assert(bcu.front == 'H');
3431 bcu[1] = 'E';
3432 assert(bcu[1] == 'E');
3433 }
3434 {
3435 auto bcu = "hello".byCodeUnit().byCodeUnit();
3436 static assert(isForwardRange!(typeof(bcu)));
3437 static assert(is(typeof(bcu) == struct));
3438 auto s = bcu.save;
3439 bcu.popFront();
3440 assert(s.front == 'h');
3441 }
3442 {
3443 auto bcu = "hello".byCodeUnit();
3444 static assert(hasSlicing!(typeof(bcu)));
3445 static assert(isBidirectionalRange!(typeof(bcu)));
3446 static assert(is(typeof(bcu) == struct));
3447 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3448 auto ret = bcu.retro;
3449 assert(ret.front == 'o');
3450 ret.popFront();
3451 assert(ret.front == 'l');
3452 }
3453 {
3454 auto bcu = "κόσμε"w.byCodeUnit();
3455 static assert(hasSlicing!(typeof(bcu)));
3456 static assert(isBidirectionalRange!(typeof(bcu)));
3457 static assert(is(typeof(bcu) == struct));
3458 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3459 auto ret = bcu.retro;
3460 assert(ret.front == 'ε');
3461 ret.popFront();
3462 assert(ret.front == 'μ');
3463 }
3464 {
3465 static struct Stringish
3466 {
3467 string s;
3468 alias s this;
3469 }
3470
3471 auto orig = Stringish("\U0010fff8 foo ");
3472 auto bcu = orig.byCodeUnit();
3473 static assert(is(typeof(bcu) == struct));
3474 static assert(!is(typeof(bcu) == Stringish));
3475 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3476 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3477 assert(bcu.front == cast(char) 244);
3478 }
3479 {
3480 static struct WStringish
3481 {
3482 wstring s;
3483 alias s this;
3484 }
3485
3486 auto orig = WStringish("\U0010fff8 foo "w);
3487 auto bcu = orig.byCodeUnit();
3488 static assert(is(typeof(bcu) == struct));
3489 static assert(!is(typeof(bcu) == WStringish));
3490 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3491 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3492 assert(bcu.front == cast(wchar) 56319);
3493 }
3494 {
3495 static struct DStringish
3496 {
3497 dstring s;
3498 alias s this;
3499 }
3500
3501 auto orig = DStringish("\U0010fff8 foo "d);
3502 auto bcu = orig.byCodeUnit();
3503 static assert(is(typeof(bcu) == dstring));
3504 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3505 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3506 assert(bcu.front == cast(dchar) 1114104);
3507 }
3508 {
3509 static struct FuncStringish
3510 {
3511 string str;
sFuncStringish3512 string s() pure nothrow @nogc { return str; }
3513 alias s this;
3514 }
3515
3516 auto orig = FuncStringish("\U0010fff8 foo ");
3517 auto bcu = orig.byCodeUnit();
3518 static assert(is(typeof(bcu) == struct));
3519 static assert(!is(typeof(bcu) == FuncStringish));
3520 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3521 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3522 assert(bcu.front == cast(char) 244);
3523 }
3524 {
3525 static struct Range
3526 {
3527 string data;
emptyRange3528 bool empty() pure nothrow @nogc { return data.empty; }
frontRange3529 char front() pure nothrow @nogc { return data[0]; }
popFrontRange3530 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3531 }
3532
3533 auto orig = Range("\U0010fff8 foo ");
3534 auto bcu = orig.byCodeUnit();
3535 static assert(is(typeof(bcu) == Range));
3536 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3537 static assert(is(ElementType!(typeof(bcu)) == char));
3538 assert(bcu.front == cast(char) 244);
3539 }
3540 {
3541 static struct WRange
3542 {
3543 wstring data;
emptyWRange3544 bool empty() pure nothrow @nogc { return data.empty; }
frontWRange3545 wchar front() pure nothrow @nogc { return data[0]; }
popFrontWRange3546 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3547 }
3548
3549 auto orig = WRange("\U0010fff8 foo "w);
3550 auto bcu = orig.byCodeUnit();
3551 static assert(is(typeof(bcu) == WRange));
3552 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3553 static assert(is(ElementType!(typeof(bcu)) == wchar));
3554 assert(bcu.front == 56319);
3555 }
3556 {
3557 static struct DRange
3558 {
3559 dstring data;
emptyDRange3560 bool empty() pure nothrow @nogc { return data.empty; }
frontDRange3561 dchar front() pure nothrow @nogc { return data[0]; }
popFrontDRange3562 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3563 }
3564
3565 auto orig = DRange("\U0010fff8 foo "d);
3566 auto bcu = orig.byCodeUnit();
3567 static assert(is(typeof(bcu) == DRange));
3568 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3569 static assert(is(ElementType!(typeof(bcu)) == dchar));
3570 assert(bcu.front == 1114104);
3571 }
3572 {
3573 static struct RangeAndStringish
3574 {
emptyRangeAndStringish3575 bool empty() pure nothrow @nogc { return data.empty; }
frontRangeAndStringish3576 char front() pure nothrow @nogc { return data[0]; }
popFrontRangeAndStringish3577 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3578
3579 string data;
3580 string s;
3581 alias s this;
3582 }
3583
3584 auto orig = RangeAndStringish("test.d", "other");
3585 auto bcu = orig.byCodeUnit();
3586 static assert(is(typeof(bcu) == RangeAndStringish));
3587 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3588 static assert(is(ElementType!(typeof(bcu)) == char));
3589 assert(bcu.front == 't');
3590 }
3591 {
3592 static struct WRangeAndStringish
3593 {
emptyWRangeAndStringish3594 bool empty() pure nothrow @nogc { return data.empty; }
frontWRangeAndStringish3595 wchar front() pure nothrow @nogc { return data[0]; }
popFrontWRangeAndStringish3596 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3597
3598 wstring data;
3599 wstring s;
3600 alias s this;
3601 }
3602
3603 auto orig = WRangeAndStringish("test.d"w, "other"w);
3604 auto bcu = orig.byCodeUnit();
3605 static assert(is(typeof(bcu) == WRangeAndStringish));
3606 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3607 static assert(is(ElementType!(typeof(bcu)) == wchar));
3608 assert(bcu.front == 't');
3609 }
3610 {
3611 static struct DRangeAndStringish
3612 {
emptyDRangeAndStringish3613 bool empty() pure nothrow @nogc { return data.empty; }
frontDRangeAndStringish3614 dchar front() pure nothrow @nogc { return data[0]; }
popFrontDRangeAndStringish3615 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3616
3617 dstring data;
3618 dstring s;
3619 alias s this;
3620 }
3621
3622 auto orig = DRangeAndStringish("test.d"d, "other"d);
3623 auto bcu = orig.byCodeUnit();
3624 static assert(is(typeof(bcu) == DRangeAndStringish));
3625 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3626 static assert(is(ElementType!(typeof(bcu)) == dchar));
3627 assert(bcu.front == 't');
3628 }
3629 {
3630 enum Enum : string { a = "test.d" }
3631
3632 auto orig = Enum.a;
3633 auto bcu = orig.byCodeUnit();
3634 static assert(!is(typeof(bcu) == Enum));
3635 static assert(is(typeof(bcu) == struct));
3636 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3637 assert(bcu.front == 't');
3638 }
3639 {
3640 enum WEnum : wstring { a = "test.d"w }
3641
3642 auto orig = WEnum.a;
3643 auto bcu = orig.byCodeUnit();
3644 static assert(!is(typeof(bcu) == WEnum));
3645 static assert(is(typeof(bcu) == struct));
3646 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3647 assert(bcu.front == 't');
3648 }
3649 {
3650 enum DEnum : dstring { a = "test.d"d }
3651
3652 auto orig = DEnum.a;
3653 auto bcu = orig.byCodeUnit();
3654 static assert(is(typeof(bcu) == dstring));
3655 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3656 assert(bcu.front == 't');
3657 }
3658
3659 static assert(!is(typeof(byCodeUnit("hello")) == string));
3660 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
3661 static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
3662
3663 static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
3664 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
3665 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
3666
3667 enum SEnum : char[5] { a = "hello" }
3668 enum WSEnum : wchar[5] { a = "hello"w }
3669 enum DSEnum : dchar[5] { a = "hello"d }
3670
3671 static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
3672 static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
3673 static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
3674 }
3675
3676 /****************************
3677 * Iterate an input range of characters by char, wchar, or dchar.
3678 * These aliases simply forward to $(LREF byUTF) with the
3679 * corresponding C argument.
3680 *
3681 * Params:
3682 * r = input range of characters, or array of characters
3683 */
3684 alias byChar = byUTF!char;
3685
3686 /// Ditto
3687 alias byWchar = byUTF!wchar;
3688
3689 /// Ditto
3690 alias byDchar = byUTF!dchar;
3691
3692 @safe pure nothrow @nogc unittest
3693 {
3694 {
3695 char[5] s;
3696 int i;
3697 foreach (c; "hello".byChar.byChar())
3698 {
3699 //writefln("[%d] '%c'", i, c);
3700 s[i++] = c;
3701 }
3702 assert(s == "hello");
3703 }
3704 {
3705 char[5+2+3+4+3+3] s;
3706 int i;
3707 dchar[10] a;
3708 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
3709 a[8] = 0xD800; // invalid
3710 a[9] = cast(dchar) 0x110000; // invalid
3711 foreach (c; a[].byChar())
3712 {
3713 //writefln("[%d] '%c'", i, c);
3714 s[i++] = c;
3715 }
3716 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
3717 }
3718 {
3719 auto r = "hello"w.byChar();
3720 r.popFront();
3721 r.popFront();
3722 assert(r.front == 'l');
3723 }
3724 {
3725 auto r = "hello"d.byChar();
3726 r.popFront();
3727 r.popFront();
3728 assert(r.front == 'l');
3729 }
3730 {
3731 auto r = "hello"d.byChar();
3732 assert(isForwardRange!(typeof(r)));
3733 auto s = r.save;
3734 r.popFront();
3735 assert(s.front == 'h');
3736 }
3737 }
3738
3739 @safe pure nothrow @nogc unittest
3740 {
3741 {
3742 wchar[11] s;
3743 int i;
3744 dchar[10] a;
3745 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
3746 a[8] = 0xD800; // invalid
3747 a[9] = cast(dchar) 0x110000; // invalid
3748 foreach (c; a[].byWchar())
3749 {
3750 //writefln("[%d] '%c' x%x", i, c, c);
3751 s[i++] = c;
3752 }
3753 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
3754 {
3755 //writefln("[%d] '%c' x%x", j, c, c);
3756 }
3757 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
3758 }
3759
3760 {
3761 auto r = "hello".byWchar();
3762 r.popFront();
3763 r.popFront();
3764 assert(r.front == 'l');
3765 }
3766 {
3767 auto r = "hello"d.byWchar();
3768 r.popFront();
3769 r.popFront();
3770 assert(r.front == 'l');
3771 }
3772 {
3773 auto r = "hello"d.byWchar();
3774 assert(isForwardRange!(typeof(r)));
3775 auto s = r.save;
3776 r.popFront();
3777 assert(s.front == 'h');
3778 }
3779 }
3780
3781 @safe pure nothrow @nogc unittest
3782 {
3783 {
3784 dchar[9] s;
3785 int i;
3786 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
3787 foreach (c; a.byDchar())
3788 {
3789 s[i++] = c;
3790 }
3791 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
3792 }
3793 {
3794 foreach (s; invalidUTFstrings!char())
3795 {
3796 auto r = s.byDchar();
3797 assert(!r.empty);
3798 assert(r.front == r.front);
3799 dchar c = r.front;
3800 assert(c == replacementDchar);
3801 }
3802 }
3803 {
3804 auto r = "hello".byDchar();
3805 r.popFront();
3806 r.popFront();
3807 assert(r.front == 'l');
3808 }
3809
3810 {
3811 dchar[8] s;
3812 int i;
3813 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
3814 foreach (c; a.byDchar())
3815 {
3816 //writefln("[%d] '%c' x%x", i, c, c);
3817 s[i++] = c;
3818 }
3819 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
3820 }
3821 {
3822 foreach (s; invalidUTFstrings!wchar())
3823 {
3824 auto r = s.byDchar();
3825 assert(!r.empty);
3826 assert(r.front == r.front);
3827 dchar c = r.front;
3828 assert(c == replacementDchar);
3829 }
3830 }
3831 {
3832 wchar[2] ws;
3833 ws[0] = 0xD800;
3834 ws[1] = 0xDD00; // correct surrogate pair
3835 auto r = ws[].byDchar();
3836 assert(!r.empty);
3837 assert(r.front == r.front);
3838 dchar c = r.front;
3839 assert(c == '\U00010100');
3840 }
3841 {
3842 auto r = "hello"w.byDchar();
3843 r.popFront();
3844 r.popFront();
3845 assert(r.front == 'l');
3846 }
3847
3848 {
3849 dchar[5] s;
3850 int i;
3851 dstring a = "hello"d;
3852 foreach (c; a.byDchar.byDchar())
3853 {
3854 //writefln("[%d] '%c' x%x", i, c, c);
3855 s[i++] = c;
3856 }
3857 assert(s == "hello"d);
3858 }
3859 {
3860 auto r = "hello".byDchar();
3861 assert(isForwardRange!(typeof(r)));
3862 auto s = r.save;
3863 r.popFront();
3864 assert(s.front == 'h');
3865 }
3866 {
3867 auto r = "hello"w.byDchar();
3868 assert(isForwardRange!(typeof(r)));
3869 auto s = r.save;
3870 r.popFront();
3871 assert(s.front == 'h');
3872 }
3873 }
3874
3875 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
3876 // which needs to support ranges with and without those attributes
3877
3878 pure @safe nothrow @nogc unittest
3879 {
3880 dchar[5] s = "hello"d;
3881 foreach (c; s[].byChar()) { }
3882 foreach (c; s[].byWchar()) { }
3883 foreach (c; s[].byDchar()) { }
3884 }
3885
version(unittest)3886 version (unittest)
3887 int impureVariable;
3888
3889 @system unittest
3890 {
3891 static struct ImpureThrowingSystemRange(Char)
3892 {
3893 @property bool empty() const { return true; }
3894 @property Char front() const { return Char.init; }
3895 void popFront()
3896 {
3897 impureVariable++;
3898 throw new Exception("only for testing nothrow");
3899 }
3900 }
3901
3902 foreach (Char; AliasSeq!(char, wchar, dchar))
3903 {
3904 ImpureThrowingSystemRange!Char range;
3905 foreach (c; range.byChar()) { }
3906 foreach (c; range.byWchar()) { }
3907 foreach (c; range.byDchar()) { }
3908 }
3909 }
3910
3911 /****************************
3912 * Iterate an input range of characters by char type `C` by
3913 * encoding the elements of the range.
3914 *
3915 * UTF sequences that cannot be converted to the specified encoding are
3916 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
3917 * of the Unicode Standard 6.2. Hence byUTF is not symmetric.
3918 * This algorithm is lazy, and does not allocate memory.
3919 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
3920 * `r` parameter.
3921 *
3922 * Params:
3923 * C = `char`, `wchar`, or `dchar`
3924 *
3925 * Returns:
3926 * A forward range if `R` is a range and not auto-decodable, as defined by
3927 * $(REF isAutodecodableString, std, traits), and if the base range is
3928 * also a forward range.
3929 *
3930 * Or, if `R` is a range and it is auto-decodable and
3931 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
3932 * to $(LREF byCodeUnit).
3933 *
3934 * Otherwise, an input range of characters.
3935 */
3936 template byUTF(C)
3937 if (isSomeChar!C)
3938 {
3939 static if (!is(Unqual!C == C))
3940 alias byUTF = byUTF!(Unqual!C);
3941 else:
3942
3943 auto ref byUTF(R)(R r)
3944 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
3945 {
3946 return byUTF(r.byCodeUnit());
3947 }
3948
3949 auto ref byUTF(R)(R r)
3950 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
3951 {
3952 alias RC = Unqual!(ElementEncodingType!R);
3953
3954 static if (is(RC == C))
3955 {
3956 return r.byCodeUnit();
3957 }
3958 else
3959 {
3960 static struct Result
3961 {
emptyResult3962 @property bool empty()
3963 {
3964 return pos == fill && r.empty;
3965 }
3966
frontResult3967 @property auto front() scope // 'scope' required by call to decodeFront() below
3968 {
3969 if (pos == fill)
3970 {
3971 pos = 0;
3972 auto c = r.front;
3973
3974 if (c <= 0x7F)
3975 {
3976 fill = 1;
3977 r.popFront;
3978 buf[pos] = cast(C) c;
3979 }
3980 else
3981 {
3982 static if (is(RC == dchar))
3983 {
3984 r.popFront;
3985 dchar dc = c;
3986 }
3987 else
3988 dchar dc = () @trusted { return decodeFront!(Yes.useReplacementDchar)(r); }();
3989 fill = cast(ushort) encode!(Yes.useReplacementDchar)(buf, dc);
3990 }
3991 }
3992 return buf[pos];
3993 }
3994
popFrontResult3995 void popFront()
3996 {
3997 if (pos == fill)
3998 front;
3999 ++pos;
4000 }
4001
4002 static if (isForwardRange!R)
4003 {
4004 @property auto save() return scope
4005 /* `return scope` cannot be inferred because compiler does not
4006 * track it backwards from assignment to local `ret`
4007 */
4008 {
4009 auto ret = this;
4010 ret.r = r.save;
4011 return ret;
4012 }
4013 }
4014
4015 private:
4016
4017 R r;
4018 C[4 / C.sizeof] buf = void;
4019 ushort pos, fill;
4020 }
4021
4022 return Result(r);
4023 }
4024 }
4025 }
4026
4027 ///
4028 @safe pure nothrow unittest
4029 {
4030 import std.algorithm.comparison : equal;
4031
4032 // hellö as a range of `char`s, which are UTF-8
4033 "hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]);
4034
4035 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4036 "hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']);
4037
4038 // is four code units in UTF-8, two in UTF-16, and one in UTF-32
4039 "".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]);
4040 "".byUTF!wchar().equal([0xD801, 0xDC37]);
4041 "".byUTF!dchar().equal([0x00010437]);
4042 }
4043