1 // Written in the D programming language.
2 
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5 
6 For cases where the _encoding is known at compile-time, functions are provided
7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9 
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252.
12 
13 $(SCRIPT inhibitQuickIndex = 1;)
14 $(BOOKTABLE,
15 $(TR $(TH Category) $(TH Functions))
16 $(TR $(TD Decode) $(TD
17     $(LREF codePoints)
18     $(LREF decode)
19     $(LREF decodeReverse)
20     $(LREF safeDecode)
21 ))
22 $(TR $(TD Conversion) $(TD
23     $(LREF codeUnits)
24     $(LREF sanitize)
25     $(LREF transcode)
26 ))
27 $(TR $(TD Classification) $(TD
28     $(LREF canEncode)
29     $(LREF isValid)
30     $(LREF isValidCodePoint)
31     $(LREF isValidCodeUnit)
32 ))
33 $(TR $(TD BOM) $(TD
34     $(LREF BOM)
35     $(LREF BOMSeq)
36     $(LREF getBOM)
37     $(LREF utfBOM)
38 ))
39 $(TR $(TD Length & Index) $(TD
40     $(LREF firstSequence)
41     $(LREF encodedLength)
42     $(LREF index)
43     $(LREF lastSequence)
44     $(LREF validLength)
45 ))
46 $(TR $(TD Encoding schemes) $(TD
47     $(LREF encodingName)
48     $(LREF EncodingScheme)
49     $(LREF EncodingSchemeASCII)
50     $(LREF EncodingSchemeLatin1)
51     $(LREF EncodingSchemeLatin2)
52     $(LREF EncodingSchemeUtf16Native)
53     $(LREF EncodingSchemeUtf32Native)
54     $(LREF EncodingSchemeUtf8)
55     $(LREF EncodingSchemeWindows1250)
56     $(LREF EncodingSchemeWindows1252)
57 ))
58 $(TR $(TD Representation) $(TD
59     $(LREF AsciiChar)
60     $(LREF AsciiString)
61     $(LREF Latin1Char)
62     $(LREF Latin1String)
63     $(LREF Latin2Char)
64     $(LREF Latin2String)
65     $(LREF Windows1250Char)
66     $(LREF Windows1250String)
67     $(LREF Windows1252Char)
68     $(LREF Windows1252String)
69 ))
70 $(TR $(TD Exceptions) $(TD
71     $(LREF INVALID_SEQUENCE)
72     $(LREF EncodingException)
73 ))
74 )
75 
76 For cases where the _encoding is not known at compile-time, but is
77 known at run-time, the abstract class $(LREF EncodingScheme)
78 and its subclasses is provided.  To construct a run-time encoder/decoder,
79 one does e.g.
80 
81 ----------------------------------------------------
82 auto e = EncodingScheme.create("utf-8");
83 ----------------------------------------------------
84 
85 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
86 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
87 WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and
88 UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
89 
90 This library provides a mechanism whereby other modules may add $(LREF
91 EncodingScheme) subclasses for any other _encoding.
92 
93 Copyright: Copyright Janice Caron 2008 - 2009.
94 License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
95 Authors:   Janice Caron
96 Source:    $(PHOBOSSRC std/_encoding.d)
97 */
98 /*
99          Copyright Janice Caron 2008 - 2009.
100 Distributed under the Boost Software License, Version 1.0.
101    (See accompanying file LICENSE_1_0.txt or copy at
102          http://www.boost.org/LICENSE_1_0.txt)
103 */
104 module std.encoding;
105 
106 import std.range.primitives;
107 import std.traits;
108 import std.typecons;
109 
110 @system unittest
111 {
112     static ubyte[][] validStrings =
113     [
114         // Plain ASCII
115         cast(ubyte[])"hello",
116 
117         // First possible sequence of a certain length
118         [ 0x00 ],                       // U+00000000   one byte
119         [ 0xC2, 0x80 ],                 // U+00000080   two bytes
120         [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
121         [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
122 
123         // Last possible sequence of a certain length
124         [ 0x7F ],                       // U+0000007F   one byte
125         [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
126         [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
127 
128         // Other boundary conditions
129         [ 0xED, 0x9F, 0xBF ],
130         // U+0000D7FF   Last character before surrogates
131         [ 0xEE, 0x80, 0x80 ],
132         // U+0000E000   First character after surrogates
133         [ 0xEF, 0xBF, 0xBD ],
134         // U+0000FFFD   Unicode replacement character
135         [ 0xF4, 0x8F, 0xBF, 0xBF ],
136         // U+0010FFFF   Very last character
137 
138         // Non-character code points
139         /*  NOTE: These are legal in UTF, and may be converted from
140             one UTF to another, however they do not represent Unicode
141             characters. These code points have been reserved by
142             Unicode as non-character code points. They are permissible
143             for data exchange within an application, but they are are
144             not permitted to be used as characters. Since this module
145             deals with UTF, and not with Unicode per se, we choose to
146             accept them here. */
147         [ 0xDF, 0xBE ],                 // U+0000FFFE
148         [ 0xDF, 0xBF ],                 // U+0000FFFF
149     ];
150 
151     static ubyte[][] invalidStrings =
152     [
153         // First possible sequence of a certain length, but greater
154         // than U+10FFFF
155         [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
156         [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
157 
158         // Last possible sequence of a certain length, but greater than U+10FFFF
159         [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
160         [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
161         [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
162 
163         // Other boundary conditions
164         [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
165                                                     // First code
166                                                     // point after
167                                                     // last character
168 
169         // Unexpected continuation bytes
170         [ 0x80 ],
171         [ 0xBF ],
172         [ 0x20, 0x80, 0x20 ],
173         [ 0x20, 0xBF, 0x20 ],
174         [ 0x80, 0x9F, 0xA0 ],
175 
176         // Lonely start bytes
177         [ 0xC0 ],
178         [ 0xCF ],
179         [ 0x20, 0xC0, 0x20 ],
180         [ 0x20, 0xCF, 0x20 ],
181         [ 0xD0 ],
182         [ 0xDF ],
183         [ 0x20, 0xD0, 0x20 ],
184         [ 0x20, 0xDF, 0x20 ],
185         [ 0xE0 ],
186         [ 0xEF ],
187         [ 0x20, 0xE0, 0x20 ],
188         [ 0x20, 0xEF, 0x20 ],
189         [ 0xF0 ],
190         [ 0xF1 ],
191         [ 0xF2 ],
192         [ 0xF3 ],
193         [ 0xF4 ],
194         [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
195         [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
196         [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
197 
198         [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
199         [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
200         [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
201 
202         // Impossible bytes
203         [ 0xF8 ],
204         [ 0xF9 ],
205         [ 0xFA ],
206         [ 0xFB ],
207         [ 0xFC ],
208         [ 0xFD ],
209         [ 0xFE ],
210         [ 0xFF ],
211         [ 0x20, 0xF8, 0x20 ],
212         [ 0x20, 0xF9, 0x20 ],
213         [ 0x20, 0xFA, 0x20 ],
214         [ 0x20, 0xFB, 0x20 ],
215         [ 0x20, 0xFC, 0x20 ],
216         [ 0x20, 0xFD, 0x20 ],
217         [ 0x20, 0xFE, 0x20 ],
218         [ 0x20, 0xFF, 0x20 ],
219 
220         // Overlong sequences, all representing U+002F
221         /*  With a safe UTF-8 decoder, all of the following five overlong
222             representations of the ASCII character slash ("/") should be
223             rejected like a malformed UTF-8 sequence */
224         [ 0xC0, 0xAF ],
225         [ 0xE0, 0x80, 0xAF ],
226         [ 0xF0, 0x80, 0x80, 0xAF ],
227         [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
228         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
229 
230         // Maximum overlong sequences
231         /*  Below you see the highest Unicode value that is still resulting in
232             an overlong sequence if represented with the given number of bytes.
233             This is a boundary test for safe UTF-8 decoders. All five
234             characters should be rejected like malformed UTF-8 sequences. */
235         [ 0xC1, 0xBF ],                             // U+0000007F
236         [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
237         [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
238         [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
239         [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
240 
241         // Overlong representation of the NUL character
242         /*  The following five sequences should also be rejected like malformed
243             UTF-8 sequences and should not be treated like the ASCII NUL
244             character. */
245         [ 0xC0, 0x80 ],
246         [ 0xE0, 0x80, 0x80 ],
247         [ 0xF0, 0x80, 0x80, 0x80 ],
248         [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
249         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
250 
251         // Illegal code positions
252         /*  The following UTF-8 sequences should be rejected like malformed
253             sequences, because they never represent valid ISO 10646 characters
254             and a UTF-8 decoder that accepts them might introduce security
255             problems comparable to overlong UTF-8 sequences. */
256         [ 0xED, 0xA0, 0x80 ],       // U+D800
257         [ 0xED, 0xAD, 0xBF ],       // U+DB7F
258         [ 0xED, 0xAE, 0x80 ],       // U+DB80
259         [ 0xED, 0xAF, 0xBF ],       // U+DBFF
260         [ 0xED, 0xB0, 0x80 ],       // U+DC00
261         [ 0xED, 0xBE, 0x80 ],       // U+DF80
262         [ 0xED, 0xBF, 0xBF ],       // U+DFFF
263     ];
264 
265     static string[] sanitizedStrings =
266     [
267         "\uFFFD","\uFFFD",
268         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
269         " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
270         "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
271         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
272         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
273         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274         " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
275         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
276         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
278     ];
279 
280     // Make sure everything that should be valid, is
foreach(a;validStrings)281     foreach (a;validStrings)
282     {
283         string s = cast(string) a;
284         assert(isValid(s),"Failed to validate: "~makeReadable(s));
285     }
286 
287     // Make sure everything that shouldn't be valid, isn't
foreach(a;invalidStrings)288     foreach (a;invalidStrings)
289     {
290         string s = cast(string) a;
291         assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
292     }
293 
294     // Make sure we can sanitize everything bad
295     assert(invalidStrings.length == sanitizedStrings.length);
296     for (int i=0; i<invalidStrings.length; ++i)
297     {
298         string s = cast(string) invalidStrings[i];
299         string t = sanitize(s);
300         assert(isValid(t));
301         assert(t == sanitizedStrings[i]);
302         ubyte[] u = cast(ubyte[]) t;
303         validStrings ~= u;
304     }
305 
306     // Make sure all transcodings work in both directions, using both forward
307     // and reverse iteration
foreach(a;validStrings)308     foreach (a; validStrings)
309     {
310         string s = cast(string) a;
311         string s2;
312         wstring ws, ws2;
313         dstring ds, ds2;
314 
315         transcode(s,ws);
316         assert(isValid(ws));
317         transcode(ws,s2);
318         assert(s == s2);
319 
320         transcode(s,ds);
321         assert(isValid(ds));
322         transcode(ds,s2);
323         assert(s == s2);
324 
325         transcode(ws,s);
326         assert(isValid(s));
327         transcode(s,ws2);
328         assert(ws == ws2);
329 
330         transcode(ws,ds);
331         assert(isValid(ds));
332         transcode(ds,ws2);
333         assert(ws == ws2);
334 
335         transcode(ds,s);
336         assert(isValid(s));
337         transcode(s,ds2);
338         assert(ds == ds2);
339 
340         transcode(ds,ws);
341         assert(isValid(ws));
342         transcode(ws,ds2);
343         assert(ds == ds2);
344 
345         transcodeReverse(s,ws);
346         assert(isValid(ws));
347         transcodeReverse(ws,s2);
348         assert(s == s2);
349 
350         transcodeReverse(s,ds);
351         assert(isValid(ds));
352         transcodeReverse(ds,s2);
353         assert(s == s2);
354 
355         transcodeReverse(ws,s);
356         assert(isValid(s));
357         transcodeReverse(s,ws2);
358         assert(ws == ws2);
359 
360         transcodeReverse(ws,ds);
361         assert(isValid(ds));
362         transcodeReverse(ds,ws2);
363         assert(ws == ws2);
364 
365         transcodeReverse(ds,s);
366         assert(isValid(s));
367         transcodeReverse(s,ds2);
368         assert(ds == ds2);
369 
370         transcodeReverse(ds,ws);
371         assert(isValid(ws));
372         transcodeReverse(ws,ds2);
373         assert(ds == ds2);
374     }
375 
376     // Make sure the non-UTF encodings work too
377     {
378         auto s = "\u20AC100";
379         Windows1252String t;
380         transcode(s,t);
381         assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
382         string u;
383         transcode(s,u);
384         assert(s == u);
385         Latin1String v;
386         transcode(s,v);
387         assert(cast(string) v == "?100");
388         AsciiString w;
389         transcode(v,w);
390         assert(cast(string) w == "?100");
391         s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
392         Latin2String x;
393         transcode(s,x);
394         assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
395         Windows1250String y;
396         transcode(s,y);
397         assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
398     }
399 
400     // Make sure we can count properly
401     {
402         assert(encodedLength!(char)('A') == 1);
403         assert(encodedLength!(char)('\u00E3') == 2);
404         assert(encodedLength!(char)('\u2028') == 3);
405         assert(encodedLength!(char)('\U0010FFF0') == 4);
406         assert(encodedLength!(wchar)('A') == 1);
407         assert(encodedLength!(wchar)('\U0010FFF0') == 2);
408     }
409 
410     // Make sure we can write into mutable arrays
411     {
412         char[4] buffer;
413         auto n = encode(cast(dchar)'\u00E3',buffer);
414         assert(n == 2);
415         assert(buffer[0] == 0xC3);
416         assert(buffer[1] == 0xA3);
417     }
418 }
419 
420 //=============================================================================
421 
422 /** Special value returned by $(D safeDecode) */
423 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
424 
EncoderFunctions()425 template EncoderFunctions()
426 {
427     // Various forms of read
428 
429     template ReadFromString()
430     {
431         @property bool canRead() { return s.length != 0; }
432         E peek() @safe pure @nogc nothrow { return s[0]; }
433         E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
434     }
435 
436     template ReverseReadFromString()
437     {
438         @property bool canRead() { return s.length != 0; }
439         E peek() @safe pure @nogc nothrow { return s[$-1]; }
440         E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
441     }
442 
443     // Various forms of Write
444 
445     template WriteToString()
446     {
447         E[] s;
448         void write(E c) @safe pure nothrow { s ~= c; }
449     }
450 
451     template WriteToArray()
452     {
453         void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
454     }
455 
456     template WriteToDelegate()
457     {
458         void write(E c) { dg(c); }
459     }
460 
461     // Functions we will export
462 
463     template EncodeViaWrite()
464     {
465         mixin encodeViaWrite;
466         void encode(dchar c) { encodeViaWrite(c); }
467     }
468 
469     template SkipViaRead()
470     {
471         mixin skipViaRead;
472         void skip() @safe pure @nogc nothrow { skipViaRead(); }
473     }
474 
475     template DecodeViaRead()
476     {
477         mixin decodeViaRead;
478         dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
479     }
480 
481     template SafeDecodeViaRead()
482     {
483         mixin safeDecodeViaRead;
484         dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
485     }
486 
487     template DecodeReverseViaRead()
488     {
489         mixin decodeReverseViaRead;
490         dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
491     }
492 
493     // Encoding to different destinations
494 
495     template EncodeToString()
496     {
497         mixin WriteToString;
498         mixin EncodeViaWrite;
499     }
500 
501     template EncodeToArray()
502     {
503         mixin WriteToArray;
504         mixin EncodeViaWrite;
505     }
506 
507     template EncodeToDelegate()
508     {
509         mixin WriteToDelegate;
510         mixin EncodeViaWrite;
511     }
512 
513     // Decoding functions
514 
515     template SkipFromString()
516     {
517         mixin ReadFromString;
518         mixin SkipViaRead;
519     }
520 
521     template DecodeFromString()
522     {
523         mixin ReadFromString;
524         mixin DecodeViaRead;
525     }
526 
527     template SafeDecodeFromString()
528     {
529         mixin ReadFromString;
530         mixin SafeDecodeViaRead;
531     }
532 
533     template DecodeReverseFromString()
534     {
535         mixin ReverseReadFromString;
536         mixin DecodeReverseViaRead;
537     }
538 
539     //=========================================================================
540 
541     // Below are the functions we will ultimately expose to the user
542 
543     E[] encode(dchar c) @safe pure nothrow
544     {
545         mixin EncodeToString e;
546         e.encode(c);
547         return e.s;
548     }
549 
550     void encode(dchar c, ref E[] array) @safe pure nothrow
551     {
552         mixin EncodeToArray e;
553         e.encode(c);
554     }
555 
556     void encode(dchar c, void delegate(E) dg)
557     {
558         mixin EncodeToDelegate e;
559         e.encode(c);
560     }
561 
562     void skip(ref const(E)[] s) @safe pure nothrow
563     {
564         mixin SkipFromString e;
565         e.skip();
566     }
567 
568     dchar decode(S)(ref S s)
569     {
570         mixin DecodeFromString e;
571         return e.decode();
572     }
573 
574     dchar safeDecode(S)(ref S s)
575     {
576         mixin SafeDecodeFromString e;
577         return e.safeDecode();
578     }
579 
580     dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
581     {
582         mixin DecodeReverseFromString e;
583         return e.decodeReverse();
584     }
585 }
586 
587 //=========================================================================
588 
CodePoints(E)589 struct CodePoints(E)
590 {
591     const(E)[] s;
592 
593     this(const(E)[] s)
594     in
595     {
596         assert(isValid(s));
597     }
598     body
599     {
600         this.s = s;
601     }
602 
603     int opApply(scope int delegate(ref dchar) dg)
604     {
605         int result = 0;
606         while (s.length != 0)
607         {
608             dchar c = decode(s);
609             result = dg(c);
610             if (result != 0) break;
611         }
612         return result;
613     }
614 
615     int opApply(scope int delegate(ref size_t, ref dchar) dg)
616     {
617         size_t i = 0;
618         int result = 0;
619         while (s.length != 0)
620         {
621             immutable len = s.length;
622             dchar c = decode(s);
623             size_t j = i; // We don't want the delegate corrupting i
624             result = dg(j,c);
625             if (result != 0) break;
626             i += len - s.length;
627         }
628         return result;
629     }
630 
631     int opApplyReverse(scope int delegate(ref dchar) dg)
632     {
633         int result = 0;
634         while (s.length != 0)
635         {
636             dchar c = decodeReverse(s);
637             result = dg(c);
638             if (result != 0) break;
639         }
640         return result;
641     }
642 
643     int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
644     {
645         int result = 0;
646         while (s.length != 0)
647         {
648             dchar c = decodeReverse(s);
649             size_t i = s.length;
650             result = dg(i,c);
651             if (result != 0) break;
652         }
653         return result;
654     }
655 }
656 
CodeUnits(E)657 struct CodeUnits(E)
658 {
659     E[] s;
660 
661     this(dchar d)
662     in
663     {
664         assert(isValidCodePoint(d));
665     }
666     body
667     {
668         s = encode!(E)(d);
669     }
670 
671     int opApply(scope int delegate(ref E) dg)
672     {
673         int result = 0;
674         foreach (E c;s)
675         {
676             result = dg(c);
677             if (result != 0) break;
678         }
679         return result;
680     }
681 
682     int opApplyReverse(scope int delegate(ref E) dg)
683     {
684         int result = 0;
685         foreach_reverse (E c;s)
686         {
687             result = dg(c);
688             if (result != 0) break;
689         }
690         return result;
691     }
692 }
693 
694 //=============================================================================
695 
EncoderInstance(E)696 template EncoderInstance(E)
697 {
698     static assert(false,"Cannot instantiate EncoderInstance for type "
699         ~ E.stringof);
700 }
701 
GenericEncoder()702 private template GenericEncoder()
703 {
704     bool canEncode(dchar c) @safe pure @nogc nothrow
705     {
706         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
707         if (c >= 0xFFFD) return false;
708 
709         auto idx = 0;
710         while (idx < bstMap.length)
711         {
712             if (bstMap[idx][0] == c) return true;
713             idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
714         }
715 
716         return false;
717     }
718 
719     bool isValidCodeUnit(E c) @safe pure @nogc nothrow
720     {
721         if (c < m_charMapStart || c > m_charMapEnd) return true;
722         return charMap[c-m_charMapStart] != 0xFFFD;
723     }
724 
725     size_t encodedLength(dchar c) @safe pure @nogc nothrow
726     in
727     {
728         assert(canEncode(c));
729     }
730     body
731     {
732         return 1;
733     }
734 
735     void encodeViaWrite()(dchar c)
736     {
737         if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
738         else if (c >= 0xFFFD) { c = '?'; }
739         else
740         {
741             auto idx = 0;
742             while (idx < bstMap.length)
743             {
744                 if (bstMap[idx][0] == c)
745                 {
746                     write(cast(E) bstMap[idx][1]);
747                     return;
748                 }
749                 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
750             }
751             c = '?';
752         }
753         write(cast(E) c);
754     }
755 
756     void skipViaRead()()
757     {
758         read();
759     }
760 
761     dchar decodeViaRead()()
762     {
763         E c = read();
764         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
765     }
766 
767     dchar safeDecodeViaRead()()
768     {
769         immutable E c = read();
770         immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
771         return d == 0xFFFD ? INVALID_SEQUENCE : d;
772     }
773 
774     dchar decodeReverseViaRead()()
775     {
776         E c = read();
777         return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
778     }
779 
780     @property EString replacementSequence() @safe pure @nogc nothrow
781     {
782         return cast(EString)("?");
783     }
784 
785     mixin EncoderFunctions;
786 }
787 
788 //=============================================================================
789 //          ASCII
790 //=============================================================================
791 
792 /** Defines various character sets. */
793 enum AsciiChar : ubyte { init }
794 /// Ditto
795 alias AsciiString = immutable(AsciiChar)[];
796 
797 template EncoderInstance(CharType : AsciiChar)
798 {
799     alias E = AsciiChar;
800     alias EString = AsciiString;
801 
encodingName()802     @property string encodingName() @safe pure nothrow @nogc
803     {
804         return "ASCII";
805     }
806 
canEncode(dchar c)807     bool canEncode(dchar c) @safe pure nothrow @nogc
808     {
809         return c < 0x80;
810     }
811 
isValidCodeUnit(AsciiChar c)812     bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
813     {
814         return c < 0x80;
815     }
816 
encodedLength(dchar c)817     size_t encodedLength(dchar c) @safe pure nothrow @nogc
818     in
819     {
820         assert(canEncode(c));
821     }
822     body
823     {
824         return 1;
825     }
826 
encodeX(Range)827     void encodeX(Range)(dchar c, Range r)
828     {
829         if (!canEncode(c)) c = '?';
830         r.write(cast(AsciiChar) c);
831     }
832 
encodeViaWrite()833     void encodeViaWrite()(dchar c)
834     {
835         if (!canEncode(c)) c = '?';
836         write(cast(AsciiChar) c);
837     }
838 
skipViaRead()839     void skipViaRead()()
840     {
841         read();
842     }
843 
decodeViaRead()844     dchar decodeViaRead()()
845     {
846         return read();
847     }
848 
safeDecodeViaRead()849     dchar safeDecodeViaRead()()
850     {
851         immutable c = read();
852         return canEncode(c) ? c : INVALID_SEQUENCE;
853     }
854 
decodeReverseViaRead()855     dchar decodeReverseViaRead()()
856     {
857         return read();
858     }
859 
replacementSequence()860     @property EString replacementSequence() @safe pure nothrow @nogc
861     {
862         return cast(EString)("?");
863     }
864 
865     mixin EncoderFunctions;
866 }
867 
868 //=============================================================================
869 //          ISO-8859-1
870 //=============================================================================
871 
872 /** Defines an Latin1-encoded character. */
873 enum Latin1Char : ubyte { init }
874 /**
875 Defines an Latin1-encoded string (as an array of $(D
876 immutable(Latin1Char))).
877  */
878 alias Latin1String = immutable(Latin1Char)[];
879 
880 template EncoderInstance(CharType : Latin1Char)
881 {
882     alias E = Latin1Char;
883     alias EString = Latin1String;
884 
encodingName()885     @property string encodingName() @safe pure nothrow @nogc
886     {
887         return "ISO-8859-1";
888     }
889 
canEncode(dchar c)890     bool canEncode(dchar c) @safe pure nothrow @nogc
891     {
892         return c < 0x100;
893     }
894 
isValidCodeUnit(Latin1Char c)895     bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
896     {
897         return true;
898     }
899 
encodedLength(dchar c)900     size_t encodedLength(dchar c) @safe pure nothrow @nogc
901     in
902     {
903         assert(canEncode(c));
904     }
905     body
906     {
907         return 1;
908     }
909 
encodeViaWrite()910     void encodeViaWrite()(dchar c)
911     {
912         if (!canEncode(c)) c = '?';
913         write(cast(Latin1Char) c);
914     }
915 
skipViaRead()916     void skipViaRead()()
917     {
918         read();
919     }
920 
decodeViaRead()921     dchar decodeViaRead()()
922     {
923         return read();
924     }
925 
safeDecodeViaRead()926     dchar safeDecodeViaRead()()
927     {
928         return read();
929     }
930 
decodeReverseViaRead()931     dchar decodeReverseViaRead()()
932     {
933         return read();
934     }
935 
replacementSequence()936     @property EString replacementSequence() @safe pure nothrow @nogc
937     {
938         return cast(EString)("?");
939     }
940 
941     mixin EncoderFunctions;
942 }
943 
944 //=============================================================================
945 //          ISO-8859-2
946 //=============================================================================
947 
948 /// Defines a Latin2-encoded character.
949 enum Latin2Char : ubyte { init }
950 
951 /**
952  * Defines an Latin2-encoded string (as an array of $(D
953  * immutable(Latin2Char))).
954  */
955 alias Latin2String = immutable(Latin2Char)[];
956 
957 private template EncoderInstance(CharType : Latin2Char)
958 {
959     import std.typecons : Tuple, tuple;
960 
961     alias E = Latin2Char;
962     alias EString = Latin2String;
963 
encodingName()964     @property string encodingName() @safe pure nothrow @nogc
965     {
966         return "ISO-8859-2";
967     }
968 
969     private static immutable dchar m_charMapStart = 0xa1;
970     private static immutable dchar m_charMapEnd = 0xff;
971 
972     private immutable wstring charMap =
973         "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
974         "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
975         "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
976         "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
977         "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
978         "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
979         "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
980         "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
981         "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
982         "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
983         "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
984         "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
985 
986     private immutable Tuple!(wchar, char)[] bstMap = [
987         tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
988         tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
989         tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
990         tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
991         tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
992         tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
993         tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
994         tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
995         tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
996         tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
997         tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
998         tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
999         tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1000         tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1001         tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1002         tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1003         tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1004         tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1005         tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1006         tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1007         tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1008         tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1009         tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1010         tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1011         tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1012         tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1013         tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1014         tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1015         tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1016         tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1017         tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1018         tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1019     ];
1020 
1021     mixin GenericEncoder!();
1022 }
1023 
1024 //=============================================================================
1025 //          WINDOWS-1250
1026 //=============================================================================
1027 
1028 /// Defines a Windows1250-encoded character.
1029 enum Windows1250Char : ubyte { init }
1030 
1031 /**
1032  * Defines an Windows1250-encoded string (as an array of $(D
1033  * immutable(Windows1250Char))).
1034  */
1035 alias Windows1250String = immutable(Windows1250Char)[];
1036 
1037 private template EncoderInstance(CharType : Windows1250Char)
1038 {
1039     import std.typecons : Tuple, tuple;
1040 
1041     alias E = Windows1250Char;
1042     alias EString = Windows1250String;
1043 
encodingName()1044     @property string encodingName() @safe pure nothrow @nogc
1045     {
1046         return "windows-1250";
1047     }
1048 
1049     private static immutable dchar m_charMapStart = 0x80;
1050     private static immutable dchar m_charMapEnd = 0xff;
1051 
1052     private immutable wstring charMap =
1053         "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1054         "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1055         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1056         "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1057         "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1058         "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1059         "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1060         "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1061         "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1062         "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1063         "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1064         "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1065         "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1066         "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1067         "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1068         "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1069 
1070     private immutable Tuple!(wchar, char)[] bstMap = [
1071         tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1072         tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1073         tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1074         tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1075         tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1076         tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1077         tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1078         tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1079         tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1080         tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1081         tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1082         tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1083         tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1084         tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1085         tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1086         tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1087         tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1088         tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1089         tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1090         tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1091         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1092         tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1093         tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1094         tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1095         tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1096         tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1097         tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1098         tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1099         tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1100         tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1101         tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1102         tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1103         tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1104         tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1105         tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1106         tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1107         tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1108         tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1109         tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1110         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1111         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1112     ];
1113 
1114     mixin GenericEncoder!();
1115 }
1116 
1117 //=============================================================================
1118 //          WINDOWS-1252
1119 //=============================================================================
1120 
1121 /// Defines a Windows1252-encoded character.
1122 enum Windows1252Char : ubyte { init }
1123 
1124 /**
1125  * Defines an Windows1252-encoded string (as an array of $(D
1126  * immutable(Windows1252Char))).
1127  */
1128 alias Windows1252String = immutable(Windows1252Char)[];
1129 
1130 template EncoderInstance(CharType : Windows1252Char)
1131 {
1132     import std.typecons : Tuple, tuple;
1133 
1134     alias E = Windows1252Char;
1135     alias EString = Windows1252String;
1136 
encodingName()1137     @property string encodingName() @safe pure nothrow @nogc
1138     {
1139         return "windows-1252";
1140     }
1141 
1142     private static immutable dchar m_charMapStart = 0x80;
1143     private static immutable dchar m_charMapEnd = 0x9f;
1144 
1145     private immutable wstring charMap =
1146         "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1147         "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1148         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1149         "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1150 
1151     private immutable Tuple!(wchar, char)[] bstMap = [
1152         tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1153         tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1154         tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1155         tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1156         tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1157         tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1158         tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1159         tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1160         tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1161     ];
1162 
1163     mixin GenericEncoder!();
1164 }
1165 
1166 //=============================================================================
1167 //          UTF-8
1168 //=============================================================================
1169 
1170 template EncoderInstance(CharType : char)
1171 {
1172     alias E = char;
1173     alias EString = immutable(char)[];
1174 
encodingName()1175     @property string encodingName() @safe pure nothrow @nogc
1176     {
1177         return "UTF-8";
1178     }
1179 
canEncode(dchar c)1180     bool canEncode(dchar c) @safe pure nothrow @nogc
1181     {
1182         return isValidCodePoint(c);
1183     }
1184 
isValidCodeUnit(char c)1185     bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1186     {
1187         return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1188     }
1189 
1190     immutable ubyte[128] tailTable =
1191     [
1192         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1194         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1195         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1196         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1197         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1198         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1199         3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1200     ];
1201 
tails(char c)1202     private int tails(char c) @safe pure nothrow @nogc
1203     in
1204     {
1205         assert(c >= 0x80);
1206     }
1207     body
1208     {
1209         return tailTable[c-0x80];
1210     }
1211 
encodedLength(dchar c)1212     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1213     in
1214     {
1215         assert(canEncode(c));
1216     }
1217     body
1218     {
1219         if (c < 0x80) return 1;
1220         if (c < 0x800) return 2;
1221         if (c < 0x10000) return 3;
1222         return 4;
1223     }
1224 
encodeViaWrite()1225     void encodeViaWrite()(dchar c)
1226     {
1227         if (c < 0x80)
1228         {
1229             write(cast(char) c);
1230         }
1231         else if (c < 0x800)
1232         {
1233             write(cast(char)((c >> 6) + 0xC0));
1234             write(cast(char)((c & 0x3F) + 0x80));
1235         }
1236         else if (c < 0x10000)
1237         {
1238             write(cast(char)((c >> 12) + 0xE0));
1239             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1240             write(cast(char)((c & 0x3F) + 0x80));
1241         }
1242         else
1243         {
1244             write(cast(char)((c >> 18) + 0xF0));
1245             write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1246             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1247             write(cast(char)((c & 0x3F) + 0x80));
1248         }
1249     }
1250 
skipViaRead()1251     void skipViaRead()()
1252     {
1253         auto c = read();
1254         if (c < 0xC0) return;
1255         int n = tails(cast(char) c);
1256         for (size_t i=0; i<n; ++i)
1257         {
1258             read();
1259         }
1260     }
1261 
decodeViaRead()1262     dchar decodeViaRead()()
1263     {
1264         dchar c = read();
1265         if (c < 0xC0) return c;
1266         int n = tails(cast(char) c);
1267         c &= (1 << (6 - n)) - 1;
1268         for (size_t i=0; i<n; ++i)
1269         {
1270             c = (c << 6) + (read() & 0x3F);
1271         }
1272         return c;
1273     }
1274 
safeDecodeViaRead()1275     dchar safeDecodeViaRead()()
1276     {
1277         dchar c = read();
1278         if (c < 0x80) return c;
1279         int n = tails(cast(char) c);
1280         if (n == 0) return INVALID_SEQUENCE;
1281 
1282         if (!canRead) return INVALID_SEQUENCE;
1283         size_t d = peek();
1284         immutable err =
1285         (
1286             (c < 0xC2)                              // fail overlong 2-byte sequences
1287         ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1288         ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1289         ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1290         ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1291         ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1292         );
1293 
1294         c &= (1 << (6 - n)) - 1;
1295         for (size_t i=0; i<n; ++i)
1296         {
1297             if (!canRead) return INVALID_SEQUENCE;
1298             d = peek();
1299             if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1300             c = (c << 6) + (read() & 0x3F);
1301         }
1302 
1303         return err ? INVALID_SEQUENCE : c;
1304     }
1305 
decodeReverseViaRead()1306     dchar decodeReverseViaRead()()
1307     {
1308         dchar c = read();
1309         if (c < 0x80) return c;
1310         size_t shift = 0;
1311         c &= 0x3F;
1312         for (size_t i=0; i<4; ++i)
1313         {
1314             shift += 6;
1315             auto d = read();
1316             size_t n = tails(cast(char) d);
1317             immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1318             c += ((d & mask) << shift);
1319             if (n != 0) break;
1320         }
1321         return c;
1322     }
1323 
replacementSequence()1324     @property EString replacementSequence() @safe pure nothrow @nogc
1325     {
1326         return "\uFFFD";
1327     }
1328 
1329     mixin EncoderFunctions;
1330 }
1331 
1332 //=============================================================================
1333 //          UTF-16
1334 //=============================================================================
1335 
1336 template EncoderInstance(CharType : wchar)
1337 {
1338     alias E = wchar;
1339     alias EString = immutable(wchar)[];
1340 
encodingName()1341     @property string encodingName() @safe pure nothrow @nogc
1342     {
1343         return "UTF-16";
1344     }
1345 
canEncode(dchar c)1346     bool canEncode(dchar c) @safe pure nothrow @nogc
1347     {
1348         return isValidCodePoint(c);
1349     }
1350 
isValidCodeUnit(wchar c)1351     bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1352     {
1353         return true;
1354     }
1355 
encodedLength(dchar c)1356     size_t encodedLength(dchar c) @safe pure nothrow @nogc
1357     in
1358     {
1359         assert(canEncode(c));
1360     }
1361     body
1362     {
1363         return (c < 0x10000) ? 1 : 2;
1364     }
1365 
encodeViaWrite()1366     void encodeViaWrite()(dchar c)
1367     {
1368         if (c < 0x10000)
1369         {
1370             write(cast(wchar) c);
1371         }
1372         else
1373         {
1374             size_t n = c - 0x10000;
1375             write(cast(wchar)(0xD800 + (n >> 10)));
1376             write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1377         }
1378     }
1379 
skipViaRead()1380     void skipViaRead()()
1381     {
1382         immutable c = read();
1383         if (c < 0xD800 || c >= 0xE000) return;
1384         read();
1385     }
1386 
decodeViaRead()1387     dchar decodeViaRead()()
1388     {
1389         wchar c = read();
1390         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1391         wchar d = read();
1392         c &= 0x3FF;
1393         d &= 0x3FF;
1394         return 0x10000 + (c << 10) + d;
1395     }
1396 
safeDecodeViaRead()1397     dchar safeDecodeViaRead()()
1398     {
1399         wchar c = read();
1400         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1401         if (c >= 0xDC00) return INVALID_SEQUENCE;
1402         if (!canRead) return INVALID_SEQUENCE;
1403         wchar d = peek();
1404         if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1405         d = read();
1406         c &= 0x3FF;
1407         d &= 0x3FF;
1408         return 0x10000 + (c << 10) + d;
1409     }
1410 
decodeReverseViaRead()1411     dchar decodeReverseViaRead()()
1412     {
1413         wchar c = read();
1414         if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1415         wchar d = read();
1416         c &= 0x3FF;
1417         d &= 0x3FF;
1418         return 0x10000 + (d << 10) + c;
1419     }
1420 
replacementSequence()1421     @property EString replacementSequence() @safe pure nothrow @nogc
1422     {
1423         return "\uFFFD"w;
1424     }
1425 
1426     mixin EncoderFunctions;
1427 }
1428 
1429 //=============================================================================
1430 //          UTF-32
1431 //=============================================================================
1432 
1433 template EncoderInstance(CharType : dchar)
1434 {
1435     alias E = dchar;
1436     alias EString = immutable(dchar)[];
1437 
encodingName()1438     @property string encodingName() @safe pure nothrow @nogc
1439     {
1440         return "UTF-32";
1441     }
1442 
canEncode(dchar c)1443     bool canEncode(dchar c) @safe pure @nogc nothrow
1444     {
1445         return isValidCodePoint(c);
1446     }
1447 
isValidCodeUnit(dchar c)1448     bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1449     {
1450         return isValidCodePoint(c);
1451     }
1452 
encodedLength(dchar c)1453     size_t encodedLength(dchar c) @safe pure @nogc nothrow
1454     in
1455     {
1456         assert(canEncode(c));
1457     }
1458     body
1459     {
1460         return 1;
1461     }
1462 
encodeViaWrite()1463     void encodeViaWrite()(dchar c)
1464     {
1465         write(c);
1466     }
1467 
skipViaRead()1468     void skipViaRead()()
1469     {
1470         read();
1471     }
1472 
decodeViaRead()1473     dchar decodeViaRead()()
1474     {
1475         return cast(dchar) read();
1476     }
1477 
safeDecodeViaRead()1478     dchar safeDecodeViaRead()()
1479     {
1480         immutable c = read();
1481         return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1482     }
1483 
decodeReverseViaRead()1484     dchar decodeReverseViaRead()()
1485     {
1486         return cast(dchar) read();
1487     }
1488 
replacementSequence()1489     @property EString replacementSequence() @safe pure nothrow @nogc
1490     {
1491         return "\uFFFD"d;
1492     }
1493 
1494     mixin EncoderFunctions;
1495 }
1496 
1497 //=============================================================================
1498 // Below are forwarding functions which expose the function to the user
1499 
1500 /**
1501 Returns true if c is a valid code point
1502 
1503  Note that this includes the non-character code points U+FFFE and U+FFFF,
1504  since these are valid code points (even though they are not valid
1505  characters).
1506 
1507  Supersedes:
1508  This function supersedes $(D std.utf.startsValidDchar()).
1509 
1510  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1511  WINDOWS-1252
1512 
1513  Params:
1514     c = the code point to be tested
1515  */
isValidCodePoint(dchar c)1516 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1517 {
1518     return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1519 }
1520 
1521 /**
1522  Returns the name of an encoding.
1523 
1524  The type of encoding cannot be deduced. Therefore, it is necessary to
1525  explicitly specify the encoding type.
1526 
1527  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1528  WINDOWS-1252
1529  */
encodingName(T)1530 @property string encodingName(T)()
1531 {
1532     return EncoderInstance!(T).encodingName;
1533 }
1534 
1535 ///
1536 @safe unittest
1537 {
1538     assert(encodingName!(char) == "UTF-8");
1539     assert(encodingName!(wchar) == "UTF-16");
1540     assert(encodingName!(dchar) == "UTF-32");
1541     assert(encodingName!(AsciiChar) == "ASCII");
1542     assert(encodingName!(Latin1Char) == "ISO-8859-1");
1543     assert(encodingName!(Latin2Char) == "ISO-8859-2");
1544     assert(encodingName!(Windows1250Char) == "windows-1250");
1545     assert(encodingName!(Windows1252Char) == "windows-1252");
1546 }
1547 
1548 /**
1549  Returns true iff it is possible to represent the specified codepoint
1550  in the encoding.
1551 
1552  The type of encoding cannot be deduced. Therefore, it is necessary to
1553  explicitly specify the encoding type.
1554 
1555  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1556  WINDOWS-1252
1557  */
canEncode(E)1558 bool canEncode(E)(dchar c)
1559 {
1560     return EncoderInstance!(E).canEncode(c);
1561 }
1562 
1563 ///
1564 @safe pure unittest
1565 {
1566     assert( canEncode!(Latin1Char)('A'));
1567     assert( canEncode!(Latin2Char)('A'));
1568     assert(!canEncode!(AsciiChar)('\u00A0'));
1569     assert( canEncode!(Latin1Char)('\u00A0'));
1570     assert( canEncode!(Latin2Char)('\u00A0'));
1571     assert( canEncode!(Windows1250Char)('\u20AC'));
1572     assert(!canEncode!(Windows1250Char)('\u20AD'));
1573     assert(!canEncode!(Windows1250Char)('\uFFFD'));
1574     assert( canEncode!(Windows1252Char)('\u20AC'));
1575     assert(!canEncode!(Windows1252Char)('\u20AD'));
1576     assert(!canEncode!(Windows1252Char)('\uFFFD'));
1577     assert(!canEncode!(char)(cast(dchar) 0x110000));
1578 }
1579 
1580 /// How to check an entire string
1581 @safe pure unittest
1582 {
1583     import std.algorithm.searching : find;
1584     import std.utf : byDchar;
1585 
1586     assert("The quick brown fox"
1587         .byDchar
1588         .find!(x => !canEncode!AsciiChar(x))
1589         .empty);
1590 }
1591 
1592 /**
1593  Returns true if the code unit is legal. For example, the byte 0x80 would
1594  not be legal in ASCII, because ASCII code units must always be in the range
1595  0x00 to 0x7F.
1596 
1597  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1598  WINDOWS-1252
1599 
1600  Params:
1601     c = the code unit to be tested
1602  */
isValidCodeUnit(E)1603 bool isValidCodeUnit(E)(E c)
1604 {
1605     return EncoderInstance!(E).isValidCodeUnit(c);
1606 }
1607 
1608 ///
1609 @system pure unittest
1610 {
1611     assert(!isValidCodeUnit(cast(char) 0xC0));
1612     assert(!isValidCodeUnit(cast(char) 0xFF));
1613     assert( isValidCodeUnit(cast(wchar) 0xD800));
1614     assert(!isValidCodeUnit(cast(dchar) 0xD800));
1615     assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1616     assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1617     assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1618     assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1619     assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1620 }
1621 
1622 /**
1623  Returns true if the string is encoded correctly
1624 
1625  Supersedes:
1626  This function supersedes std.utf.validate(), however note that this
1627  function returns a bool indicating whether the input was valid or not,
1628  whereas the older function would throw an exception.
1629 
1630  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1631  WINDOWS-1252
1632 
1633  Params:
1634     s = the string to be tested
1635  */
isValid(E)1636 bool isValid(E)(const(E)[] s)
1637 {
1638     return s.length == validLength(s);
1639 }
1640 
1641 ///
1642 @system pure unittest
1643 {
1644     assert( isValid("\u20AC100"));
1645     assert(!isValid(cast(char[3])[167, 133, 175]));
1646 }
1647 
1648 /**
1649  Returns the length of the longest possible substring, starting from
1650  the first code unit, which is validly encoded.
1651 
1652  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1653  WINDOWS-1252
1654 
1655  Params:
1656     s = the string to be tested
1657  */
validLength(E)1658 size_t validLength(E)(const(E)[] s)
1659 {
1660     size_t result, before = void;
1661     while ((before = s.length) > 0)
1662     {
1663         if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1664             break;
1665         result += before - s.length;
1666     }
1667     return result;
1668 }
1669 
1670 /**
1671  Sanitizes a string by replacing malformed code unit sequences with valid
1672  code unit sequences. The result is guaranteed to be valid for this encoding.
1673 
1674  If the input string is already valid, this function returns the original,
1675  otherwise it constructs a new string by replacing all illegal code unit
1676  sequences with the encoding's replacement character, Invalid sequences will
1677  be replaced with the Unicode replacement character (U+FFFD) if the
1678  character repertoire contains it, otherwise invalid sequences will be
1679  replaced with '?'.
1680 
1681  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1682  WINDOWS-1252
1683 
1684  Params:
1685     s = the string to be sanitized
1686  */
immutable(E)1687 immutable(E)[] sanitize(E)(immutable(E)[] s)
1688 {
1689     size_t n = validLength(s);
1690     if (n == s.length) return s;
1691 
1692     auto repSeq = EncoderInstance!(E).replacementSequence;
1693 
1694     // Count how long the string needs to be.
1695     // Overestimating is not a problem
1696     size_t len = s.length;
1697     const(E)[] t = s[n..$];
1698     while (t.length != 0)
1699     {
1700         immutable c = EncoderInstance!(E).safeDecode(t);
1701         assert(c == INVALID_SEQUENCE);
1702         len += repSeq.length;
1703         t = t[validLength(t)..$];
1704     }
1705 
1706     // Now do the write
1707     E[] array = new E[len];
1708     array[0 .. n] = s[0 .. n];
1709     size_t offset = n;
1710 
1711     t = s[n..$];
1712     while (t.length != 0)
1713     {
1714         immutable c = EncoderInstance!(E).safeDecode(t);
1715         assert(c == INVALID_SEQUENCE);
1716         array[offset .. offset+repSeq.length] = repSeq[];
1717         offset += repSeq.length;
1718         n = validLength(t);
1719         array[offset .. offset+n] = t[0 .. n];
1720         offset += n;
1721         t = t[n..$];
1722     }
1723     return cast(immutable(E)[])array[0 .. offset];
1724 }
1725 
1726 ///
1727 @system pure unittest
1728 {
1729     assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1730 }
1731 
1732 /**
1733  Returns the length of the first encoded sequence.
1734 
1735  The input to this function MUST be validly encoded.
1736  This is enforced by the function's in-contract.
1737 
1738  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1739  WINDOWS-1252
1740 
1741  Params:
1742  s = the string to be sliced
1743  */
firstSequence(E)1744 size_t firstSequence(E)(const(E)[] s)
1745 in
1746 {
1747     assert(s.length != 0);
1748     const(E)[] u = s;
1749     assert(safeDecode(u) != INVALID_SEQUENCE);
1750 }
1751 body
1752 {
1753     auto before = s.length;
1754     EncoderInstance!(E).skip(s);
1755     return before - s.length;
1756 }
1757 
1758 ///
1759 @system pure unittest
1760 {
1761     assert(firstSequence("\u20AC1000") == "\u20AC".length);
1762     assert(firstSequence("hel") == "h".length);
1763 }
1764 
1765 /**
1766  Returns the length of the last encoded sequence.
1767 
1768  The input to this function MUST be validly encoded.
1769  This is enforced by the function's in-contract.
1770 
1771  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1772  WINDOWS-1252
1773 
1774  Params:
1775     s = the string to be sliced
1776  */
lastSequence(E)1777 size_t lastSequence(E)(const(E)[] s)
1778 in
1779 {
1780     assert(s.length != 0);
1781     assert(isValid(s));
1782 }
1783 body
1784 {
1785     const(E)[] t = s;
1786     EncoderInstance!(E).decodeReverse(s);
1787     return t.length - s.length;
1788 }
1789 
1790 ///
1791 @system pure unittest
1792 {
1793     assert(lastSequence("1000\u20AC") == "\u20AC".length);
1794     assert(lastSequence("hellö") == "ö".length);
1795 }
1796 
1797 /**
1798  Returns the array index at which the (n+1)th code point begins.
1799 
1800  The input to this function MUST be validly encoded.
1801  This is enforced by the function's in-contract.
1802 
1803  Supersedes:
1804  This function supersedes std.utf.toUTFindex().
1805 
1806  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1807  WINDOWS-1252
1808 
1809  Params:
1810     s = the string to be counted
1811     n = the current code point index
1812  */
index(E)1813 ptrdiff_t index(E)(const(E)[] s,int n)
1814 in
1815 {
1816     assert(isValid(s));
1817     assert(n >= 0);
1818 }
1819 body
1820 {
1821     const(E)[] t = s;
1822     for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1823     return t.length - s.length;
1824 }
1825 
1826 ///
1827 @system pure unittest
1828 {
1829     assert(index("\u20AC100",1) == 3);
1830     assert(index("hällo",2) == 3);
1831 }
1832 
1833 /**
1834  Decodes a single code point.
1835 
1836  This function removes one or more code units from the start of a string,
1837  and returns the decoded code point which those code units represent.
1838 
1839  The input to this function MUST be validly encoded.
1840  This is enforced by the function's in-contract.
1841 
1842  Supersedes:
1843  This function supersedes std.utf.decode(), however, note that the
1844  function codePoints() supersedes it more conveniently.
1845 
1846  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1847  WINDOWS-1252
1848 
1849  Params:
1850     s = the string whose first code point is to be decoded
1851  */
decode(S)1852 dchar decode(S)(ref S s)
1853 in
1854 {
1855     assert(s.length != 0);
1856     auto u = s;
1857     assert(safeDecode(u) != INVALID_SEQUENCE);
1858 }
1859 body
1860 {
1861     return EncoderInstance!(typeof(s[0])).decode(s);
1862 }
1863 
1864 /**
1865  Decodes a single code point from the end of a string.
1866 
1867  This function removes one or more code units from the end of a string,
1868  and returns the decoded code point which those code units represent.
1869 
1870  The input to this function MUST be validly encoded.
1871  This is enforced by the function's in-contract.
1872 
1873  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1874  WINDOWS-1252
1875 
1876  Params:
1877     s = the string whose first code point is to be decoded
1878  */
decodeReverse(E)1879 dchar decodeReverse(E)(ref const(E)[] s)
1880 in
1881 {
1882     assert(s.length != 0);
1883     assert(isValid(s));
1884 }
1885 body
1886 {
1887     return EncoderInstance!(E).decodeReverse(s);
1888 }
1889 
1890 /**
1891  Decodes a single code point. The input does not have to be valid.
1892 
1893  This function removes one or more code units from the start of a string,
1894  and returns the decoded code point which those code units represent.
1895 
1896  This function will accept an invalidly encoded string as input.
1897  If an invalid sequence is found at the start of the string, this
1898  function will remove it, and return the value INVALID_SEQUENCE.
1899 
1900  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1901  WINDOWS-1252
1902 
1903  Params:
1904     s = the string whose first code point is to be decoded
1905  */
safeDecode(S)1906 dchar safeDecode(S)(ref S s)
1907 in
1908 {
1909     assert(s.length != 0);
1910 }
1911 body
1912 {
1913     return EncoderInstance!(typeof(s[0])).safeDecode(s);
1914 }
1915 
1916 /**
1917  Returns the number of code units required to encode a single code point.
1918 
1919  The input to this function MUST be a valid code point.
1920  This is enforced by the function's in-contract.
1921 
1922  The type of the output cannot be deduced. Therefore, it is necessary to
1923  explicitly specify the encoding as a template parameter.
1924 
1925  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1926  WINDOWS-1252
1927 
1928  Params:
1929     c = the code point to be encoded
1930  */
encodedLength(E)1931 size_t encodedLength(E)(dchar c)
1932 in
1933 {
1934     assert(isValidCodePoint(c));
1935 }
1936 body
1937 {
1938     return EncoderInstance!(E).encodedLength(c);
1939 }
1940 
1941 /**
1942  Encodes a single code point.
1943 
1944  This function encodes a single code point into one or more code units.
1945  It returns a string containing those code units.
1946 
1947  The input to this function MUST be a valid code point.
1948  This is enforced by the function's in-contract.
1949 
1950  The type of the output cannot be deduced. Therefore, it is necessary to
1951  explicitly specify the encoding as a template parameter.
1952 
1953  Supersedes:
1954  This function supersedes std.utf.encode(), however, note that the
1955  function codeUnits() supersedes it more conveniently.
1956 
1957  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1958  WINDOWS-1252
1959 
1960  Params:
1961     c = the code point to be encoded
1962  */
encode(E)1963 E[] encode(E)(dchar c)
1964 in
1965 {
1966     assert(isValidCodePoint(c));
1967 }
1968 body
1969 {
1970     return EncoderInstance!(E).encode(c);
1971 }
1972 
1973 /**
1974  Encodes a single code point into an array.
1975 
1976  This function encodes a single code point into one or more code units
1977  The code units are stored in a user-supplied fixed-size array,
1978  which must be passed by reference.
1979 
1980  The input to this function MUST be a valid code point.
1981  This is enforced by the function's in-contract.
1982 
1983  The type of the output cannot be deduced. Therefore, it is necessary to
1984  explicitly specify the encoding as a template parameter.
1985 
1986  Supersedes:
1987  This function supersedes std.utf.encode(), however, note that the
1988  function codeUnits() supersedes it more conveniently.
1989 
1990  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1991  WINDOWS-1252
1992 
1993  Params:
1994     c     = the code point to be encoded
1995     array = the destination array
1996 
1997  Returns:
1998           the number of code units written to the array
1999  */
encode(E)2000 size_t encode(E)(dchar c, E[] array)
2001 in
2002 {
2003     assert(isValidCodePoint(c));
2004 }
2005 body
2006 {
2007     E[] t = array;
2008     EncoderInstance!(E).encode(c,t);
2009     return array.length - t.length;
2010 }
2011 
2012 /*
2013 Encodes $(D c) in units of type $(D E) and writes the result to the
2014 output range $(D R). Returns the number of $(D E)s written.
2015  */
2016 size_t encode(E, R)(dchar c, auto ref R range)
2017 if (isNativeOutputRange!(R, E))
2018 {
2019     static if (is(Unqual!E == char))
2020     {
2021         if (c <= 0x7F)
2022         {
2023             put(range, cast(char) c);
2024             return 1;
2025         }
2026         if (c <= 0x7FF)
2027         {
2028             put(range, cast(char)(0xC0 | (c >> 6)));
2029             put(range, cast(char)(0x80 | (c & 0x3F)));
2030             return 2;
2031         }
2032         if (c <= 0xFFFF)
2033         {
2034             put(range, cast(char)(0xE0 | (c >> 12)));
2035             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2036             put(range, cast(char)(0x80 | (c & 0x3F)));
2037             return 3;
2038         }
2039         if (c <= 0x10FFFF)
2040         {
2041             put(range, cast(char)(0xF0 | (c >> 18)));
2042             put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2043             put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2044             put(range, cast(char)(0x80 | (c & 0x3F)));
2045             return 4;
2046         }
2047         else
2048         {
2049             assert(0);
2050         }
2051     }
2052     else static if (is(Unqual!E == wchar))
2053     {
2054         if (c <= 0xFFFF)
2055         {
2056             range.put(cast(wchar) c);
2057             return 1;
2058         }
2059         range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2060         range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2061         return 2;
2062     }
2063     else static if (is(Unqual!E == dchar))
2064     {
2065         range.put(c);
2066         return 1;
2067     }
2068     else
2069     {
2070         static assert(0);
2071     }
2072 }
2073 
2074 @safe pure unittest
2075 {
2076     import std.array;
2077     Appender!(char[]) r;
2078     assert(encode!(char)('T', r) == 1);
2079     assert(encode!(wchar)('T', r) == 1);
2080     assert(encode!(dchar)('T', r) == 1);
2081 }
2082 
2083 /**
2084  Encodes a single code point to a delegate.
2085 
2086  This function encodes a single code point into one or more code units.
2087  The code units are passed one at a time to the supplied delegate.
2088 
2089  The input to this function MUST be a valid code point.
2090  This is enforced by the function's in-contract.
2091 
2092  The type of the output cannot be deduced. Therefore, it is necessary to
2093  explicitly specify the encoding as a template parameter.
2094 
2095  Supersedes:
2096  This function supersedes std.utf.encode(), however, note that the
2097  function codeUnits() supersedes it more conveniently.
2098 
2099  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2100  WINDOWS-1252
2101 
2102  Params:
2103     c  = the code point to be encoded
2104     dg = the delegate to invoke for each code unit
2105  */
encode(E)2106 void encode(E)(dchar c, void delegate(E) dg)
2107 in
2108 {
2109     assert(isValidCodePoint(c));
2110 }
2111 body
2112 {
2113     EncoderInstance!(E).encode(c,dg);
2114 }
2115 
2116 /**
2117 Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an
2118 output range.
2119 
2120 Returns: The number of $(D Tgt) elements written.
2121 Params:
2122 Tgt = Element type of $(D range).
2123 s = Input array.
2124 range = Output range.
2125  */
encode(Tgt,Src,R)2126 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2127 {
2128     size_t result;
2129     foreach (c; s)
2130     {
2131         result += encode!(Tgt)(c, range);
2132     }
2133     return result;
2134 }
2135 
2136 /**
2137  Returns a foreachable struct which can bidirectionally iterate over all
2138  code points in a string.
2139 
2140  The input to this function MUST be validly encoded.
2141  This is enforced by the function's in-contract.
2142 
2143  You can foreach either
2144  with or without an index. If an index is specified, it will be initialized
2145  at each iteration with the offset into the string at which the code point
2146  begins.
2147 
2148  Supersedes:
2149  This function supersedes std.utf.decode().
2150 
2151  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2152  WINDOWS-1252
2153 
2154  Params:
2155     s = the string to be decoded
2156 
2157  Example:
2158  --------------------------------------------------------
2159  string s = "hello world";
2160  foreach (c;codePoints(s))
2161  {
2162      // do something with c (which will always be a dchar)
2163  }
2164  --------------------------------------------------------
2165 
2166  Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2167  in that the latter will fall over on encountering U+FFFF.
2168  */
2169 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2170 in
2171 {
2172     assert(isValid(s));
2173 }
2174 body
2175 {
2176     return CodePoints!(E)(s);
2177 }
2178 
2179 ///
2180 @system unittest
2181 {
2182     string s = "hello";
2183     string t;
foreach(c;codePoints (s))2184     foreach (c;codePoints(s))
2185     {
2186         t ~= cast(char) c;
2187     }
2188     assert(s == t);
2189 }
2190 
2191 /**
2192  Returns a foreachable struct which can bidirectionally iterate over all
2193  code units in a code point.
2194 
2195  The input to this function MUST be a valid code point.
2196  This is enforced by the function's in-contract.
2197 
2198  The type of the output cannot be deduced. Therefore, it is necessary to
2199  explicitly specify the encoding type in the template parameter.
2200 
2201  Supersedes:
2202  This function supersedes std.utf.encode().
2203 
2204  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2205  WINDOWS-1252
2206 
2207  Params:
2208     c = the code point to be encoded
2209  */
2210 CodeUnits!(E) codeUnits(E)(dchar c)
2211 in
2212 {
2213     assert(isValidCodePoint(c));
2214 }
2215 body
2216 {
2217     return CodeUnits!(E)(c);
2218 }
2219 
2220 ///
2221 @system unittest
2222 {
2223     char[] a;
2224     foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2225     {
2226         a ~= c;
2227     }
2228     assert(a.length == 3);
2229     assert(a[0] == 0xE2);
2230     assert(a[1] == 0x82);
2231     assert(a[2] == 0xAC);
2232 }
2233 
2234 /**
2235  Convert a string from one encoding to another.
2236 
2237  Supersedes:
2238  This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2239  std.utf.toUTF32()
2240  (but note that to!() supersedes it more conveniently).
2241 
2242  Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2243  WINDOWS-1252
2244 
2245  Params:
2246     s = Source string. $(B Must) be validly encoded.
2247         This is enforced by the function's in-contract.
2248     r = Destination string
2249 
2250  See_Also:
2251     $(REF to, std,conv)
2252  */
transcode(Src,Dst)2253 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2254 in
2255 {
2256     assert(isValid(s));
2257 }
2258 body
2259 {
2260     static if (is(Src == Dst) && is(Src == immutable))
2261     {
2262         r = s;
2263     }
2264     else static if (is(Unqual!Src == AsciiChar))
2265     {
2266         transcode(cast(const(char)[])s, r);
2267     }
2268     else
2269     {
2270         static if (is(Unqual!Dst == wchar))
2271         {
2272             immutable minReservePlace = 2;
2273         }
2274         else static if (is(Unqual!Dst == dchar))
2275         {
2276             immutable minReservePlace = 1;
2277         }
2278         else
2279         {
2280             immutable minReservePlace = 6;
2281         }
2282 
2283         auto buffer = new Unqual!Dst[s.length];
2284         auto tmpBuffer = buffer;
2285 
2286         while (s.length != 0)
2287         {
2288             if (tmpBuffer.length < minReservePlace)
2289             {
2290                 size_t prevLength = buffer.length;
2291                 buffer.length += s.length + minReservePlace;
2292                 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2293             }
2294             EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2295         }
2296 
2297         r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2298     }
2299 }
2300 
2301 ///
2302 @system pure unittest
2303 {
2304     wstring ws;
2305     // transcode from UTF-8 to UTF-16
2306     transcode("hello world",ws);
2307     assert(ws == "hello world"w);
2308 
2309     Latin1String ls;
2310     // transcode from UTF-16 to ISO-8859-1
2311     transcode(ws, ls);
2312     assert(ws == "hello world");
2313 }
2314 
2315 @system pure unittest
2316 {
2317     import std.meta;
2318     import std.range;
2319     {
2320         import std.conv : to;
2321 
2322         string asciiCharString = to!string(iota(0, 128, 1));
2323 
2324         alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2325             Windows1250String, Windows1252String, dstring, wstring);
2326         foreach (S; Types)
foreach(D;Types)2327             foreach (D; Types)
2328             {
2329                 string str;
2330                 S sStr;
2331                 D dStr;
2332                 transcode(asciiCharString, sStr);
2333                 transcode(sStr, dStr);
2334                 transcode(dStr, str);
2335                 assert(asciiCharString == str);
2336             }
2337     }
2338     {
2339         string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2340         alias Types = AliasSeq!(string, dstring, wstring);
2341         foreach (S; Types)
foreach(D;Types)2342             foreach (D; Types)
2343             {
2344                 string str;
2345                 S sStr;
2346                 D dStr;
2347                 transcode(czechChars, sStr);
2348                 transcode(sStr, dStr);
2349                 transcode(dStr, str);
2350                 assert(czechChars == str);
2351             }
2352     }
2353 }
2354 
2355 @system unittest // mutable/const input/output
2356 {
2357     import std.meta : AliasSeq;
2358 
2359     foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2360     {
2361         O[] output;
2362 
2363         char[] mutableInput = "äbc".dup;
2364         transcode(mutableInput, output);
2365         assert(output == [0xE4, 'b', 'c']);
2366 
2367         const char[] constInput = "öbc";
2368         transcode(constInput, output);
2369         assert(output == [0xF6, 'b', 'c']);
2370 
2371         immutable char[] immutInput = "übc";
2372         transcode(immutInput, output);
2373         assert(output == [0xFC, 'b', 'c']);
2374     }
2375 
2376     // Make sure that const/mutable input is copied.
2377     foreach (C; AliasSeq!(char, const char))
2378     {
2379         C[] input = "foo".dup;
2380         C[] output;
2381         transcode(input, output);
2382         assert(input == output);
2383         assert(input !is output);
2384     }
2385 
2386     // But immutable input should not be copied.
2387     string input = "foo";
2388     string output;
2389     transcode(input, output);
2390     assert(input is output);
2391 }
2392 
2393 //=============================================================================
2394 
2395 /** The base class for exceptions thrown by this module */
this(string msg)2396 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2397 
2398 class UnrecognizedEncodingException : EncodingException
2399 {
this(string msg)2400     private this(string msg) @safe pure { super(msg); }
2401 }
2402 
2403 /** Abstract base class of all encoding schemes */
2404 abstract class EncodingScheme
2405 {
2406     import std.uni : toLower;
2407 
2408     /**
2409      * Registers a subclass of EncodingScheme.
2410      *
2411      * This function allows user-defined subclasses of EncodingScheme to
2412      * be declared in other modules.
2413      *
2414      * Params:
2415      *     Klass = The subclass of EncodingScheme to register.
2416      *
2417      * Example:
2418      * ----------------------------------------------
2419      * class Amiga1251 : EncodingScheme
2420      * {
2421      *     shared static this()
2422      *     {
2423      *         EncodingScheme.register!Amiga1251;
2424      *     }
2425      * }
2426      * ----------------------------------------------
2427      */
2428     static void register(Klass:EncodingScheme)()
2429     {
2430         scope scheme = new Klass();
2431         foreach (encodingName;scheme.names())
2432         {
2433             supported[toLower(encodingName)] = () => new Klass();
2434         }
2435     }
2436 
2437     deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2438     static void register(string className)
2439     {
2440         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2441         if (scheme is null)
2442             throw new EncodingException("Unable to create class "~className);
2443         foreach (encodingName;scheme.names())
2444         {
2445             supportedFactories[toLower(encodingName)] = className;
2446         }
2447     }
2448 
2449     /**
2450      * Obtains a subclass of EncodingScheme which is capable of encoding
2451      * and decoding the named encoding scheme.
2452      *
2453      * This function is only aware of EncodingSchemes which have been
2454      * registered with the register() function.
2455      *
2456      * Example:
2457      * ---------------------------------------------------
2458      * auto scheme = EncodingScheme.create("Amiga-1251");
2459      * ---------------------------------------------------
2460      */
create(string encodingName)2461     static EncodingScheme create(string encodingName)
2462     {
2463         static bool registerDefaultEncodings()
2464         {
2465             EncodingScheme.register!EncodingSchemeASCII;
2466             EncodingScheme.register!EncodingSchemeLatin1;
2467             EncodingScheme.register!EncodingSchemeLatin2;
2468             EncodingScheme.register!EncodingSchemeWindows1250;
2469             EncodingScheme.register!EncodingSchemeWindows1252;
2470             EncodingScheme.register!EncodingSchemeUtf8;
2471             EncodingScheme.register!EncodingSchemeUtf16Native;
2472             EncodingScheme.register!EncodingSchemeUtf32Native;
2473             return true;
2474         }
2475 
2476         static shared bool initialized;
2477         import std.concurrency : initOnce;
2478         initOnce!initialized(registerDefaultEncodings());
2479         encodingName = toLower(encodingName);
2480 
2481         if (auto p = encodingName in supported)
2482             return (*p)();
2483 
2484         auto p = encodingName in supportedFactories;
2485         if (p is null)
2486             throw new EncodingException("Unrecognized Encoding: "~encodingName);
2487         string className = *p;
2488         auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2489         if (scheme is null) throw new EncodingException("Unable to create class "~className);
2490         return scheme;
2491     }
2492 
2493     const
2494     {
2495         /**
2496          * Returns the standard name of the encoding scheme
2497          */
2498         abstract override string toString();
2499 
2500         /**
2501          * Returns an array of all known names for this encoding scheme
2502          */
2503         abstract string[] names();
2504 
2505         /**
2506          * Returns true if the character c can be represented
2507          * in this encoding scheme.
2508          */
2509         abstract bool canEncode(dchar c);
2510 
2511         /**
2512          * Returns the number of ubytes required to encode this code point.
2513          *
2514          * The input to this function MUST be a valid code point.
2515          *
2516          * Params:
2517          *    c = the code point to be encoded
2518          *
2519          * Returns:
2520          *    the number of ubytes required.
2521          */
2522         abstract size_t encodedLength(dchar c);
2523 
2524         /**
2525          * Encodes a single code point into a user-supplied, fixed-size buffer.
2526          *
2527          * This function encodes a single code point into one or more ubytes.
2528          * The supplied buffer must be code unit aligned.
2529          * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2530          * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2531          *
2532          * The input to this function MUST be a valid code point.
2533          *
2534          * Params:
2535          *    c      = the code point to be encoded
2536          *    buffer = the destination array
2537          *
2538          * Returns:
2539          *    the number of ubytes written.
2540          */
2541         abstract size_t encode(dchar c, ubyte[] buffer);
2542 
2543         /**
2544          * Decodes a single code point.
2545          *
2546          * This function removes one or more ubytes from the start of an array,
2547          * and returns the decoded code point which those ubytes represent.
2548          *
2549          * The input to this function MUST be validly encoded.
2550          *
2551          * Params:
2552          *    s = the array whose first code point is to be decoded
2553          */
2554         abstract dchar decode(ref const(ubyte)[] s);
2555 
2556         /**
2557          * Decodes a single code point. The input does not have to be valid.
2558          *
2559          * This function removes one or more ubytes from the start of an array,
2560          * and returns the decoded code point which those ubytes represent.
2561          *
2562          * This function will accept an invalidly encoded array as input.
2563          * If an invalid sequence is found at the start of the string, this
2564          * function will remove it, and return the value INVALID_SEQUENCE.
2565          *
2566          * Params:
2567          *    s = the array whose first code point is to be decoded
2568          */
2569         abstract dchar safeDecode(ref const(ubyte)[] s);
2570 
2571         /**
2572          * Returns the sequence of ubytes to be used to represent
2573          * any character which cannot be represented in the encoding scheme.
2574          *
2575          * Normally this will be a representation of some substitution
2576          * character, such as U+FFFD or '?'.
2577          */
2578         abstract @property immutable(ubyte)[] replacementSequence();
2579     }
2580 
2581     /**
2582      * Returns true if the array is encoded correctly
2583      *
2584      * Params:
2585      *    s = the array to be tested
2586      */
isValid(const (ubyte)[]s)2587     bool isValid(const(ubyte)[] s)
2588     {
2589         while (s.length != 0)
2590         {
2591             if (safeDecode(s) == INVALID_SEQUENCE)
2592                 return false;
2593         }
2594         return true;
2595     }
2596 
2597     /**
2598      * Returns the length of the longest possible substring, starting from
2599      * the first element, which is validly encoded.
2600      *
2601      * Params:
2602      *    s = the array to be tested
2603      */
validLength()2604     size_t validLength()(const(ubyte)[] s)
2605     {
2606         const(ubyte)[] r = s;
2607         const(ubyte)[] t = s;
2608         while (s.length != 0)
2609         {
2610             if (safeDecode(s) == INVALID_SEQUENCE) break;
2611             t = s;
2612         }
2613         return r.length - t.length;
2614     }
2615 
2616     /**
2617      * Sanitizes an array by replacing malformed ubyte sequences with valid
2618      * ubyte sequences. The result is guaranteed to be valid for this
2619      * encoding scheme.
2620      *
2621      * If the input array is already valid, this function returns the
2622      * original, otherwise it constructs a new array by replacing all illegal
2623      * sequences with the encoding scheme's replacement sequence.
2624      *
2625      * Params:
2626      *    s = the string to be sanitized
2627      */
immutable(ubyte)2628     immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2629     {
2630         auto n = validLength(s);
2631         if (n == s.length) return s;
2632 
2633         auto repSeq = replacementSequence;
2634 
2635         // Count how long the string needs to be.
2636         // Overestimating is not a problem
2637         auto len = s.length;
2638         const(ubyte)[] t = s[n..$];
2639         while (t.length != 0)
2640         {
2641             immutable c = safeDecode(t);
2642             assert(c == INVALID_SEQUENCE);
2643             len += repSeq.length;
2644             t = t[validLength(t)..$];
2645         }
2646 
2647         // Now do the write
2648         ubyte[] array = new ubyte[len];
2649         array[0 .. n] = s[0 .. n];
2650         auto offset = n;
2651 
2652         t = s[n..$];
2653         while (t.length != 0)
2654         {
2655             immutable c = safeDecode(t);
2656             assert(c == INVALID_SEQUENCE);
2657             array[offset .. offset+repSeq.length] = repSeq[];
2658             offset += repSeq.length;
2659             n = validLength(t);
2660             array[offset .. offset+n] = t[0 .. n];
2661             offset += n;
2662             t = t[n..$];
2663         }
2664         return cast(immutable(ubyte)[])array[0 .. offset];
2665     }
2666 
2667     /**
2668      * Returns the length of the first encoded sequence.
2669      *
2670      * The input to this function MUST be validly encoded.
2671      * This is enforced by the function's in-contract.
2672      *
2673      * Params:
2674      *    s = the array to be sliced
2675      */
firstSequence()2676     size_t firstSequence()(const(ubyte)[] s)
2677     in
2678     {
2679         assert(s.length != 0);
2680         const(ubyte)[] u = s;
2681         assert(safeDecode(u) != INVALID_SEQUENCE);
2682     }
2683     body
2684     {
2685         const(ubyte)[] t = s;
2686         decode(s);
2687         return t.length - s.length;
2688     }
2689 
2690     /**
2691      * Returns the total number of code points encoded in a ubyte array.
2692      *
2693      * The input to this function MUST be validly encoded.
2694      * This is enforced by the function's in-contract.
2695      *
2696      * Params:
2697      *    s = the string to be counted
2698      */
count()2699     size_t count()(const(ubyte)[] s)
2700     in
2701     {
2702         assert(isValid(s));
2703     }
2704     body
2705     {
2706         size_t n = 0;
2707         while (s.length != 0)
2708         {
2709             decode(s);
2710             ++n;
2711         }
2712         return n;
2713     }
2714 
2715     /**
2716      * Returns the array index at which the (n+1)th code point begins.
2717      *
2718      * The input to this function MUST be validly encoded.
2719      * This is enforced by the function's in-contract.
2720      *
2721      * Params:
2722      *    s = the string to be counted
2723      *    n = the current code point index
2724      */
index()2725     ptrdiff_t index()(const(ubyte)[] s, size_t n)
2726     in
2727     {
2728         assert(isValid(s));
2729         assert(n >= 0);
2730     }
2731     body
2732     {
2733         const(ubyte)[] t = s;
2734         for (size_t i=0; i<n; ++i) decode(s);
2735         return t.length - s.length;
2736     }
2737 
2738     __gshared EncodingScheme function()[string] supported;
2739     __gshared string[string] supportedFactories;
2740 }
2741 
2742 /**
2743  EncodingScheme to handle ASCII
2744 
2745  This scheme recognises the following names:
2746                  "ANSI_X3.4-1968",
2747                  "ANSI_X3.4-1986",
2748                  "ASCII",
2749                  "IBM367",
2750                  "ISO646-US",
2751                  "ISO_646.irv:1991",
2752                  "US-ASCII",
2753                  "cp367",
2754                  "csASCII"
2755                  "iso-ir-6",
2756                  "us"
2757  */
2758 class EncodingSchemeASCII : EncodingScheme
2759 {
2760     /* // moved to std.internal.phobosinit
2761     shared static this()
2762     {
2763         EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2764     }*/
2765 
2766     const
2767     {
names()2768         override string[] names() @safe pure nothrow
2769         {
2770             return
2771             [
2772                 "ANSI_X3.4-1968",
2773                 "ANSI_X3.4-1986",
2774                 "ASCII",
2775                 "IBM367",
2776                 "ISO646-US",
2777                 "ISO_646.irv:1991",
2778                 "US-ASCII",
2779                 "cp367",
2780                 "csASCII",
2781                 "iso-ir-6",
2782                 "us"
2783             ];
2784         }
2785 
toString()2786         override string toString() @safe pure nothrow @nogc
2787         {
2788             return "ASCII";
2789         }
2790 
canEncode(dchar c)2791         override bool canEncode(dchar c) @safe pure nothrow @nogc
2792         {
2793             return std.encoding.canEncode!(AsciiChar)(c);
2794         }
2795 
encodedLength(dchar c)2796         override size_t encodedLength(dchar c)  @safe pure nothrow @nogc
2797         {
2798             return std.encoding.encodedLength!(AsciiChar)(c);
2799         }
2800 
encode(dchar c,ubyte[]buffer)2801         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2802         {
2803             auto r = cast(AsciiChar[]) buffer;
2804             return std.encoding.encode(c,r);
2805         }
2806 
decode(ref const (ubyte)[]s)2807         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2808         {
2809             auto t = cast(const(AsciiChar)[]) s;
2810             dchar c = std.encoding.decode(t);
2811             s = s[$-t.length..$];
2812             return c;
2813         }
2814 
safeDecode(ref const (ubyte)[]s)2815         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2816         {
2817             auto t = cast(const(AsciiChar)[]) s;
2818             dchar c = std.encoding.safeDecode(t);
2819             s = s[$-t.length..$];
2820             return c;
2821         }
2822 
immutable(ubyte)2823         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2824         {
2825             return cast(immutable(ubyte)[])"?";
2826         }
2827     }
2828 }
2829 
2830 /**
2831  EncodingScheme to handle Latin-1
2832 
2833  This scheme recognises the following names:
2834                  "CP819",
2835                  "IBM819",
2836                  "ISO-8859-1",
2837                  "ISO_8859-1",
2838                  "ISO_8859-1:1987",
2839                  "csISOLatin1",
2840                  "iso-ir-100",
2841                  "l1",
2842                  "latin1"
2843  */
2844 class EncodingSchemeLatin1 : EncodingScheme
2845 {
2846     /* // moved to std.internal.phobosinit
2847     shared static this()
2848     {
2849         EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2850     }*/
2851 
2852     const
2853     {
names()2854         override string[] names() @safe pure nothrow
2855         {
2856             return
2857             [
2858                 "CP819",
2859                 "IBM819",
2860                 "ISO-8859-1",
2861                 "ISO_8859-1",
2862                 "ISO_8859-1:1987",
2863                 "csISOLatin1",
2864                 "iso-ir-100",
2865                 "l1",
2866                 "latin1"
2867             ];
2868         }
2869 
toString()2870         override string toString() @safe pure nothrow @nogc
2871         {
2872             return "ISO-8859-1";
2873         }
2874 
canEncode(dchar c)2875         override bool canEncode(dchar c) @safe pure nothrow @nogc
2876         {
2877             return std.encoding.canEncode!(Latin1Char)(c);
2878         }
2879 
encodedLength(dchar c)2880         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2881         {
2882             return std.encoding.encodedLength!(Latin1Char)(c);
2883         }
2884 
encode(dchar c,ubyte[]buffer)2885         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2886         {
2887             auto r = cast(Latin1Char[]) buffer;
2888             return std.encoding.encode(c,r);
2889         }
2890 
decode(ref const (ubyte)[]s)2891         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2892         {
2893             auto t = cast(const(Latin1Char)[]) s;
2894             dchar c = std.encoding.decode(t);
2895             s = s[$-t.length..$];
2896             return c;
2897         }
2898 
safeDecode(ref const (ubyte)[]s)2899         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2900         {
2901             auto t = cast(const(Latin1Char)[]) s;
2902             dchar c = std.encoding.safeDecode(t);
2903             s = s[$-t.length..$];
2904             return c;
2905         }
2906 
immutable(ubyte)2907         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2908         {
2909             return cast(immutable(ubyte)[])"?";
2910         }
2911     }
2912 }
2913 
2914 /**
2915  EncodingScheme to handle Latin-2
2916 
2917  This scheme recognises the following names:
2918                  "Latin 2",
2919                  "ISO-8859-2",
2920                  "ISO_8859-2",
2921                  "ISO_8859-2:1999",
2922                  "Windows-28592"
2923  */
2924 class EncodingSchemeLatin2 : EncodingScheme
2925 {
2926     /* // moved to std.internal.phobosinit
2927     shared static this()
2928     {
2929         EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
2930     }*/
2931 
2932     const
2933     {
names()2934         override string[] names() @safe pure nothrow
2935         {
2936             return
2937             [
2938                 "Latin 2",
2939                 "ISO-8859-2",
2940                 "ISO_8859-2",
2941                 "ISO_8859-2:1999",
2942                 "windows-28592"
2943             ];
2944         }
2945 
toString()2946         override string toString() @safe pure nothrow @nogc
2947         {
2948             return "ISO-8859-2";
2949         }
2950 
canEncode(dchar c)2951         override bool canEncode(dchar c) @safe pure nothrow @nogc
2952         {
2953             return std.encoding.canEncode!(Latin2Char)(c);
2954         }
2955 
encodedLength(dchar c)2956         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2957         {
2958             return std.encoding.encodedLength!(Latin2Char)(c);
2959         }
2960 
encode(dchar c,ubyte[]buffer)2961         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962         {
2963             auto r = cast(Latin2Char[]) buffer;
2964             return std.encoding.encode(c,r);
2965         }
2966 
decode(ref const (ubyte)[]s)2967         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968         {
2969             auto t = cast(const(Latin2Char)[]) s;
2970             dchar c = std.encoding.decode(t);
2971             s = s[$-t.length..$];
2972             return c;
2973         }
2974 
safeDecode(ref const (ubyte)[]s)2975         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976         {
2977             auto t = cast(const(Latin2Char)[]) s;
2978             dchar c = std.encoding.safeDecode(t);
2979             s = s[$-t.length..$];
2980             return c;
2981         }
2982 
immutable(ubyte)2983         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984         {
2985             return cast(immutable(ubyte)[])"?";
2986         }
2987     }
2988 }
2989 
2990 /**
2991  EncodingScheme to handle Windows-1250
2992 
2993  This scheme recognises the following names:
2994                  "windows-1250"
2995  */
2996 class EncodingSchemeWindows1250 : EncodingScheme
2997 {
2998     /* // moved to std.internal.phobosinit
2999     shared static this()
3000     {
3001         EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3002     }*/
3003 
3004     const
3005     {
names()3006         override string[] names() @safe pure nothrow
3007         {
3008             return
3009             [
3010                 "windows-1250"
3011             ];
3012         }
3013 
toString()3014         override string toString() @safe pure nothrow @nogc
3015         {
3016             return "windows-1250";
3017         }
3018 
canEncode(dchar c)3019         override bool canEncode(dchar c) @safe pure nothrow @nogc
3020         {
3021             return std.encoding.canEncode!(Windows1250Char)(c);
3022         }
3023 
encodedLength(dchar c)3024         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3025         {
3026             return std.encoding.encodedLength!(Windows1250Char)(c);
3027         }
3028 
encode(dchar c,ubyte[]buffer)3029         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3030         {
3031             auto r = cast(Windows1250Char[]) buffer;
3032             return std.encoding.encode(c,r);
3033         }
3034 
decode(ref const (ubyte)[]s)3035         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3036         {
3037             auto t = cast(const(Windows1250Char)[]) s;
3038             dchar c = std.encoding.decode(t);
3039             s = s[$-t.length..$];
3040             return c;
3041         }
3042 
safeDecode(ref const (ubyte)[]s)3043         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3044         {
3045             auto t = cast(const(Windows1250Char)[]) s;
3046             dchar c = std.encoding.safeDecode(t);
3047             s = s[$-t.length..$];
3048             return c;
3049         }
3050 
immutable(ubyte)3051         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3052         {
3053             return cast(immutable(ubyte)[])"?";
3054         }
3055     }
3056 }
3057 
3058 /**
3059  EncodingScheme to handle Windows-1252
3060 
3061  This scheme recognises the following names:
3062                  "windows-1252"
3063  */
3064 class EncodingSchemeWindows1252 : EncodingScheme
3065 {
3066     /* // moved to std.internal.phobosinit
3067     shared static this()
3068     {
3069         EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3070     }*/
3071 
3072     const
3073     {
names()3074         override string[] names() @safe pure nothrow
3075         {
3076             return
3077             [
3078                 "windows-1252"
3079             ];
3080         }
3081 
toString()3082         override string toString() @safe pure nothrow @nogc
3083         {
3084             return "windows-1252";
3085         }
3086 
canEncode(dchar c)3087         override bool canEncode(dchar c) @safe pure nothrow @nogc
3088         {
3089             return std.encoding.canEncode!(Windows1252Char)(c);
3090         }
3091 
encodedLength(dchar c)3092         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3093         {
3094             return std.encoding.encodedLength!(Windows1252Char)(c);
3095         }
3096 
encode(dchar c,ubyte[]buffer)3097         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3098         {
3099             auto r = cast(Windows1252Char[]) buffer;
3100             return std.encoding.encode(c,r);
3101         }
3102 
decode(ref const (ubyte)[]s)3103         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3104         {
3105             auto t = cast(const(Windows1252Char)[]) s;
3106             dchar c = std.encoding.decode(t);
3107             s = s[$-t.length..$];
3108             return c;
3109         }
3110 
safeDecode(ref const (ubyte)[]s)3111         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3112         {
3113             auto t = cast(const(Windows1252Char)[]) s;
3114             dchar c = std.encoding.safeDecode(t);
3115             s = s[$-t.length..$];
3116             return c;
3117         }
3118 
immutable(ubyte)3119         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3120         {
3121             return cast(immutable(ubyte)[])"?";
3122         }
3123     }
3124 }
3125 
3126 /**
3127  EncodingScheme to handle UTF-8
3128 
3129  This scheme recognises the following names:
3130                  "UTF-8"
3131  */
3132 class EncodingSchemeUtf8 : EncodingScheme
3133 {
3134     /* // moved to std.internal.phobosinit
3135     shared static this()
3136     {
3137         EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3138     }*/
3139 
3140     const
3141     {
names()3142         override string[] names() @safe pure nothrow
3143         {
3144             return
3145             [
3146                 "UTF-8"
3147             ];
3148         }
3149 
toString()3150         override string toString() @safe pure nothrow @nogc
3151         {
3152             return "UTF-8";
3153         }
3154 
canEncode(dchar c)3155         override bool canEncode(dchar c) @safe pure nothrow @nogc
3156         {
3157             return std.encoding.canEncode!(char)(c);
3158         }
3159 
encodedLength(dchar c)3160         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3161         {
3162             return std.encoding.encodedLength!(char)(c);
3163         }
3164 
encode(dchar c,ubyte[]buffer)3165         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3166         {
3167             auto r = cast(char[]) buffer;
3168             return std.encoding.encode(c,r);
3169         }
3170 
decode(ref const (ubyte)[]s)3171         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3172         {
3173             auto t = cast(const(char)[]) s;
3174             dchar c = std.encoding.decode(t);
3175             s = s[$-t.length..$];
3176             return c;
3177         }
3178 
safeDecode(ref const (ubyte)[]s)3179         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3180         {
3181             auto t = cast(const(char)[]) s;
3182             dchar c = std.encoding.safeDecode(t);
3183             s = s[$-t.length..$];
3184             return c;
3185         }
3186 
immutable(ubyte)3187         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3188         {
3189             return cast(immutable(ubyte)[])"\uFFFD";
3190         }
3191     }
3192 }
3193 
3194 /**
3195  EncodingScheme to handle UTF-16 in native byte order
3196 
3197  This scheme recognises the following names:
3198                  "UTF-16LE" (little-endian architecture only)
3199                  "UTF-16BE" (big-endian architecture only)
3200  */
3201 class EncodingSchemeUtf16Native : EncodingScheme
3202 {
3203     /* // moved to std.internal.phobosinit
3204     shared static this()
3205     {
3206         EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3207     }*/
3208 
3209     const
3210     {
version(LittleEndian)3211         version (LittleEndian) { enum string NAME = "UTF-16LE"; }
version(BigEndian)3212         version (BigEndian)    { enum string NAME = "UTF-16BE"; }
3213 
names()3214         override string[] names() @safe pure nothrow
3215         {
3216             return [ NAME ];
3217         }
3218 
toString()3219         override string toString() @safe pure nothrow @nogc
3220         {
3221             return NAME;
3222         }
3223 
canEncode(dchar c)3224         override bool canEncode(dchar c) @safe pure nothrow @nogc
3225         {
3226             return std.encoding.canEncode!(wchar)(c);
3227         }
3228 
encodedLength(dchar c)3229         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3230         {
3231             return std.encoding.encodedLength!(wchar)(c);
3232         }
3233 
encode(dchar c,ubyte[]buffer)3234         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3235         {
3236             auto r = cast(wchar[]) buffer;
3237             return wchar.sizeof * std.encoding.encode(c,r);
3238         }
3239 
decode(ref const (ubyte)[]s)3240         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3241         in
3242         {
3243             assert((s.length & 1) == 0);
3244         }
3245         body
3246         {
3247             auto t = cast(const(wchar)[]) s;
3248             dchar c = std.encoding.decode(t);
3249             s = s[$-t.length * wchar.sizeof..$];
3250             return c;
3251         }
3252 
safeDecode(ref const (ubyte)[]s)3253         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3254         in
3255         {
3256             assert((s.length & 1) == 0);
3257         }
3258         body
3259         {
3260             auto t = cast(const(wchar)[]) s;
3261             dchar c = std.encoding.safeDecode(t);
3262             s = s[$-t.length * wchar.sizeof..$];
3263             return c;
3264         }
3265 
immutable(ubyte)3266         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3267         {
3268             return cast(immutable(ubyte)[])"\uFFFD"w;
3269         }
3270     }
3271 }
3272 @system unittest
3273 {
version(LittleEndian)3274     version (LittleEndian)
3275     {
3276         auto efrom = EncodingScheme.create("utf-16le");
3277         ubyte[6] sample = [154,1, 155,1, 156,1];
3278     }
version(BigEndian)3279     version (BigEndian)
3280     {
3281         auto efrom = EncodingScheme.create("utf-16be");
3282         ubyte[6] sample = [1,154, 1,155, 1,156];
3283     }
3284     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3285     dchar dc = efrom.safeDecode(ub);
3286     assert(dc == 410);
3287     assert(ub.length == 4);
3288 }
3289 
3290 /**
3291  EncodingScheme to handle UTF-32 in native byte order
3292 
3293  This scheme recognises the following names:
3294                  "UTF-32LE" (little-endian architecture only)
3295                  "UTF-32BE" (big-endian architecture only)
3296  */
3297 class EncodingSchemeUtf32Native : EncodingScheme
3298 {
3299     /* // moved to std.internal.phobosinit
3300     shared static this()
3301     {
3302         EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3303     }*/
3304 
3305     const
3306     {
version(LittleEndian)3307         version (LittleEndian) { enum string NAME = "UTF-32LE"; }
version(BigEndian)3308         version (BigEndian)    { enum string NAME = "UTF-32BE"; }
3309 
names()3310         override string[] names() @safe pure nothrow
3311         {
3312             return [ NAME ];
3313         }
3314 
toString()3315         override string toString() @safe pure nothrow @nogc
3316         {
3317             return NAME;
3318         }
3319 
canEncode(dchar c)3320         override bool canEncode(dchar c) @safe pure nothrow @nogc
3321         {
3322             return std.encoding.canEncode!(dchar)(c);
3323         }
3324 
encodedLength(dchar c)3325         override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3326         {
3327             return std.encoding.encodedLength!(dchar)(c);
3328         }
3329 
encode(dchar c,ubyte[]buffer)3330         override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3331         {
3332             auto r = cast(dchar[]) buffer;
3333             return dchar.sizeof * std.encoding.encode(c,r);
3334         }
3335 
decode(ref const (ubyte)[]s)3336         override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3337         in
3338         {
3339             assert((s.length & 3) == 0);
3340         }
3341         body
3342         {
3343             auto t = cast(const(dchar)[]) s;
3344             dchar c = std.encoding.decode(t);
3345             s = s[$-t.length * dchar.sizeof..$];
3346             return c;
3347         }
3348 
safeDecode(ref const (ubyte)[]s)3349         override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3350         in
3351         {
3352             assert((s.length & 3) == 0);
3353         }
3354         body
3355         {
3356             auto t = cast(const(dchar)[]) s;
3357             dchar c = std.encoding.safeDecode(t);
3358             s = s[$-t.length * dchar.sizeof..$];
3359             return c;
3360         }
3361 
immutable(ubyte)3362         override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3363         {
3364             return cast(immutable(ubyte)[])"\uFFFD"d;
3365         }
3366     }
3367 }
3368 @system unittest
3369 {
version(LittleEndian)3370     version (LittleEndian)
3371     {
3372         auto efrom = EncodingScheme.create("utf-32le");
3373         ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3374     }
version(BigEndian)3375     version (BigEndian)
3376     {
3377         auto efrom = EncodingScheme.create("utf-32be");
3378         ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3379     }
3380     const(ubyte)[] ub = cast(const(ubyte)[])sample;
3381     dchar dc = efrom.safeDecode(ub);
3382     assert(dc == 410);
3383     assert(ub.length == 8);
3384 }
3385 
3386 //=============================================================================
3387 
3388 
3389 // Helper functions
version(unittest)3390 version (unittest)
3391 {
3392     void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
3393     {
3394         static if (is(Src == Dst))
3395         {
3396             return s;
3397         }
3398         else static if (is(Src == AsciiChar))
3399         {
3400             transcodeReverse!(char,Dst)(cast(string) s,r);
3401         }
3402         else
3403         {
3404             foreach_reverse (d;codePoints(s))
3405             {
3406                 foreach_reverse (c;codeUnits!(Dst)(d))
3407                 {
3408                     r = c ~ r;
3409                 }
3410             }
3411         }
3412     }
3413 
3414     string makeReadable(string s)
3415     {
3416         string r = "\"";
3417         foreach (char c;s)
3418         {
3419             if (c >= 0x20 && c < 0x80)
3420             {
3421                 r ~= c;
3422             }
3423             else
3424             {
3425                 r ~= "\\x";
3426                 r ~= toHexDigit(c >> 4);
3427                 r ~= toHexDigit(c);
3428             }
3429         }
3430         r ~= "\"";
3431         return r;
3432     }
3433 
3434     string makeReadable(wstring s)
3435     {
3436         string r = "\"";
3437         foreach (wchar c;s)
3438         {
3439             if (c >= 0x20 && c < 0x80)
3440             {
3441                 r ~= cast(char) c;
3442             }
3443             else
3444             {
3445                 r ~= "\\u";
3446                 r ~= toHexDigit(c >> 12);
3447                 r ~= toHexDigit(c >> 8);
3448                 r ~= toHexDigit(c >> 4);
3449                 r ~= toHexDigit(c);
3450             }
3451         }
3452         r ~= "\"w";
3453         return r;
3454     }
3455 
3456     string makeReadable(dstring s)
3457     {
3458         string r = "\"";
3459         foreach (dchar c; s)
3460         {
3461             if (c >= 0x20 && c < 0x80)
3462             {
3463                 r ~= cast(char) c;
3464             }
3465             else if (c < 0x10000)
3466             {
3467                 r ~= "\\u";
3468                 r ~= toHexDigit(c >> 12);
3469                 r ~= toHexDigit(c >> 8);
3470                 r ~= toHexDigit(c >> 4);
3471                 r ~= toHexDigit(c);
3472             }
3473             else
3474             {
3475                 r ~= "\\U00";
3476                 r ~= toHexDigit(c >> 20);
3477                 r ~= toHexDigit(c >> 16);
3478                 r ~= toHexDigit(c >> 12);
3479                 r ~= toHexDigit(c >> 8);
3480                 r ~= toHexDigit(c >> 4);
3481                 r ~= toHexDigit(c);
3482             }
3483         }
3484         r ~= "\"d";
3485         return r;
3486     }
3487 
3488     char toHexDigit(int n)
3489     {
3490         return "0123456789ABCDEF"[n & 0xF];
3491     }
3492 }
3493 
3494 /** Definitions of common Byte Order Marks.
3495 The elements of the $(D enum) can used as indices into $(D bomTable) to get
3496 matching $(D BOMSeq).
3497 */
3498 enum BOM
3499 {
3500     none      = 0,  /// no BOM was found
3501     utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF]
3502     utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00]
3503     utf7      = 3,  /*  [0x2B, 0x2F, 0x76, 0x38]
3504                         [0x2B, 0x2F, 0x76, 0x39],
3505                         [0x2B, 0x2F, 0x76, 0x2B],
3506                         [0x2B, 0x2F, 0x76, 0x2F],
3507                         [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3508                     */
3509     utf1      = 8,  /// [0xF7, 0x64, 0x4C]
3510     utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73]
3511     scsu      = 10, /// [0x0E, 0xFE, 0xFF]
3512     bocu1     = 11, /// [0xFB, 0xEE, 0x28]
3513     gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33]
3514     utf8      = 13, /// [0xEF, 0xBB, 0xBF]
3515     utf16be   = 14, /// [0xFE, 0xFF]
3516     utf16le   = 15  /// [0xFF, 0xFE]
3517 }
3518 
3519 /// The type stored inside $(D bomTable).
3520 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3521 
3522 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3523 */
3524 immutable bomTable = [
3525     BOMSeq(BOM.none, null),
3526     BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3527     BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3528     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3529     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3530     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3531     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3532     BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3533     BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3534     BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3535     BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3536     BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3537     BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3538     BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3539     BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3540     BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3541 ];
3542 
3543 /** Returns a $(D BOMSeq) for a given $(D input).
3544 If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
3545 returned. The $(D BOM) sequence at the beginning of the range will
3546 not be comsumed from the passed range. If you pass a reference type
3547 range make sure that $(D save) creates a deep copy.
3548 
3549 Params:
3550     input = The sequence to check for the $(D BOM)
3551 
3552 Returns:
3553     the found $(D BOMSeq) corresponding to the passed $(D input).
3554 */
3555 immutable(BOMSeq) getBOM(Range)(Range input)
3556 if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
3557 {
3558     import std.algorithm.searching : startsWith;
foreach(it;bomTable[1..$])3559     foreach (it; bomTable[1 .. $])
3560     {
3561         if (startsWith(input.save, it.sequence))
3562         {
3563             return it;
3564         }
3565     }
3566 
3567     return bomTable[0];
3568 }
3569 
3570 ///
3571 @system unittest
3572 {
3573     import std.format : format;
3574 
3575     auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3576 
3577     auto entry = getBOM(cast(ubyte[]) ts);
version(BigEndian)3578     version (BigEndian)
3579     {
3580         assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3581     }
3582     else
3583     {
3584         assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3585     }
3586 }
3587 
3588 @system unittest
3589 {
3590     import std.format : format;
3591 
foreach(idx,it;bomTable)3592     foreach (idx, it; bomTable)
3593     {
3594         auto s = it[1] ~ cast(ubyte[])"hello world";
3595         auto i = getBOM(s);
3596         assert(i[0] == bomTable[idx][0]);
3597 
3598         if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3599         {
3600             assert(i[0] == BOM.init + idx);
3601             assert(i[1] == it[1]);
3602         }
3603     }
3604 }
3605 
3606 @safe pure unittest
3607 {
3608     struct BOMInputRange
3609     {
3610         ubyte[] arr;
3611 
frontBOMInputRange3612         @property ubyte front()
3613         {
3614             return this.arr.front;
3615         }
3616 
emptyBOMInputRange3617         @property bool empty()
3618         {
3619             return this.arr.empty;
3620         }
3621 
popFrontBOMInputRange3622         void popFront()
3623         {
3624             this.arr = this.arr[1 .. $];
3625         }
3626 
typeofBOMInputRange3627         @property typeof(this) save()
3628         {
3629             return this;
3630         }
3631     }
3632 
3633     static assert( isInputRange!BOMInputRange);
3634     static assert(!isArray!BOMInputRange);
3635 
3636     ubyte[] dummyEnd = [0,0,0,0];
3637 
foreach(idx,it;bomTable[1..$])3638     foreach (idx, it; bomTable[1 .. $])
3639     {
3640         {
3641             auto ir = BOMInputRange(it.sequence.dup);
3642 
3643             auto b = getBOM(ir);
3644             assert(b.schema == it.schema);
3645             assert(ir.arr == it.sequence);
3646         }
3647 
3648         {
3649             auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3650             size_t oldLen = noBom.length;
3651             assert(oldLen - 4 < it.sequence.length);
3652 
3653             auto ir = BOMInputRange(noBom.dup);
3654             auto b = getBOM(ir);
3655             assert(b.schema == BOM.none);
3656             assert(noBom.length == oldLen);
3657         }
3658     }
3659 }
3660 
3661 /** Constant defining a fully decoded BOM */
3662 enum dchar utfBOM = 0xfeff;
3663