1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2015-2018 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #include <codecvt>
26 #include <cstring>		// std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h>	// std::min
28 
29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
30 namespace std _GLIBCXX_VISIBILITY(default)
31 {
32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
33 
34   // The standard doesn't define these operators, which is annoying.
35   static underlying_type<codecvt_mode>::type
36   to_integer(codecvt_mode m)
37   { return static_cast<underlying_type<codecvt_mode>::type>(m); }
38 
39   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
40   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
41 
42   static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
43   { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
44 
45   static codecvt_mode operator~(codecvt_mode m)
46   { return codecvt_mode(~to_integer(m)); }
47 
48 namespace
49 {
50   // Largest code point that fits in a single UTF-16 code unit.
51   const char32_t max_single_utf16_unit = 0xFFFF;
52 
53   const char32_t max_code_point = 0x10FFFF;
54 
55   // The functions below rely on maxcode < incomplete_mb_character
56   // (which is enforced by the codecvt_utf* classes on construction).
57   const char32_t incomplete_mb_character = char32_t(-2);
58   const char32_t invalid_mb_sequence = char32_t(-1);
59 
60   // Utility type for reading and writing code units of type Elem from
61   // a range defined by a pair of pointers.
62   template<typename Elem, bool Aligned = true>
63     struct range
64     {
65       Elem* next;
66       Elem* end;
67 
68       // Write a code unit.
69       range& operator=(Elem e)
70       {
71 	*next++ = e;
72 	return *this;
73       }
74 
75       // Read the next code unit.
76       Elem operator*() const { return *next; }
77 
78       // Read the Nth code unit.
79       Elem operator[](size_t n) const { return next[n]; }
80 
81       // Move to the next code unit.
82       range& operator++()
83       {
84 	++next;
85 	return *this;
86       }
87 
88       // Move to the Nth code unit.
89       range& operator+=(size_t n)
90       {
91 	next += n;
92 	return *this;
93       }
94 
95       // The number of code units remaining.
96       size_t size() const { return end - next; }
97 
98       // The number of bytes remaining.
99       size_t nbytes() const { return (const char*)end - (const char*)next; }
100     };
101 
102   // This specialization is used when accessing char16_t values through
103   // pointers to char, which might not be correctly aligned for char16_t.
104   template<typename Elem>
105     struct range<Elem, false>
106     {
107       using value_type = typename remove_const<Elem>::type;
108 
109       using char_pointer = typename
110 	conditional<is_const<Elem>::value, const char*, char*>::type;
111 
112       char_pointer next;
113       char_pointer end;
114 
115       // Write a code unit.
116       range& operator=(Elem e)
117       {
118 	memcpy(next, &e, sizeof(Elem));
119 	++*this;
120 	return *this;
121       }
122 
123       // Read the next code unit.
124       Elem operator*() const
125       {
126 	value_type e;
127 	memcpy(&e, next, sizeof(Elem));
128 	return e;
129       }
130 
131       // Read the Nth code unit.
132       Elem operator[](size_t n) const
133       {
134 	value_type e;
135 	memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
136 	return e;
137       }
138 
139       // Move to the next code unit.
140       range& operator++()
141       {
142 	next += sizeof(Elem);
143 	return *this;
144       }
145 
146       // Move to the Nth code unit.
147       range& operator+=(size_t n)
148       {
149 	next += n * sizeof(Elem);
150 	return *this;
151       }
152 
153       // The number of code units remaining.
154       size_t size() const { return nbytes() / sizeof(Elem); }
155 
156       // The number of bytes remaining.
157       size_t nbytes() const { return end - next; }
158     };
159 
160   // Multibyte sequences can have "header" consisting of Byte Order Mark
161   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
162   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
163   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
164 
165   // Write a BOM (space permitting).
166   template<typename C, bool A, size_t N>
167     bool
168     write_bom(range<C, A>& to, const unsigned char (&bom)[N])
169     {
170       static_assert( (N / sizeof(C)) != 0, "" );
171       static_assert( (N % sizeof(C)) == 0, "" );
172 
173       if (to.nbytes() < N)
174 	return false;
175       memcpy(to.next, bom, N);
176       to += (N / sizeof(C));
177       return true;
178     }
179 
180   // Try to read a BOM.
181   template<typename C, bool A, size_t N>
182     bool
183     read_bom(range<C, A>& from, const unsigned char (&bom)[N])
184     {
185       static_assert( (N / sizeof(C)) != 0, "" );
186       static_assert( (N % sizeof(C)) == 0, "" );
187 
188       if (from.nbytes() >= N && !memcmp(from.next, bom, N))
189 	{
190 	  from += (N / sizeof(C));
191 	  return true;
192 	}
193       return false;
194     }
195 
196   // If generate_header is set in mode write out UTF-8 BOM.
197   bool
198   write_utf8_bom(range<char>& to, codecvt_mode mode)
199   {
200     if (mode & generate_header)
201       return write_bom(to, utf8_bom);
202     return true;
203   }
204 
205   // If generate_header is set in mode write out the UTF-16 BOM indicated
206   // by whether little_endian is set in mode.
207   template<bool Aligned>
208   bool
209   write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
210   {
211     if (mode & generate_header)
212     {
213       if (mode & little_endian)
214 	return write_bom(to, utf16le_bom);
215       else
216 	return write_bom(to, utf16_bom);
217     }
218     return true;
219   }
220 
221   // If consume_header is set in mode update from.next to after any BOM.
222   void
223   read_utf8_bom(range<const char>& from, codecvt_mode mode)
224   {
225     if (mode & consume_header)
226       read_bom(from, utf8_bom);
227   }
228 
229   // If consume_header is not set in mode, no effects.
230   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
231   // - if the UTF-16BE BOM was found unset little_endian in mode, or
232   // - if the UTF-16LE BOM was found set little_endian in mode.
233   template<bool Aligned>
234   void
235   read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
236   {
237     if (mode & consume_header)
238       {
239 	if (read_bom(from, utf16_bom))
240 	  mode &= ~little_endian;
241 	else if (read_bom(from, utf16le_bom))
242 	  mode |= little_endian;
243       }
244   }
245 
246   // Read a codepoint from a UTF-8 multibyte sequence.
247   // Updates from.next if the codepoint is not greater than maxcode.
248   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
249   char32_t
250   read_utf8_code_point(range<const char>& from, unsigned long maxcode)
251   {
252     const size_t avail = from.size();
253     if (avail == 0)
254       return incomplete_mb_character;
255     unsigned char c1 = from[0];
256     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
257     if (c1 < 0x80)
258     {
259       ++from;
260       return c1;
261     }
262     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
263       return invalid_mb_sequence;
264     else if (c1 < 0xE0) // 2-byte sequence
265     {
266       if (avail < 2)
267 	return incomplete_mb_character;
268       unsigned char c2 = from[1];
269       if ((c2 & 0xC0) != 0x80)
270 	return invalid_mb_sequence;
271       char32_t c = (c1 << 6) + c2 - 0x3080;
272       if (c <= maxcode)
273 	from += 2;
274       return c;
275     }
276     else if (c1 < 0xF0) // 3-byte sequence
277     {
278       if (avail < 3)
279 	return incomplete_mb_character;
280       unsigned char c2 = from[1];
281       if ((c2 & 0xC0) != 0x80)
282 	return invalid_mb_sequence;
283       if (c1 == 0xE0 && c2 < 0xA0) // overlong
284 	return invalid_mb_sequence;
285       unsigned char c3 = from[2];
286       if ((c3 & 0xC0) != 0x80)
287 	return invalid_mb_sequence;
288       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
289       if (c <= maxcode)
290 	from += 3;
291       return c;
292     }
293     else if (c1 < 0xF5) // 4-byte sequence
294     {
295       if (avail < 4)
296 	return incomplete_mb_character;
297       unsigned char c2 = from[1];
298       if ((c2 & 0xC0) != 0x80)
299 	return invalid_mb_sequence;
300       if (c1 == 0xF0 && c2 < 0x90) // overlong
301 	return invalid_mb_sequence;
302       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
303       return invalid_mb_sequence;
304       unsigned char c3 = from[2];
305       if ((c3 & 0xC0) != 0x80)
306 	return invalid_mb_sequence;
307       unsigned char c4 = from[3];
308       if ((c4 & 0xC0) != 0x80)
309 	return invalid_mb_sequence;
310       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
311       if (c <= maxcode)
312 	from += 4;
313       return c;
314     }
315     else // > U+10FFFF
316       return invalid_mb_sequence;
317   }
318 
319   bool
320   write_utf8_code_point(range<char>& to, char32_t code_point)
321   {
322     if (code_point < 0x80)
323       {
324 	if (to.size() < 1)
325 	  return false;
326 	to = code_point;
327       }
328     else if (code_point <= 0x7FF)
329       {
330 	if (to.size() < 2)
331 	  return false;
332 	to = (code_point >> 6) + 0xC0;
333 	to = (code_point & 0x3F) + 0x80;
334       }
335     else if (code_point <= 0xFFFF)
336       {
337 	if (to.size() < 3)
338 	  return false;
339 	to = (code_point >> 12) + 0xE0;
340 	to = ((code_point >> 6) & 0x3F) + 0x80;
341 	to = (code_point & 0x3F) + 0x80;
342       }
343     else if (code_point <= 0x10FFFF)
344       {
345 	if (to.size() < 4)
346 	  return false;
347 	to = (code_point >> 18) + 0xF0;
348 	to = ((code_point >> 12) & 0x3F) + 0x80;
349 	to = ((code_point >> 6) & 0x3F) + 0x80;
350 	to = (code_point & 0x3F) + 0x80;
351       }
352     else
353       return false;
354     return true;
355   }
356 
357   inline char16_t
358   adjust_byte_order(char16_t c, codecvt_mode mode)
359   {
360 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
361     return (mode & little_endian) ? __builtin_bswap16(c) : c;
362 #else
363     return (mode & little_endian) ? c : __builtin_bswap16(c);
364 #endif
365   }
366 
367   // Return true if c is a high-surrogate (aka leading) code point.
368   inline bool
369   is_high_surrogate(char32_t c)
370   {
371     return c >= 0xD800 && c <= 0xDBFF;
372   }
373 
374   // Return true if c is a low-surrogate (aka trailing) code point.
375   inline bool
376   is_low_surrogate(char32_t c)
377   {
378     return c >= 0xDC00 && c <= 0xDFFF;
379   }
380 
381   inline char32_t
382   surrogate_pair_to_code_point(char32_t high, char32_t low)
383   {
384     return (high << 10) + low - 0x35FDC00;
385   }
386 
387   // Read a codepoint from a UTF-16 multibyte sequence.
388   // The sequence's endianness is indicated by (mode & little_endian).
389   // Updates from.next if the codepoint is not greater than maxcode.
390   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
391   template<bool Aligned>
392     char32_t
393     read_utf16_code_point(range<const char16_t, Aligned>& from,
394 			  unsigned long maxcode, codecvt_mode mode)
395     {
396       const size_t avail = from.size();
397       if (avail == 0)
398 	return incomplete_mb_character;
399       int inc = 1;
400       char32_t c = adjust_byte_order(from[0], mode);
401       if (is_high_surrogate(c))
402 	{
403 	  if (avail < 2)
404 	    return incomplete_mb_character;
405 	  const char16_t c2 = adjust_byte_order(from[1], mode);
406 	  if (is_low_surrogate(c2))
407 	    {
408 	      c = surrogate_pair_to_code_point(c, c2);
409 	      inc = 2;
410 	    }
411 	  else
412 	    return invalid_mb_sequence;
413 	}
414       else if (is_low_surrogate(c))
415 	return invalid_mb_sequence;
416       if (c <= maxcode)
417 	from += inc;
418       return c;
419     }
420 
421   template<typename C, bool A>
422   bool
423   write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
424   {
425     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
426 
427     if (codepoint <= max_single_utf16_unit)
428       {
429 	if (to.size() > 0)
430 	  {
431 	    to = adjust_byte_order(codepoint, mode);
432 	    return true;
433 	  }
434       }
435     else if (to.size() > 1)
436       {
437 	// Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
438 	const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
439 	char16_t lead = LEAD_OFFSET + (codepoint >> 10);
440 	char16_t trail = 0xDC00 + (codepoint & 0x3FF);
441 	to = adjust_byte_order(lead, mode);
442 	to = adjust_byte_order(trail, mode);
443 	return true;
444       }
445     return false;
446   }
447 
448   // utf8 -> ucs4
449   codecvt_base::result
450   ucs4_in(range<const char>& from, range<char32_t>& to,
451           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
452   {
453     read_utf8_bom(from, mode);
454     while (from.size() && to.size())
455       {
456 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
457 	if (codepoint == incomplete_mb_character)
458 	  return codecvt_base::partial;
459 	if (codepoint > maxcode)
460 	  return codecvt_base::error;
461 	to = codepoint;
462       }
463     return from.size() ? codecvt_base::partial : codecvt_base::ok;
464   }
465 
466   // ucs4 -> utf8
467   codecvt_base::result
468   ucs4_out(range<const char32_t>& from, range<char>& to,
469            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
470   {
471     if (!write_utf8_bom(to, mode))
472       return codecvt_base::partial;
473     while (from.size())
474       {
475 	const char32_t c = from[0];
476 	if (c > maxcode)
477 	  return codecvt_base::error;
478 	if (!write_utf8_code_point(to, c))
479 	  return codecvt_base::partial;
480 	++from;
481       }
482     return codecvt_base::ok;
483   }
484 
485   // utf16 -> ucs4
486   codecvt_base::result
487   ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
488           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
489   {
490     read_utf16_bom(from, mode);
491     while (from.size() && to.size())
492       {
493 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
494 	if (codepoint == incomplete_mb_character)
495 	  return codecvt_base::partial;
496 	if (codepoint > maxcode)
497 	  return codecvt_base::error;
498 	to = codepoint;
499       }
500     return from.size() ? codecvt_base::partial : codecvt_base::ok;
501   }
502 
503   // ucs4 -> utf16
504   codecvt_base::result
505   ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
506            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
507   {
508     if (!write_utf16_bom(to, mode))
509       return codecvt_base::partial;
510     while (from.size())
511       {
512 	const char32_t c = from[0];
513 	if (c > maxcode)
514 	  return codecvt_base::error;
515 	if (!write_utf16_code_point(to, c, mode))
516 	  return codecvt_base::partial;
517 	++from;
518       }
519     return codecvt_base::ok;
520   }
521 
522   // Flag indicating whether to process UTF-16 or UCS2
523   enum class surrogates { allowed, disallowed };
524 
525   // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
526   template<typename C>
527   codecvt_base::result
528   utf16_in(range<const char>& from, range<C>& to,
529 	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
530 	   surrogates s = surrogates::allowed)
531   {
532     read_utf8_bom(from, mode);
533     while (from.size() && to.size())
534       {
535 	auto orig = from;
536 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
537 	if (codepoint == incomplete_mb_character)
538 	  {
539 	    if (s == surrogates::allowed)
540 	      return codecvt_base::partial;
541 	    else
542 	      return codecvt_base::error; // No surrogates in UCS2
543 	  }
544 	if (codepoint > maxcode)
545 	  return codecvt_base::error;
546 	if (!write_utf16_code_point(to, codepoint, mode))
547 	  {
548 	    from = orig; // rewind to previous position
549 	    return codecvt_base::partial;
550 	  }
551       }
552     return codecvt_base::ok;
553   }
554 
555   // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
556   template<typename C>
557   codecvt_base::result
558   utf16_out(range<const C>& from, range<char>& to,
559 	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
560 	    surrogates s = surrogates::allowed)
561   {
562     if (!write_utf8_bom(to, mode))
563       return codecvt_base::partial;
564     while (from.size())
565       {
566 	char32_t c = from[0];
567 	int inc = 1;
568 	if (is_high_surrogate(c))
569 	  {
570 	    if (s == surrogates::disallowed)
571 	      return codecvt_base::error; // No surrogates in UCS-2
572 
573 	    if (from.size() < 2)
574 	      return codecvt_base::ok; // stop converting at this point
575 
576 	    const char32_t c2 = from[1];
577 	    if (is_low_surrogate(c2))
578 	      {
579 		c = surrogate_pair_to_code_point(c, c2);
580 		inc = 2;
581 	      }
582 	    else
583 	      return codecvt_base::error;
584 	  }
585 	else if (is_low_surrogate(c))
586 	  return codecvt_base::error;
587 	if (c > maxcode)
588 	  return codecvt_base::error;
589 	if (!write_utf8_code_point(to, c))
590 	  return codecvt_base::partial;
591 	from += inc;
592       }
593     return codecvt_base::ok;
594   }
595 
596   // return pos such that [begin,pos) is valid UTF-16 string no longer than max
597   const char*
598   utf16_span(const char* begin, const char* end, size_t max,
599 	     char32_t maxcode = max_code_point, codecvt_mode mode = {})
600   {
601     range<const char> from{ begin, end };
602     read_utf8_bom(from, mode);
603     size_t count = 0;
604     while (count+1 < max)
605       {
606 	char32_t c = read_utf8_code_point(from, maxcode);
607 	if (c > maxcode)
608 	  return from.next;
609 	else if (c > max_single_utf16_unit)
610 	  ++count;
611 	++count;
612       }
613     if (count+1 == max) // take one more character if it fits in a single unit
614       read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
615     return from.next;
616   }
617 
618   // utf8 -> ucs2
619   codecvt_base::result
620   ucs2_in(range<const char>& from, range<char16_t>& to,
621 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
622   {
623     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
624     maxcode = std::min(max_single_utf16_unit, maxcode);
625     return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
626   }
627 
628   // ucs2 -> utf8
629   codecvt_base::result
630   ucs2_out(range<const char16_t>& from, range<char>& to,
631 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
632   {
633     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
634     maxcode = std::min(max_single_utf16_unit, maxcode);
635     return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
636   }
637 
638   // ucs2 -> utf16
639   codecvt_base::result
640   ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
641 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
642   {
643     if (!write_utf16_bom(to, mode))
644       return codecvt_base::partial;
645     while (from.size() && to.size())
646       {
647 	char16_t c = from[0];
648 	if (is_high_surrogate(c))
649 	  return codecvt_base::error;
650 	if (c > maxcode)
651 	  return codecvt_base::error;
652 	to = adjust_byte_order(c, mode);
653 	++from;
654       }
655     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
656   }
657 
658   // utf16 -> ucs2
659   codecvt_base::result
660   ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
661 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
662   {
663     read_utf16_bom(from, mode);
664     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
665     maxcode = std::min(max_single_utf16_unit, maxcode);
666     while (from.size() && to.size())
667       {
668 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
669 	if (c == incomplete_mb_character)
670 	  return codecvt_base::error; // UCS-2 only supports single units.
671 	if (c > maxcode)
672 	  return codecvt_base::error;
673 	to = c;
674       }
675     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
676   }
677 
678   const char16_t*
679   ucs2_span(range<const char16_t, false>& from, size_t max,
680             char32_t maxcode, codecvt_mode mode)
681   {
682     read_utf16_bom(from, mode);
683     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
684     maxcode = std::min(max_single_utf16_unit, maxcode);
685     char32_t c = 0;
686     while (max-- && c <= maxcode)
687       c = read_utf16_code_point(from, maxcode, mode);
688     return reinterpret_cast<const char16_t*>(from.next);
689   }
690 
691   const char*
692   ucs2_span(const char* begin, const char* end, size_t max,
693             char32_t maxcode, codecvt_mode mode)
694   {
695     range<const char> from{ begin, end };
696     read_utf8_bom(from, mode);
697     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
698     maxcode = std::min(max_single_utf16_unit, maxcode);
699     char32_t c = 0;
700     while (max-- && c <= maxcode)
701       c = read_utf8_code_point(from, maxcode);
702     return from.next;
703   }
704 
705   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
706   const char*
707   ucs4_span(const char* begin, const char* end, size_t max,
708             char32_t maxcode = max_code_point, codecvt_mode mode = {})
709   {
710     range<const char> from{ begin, end };
711     read_utf8_bom(from, mode);
712     char32_t c = 0;
713     while (max-- && c <= maxcode)
714       c = read_utf8_code_point(from, maxcode);
715     return from.next;
716   }
717 
718   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
719   const char16_t*
720   ucs4_span(range<const char16_t, false>& from, size_t max,
721             char32_t maxcode = max_code_point, codecvt_mode mode = {})
722   {
723     read_utf16_bom(from, mode);
724     char32_t c = 0;
725     while (max-- && c <= maxcode)
726       c = read_utf16_code_point(from, maxcode, mode);
727     return reinterpret_cast<const char16_t*>(from.next);
728   }
729 }
730 
731 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
732 // Converts from UTF-8 to UTF-16.
733 
734 locale::id codecvt<char16_t, char, mbstate_t>::id;
735 
736 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
737 
738 codecvt_base::result
739 codecvt<char16_t, char, mbstate_t>::
740 do_out(state_type&,
741        const intern_type* __from,
742        const intern_type* __from_end, const intern_type*& __from_next,
743        extern_type* __to, extern_type* __to_end,
744        extern_type*& __to_next) const
745 {
746   range<const char16_t> from{ __from, __from_end };
747   range<char> to{ __to, __to_end };
748   auto res = utf16_out(from, to);
749   __from_next = from.next;
750   __to_next = to.next;
751   return res;
752 }
753 
754 codecvt_base::result
755 codecvt<char16_t, char, mbstate_t>::
756 do_unshift(state_type&, extern_type* __to, extern_type*,
757 	   extern_type*& __to_next) const
758 {
759   __to_next = __to;
760   return noconv; // we don't use mbstate_t for the unicode facets
761 }
762 
763 codecvt_base::result
764 codecvt<char16_t, char, mbstate_t>::
765 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
766       const extern_type*& __from_next,
767       intern_type* __to, intern_type* __to_end,
768       intern_type*& __to_next) const
769 {
770   range<const char> from{ __from, __from_end };
771   range<char16_t> to{ __to, __to_end };
772 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
773   codecvt_mode mode = {};
774 #else
775   codecvt_mode mode = little_endian;
776 #endif
777   auto res = utf16_in(from, to, max_code_point, mode);
778   __from_next = from.next;
779   __to_next = to.next;
780   return res;
781 }
782 
783 int
784 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
785 { return 0; } // UTF-8 is not a fixed-width encoding
786 
787 bool
788 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
789 { return false; }
790 
791 int
792 codecvt<char16_t, char, mbstate_t>::
793 do_length(state_type&, const extern_type* __from,
794 	  const extern_type* __end, size_t __max) const
795 {
796   __end = utf16_span(__from, __end, __max);
797   return __end - __from;
798 }
799 
800 int
801 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
802 {
803   // A single character (one or two UTF-16 code units) requires
804   // up to four UTF-8 code units.
805   return 4;
806 }
807 
808 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
809 // Converts from UTF-8 to UTF-32 (aka UCS-4).
810 
811 locale::id codecvt<char32_t, char, mbstate_t>::id;
812 
813 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
814 
815 codecvt_base::result
816 codecvt<char32_t, char, mbstate_t>::
817 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
818        const intern_type*& __from_next,
819        extern_type* __to, extern_type* __to_end,
820        extern_type*& __to_next) const
821 {
822   range<const char32_t> from{ __from, __from_end };
823   range<char> to{ __to, __to_end };
824   auto res = ucs4_out(from, to);
825   __from_next = from.next;
826   __to_next = to.next;
827   return res;
828 }
829 
830 codecvt_base::result
831 codecvt<char32_t, char, mbstate_t>::
832 do_unshift(state_type&, extern_type* __to, extern_type*,
833 	   extern_type*& __to_next) const
834 {
835   __to_next = __to;
836   return noconv;
837 }
838 
839 codecvt_base::result
840 codecvt<char32_t, char, mbstate_t>::
841 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
842       const extern_type*& __from_next,
843       intern_type* __to, intern_type* __to_end,
844       intern_type*& __to_next) const
845 {
846   range<const char> from{ __from, __from_end };
847   range<char32_t> to{ __to, __to_end };
848   auto res = ucs4_in(from, to);
849   __from_next = from.next;
850   __to_next = to.next;
851   return res;
852 }
853 
854 int
855 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
856 { return 0; } // UTF-8 is not a fixed-width encoding
857 
858 bool
859 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
860 { return false; }
861 
862 int
863 codecvt<char32_t, char, mbstate_t>::
864 do_length(state_type&, const extern_type* __from,
865 	  const extern_type* __end, size_t __max) const
866 {
867   __end = ucs4_span(__from, __end, __max);
868   return __end - __from;
869 }
870 
871 int
872 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
873 {
874   // A single character (one UTF-32 code unit) requires
875   // up to 4 UTF-8 code units.
876   return 4;
877 }
878 
879 // Define members of codecvt_utf8<char16_t> base class implementation.
880 // Converts from UTF-8 to UCS-2.
881 
882 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
883 
884 codecvt_base::result
885 __codecvt_utf8_base<char16_t>::
886 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
887        const intern_type*& __from_next,
888        extern_type* __to, extern_type* __to_end,
889        extern_type*& __to_next) const
890 {
891   range<const char16_t> from{ __from, __from_end };
892   range<char> to{ __to, __to_end };
893   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
894   __from_next = from.next;
895   __to_next = to.next;
896   return res;
897 }
898 
899 codecvt_base::result
900 __codecvt_utf8_base<char16_t>::
901 do_unshift(state_type&, extern_type* __to, extern_type*,
902 	   extern_type*& __to_next) const
903 {
904   __to_next = __to;
905   return noconv;
906 }
907 
908 codecvt_base::result
909 __codecvt_utf8_base<char16_t>::
910 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
911       const extern_type*& __from_next,
912       intern_type* __to, intern_type* __to_end,
913       intern_type*& __to_next) const
914 {
915   range<const char> from{ __from, __from_end };
916   range<char16_t> to{ __to, __to_end };
917   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
918 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
919   mode = codecvt_mode(mode | little_endian);
920 #endif
921   auto res = ucs2_in(from, to, _M_maxcode, mode);
922   __from_next = from.next;
923   __to_next = to.next;
924   return res;
925 }
926 
927 int
928 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
929 { return 0; } // UTF-8 is not a fixed-width encoding
930 
931 bool
932 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
933 { return false; }
934 
935 int
936 __codecvt_utf8_base<char16_t>::
937 do_length(state_type&, const extern_type* __from,
938 	  const extern_type* __end, size_t __max) const
939 {
940   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
941   return __end - __from;
942 }
943 
944 int
945 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
946 {
947   // A single UCS-2 character requires up to three UTF-8 code units.
948   // (UCS-2 cannot represent characters that use four UTF-8 code units).
949   int max = 3;
950   if (_M_mode & consume_header)
951     max += sizeof(utf8_bom);
952   return max;
953 }
954 
955 // Define members of codecvt_utf8<char32_t> base class implementation.
956 // Converts from UTF-8 to UTF-32 (aka UCS-4).
957 
958 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
959 
960 codecvt_base::result
961 __codecvt_utf8_base<char32_t>::
962 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
963        const intern_type*& __from_next,
964        extern_type* __to, extern_type* __to_end,
965        extern_type*& __to_next) const
966 {
967   range<const char32_t> from{ __from, __from_end };
968   range<char> to{ __to, __to_end };
969   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
970   __from_next = from.next;
971   __to_next = to.next;
972   return res;
973 }
974 
975 codecvt_base::result
976 __codecvt_utf8_base<char32_t>::
977 do_unshift(state_type&, extern_type* __to, extern_type*,
978 	   extern_type*& __to_next) const
979 {
980   __to_next = __to;
981   return noconv;
982 }
983 
984 codecvt_base::result
985 __codecvt_utf8_base<char32_t>::
986 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
987       const extern_type*& __from_next,
988       intern_type* __to, intern_type* __to_end,
989       intern_type*& __to_next) const
990 {
991   range<const char> from{ __from, __from_end };
992   range<char32_t> to{ __to, __to_end };
993   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
994   __from_next = from.next;
995   __to_next = to.next;
996   return res;
997 }
998 
999 int
1000 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1001 { return 0; } // UTF-8 is not a fixed-width encoding
1002 
1003 bool
1004 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1005 { return false; }
1006 
1007 int
1008 __codecvt_utf8_base<char32_t>::
1009 do_length(state_type&, const extern_type* __from,
1010 	  const extern_type* __end, size_t __max) const
1011 {
1012   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1013   return __end - __from;
1014 }
1015 
1016 int
1017 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1018 {
1019   // A single UCS-4 character requires up to four UTF-8 code units.
1020   int max = 4;
1021   if (_M_mode & consume_header)
1022     max += sizeof(utf8_bom);
1023   return max;
1024 }
1025 
1026 #ifdef _GLIBCXX_USE_WCHAR_T
1027 
1028 #if __SIZEOF_WCHAR_T__ == 2
1029 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1030 #elif __SIZEOF_WCHAR_T__ == 4
1031 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1032 #endif
1033 
1034 // Define members of codecvt_utf8<wchar_t> base class implementation.
1035 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1036 
1037 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1038 
1039 codecvt_base::result
1040 __codecvt_utf8_base<wchar_t>::
1041 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1042        const intern_type*& __from_next,
1043        extern_type* __to, extern_type* __to_end,
1044        extern_type*& __to_next) const
1045 {
1046   range<char> to{ __to, __to_end };
1047 #if __SIZEOF_WCHAR_T__ == 2
1048   range<const char16_t> from{
1049     reinterpret_cast<const char16_t*>(__from),
1050     reinterpret_cast<const char16_t*>(__from_end)
1051   };
1052   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1053 #elif __SIZEOF_WCHAR_T__ == 4
1054   range<const char32_t> from{
1055     reinterpret_cast<const char32_t*>(__from),
1056     reinterpret_cast<const char32_t*>(__from_end)
1057   };
1058   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1059 #else
1060   return codecvt_base::error;
1061 #endif
1062   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1063   __to_next = to.next;
1064   return res;
1065 }
1066 
1067 codecvt_base::result
1068 __codecvt_utf8_base<wchar_t>::
1069 do_unshift(state_type&, extern_type* __to, extern_type*,
1070 	   extern_type*& __to_next) const
1071 {
1072   __to_next = __to;
1073   return noconv;
1074 }
1075 
1076 codecvt_base::result
1077 __codecvt_utf8_base<wchar_t>::
1078 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1079       const extern_type*& __from_next,
1080       intern_type* __to, intern_type* __to_end,
1081       intern_type*& __to_next) const
1082 {
1083   range<const char> from{ __from, __from_end };
1084 #if __SIZEOF_WCHAR_T__ == 2
1085   range<char16_t> to{
1086     reinterpret_cast<char16_t*>(__to),
1087     reinterpret_cast<char16_t*>(__to_end)
1088   };
1089 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1090   codecvt_mode mode = {};
1091 #else
1092   codecvt_mode mode = little_endian;
1093 #endif
1094   auto res = ucs2_in(from, to, _M_maxcode, mode);
1095 #elif __SIZEOF_WCHAR_T__ == 4
1096   range<char32_t> to{
1097     reinterpret_cast<char32_t*>(__to),
1098     reinterpret_cast<char32_t*>(__to_end)
1099   };
1100   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1101 #else
1102   return codecvt_base::error;
1103 #endif
1104   __from_next = from.next;
1105   __to_next = reinterpret_cast<wchar_t*>(to.next);
1106   return res;
1107 }
1108 
1109 int
1110 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1111 { return 0; } // UTF-8 is not a fixed-width encoding
1112 
1113 bool
1114 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1115 { return false; }
1116 
1117 int
1118 __codecvt_utf8_base<wchar_t>::
1119 do_length(state_type&, const extern_type* __from,
1120 	  const extern_type* __end, size_t __max) const
1121 {
1122 #if __SIZEOF_WCHAR_T__ == 2
1123   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1124 #elif __SIZEOF_WCHAR_T__ == 4
1125   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1126 #else
1127   __end = __from;
1128 #endif
1129   return __end - __from;
1130 }
1131 
1132 int
1133 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1134 {
1135 #if __SIZEOF_WCHAR_T__ == 2
1136   int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1137 #else
1138   int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1139 #endif
1140   if (_M_mode & consume_header)
1141     max += sizeof(utf8_bom);
1142   return max;
1143 }
1144 #endif
1145 
1146 // Define members of codecvt_utf16<char16_t> base class implementation.
1147 // Converts from UTF-16 to UCS-2.
1148 
1149 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1150 
1151 codecvt_base::result
1152 __codecvt_utf16_base<char16_t>::
1153 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1154        const intern_type*& __from_next,
1155        extern_type* __to, extern_type* __to_end,
1156        extern_type*& __to_next) const
1157 {
1158   range<const char16_t> from{ __from, __from_end };
1159   range<char16_t, false> to{ __to, __to_end };
1160   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1161   __from_next = from.next;
1162   __to_next = reinterpret_cast<char*>(to.next);
1163   return res;
1164 }
1165 
1166 codecvt_base::result
1167 __codecvt_utf16_base<char16_t>::
1168 do_unshift(state_type&, extern_type* __to, extern_type*,
1169 	   extern_type*& __to_next) const
1170 {
1171   __to_next = __to;
1172   return noconv;
1173 }
1174 
1175 codecvt_base::result
1176 __codecvt_utf16_base<char16_t>::
1177 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1178       const extern_type*& __from_next,
1179       intern_type* __to, intern_type* __to_end,
1180       intern_type*& __to_next) const
1181 {
1182   range<const char16_t, false> from{ __from, __from_end };
1183   range<char16_t> to{ __to, __to_end };
1184   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1185   __from_next = reinterpret_cast<const char*>(from.next);
1186   __to_next = to.next;
1187   if (res == codecvt_base::ok && __from_next != __from_end)
1188     res = codecvt_base::error;
1189   return res;
1190 }
1191 
1192 int
1193 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1194 { return 0; } // UTF-16 is not a fixed-width encoding
1195 
1196 bool
1197 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1198 { return false; }
1199 
1200 int
1201 __codecvt_utf16_base<char16_t>::
1202 do_length(state_type&, const extern_type* __from,
1203 	  const extern_type* __end, size_t __max) const
1204 {
1205   range<const char16_t, false> from{ __from, __end };
1206   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1207   return reinterpret_cast<const char*>(next) - __from;
1208 }
1209 
1210 int
1211 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1212 {
1213   // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1214   // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1215   int max = 2;
1216   if (_M_mode & consume_header)
1217     max += sizeof(utf16_bom);
1218   return max;
1219 }
1220 
1221 // Define members of codecvt_utf16<char32_t> base class implementation.
1222 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1223 
1224 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1225 
1226 codecvt_base::result
1227 __codecvt_utf16_base<char32_t>::
1228 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1229        const intern_type*& __from_next,
1230        extern_type* __to, extern_type* __to_end,
1231        extern_type*& __to_next) const
1232 {
1233   range<const char32_t> from{ __from, __from_end };
1234   range<char16_t, false> to{ __to, __to_end };
1235   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1236   __from_next = from.next;
1237   __to_next = reinterpret_cast<char*>(to.next);
1238   return res;
1239 }
1240 
1241 codecvt_base::result
1242 __codecvt_utf16_base<char32_t>::
1243 do_unshift(state_type&, extern_type* __to, extern_type*,
1244 	   extern_type*& __to_next) const
1245 {
1246   __to_next = __to;
1247   return noconv;
1248 }
1249 
1250 codecvt_base::result
1251 __codecvt_utf16_base<char32_t>::
1252 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1253       const extern_type*& __from_next,
1254       intern_type* __to, intern_type* __to_end,
1255       intern_type*& __to_next) const
1256 {
1257   range<const char16_t, false> from{ __from, __from_end };
1258   range<char32_t> to{ __to, __to_end };
1259   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1260   __from_next = reinterpret_cast<const char*>(from.next);
1261   __to_next = to.next;
1262   if (res == codecvt_base::ok && __from_next != __from_end)
1263     res = codecvt_base::error;
1264   return res;
1265 }
1266 
1267 int
1268 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1269 { return 0; } // UTF-16 is not a fixed-width encoding
1270 
1271 bool
1272 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1273 { return false; }
1274 
1275 int
1276 __codecvt_utf16_base<char32_t>::
1277 do_length(state_type&, const extern_type* __from,
1278 	  const extern_type* __end, size_t __max) const
1279 {
1280   range<const char16_t, false> from{ __from, __end };
1281   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1282   return reinterpret_cast<const char*>(next) - __from;
1283 }
1284 
1285 int
1286 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1287 {
1288   // A single UCS-4 character requires one or two UTF-16 code units
1289   // (so up to four chars).
1290   int max = 4;
1291   if (_M_mode & consume_header)
1292     max += sizeof(utf16_bom);
1293   return max;
1294 }
1295 
1296 #ifdef _GLIBCXX_USE_WCHAR_T
1297 // Define members of codecvt_utf16<wchar_t> base class implementation.
1298 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1299 
1300 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1301 
1302 codecvt_base::result
1303 __codecvt_utf16_base<wchar_t>::
1304 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1305        const intern_type*& __from_next,
1306        extern_type* __to, extern_type* __to_end,
1307        extern_type*& __to_next) const
1308 {
1309   range<char16_t, false> to{ __to, __to_end };
1310 #if __SIZEOF_WCHAR_T__ == 2
1311   range<const char16_t> from{
1312     reinterpret_cast<const char16_t*>(__from),
1313     reinterpret_cast<const char16_t*>(__from_end),
1314   };
1315   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1316 #elif __SIZEOF_WCHAR_T__ == 4
1317   range<const char32_t> from{
1318     reinterpret_cast<const char32_t*>(__from),
1319     reinterpret_cast<const char32_t*>(__from_end),
1320   };
1321   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1322 #else
1323   return codecvt_base::error;
1324 #endif
1325   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1326   __to_next = reinterpret_cast<char*>(to.next);
1327   return res;
1328 }
1329 
1330 codecvt_base::result
1331 __codecvt_utf16_base<wchar_t>::
1332 do_unshift(state_type&, extern_type* __to, extern_type*,
1333 	   extern_type*& __to_next) const
1334 {
1335   __to_next = __to;
1336   return noconv;
1337 }
1338 
1339 codecvt_base::result
1340 __codecvt_utf16_base<wchar_t>::
1341 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1342       const extern_type*& __from_next,
1343       intern_type* __to, intern_type* __to_end,
1344       intern_type*& __to_next) const
1345 {
1346   range<const char16_t, false> from{ __from, __from_end };
1347 #if __SIZEOF_WCHAR_T__ == 2
1348   range<char16_t> to{
1349     reinterpret_cast<char16_t*>(__to),
1350     reinterpret_cast<char16_t*>(__to_end),
1351   };
1352   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1353 #elif __SIZEOF_WCHAR_T__ == 4
1354   range<char32_t> to{
1355     reinterpret_cast<char32_t*>(__to),
1356     reinterpret_cast<char32_t*>(__to_end),
1357   };
1358   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1359 #else
1360   return codecvt_base::error;
1361 #endif
1362   __from_next = reinterpret_cast<const char*>(from.next);
1363   __to_next = reinterpret_cast<wchar_t*>(to.next);
1364   if (res == codecvt_base::ok && __from_next != __from_end)
1365     res = codecvt_base::error;
1366   return res;
1367 }
1368 
1369 int
1370 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1371 { return 0; } // UTF-16 is not a fixed-width encoding
1372 
1373 bool
1374 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1375 { return false; }
1376 
1377 int
1378 __codecvt_utf16_base<wchar_t>::
1379 do_length(state_type&, const extern_type* __from,
1380 	  const extern_type* __end, size_t __max) const
1381 {
1382   range<const char16_t, false> from{ __from, __end };
1383 #if __SIZEOF_WCHAR_T__ == 2
1384   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1385 #elif __SIZEOF_WCHAR_T__ == 4
1386   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1387 #endif
1388   return reinterpret_cast<const char*>(next) - __from;
1389 }
1390 
1391 int
1392 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1393 {
1394 #if __SIZEOF_WCHAR_T__ == 2
1395   int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1396 #else
1397   int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1398 #endif
1399   if (_M_mode & consume_header)
1400     max += sizeof(utf16_bom);
1401   return max;
1402 }
1403 #endif
1404 
1405 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1406 // Converts from UTF-8 to UTF-16.
1407 
1408 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1409 
1410 codecvt_base::result
1411 __codecvt_utf8_utf16_base<char16_t>::
1412 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1413        const intern_type*& __from_next,
1414        extern_type* __to, extern_type* __to_end,
1415        extern_type*& __to_next) const
1416 {
1417   range<const char16_t> from{ __from, __from_end };
1418   range<char> to{ __to, __to_end };
1419   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1420   __from_next = from.next;
1421   __to_next = to.next;
1422   return res;
1423 }
1424 
1425 codecvt_base::result
1426 __codecvt_utf8_utf16_base<char16_t>::
1427 do_unshift(state_type&, extern_type* __to, extern_type*,
1428 	   extern_type*& __to_next) const
1429 {
1430   __to_next = __to;
1431   return noconv;
1432 }
1433 
1434 codecvt_base::result
1435 __codecvt_utf8_utf16_base<char16_t>::
1436 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1437       const extern_type*& __from_next,
1438       intern_type* __to, intern_type* __to_end,
1439       intern_type*& __to_next) const
1440 {
1441   range<const char> from{ __from, __from_end };
1442   range<char16_t> to{ __to, __to_end };
1443   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1444 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1445   mode = codecvt_mode(mode | little_endian);
1446 #endif
1447   auto res = utf16_in(from, to, _M_maxcode, mode);
1448   __from_next = from.next;
1449   __to_next = to.next;
1450   return res;
1451 }
1452 
1453 int
1454 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1455 { return 0; } // UTF-8 is not a fixed-width encoding
1456 
1457 bool
1458 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1459 { return false; }
1460 
1461 int
1462 __codecvt_utf8_utf16_base<char16_t>::
1463 do_length(state_type&, const extern_type* __from,
1464 	  const extern_type* __end, size_t __max) const
1465 {
1466   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1467   return __end - __from;
1468 }
1469 
1470 int
1471 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1472 {
1473   // A single character can be 1 or 2 UTF-16 code units,
1474   // requiring up to 4 UTF-8 code units.
1475   int max = 4;
1476   if (_M_mode & consume_header)
1477     max += sizeof(utf8_bom);
1478   return max;
1479 }
1480 
1481 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1482 // Converts from UTF-8 to UTF-16.
1483 
1484 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1485 
1486 codecvt_base::result
1487 __codecvt_utf8_utf16_base<char32_t>::
1488 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1489        const intern_type*& __from_next,
1490        extern_type* __to, extern_type* __to_end,
1491        extern_type*& __to_next) const
1492 {
1493   range<const char32_t> from{ __from, __from_end };
1494   range<char> to{ __to, __to_end };
1495   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1496   __from_next = from.next;
1497   __to_next = to.next;
1498   return res;
1499 }
1500 
1501 codecvt_base::result
1502 __codecvt_utf8_utf16_base<char32_t>::
1503 do_unshift(state_type&, extern_type* __to, extern_type*,
1504 	   extern_type*& __to_next) const
1505 {
1506   __to_next = __to;
1507   return noconv;
1508 }
1509 
1510 codecvt_base::result
1511 __codecvt_utf8_utf16_base<char32_t>::
1512 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1513       const extern_type*& __from_next,
1514       intern_type* __to, intern_type* __to_end,
1515       intern_type*& __to_next) const
1516 {
1517   range<const char> from{ __from, __from_end };
1518   range<char32_t> to{ __to, __to_end };
1519   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1520 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1521   mode = codecvt_mode(mode | little_endian);
1522 #endif
1523   auto res = utf16_in(from, to, _M_maxcode, mode);
1524   __from_next = from.next;
1525   __to_next = to.next;
1526   return res;
1527 }
1528 
1529 int
1530 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1531 { return 0; } // UTF-8 is not a fixed-width encoding
1532 
1533 bool
1534 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1535 { return false; }
1536 
1537 int
1538 __codecvt_utf8_utf16_base<char32_t>::
1539 do_length(state_type&, const extern_type* __from,
1540 	  const extern_type* __end, size_t __max) const
1541 {
1542   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1543   return __end - __from;
1544 }
1545 
1546 int
1547 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1548 {
1549   // A single character can be 1 or 2 UTF-16 code units,
1550   // requiring up to 4 UTF-8 code units.
1551   int max = 4;
1552   if (_M_mode & consume_header)
1553     max += sizeof(utf8_bom);
1554   return max;
1555 }
1556 
1557 #ifdef _GLIBCXX_USE_WCHAR_T
1558 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1559 // Converts from UTF-8 to UTF-16.
1560 
1561 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1562 
1563 codecvt_base::result
1564 __codecvt_utf8_utf16_base<wchar_t>::
1565 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1566        const intern_type*& __from_next,
1567        extern_type* __to, extern_type* __to_end,
1568        extern_type*& __to_next) const
1569 {
1570   range<const wchar_t> from{ __from, __from_end };
1571   range<char> to{ __to, __to_end };
1572   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1573   __from_next = from.next;
1574   __to_next = to.next;
1575   return res;
1576 }
1577 
1578 codecvt_base::result
1579 __codecvt_utf8_utf16_base<wchar_t>::
1580 do_unshift(state_type&, extern_type* __to, extern_type*,
1581 	   extern_type*& __to_next) const
1582 {
1583   __to_next = __to;
1584   return noconv;
1585 }
1586 
1587 codecvt_base::result
1588 __codecvt_utf8_utf16_base<wchar_t>::
1589 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1590       const extern_type*& __from_next,
1591       intern_type* __to, intern_type* __to_end,
1592       intern_type*& __to_next) const
1593 {
1594   range<const char> from{ __from, __from_end };
1595   range<wchar_t> to{ __to, __to_end };
1596   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1597 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1598   mode = codecvt_mode(mode | little_endian);
1599 #endif
1600   auto res = utf16_in(from, to, _M_maxcode, mode);
1601   __from_next = from.next;
1602   __to_next = to.next;
1603   return res;
1604 }
1605 
1606 int
1607 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1608 { return 0; } // UTF-8 is not a fixed-width encoding
1609 
1610 bool
1611 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1612 { return false; }
1613 
1614 int
1615 __codecvt_utf8_utf16_base<wchar_t>::
1616 do_length(state_type&, const extern_type* __from,
1617 	  const extern_type* __end, size_t __max) const
1618 {
1619   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1620   return __end - __from;
1621 }
1622 
1623 int
1624 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1625 {
1626   // A single character can be 1 or 2 UTF-16 code units,
1627   // requiring up to 4 UTF-8 code units.
1628   int max = 4;
1629   if (_M_mode & consume_header)
1630     max += sizeof(utf8_bom);
1631   return max;
1632 }
1633 #endif
1634 
1635 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1636 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1637 template class codecvt_byname<char16_t, char, mbstate_t>;
1638 template class codecvt_byname<char32_t, char, mbstate_t>;
1639 
1640 _GLIBCXX_END_NAMESPACE_VERSION
1641 }
1642 #endif // _GLIBCXX_USE_C99_STDINT_TR1
1643