1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         unicode_iterator.hpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17   */
18 
19 /****************************************************************************
20 
21 Contents:
22 ~~~~~~~~~
23 
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 
27 template <class BaseIterator, class U8Type = std::uint8_t>
28 class u32_to_u8_iterator;
29 
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31 
32 template <class BaseIterator, class U32Type = std::uint32_t>
33 class u8_to_u32_iterator;
34 
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36 
37 template <class BaseIterator, class U16Type = std::uint16_t>
38 class u32_to_u16_iterator;
39 
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41 
42 template <class BaseIterator, class U32Type = std::uint32_t>
43 class u16_to_u32_iterator;
44 
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46 
47 2) Single pass output iterator adapters:
48 
49 template <class BaseIterator>
50 class utf8_output_iterator;
51 
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53 
54 template <class BaseIterator>
55 class utf16_output_iterator;
56 
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58 
59 ****************************************************************************/
60 
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <cstdint>
64 #include <boost/regex/config.hpp>
65 #include <stdexcept>
66 #include <sstream>
67 #include <ios>
68 #include <limits.h> // CHAR_BIT
69 
70 #ifndef BOOST_REGEX_STANDALONE
71 #include <boost/throw_exception.hpp>
72 #endif
73 
74 namespace boost{
75 
76 namespace detail{
77 
78 static const std::uint16_t high_surrogate_base = 0xD7C0u;
79 static const std::uint16_t low_surrogate_base = 0xDC00u;
80 static const std::uint32_t ten_bit_mask = 0x3FFu;
81 
is_high_surrogate(std::uint16_t v)82 inline bool is_high_surrogate(std::uint16_t v)
83 {
84    return (v & 0xFFFFFC00u) == 0xd800u;
85 }
is_low_surrogate(std::uint16_t v)86 inline bool is_low_surrogate(std::uint16_t v)
87 {
88    return (v & 0xFFFFFC00u) == 0xdc00u;
89 }
90 template <class T>
is_surrogate(T v)91 inline bool is_surrogate(T v)
92 {
93    return (v & 0xFFFFF800u) == 0xd800;
94 }
95 
utf8_byte_count(std::uint8_t c)96 inline unsigned utf8_byte_count(std::uint8_t c)
97 {
98    // if the most significant bit with a zero in it is in position
99    // 8-N then there are N bytes in this UTF-8 sequence:
100    std::uint8_t mask = 0x80u;
101    unsigned result = 0;
102    while(c & mask)
103    {
104       ++result;
105       mask >>= 1;
106    }
107    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
108 }
109 
utf8_trailing_byte_count(std::uint8_t c)110 inline unsigned utf8_trailing_byte_count(std::uint8_t c)
111 {
112    return utf8_byte_count(c) - 1;
113 }
114 
115 #ifdef BOOST_REGEX_MSVC
116 #pragma warning(push)
117 #pragma warning(disable:4100)
118 #endif
119 #ifndef BOOST_NO_EXCEPTIONS
120 BOOST_REGEX_NORETURN
121 #endif
invalid_utf32_code_point(std::uint32_t val)122 inline void invalid_utf32_code_point(std::uint32_t val)
123 {
124    std::stringstream ss;
125    ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
126    std::out_of_range e(ss.str());
127 #ifndef BOOST_REGEX_STANDALONE
128    boost::throw_exception(e);
129 #else
130    throw e;
131 #endif
132 }
133 #ifdef BOOST_REGEX_MSVC
134 #pragma warning(pop)
135 #endif
136 
137 
138 } // namespace detail
139 
140 template <class BaseIterator, class U16Type = std::uint16_t>
141 class u32_to_u16_iterator
142 {
143    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
144 
145    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
146    static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
147 
148 public:
149    typedef std::ptrdiff_t     difference_type;
150    typedef U16Type            value_type;
151    typedef value_type const*  pointer;
152    typedef value_type const   reference;
153    typedef std::bidirectional_iterator_tag iterator_category;
154 
operator *() const155    reference operator*()const
156    {
157       if(m_current == 2)
158          extract_current();
159       return m_values[m_current];
160    }
operator ==(const u32_to_u16_iterator & that) const161    bool operator==(const u32_to_u16_iterator& that)const
162    {
163       if(m_position == that.m_position)
164       {
165          // Both m_currents must be equal, or both even
166          // this is the same as saying their sum must be even:
167          return (m_current + that.m_current) & 1u ? false : true;
168       }
169       return false;
170    }
operator !=(const u32_to_u16_iterator & that) const171    bool operator!=(const u32_to_u16_iterator& that)const
172    {
173       return !(*this == that);
174    }
operator ++()175    u32_to_u16_iterator& operator++()
176    {
177       // if we have a pending read then read now, so that we know whether
178       // to skip a position, or move to a low-surrogate:
179       if(m_current == 2)
180       {
181          // pending read:
182          extract_current();
183       }
184       // move to the next surrogate position:
185       ++m_current;
186       // if we've reached the end skip a position:
187       if(m_values[m_current] == 0)
188       {
189          m_current = 2;
190          ++m_position;
191       }
192       return *this;
193    }
operator ++(int)194    u32_to_u16_iterator operator++(int)
195    {
196       u32_to_u16_iterator r(*this);
197       ++(*this);
198       return r;
199    }
operator --()200    u32_to_u16_iterator& operator--()
201    {
202       if(m_current != 1)
203       {
204          // decrementing an iterator always leads to a valid position:
205          --m_position;
206          extract_current();
207          m_current = m_values[1] ? 1 : 0;
208       }
209       else
210       {
211          m_current = 0;
212       }
213       return *this;
214    }
operator --(int)215    u32_to_u16_iterator operator--(int)
216    {
217       u32_to_u16_iterator r(*this);
218       --(*this);
219       return r;
220    }
base() const221    BaseIterator base()const
222    {
223       return m_position;
224    }
225    // construct:
u32_to_u16_iterator()226    u32_to_u16_iterator() : m_position(), m_current(0)
227    {
228       m_values[0] = 0;
229       m_values[1] = 0;
230       m_values[2] = 0;
231    }
u32_to_u16_iterator(BaseIterator b)232    u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
233    {
234       m_values[0] = 0;
235       m_values[1] = 0;
236       m_values[2] = 0;
237    }
238 private:
239 
extract_current() const240    void extract_current()const
241    {
242       // begin by checking for a code point out of range:
243       std::uint32_t v = *m_position;
244       if(v >= 0x10000u)
245       {
246          if(v > 0x10FFFFu)
247             detail::invalid_utf32_code_point(*m_position);
248          // split into two surrogates:
249          m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
250          m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
251          m_current = 0;
252          BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
253          BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
254       }
255       else
256       {
257          // 16-bit code point:
258          m_values[0] = static_cast<U16Type>(*m_position);
259          m_values[1] = 0;
260          m_current = 0;
261          // value must not be a surrogate:
262          if(detail::is_surrogate(m_values[0]))
263             detail::invalid_utf32_code_point(*m_position);
264       }
265    }
266    BaseIterator m_position;
267    mutable U16Type m_values[3];
268    mutable unsigned m_current;
269 };
270 
271 template <class BaseIterator, class U32Type = std::uint32_t>
272 class u16_to_u32_iterator
273 {
274    // special values for pending iterator reads:
275    static const U32Type pending_read = 0xffffffffu;
276 
277    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
278 
279    static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
280    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
281 
282 public:
283    typedef std::ptrdiff_t     difference_type;
284    typedef U32Type            value_type;
285    typedef value_type const*  pointer;
286    typedef value_type const   reference;
287    typedef std::bidirectional_iterator_tag iterator_category;
288 
operator *() const289    reference operator*()const
290    {
291       if(m_value == pending_read)
292          extract_current();
293       return m_value;
294    }
operator ==(const u16_to_u32_iterator & that) const295    bool operator==(const u16_to_u32_iterator& that)const
296    {
297       return m_position == that.m_position;
298    }
operator !=(const u16_to_u32_iterator & that) const299    bool operator!=(const u16_to_u32_iterator& that)const
300    {
301       return !(*this == that);
302    }
operator ++()303    u16_to_u32_iterator& operator++()
304    {
305       // skip high surrogate first if there is one:
306       if(detail::is_high_surrogate(*m_position)) ++m_position;
307       ++m_position;
308       m_value = pending_read;
309       return *this;
310    }
operator ++(int)311    u16_to_u32_iterator operator++(int)
312    {
313       u16_to_u32_iterator r(*this);
314       ++(*this);
315       return r;
316    }
operator --()317    u16_to_u32_iterator& operator--()
318    {
319       --m_position;
320       // if we have a low surrogate then go back one more:
321       if(detail::is_low_surrogate(*m_position))
322          --m_position;
323       m_value = pending_read;
324       return *this;
325    }
operator --(int)326    u16_to_u32_iterator operator--(int)
327    {
328       u16_to_u32_iterator r(*this);
329       --(*this);
330       return r;
331    }
base() const332    BaseIterator base()const
333    {
334       return m_position;
335    }
336    // construct:
u16_to_u32_iterator()337    u16_to_u32_iterator() : m_position()
338    {
339       m_value = pending_read;
340    }
u16_to_u32_iterator(BaseIterator b)341    u16_to_u32_iterator(BaseIterator b) : m_position(b)
342    {
343       m_value = pending_read;
344    }
345    //
346    // Range checked version:
347    //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)348    u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
349    {
350       m_value = pending_read;
351       //
352       // The range must not start with a low surrogate, or end in a high surrogate,
353       // otherwise we run the risk of running outside the underlying input range.
354       // Likewise b must not be located at a low surrogate.
355       //
356       std::uint16_t val;
357       if(start != end)
358       {
359          if((b != start) && (b != end))
360          {
361             val = *b;
362             if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
363                invalid_code_point(val);
364          }
365          val = *start;
366          if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
367             invalid_code_point(val);
368          val = *--end;
369          if(detail::is_high_surrogate(val))
370             invalid_code_point(val);
371       }
372    }
373 private:
invalid_code_point(std::uint16_t val)374    static void invalid_code_point(std::uint16_t val)
375    {
376       std::stringstream ss;
377       ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
378       std::out_of_range e(ss.str());
379 #ifndef BOOST_REGEX_STANDALONE
380       boost::throw_exception(e);
381 #else
382       throw e;
383 #endif
384    }
extract_current() const385    void extract_current()const
386    {
387       m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
388       // if the last value is a high surrogate then adjust m_position and m_value as needed:
389       if(detail::is_high_surrogate(*m_position))
390       {
391          // precondition; next value must have be a low-surrogate:
392          BaseIterator next(m_position);
393          std::uint16_t t = *++next;
394          if((t & 0xFC00u) != 0xDC00u)
395             invalid_code_point(t);
396          m_value = (m_value - detail::high_surrogate_base) << 10;
397          m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
398       }
399       // postcondition; result must not be a surrogate:
400       if(detail::is_surrogate(m_value))
401          invalid_code_point(static_cast< std::uint16_t>(m_value));
402    }
403    BaseIterator m_position;
404    mutable U32Type m_value;
405 };
406 
407 template <class BaseIterator, class U8Type = std::uint8_t>
408 class u32_to_u8_iterator
409 {
410    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
411 
412    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
413    static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
414 
415 public:
416    typedef std::ptrdiff_t     difference_type;
417    typedef U8Type             value_type;
418    typedef value_type const*  pointer;
419    typedef value_type const   reference;
420    typedef std::bidirectional_iterator_tag iterator_category;
421 
operator *() const422    reference operator*()const
423    {
424       if(m_current == 4)
425          extract_current();
426       return m_values[m_current];
427    }
operator ==(const u32_to_u8_iterator & that) const428    bool operator==(const u32_to_u8_iterator& that)const
429    {
430       if(m_position == that.m_position)
431       {
432          // either the m_current's must be equal, or one must be 0 and
433          // the other 4: which means neither must have bits 1 or 2 set:
434          return (m_current == that.m_current)
435             || (((m_current | that.m_current) & 3) == 0);
436       }
437       return false;
438    }
operator !=(const u32_to_u8_iterator & that) const439    bool operator!=(const u32_to_u8_iterator& that)const
440    {
441       return !(*this == that);
442    }
operator ++()443    u32_to_u8_iterator& operator++()
444    {
445       // if we have a pending read then read now, so that we know whether
446       // to skip a position, or move to a low-surrogate:
447       if(m_current == 4)
448       {
449          // pending read:
450          extract_current();
451       }
452       // move to the next surrogate position:
453       ++m_current;
454       // if we've reached the end skip a position:
455       if(m_values[m_current] == 0)
456       {
457          m_current = 4;
458          ++m_position;
459       }
460       return *this;
461    }
operator ++(int)462    u32_to_u8_iterator operator++(int)
463    {
464       u32_to_u8_iterator r(*this);
465       ++(*this);
466       return r;
467    }
operator --()468    u32_to_u8_iterator& operator--()
469    {
470       if((m_current & 3) == 0)
471       {
472          --m_position;
473          extract_current();
474          m_current = 3;
475          while(m_current && (m_values[m_current] == 0))
476             --m_current;
477       }
478       else
479          --m_current;
480       return *this;
481    }
operator --(int)482    u32_to_u8_iterator operator--(int)
483    {
484       u32_to_u8_iterator r(*this);
485       --(*this);
486       return r;
487    }
base() const488    BaseIterator base()const
489    {
490       return m_position;
491    }
492    // construct:
u32_to_u8_iterator()493    u32_to_u8_iterator() : m_position(), m_current(0)
494    {
495       m_values[0] = 0;
496       m_values[1] = 0;
497       m_values[2] = 0;
498       m_values[3] = 0;
499       m_values[4] = 0;
500    }
u32_to_u8_iterator(BaseIterator b)501    u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
502    {
503       m_values[0] = 0;
504       m_values[1] = 0;
505       m_values[2] = 0;
506       m_values[3] = 0;
507       m_values[4] = 0;
508    }
509 private:
510 
extract_current() const511    void extract_current()const
512    {
513       std::uint32_t c = *m_position;
514       if(c > 0x10FFFFu)
515          detail::invalid_utf32_code_point(c);
516       if(c < 0x80u)
517       {
518          m_values[0] = static_cast<unsigned char>(c);
519          m_values[1] = static_cast<unsigned char>(0u);
520          m_values[2] = static_cast<unsigned char>(0u);
521          m_values[3] = static_cast<unsigned char>(0u);
522       }
523       else if(c < 0x800u)
524       {
525          m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
526          m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
527          m_values[2] = static_cast<unsigned char>(0u);
528          m_values[3] = static_cast<unsigned char>(0u);
529       }
530       else if(c < 0x10000u)
531       {
532          m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
533          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
534          m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
535          m_values[3] = static_cast<unsigned char>(0u);
536       }
537       else
538       {
539          m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
540          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
541          m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
542          m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
543       }
544       m_current= 0;
545    }
546    BaseIterator m_position;
547    mutable U8Type m_values[5];
548    mutable unsigned m_current;
549 };
550 
551 template <class BaseIterator, class U32Type = std::uint32_t>
552 class u8_to_u32_iterator
553 {
554    // special values for pending iterator reads:
555    static const U32Type pending_read = 0xffffffffu;
556 
557    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
558 
559    static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
560    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
561 
562 public:
563    typedef std::ptrdiff_t     difference_type;
564    typedef U32Type            value_type;
565    typedef value_type const*  pointer;
566    typedef value_type const   reference;
567    typedef std::bidirectional_iterator_tag iterator_category;
568 
operator *() const569    reference operator*()const
570    {
571       if(m_value == pending_read)
572          extract_current();
573       return m_value;
574    }
operator ==(const u8_to_u32_iterator & that) const575    bool operator==(const u8_to_u32_iterator& that)const
576    {
577       return m_position == that.m_position;
578    }
operator !=(const u8_to_u32_iterator & that) const579    bool operator!=(const u8_to_u32_iterator& that)const
580    {
581       return !(*this == that);
582    }
operator ++()583    u8_to_u32_iterator& operator++()
584    {
585       // We must not start with a continuation character:
586       if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
587          invalid_sequence();
588       // skip high surrogate first if there is one:
589       unsigned c = detail::utf8_byte_count(*m_position);
590       if(m_value == pending_read)
591       {
592          // Since we haven't read in a value, we need to validate the code points:
593          for(unsigned i = 0; i < c; ++i)
594          {
595             ++m_position;
596             // We must have a continuation byte:
597             if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
598                invalid_sequence();
599          }
600       }
601       else
602       {
603          std::advance(m_position, c);
604       }
605       m_value = pending_read;
606       return *this;
607    }
operator ++(int)608    u8_to_u32_iterator operator++(int)
609    {
610       u8_to_u32_iterator r(*this);
611       ++(*this);
612       return r;
613    }
operator --()614    u8_to_u32_iterator& operator--()
615    {
616       // Keep backtracking until we don't have a trailing character:
617       unsigned count = 0;
618       while((*--m_position & 0xC0u) == 0x80u) ++count;
619       // now check that the sequence was valid:
620       if(count != detail::utf8_trailing_byte_count(*m_position))
621          invalid_sequence();
622       m_value = pending_read;
623       return *this;
624    }
operator --(int)625    u8_to_u32_iterator operator--(int)
626    {
627       u8_to_u32_iterator r(*this);
628       --(*this);
629       return r;
630    }
base() const631    BaseIterator base()const
632    {
633       return m_position;
634    }
635    // construct:
u8_to_u32_iterator()636    u8_to_u32_iterator() : m_position()
637    {
638       m_value = pending_read;
639    }
u8_to_u32_iterator(BaseIterator b)640    u8_to_u32_iterator(BaseIterator b) : m_position(b)
641    {
642       m_value = pending_read;
643    }
644    //
645    // Checked constructor:
646    //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)647    u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
648    {
649       m_value = pending_read;
650       //
651       // We must not start with a continuation character, or end with a
652       // truncated UTF-8 sequence otherwise we run the risk of going past
653       // the start/end of the underlying sequence:
654       //
655       if(start != end)
656       {
657          unsigned char v = *start;
658          if((v & 0xC0u) == 0x80u)
659             invalid_sequence();
660          if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
661             invalid_sequence();
662          BaseIterator pos = end;
663          do
664          {
665             v = *--pos;
666          }
667          while((start != pos) && ((v & 0xC0u) == 0x80u));
668          std::ptrdiff_t extra = detail::utf8_byte_count(v);
669          if(std::distance(pos, end) < extra)
670             invalid_sequence();
671       }
672    }
673 private:
invalid_sequence()674    static void invalid_sequence()
675    {
676       std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
677 #ifndef BOOST_REGEX_STANDALONE
678       boost::throw_exception(e);
679 #else
680       throw e;
681 #endif
682    }
extract_current() const683    void extract_current()const
684    {
685       m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
686       // we must not have a continuation character:
687       if((m_value & 0xC0u) == 0x80u)
688          invalid_sequence();
689       // see how many extra bytes we have:
690       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
691       // extract the extra bits, 6 from each extra byte:
692       BaseIterator next(m_position);
693       for(unsigned c = 0; c < extra; ++c)
694       {
695          ++next;
696          m_value <<= 6;
697          // We must have a continuation byte:
698          if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
699             invalid_sequence();
700          m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
701       }
702       // we now need to remove a few of the leftmost bits, but how many depends
703       // upon how many extra bytes we've extracted:
704       static const std::uint32_t masks[4] =
705       {
706          0x7Fu,
707          0x7FFu,
708          0xFFFFu,
709          0x1FFFFFu,
710       };
711       m_value &= masks[extra];
712       // check the result is in range:
713       if(m_value > static_cast<U32Type>(0x10FFFFu))
714          invalid_sequence();
715       // The result must not be a surrogate:
716       if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
717          invalid_sequence();
718       // We should not have had an invalidly encoded UTF8 sequence:
719       if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
720          invalid_sequence();
721    }
722    BaseIterator m_position;
723    mutable U32Type m_value;
724 };
725 
726 template <class BaseIterator>
727 class utf16_output_iterator
728 {
729 public:
730    typedef void                                   difference_type;
731    typedef void                                   value_type;
732    typedef std::uint32_t*                         pointer;
733    typedef std::uint32_t&                         reference;
734    typedef std::output_iterator_tag               iterator_category;
735 
utf16_output_iterator(const BaseIterator & b)736    utf16_output_iterator(const BaseIterator& b)
737       : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)738    utf16_output_iterator(const utf16_output_iterator& that)
739       : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)740    utf16_output_iterator& operator=(const utf16_output_iterator& that)
741    {
742       m_position = that.m_position;
743       return *this;
744    }
operator *() const745    const utf16_output_iterator& operator*()const
746    {
747       return *this;
748    }
operator =(std::uint32_t val) const749    void operator=(std::uint32_t val)const
750    {
751       push(val);
752    }
operator ++()753    utf16_output_iterator& operator++()
754    {
755       return *this;
756    }
operator ++(int)757    utf16_output_iterator& operator++(int)
758    {
759       return *this;
760    }
base() const761    BaseIterator base()const
762    {
763       return m_position;
764    }
765 private:
push(std::uint32_t v) const766    void push(std::uint32_t v)const
767    {
768       if(v >= 0x10000u)
769       {
770          // begin by checking for a code point out of range:
771          if(v > 0x10FFFFu)
772             detail::invalid_utf32_code_point(v);
773          // split into two surrogates:
774          *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
775          *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
776       }
777       else
778       {
779          // 16-bit code point:
780          // value must not be a surrogate:
781          if(detail::is_surrogate(v))
782             detail::invalid_utf32_code_point(v);
783          *m_position++ = static_cast<std::uint16_t>(v);
784       }
785    }
786    mutable BaseIterator m_position;
787 };
788 
789 template <class BaseIterator>
790 class utf8_output_iterator
791 {
792 public:
793    typedef void                                   difference_type;
794    typedef void                                   value_type;
795    typedef std::uint32_t*                       pointer;
796    typedef std::uint32_t&                       reference;
797    typedef std::output_iterator_tag               iterator_category;
798 
utf8_output_iterator(const BaseIterator & b)799    utf8_output_iterator(const BaseIterator& b)
800       : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)801    utf8_output_iterator(const utf8_output_iterator& that)
802       : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)803    utf8_output_iterator& operator=(const utf8_output_iterator& that)
804    {
805       m_position = that.m_position;
806       return *this;
807    }
operator *() const808    const utf8_output_iterator& operator*()const
809    {
810       return *this;
811    }
operator =(std::uint32_t val) const812    void operator=(std::uint32_t val)const
813    {
814       push(val);
815    }
operator ++()816    utf8_output_iterator& operator++()
817    {
818       return *this;
819    }
operator ++(int)820    utf8_output_iterator& operator++(int)
821    {
822       return *this;
823    }
base() const824    BaseIterator base()const
825    {
826       return m_position;
827    }
828 private:
push(std::uint32_t c) const829    void push(std::uint32_t c)const
830    {
831       if(c > 0x10FFFFu)
832          detail::invalid_utf32_code_point(c);
833       if(c < 0x80u)
834       {
835          *m_position++ = static_cast<unsigned char>(c);
836       }
837       else if(c < 0x800u)
838       {
839          *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
840          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
841       }
842       else if(c < 0x10000u)
843       {
844          *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
845          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
846          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
847       }
848       else
849       {
850          *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
851          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
852          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
853          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
854       }
855    }
856    mutable BaseIterator m_position;
857 };
858 
859 } // namespace boost
860 
861 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
862 
863