1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         unicode_iterator.hpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17   */
18 
19 /****************************************************************************
20 
21 Contents:
22 ~~~~~~~~~
23 
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 
27 template <class BaseIterator, class U8Type = ::boost::uint8_t>
28 class u32_to_u8_iterator;
29 
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31 
32 template <class BaseIterator, class U32Type = ::boost::uint32_t>
33 class u8_to_u32_iterator;
34 
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36 
37 template <class BaseIterator, class U16Type = ::boost::uint16_t>
38 class u32_to_u16_iterator;
39 
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41 
42 template <class BaseIterator, class U32Type = ::boost::uint32_t>
43 class u16_to_u32_iterator;
44 
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46 
47 2) Single pass output iterator adapters:
48 
49 template <class BaseIterator>
50 class utf8_output_iterator;
51 
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53 
54 template <class BaseIterator>
55 class utf16_output_iterator;
56 
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58 
59 ****************************************************************************/
60 
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <boost/cstdint.hpp>
64 #include <boost/assert.hpp>
65 #include <boost/iterator/iterator_facade.hpp>
66 #include <boost/static_assert.hpp>
67 #include <boost/throw_exception.hpp>
68 #include <stdexcept>
69 #ifndef BOOST_NO_STD_LOCALE
70 #include <sstream>
71 #include <ios>
72 #endif
73 #include <limits.h> // CHAR_BIT
74 
75 namespace boost{
76 
77 namespace detail{
78 
79 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
80 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
81 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
82 
is_high_surrogate(::boost::uint16_t v)83 inline bool is_high_surrogate(::boost::uint16_t v)
84 {
85    return (v & 0xFFFFFC00u) == 0xd800u;
86 }
is_low_surrogate(::boost::uint16_t v)87 inline bool is_low_surrogate(::boost::uint16_t v)
88 {
89    return (v & 0xFFFFFC00u) == 0xdc00u;
90 }
91 template <class T>
is_surrogate(T v)92 inline bool is_surrogate(T v)
93 {
94    return (v & 0xFFFFF800u) == 0xd800;
95 }
96 
utf8_byte_count(boost::uint8_t c)97 inline unsigned utf8_byte_count(boost::uint8_t c)
98 {
99    // if the most significant bit with a zero in it is in position
100    // 8-N then there are N bytes in this UTF-8 sequence:
101    boost::uint8_t mask = 0x80u;
102    unsigned result = 0;
103    while(c & mask)
104    {
105       ++result;
106       mask >>= 1;
107    }
108    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
109 }
110 
utf8_trailing_byte_count(boost::uint8_t c)111 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
112 {
113    return utf8_byte_count(c) - 1;
114 }
115 
116 #ifdef BOOST_MSVC
117 #pragma warning(push)
118 #pragma warning(disable:4100)
119 #endif
120 #ifndef BOOST_NO_EXCEPTIONS
121 BOOST_NORETURN
122 #endif
invalid_utf32_code_point(::boost::uint32_t val)123 inline void invalid_utf32_code_point(::boost::uint32_t val)
124 {
125 #ifndef BOOST_NO_STD_LOCALE
126    std::stringstream ss;
127    ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
128    std::out_of_range e(ss.str());
129 #else
130    std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
131 #endif
132    boost::throw_exception(e);
133 }
134 #ifdef BOOST_MSVC
135 #pragma warning(pop)
136 #endif
137 
138 
139 } // namespace detail
140 
141 template <class BaseIterator, class U16Type = ::boost::uint16_t>
142 class u32_to_u16_iterator
143    : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
144 {
145    typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
146 
147 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
148    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
149 
150    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
151    BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
152 #endif
153 
154 public:
155    typename base_type::reference
dereference() const156       dereference()const
157    {
158       if(m_current == 2)
159          extract_current();
160       return m_values[m_current];
161    }
equal(const u32_to_u16_iterator & that) const162    bool equal(const u32_to_u16_iterator& that)const
163    {
164       if(m_position == that.m_position)
165       {
166          // Both m_currents must be equal, or both even
167          // this is the same as saying their sum must be even:
168          return (m_current + that.m_current) & 1u ? false : true;
169       }
170       return false;
171    }
increment()172    void increment()
173    {
174       // if we have a pending read then read now, so that we know whether
175       // to skip a position, or move to a low-surrogate:
176       if(m_current == 2)
177       {
178          // pending read:
179          extract_current();
180       }
181       // move to the next surrogate position:
182       ++m_current;
183       // if we've reached the end skip a position:
184       if(m_values[m_current] == 0)
185       {
186          m_current = 2;
187          ++m_position;
188       }
189    }
decrement()190    void decrement()
191    {
192       if(m_current != 1)
193       {
194          // decrementing an iterator always leads to a valid position:
195          --m_position;
196          extract_current();
197          m_current = m_values[1] ? 1 : 0;
198       }
199       else
200       {
201          m_current = 0;
202       }
203    }
base() const204    BaseIterator base()const
205    {
206       return m_position;
207    }
208    // construct:
u32_to_u16_iterator()209    u32_to_u16_iterator() : m_position(), m_current(0)
210    {
211       m_values[0] = 0;
212       m_values[1] = 0;
213       m_values[2] = 0;
214    }
u32_to_u16_iterator(BaseIterator b)215    u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
216    {
217       m_values[0] = 0;
218       m_values[1] = 0;
219       m_values[2] = 0;
220    }
221 private:
222 
extract_current() const223    void extract_current()const
224    {
225       // begin by checking for a code point out of range:
226       ::boost::uint32_t v = *m_position;
227       if(v >= 0x10000u)
228       {
229          if(v > 0x10FFFFu)
230             detail::invalid_utf32_code_point(*m_position);
231          // split into two surrogates:
232          m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
233          m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
234          m_current = 0;
235          BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
236          BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
237       }
238       else
239       {
240          // 16-bit code point:
241          m_values[0] = static_cast<U16Type>(*m_position);
242          m_values[1] = 0;
243          m_current = 0;
244          // value must not be a surrogate:
245          if(detail::is_surrogate(m_values[0]))
246             detail::invalid_utf32_code_point(*m_position);
247       }
248    }
249    BaseIterator m_position;
250    mutable U16Type m_values[3];
251    mutable unsigned m_current;
252 };
253 
254 template <class BaseIterator, class U32Type = ::boost::uint32_t>
255 class u16_to_u32_iterator
256    : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
257 {
258    typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
259    // special values for pending iterator reads:
260    BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
261 
262 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
263    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
264 
265    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
266    BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
267 #endif
268 
269 public:
270    typename base_type::reference
dereference() const271       dereference()const
272    {
273       if(m_value == pending_read)
274          extract_current();
275       return m_value;
276    }
equal(const u16_to_u32_iterator & that) const277    bool equal(const u16_to_u32_iterator& that)const
278    {
279       return m_position == that.m_position;
280    }
increment()281    void increment()
282    {
283       // skip high surrogate first if there is one:
284       if(detail::is_high_surrogate(*m_position)) ++m_position;
285       ++m_position;
286       m_value = pending_read;
287    }
decrement()288    void decrement()
289    {
290       --m_position;
291       // if we have a low surrogate then go back one more:
292       if(detail::is_low_surrogate(*m_position))
293          --m_position;
294       m_value = pending_read;
295    }
base() const296    BaseIterator base()const
297    {
298       return m_position;
299    }
300    // construct:
u16_to_u32_iterator()301    u16_to_u32_iterator() : m_position()
302    {
303       m_value = pending_read;
304    }
u16_to_u32_iterator(BaseIterator b)305    u16_to_u32_iterator(BaseIterator b) : m_position(b)
306    {
307       m_value = pending_read;
308    }
309    //
310    // Range checked version:
311    //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)312    u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
313    {
314       m_value = pending_read;
315       //
316       // The range must not start with a low surrogate, or end in a high surrogate,
317       // otherwise we run the risk of running outside the underlying input range.
318       // Likewise b must not be located at a low surrogate.
319       //
320       boost::uint16_t val;
321       if(start != end)
322       {
323          if((b != start) && (b != end))
324          {
325             val = *b;
326             if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
327                invalid_code_point(val);
328          }
329          val = *start;
330          if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
331             invalid_code_point(val);
332          val = *--end;
333          if(detail::is_high_surrogate(val))
334             invalid_code_point(val);
335       }
336    }
337 private:
invalid_code_point(::boost::uint16_t val)338    static void invalid_code_point(::boost::uint16_t val)
339    {
340 #ifndef BOOST_NO_STD_LOCALE
341       std::stringstream ss;
342       ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
343       std::out_of_range e(ss.str());
344 #else
345       std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
346 #endif
347       boost::throw_exception(e);
348    }
extract_current() const349    void extract_current()const
350    {
351       m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
352       // if the last value is a high surrogate then adjust m_position and m_value as needed:
353       if(detail::is_high_surrogate(*m_position))
354       {
355          // precondition; next value must have be a low-surrogate:
356          BaseIterator next(m_position);
357          ::boost::uint16_t t = *++next;
358          if((t & 0xFC00u) != 0xDC00u)
359             invalid_code_point(t);
360          m_value = (m_value - detail::high_surrogate_base) << 10;
361          m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
362       }
363       // postcondition; result must not be a surrogate:
364       if(detail::is_surrogate(m_value))
365          invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
366    }
367    BaseIterator m_position;
368    mutable U32Type m_value;
369 };
370 
371 template <class BaseIterator, class U8Type = ::boost::uint8_t>
372 class u32_to_u8_iterator
373    : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
374 {
375    typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
376 
377 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
378    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
379 
380    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
381    BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
382 #endif
383 
384 public:
385    typename base_type::reference
dereference() const386       dereference()const
387    {
388       if(m_current == 4)
389          extract_current();
390       return m_values[m_current];
391    }
equal(const u32_to_u8_iterator & that) const392    bool equal(const u32_to_u8_iterator& that)const
393    {
394       if(m_position == that.m_position)
395       {
396          // either the m_current's must be equal, or one must be 0 and
397          // the other 4: which means neither must have bits 1 or 2 set:
398          return (m_current == that.m_current)
399             || (((m_current | that.m_current) & 3) == 0);
400       }
401       return false;
402    }
increment()403    void increment()
404    {
405       // if we have a pending read then read now, so that we know whether
406       // to skip a position, or move to a low-surrogate:
407       if(m_current == 4)
408       {
409          // pending read:
410          extract_current();
411       }
412       // move to the next surrogate position:
413       ++m_current;
414       // if we've reached the end skip a position:
415       if(m_values[m_current] == 0)
416       {
417          m_current = 4;
418          ++m_position;
419       }
420    }
decrement()421    void decrement()
422    {
423       if((m_current & 3) == 0)
424       {
425          --m_position;
426          extract_current();
427          m_current = 3;
428          while(m_current && (m_values[m_current] == 0))
429             --m_current;
430       }
431       else
432          --m_current;
433    }
base() const434    BaseIterator base()const
435    {
436       return m_position;
437    }
438    // construct:
u32_to_u8_iterator()439    u32_to_u8_iterator() : m_position(), m_current(0)
440    {
441       m_values[0] = 0;
442       m_values[1] = 0;
443       m_values[2] = 0;
444       m_values[3] = 0;
445       m_values[4] = 0;
446    }
u32_to_u8_iterator(BaseIterator b)447    u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
448    {
449       m_values[0] = 0;
450       m_values[1] = 0;
451       m_values[2] = 0;
452       m_values[3] = 0;
453       m_values[4] = 0;
454    }
455 private:
456 
extract_current() const457    void extract_current()const
458    {
459       boost::uint32_t c = *m_position;
460       if(c > 0x10FFFFu)
461          detail::invalid_utf32_code_point(c);
462       if(c < 0x80u)
463       {
464          m_values[0] = static_cast<unsigned char>(c);
465          m_values[1] = static_cast<unsigned char>(0u);
466          m_values[2] = static_cast<unsigned char>(0u);
467          m_values[3] = static_cast<unsigned char>(0u);
468       }
469       else if(c < 0x800u)
470       {
471          m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
472          m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
473          m_values[2] = static_cast<unsigned char>(0u);
474          m_values[3] = static_cast<unsigned char>(0u);
475       }
476       else if(c < 0x10000u)
477       {
478          m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
479          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
480          m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
481          m_values[3] = static_cast<unsigned char>(0u);
482       }
483       else
484       {
485          m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
486          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
487          m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
488          m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
489       }
490       m_current= 0;
491    }
492    BaseIterator m_position;
493    mutable U8Type m_values[5];
494    mutable unsigned m_current;
495 };
496 
497 template <class BaseIterator, class U32Type = ::boost::uint32_t>
498 class u8_to_u32_iterator
499    : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
500 {
501    typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
502    // special values for pending iterator reads:
503    BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
504 
505 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
506    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
507 
508    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
509    BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
510 #endif
511 
512 public:
513    typename base_type::reference
dereference() const514       dereference()const
515    {
516       if(m_value == pending_read)
517          extract_current();
518       return m_value;
519    }
equal(const u8_to_u32_iterator & that) const520    bool equal(const u8_to_u32_iterator& that)const
521    {
522       return m_position == that.m_position;
523    }
increment()524    void increment()
525    {
526       // We must not start with a continuation character:
527       if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
528          invalid_sequence();
529       // skip high surrogate first if there is one:
530       unsigned c = detail::utf8_byte_count(*m_position);
531       if(m_value == pending_read)
532       {
533          // Since we haven't read in a value, we need to validate the code points:
534          for(unsigned i = 0; i < c; ++i)
535          {
536             ++m_position;
537             // We must have a continuation byte:
538             if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
539                invalid_sequence();
540          }
541       }
542       else
543       {
544          std::advance(m_position, c);
545       }
546       m_value = pending_read;
547    }
decrement()548    void decrement()
549    {
550       // Keep backtracking until we don't have a trailing character:
551       unsigned count = 0;
552       while((*--m_position & 0xC0u) == 0x80u) ++count;
553       // now check that the sequence was valid:
554       if(count != detail::utf8_trailing_byte_count(*m_position))
555          invalid_sequence();
556       m_value = pending_read;
557    }
base() const558    BaseIterator base()const
559    {
560       return m_position;
561    }
562    // construct:
u8_to_u32_iterator()563    u8_to_u32_iterator() : m_position()
564    {
565       m_value = pending_read;
566    }
u8_to_u32_iterator(BaseIterator b)567    u8_to_u32_iterator(BaseIterator b) : m_position(b)
568    {
569       m_value = pending_read;
570    }
571    //
572    // Checked constructor:
573    //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)574    u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
575    {
576       m_value = pending_read;
577       //
578       // We must not start with a continuation character, or end with a
579       // truncated UTF-8 sequence otherwise we run the risk of going past
580       // the start/end of the underlying sequence:
581       //
582       if(start != end)
583       {
584          unsigned char v = *start;
585          if((v & 0xC0u) == 0x80u)
586             invalid_sequence();
587          if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
588             invalid_sequence();
589          BaseIterator pos = end;
590          do
591          {
592             v = *--pos;
593          }
594          while((start != pos) && ((v & 0xC0u) == 0x80u));
595          std::ptrdiff_t extra = detail::utf8_byte_count(v);
596          if(std::distance(pos, end) < extra)
597             invalid_sequence();
598       }
599    }
600 private:
invalid_sequence()601    static void invalid_sequence()
602    {
603       std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
604       boost::throw_exception(e);
605    }
extract_current() const606    void extract_current()const
607    {
608       m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
609       // we must not have a continuation character:
610       if((m_value & 0xC0u) == 0x80u)
611          invalid_sequence();
612       // see how many extra bytes we have:
613       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
614       // extract the extra bits, 6 from each extra byte:
615       BaseIterator next(m_position);
616       for(unsigned c = 0; c < extra; ++c)
617       {
618          ++next;
619          m_value <<= 6;
620          // We must have a continuation byte:
621          if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
622             invalid_sequence();
623          m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
624       }
625       // we now need to remove a few of the leftmost bits, but how many depends
626       // upon how many extra bytes we've extracted:
627       static const boost::uint32_t masks[4] =
628       {
629          0x7Fu,
630          0x7FFu,
631          0xFFFFu,
632          0x1FFFFFu,
633       };
634       m_value &= masks[extra];
635       // check the result is in range:
636       if(m_value > static_cast<U32Type>(0x10FFFFu))
637          invalid_sequence();
638       // The result must not be a surrogate:
639       if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
640          invalid_sequence();
641       // We should not have had an invalidly encoded UTF8 sequence:
642       if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
643          invalid_sequence();
644    }
645    BaseIterator m_position;
646    mutable U32Type m_value;
647 };
648 
649 template <class BaseIterator>
650 class utf16_output_iterator
651 {
652 public:
653    typedef void                                   difference_type;
654    typedef void                                   value_type;
655    typedef boost::uint32_t*                       pointer;
656    typedef boost::uint32_t&                       reference;
657    typedef std::output_iterator_tag               iterator_category;
658 
utf16_output_iterator(const BaseIterator & b)659    utf16_output_iterator(const BaseIterator& b)
660       : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)661    utf16_output_iterator(const utf16_output_iterator& that)
662       : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)663    utf16_output_iterator& operator=(const utf16_output_iterator& that)
664    {
665       m_position = that.m_position;
666       return *this;
667    }
operator *() const668    const utf16_output_iterator& operator*()const
669    {
670       return *this;
671    }
operator =(boost::uint32_t val) const672    void operator=(boost::uint32_t val)const
673    {
674       push(val);
675    }
operator ++()676    utf16_output_iterator& operator++()
677    {
678       return *this;
679    }
operator ++(int)680    utf16_output_iterator& operator++(int)
681    {
682       return *this;
683    }
base() const684    BaseIterator base()const
685    {
686       return m_position;
687    }
688 private:
push(boost::uint32_t v) const689    void push(boost::uint32_t v)const
690    {
691       if(v >= 0x10000u)
692       {
693          // begin by checking for a code point out of range:
694          if(v > 0x10FFFFu)
695             detail::invalid_utf32_code_point(v);
696          // split into two surrogates:
697          *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
698          *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
699       }
700       else
701       {
702          // 16-bit code point:
703          // value must not be a surrogate:
704          if(detail::is_surrogate(v))
705             detail::invalid_utf32_code_point(v);
706          *m_position++ = static_cast<boost::uint16_t>(v);
707       }
708    }
709    mutable BaseIterator m_position;
710 };
711 
712 template <class BaseIterator>
713 class utf8_output_iterator
714 {
715 public:
716    typedef void                                   difference_type;
717    typedef void                                   value_type;
718    typedef boost::uint32_t*                       pointer;
719    typedef boost::uint32_t&                       reference;
720    typedef std::output_iterator_tag               iterator_category;
721 
utf8_output_iterator(const BaseIterator & b)722    utf8_output_iterator(const BaseIterator& b)
723       : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)724    utf8_output_iterator(const utf8_output_iterator& that)
725       : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)726    utf8_output_iterator& operator=(const utf8_output_iterator& that)
727    {
728       m_position = that.m_position;
729       return *this;
730    }
operator *() const731    const utf8_output_iterator& operator*()const
732    {
733       return *this;
734    }
operator =(boost::uint32_t val) const735    void operator=(boost::uint32_t val)const
736    {
737       push(val);
738    }
operator ++()739    utf8_output_iterator& operator++()
740    {
741       return *this;
742    }
operator ++(int)743    utf8_output_iterator& operator++(int)
744    {
745       return *this;
746    }
base() const747    BaseIterator base()const
748    {
749       return m_position;
750    }
751 private:
push(boost::uint32_t c) const752    void push(boost::uint32_t c)const
753    {
754       if(c > 0x10FFFFu)
755          detail::invalid_utf32_code_point(c);
756       if(c < 0x80u)
757       {
758          *m_position++ = static_cast<unsigned char>(c);
759       }
760       else if(c < 0x800u)
761       {
762          *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
763          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
764       }
765       else if(c < 0x10000u)
766       {
767          *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
768          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
769          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
770       }
771       else
772       {
773          *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
774          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
775          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
776          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
777       }
778    }
779    mutable BaseIterator m_position;
780 };
781 
782 } // namespace boost
783 
784 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
785 
786