1 /*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE unicode_iterator.hpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17 */
18
19 /****************************************************************************
20
21 Contents:
22 ~~~~~~~~~
23
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26
27 template <class BaseIterator, class U8Type = std::uint8_t>
28 class u32_to_u8_iterator;
29
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31
32 template <class BaseIterator, class U32Type = std::uint32_t>
33 class u8_to_u32_iterator;
34
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36
37 template <class BaseIterator, class U16Type = std::uint16_t>
38 class u32_to_u16_iterator;
39
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41
42 template <class BaseIterator, class U32Type = std::uint32_t>
43 class u16_to_u32_iterator;
44
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46
47 2) Single pass output iterator adapters:
48
49 template <class BaseIterator>
50 class utf8_output_iterator;
51
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53
54 template <class BaseIterator>
55 class utf16_output_iterator;
56
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58
59 ****************************************************************************/
60
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <cstdint>
64 #include <boost/regex/config.hpp>
65 #include <stdexcept>
66 #include <sstream>
67 #include <ios>
68 #include <limits.h> // CHAR_BIT
69
70 #ifndef BOOST_REGEX_STANDALONE
71 #include <boost/throw_exception.hpp>
72 #endif
73
74 namespace boost{
75
76 namespace detail{
77
78 static const std::uint16_t high_surrogate_base = 0xD7C0u;
79 static const std::uint16_t low_surrogate_base = 0xDC00u;
80 static const std::uint32_t ten_bit_mask = 0x3FFu;
81
is_high_surrogate(std::uint16_t v)82 inline bool is_high_surrogate(std::uint16_t v)
83 {
84 return (v & 0xFFFFFC00u) == 0xd800u;
85 }
is_low_surrogate(std::uint16_t v)86 inline bool is_low_surrogate(std::uint16_t v)
87 {
88 return (v & 0xFFFFFC00u) == 0xdc00u;
89 }
90 template <class T>
is_surrogate(T v)91 inline bool is_surrogate(T v)
92 {
93 return (v & 0xFFFFF800u) == 0xd800;
94 }
95
utf8_byte_count(std::uint8_t c)96 inline unsigned utf8_byte_count(std::uint8_t c)
97 {
98 // if the most significant bit with a zero in it is in position
99 // 8-N then there are N bytes in this UTF-8 sequence:
100 std::uint8_t mask = 0x80u;
101 unsigned result = 0;
102 while(c & mask)
103 {
104 ++result;
105 mask >>= 1;
106 }
107 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
108 }
109
utf8_trailing_byte_count(std::uint8_t c)110 inline unsigned utf8_trailing_byte_count(std::uint8_t c)
111 {
112 return utf8_byte_count(c) - 1;
113 }
114
115 #ifdef BOOST_REGEX_MSVC
116 #pragma warning(push)
117 #pragma warning(disable:4100)
118 #endif
119 #ifndef BOOST_NO_EXCEPTIONS
120 BOOST_REGEX_NORETURN
121 #endif
invalid_utf32_code_point(std::uint32_t val)122 inline void invalid_utf32_code_point(std::uint32_t val)
123 {
124 std::stringstream ss;
125 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
126 std::out_of_range e(ss.str());
127 #ifndef BOOST_REGEX_STANDALONE
128 boost::throw_exception(e);
129 #else
130 throw e;
131 #endif
132 }
133 #ifdef BOOST_REGEX_MSVC
134 #pragma warning(pop)
135 #endif
136
137
138 } // namespace detail
139
140 template <class BaseIterator, class U16Type = std::uint16_t>
141 class u32_to_u16_iterator
142 {
143 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
144
145 static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
146 static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
147
148 public:
149 typedef std::ptrdiff_t difference_type;
150 typedef U16Type value_type;
151 typedef value_type const* pointer;
152 typedef value_type const reference;
153 typedef std::bidirectional_iterator_tag iterator_category;
154
operator *() const155 reference operator*()const
156 {
157 if(m_current == 2)
158 extract_current();
159 return m_values[m_current];
160 }
operator ==(const u32_to_u16_iterator & that) const161 bool operator==(const u32_to_u16_iterator& that)const
162 {
163 if(m_position == that.m_position)
164 {
165 // Both m_currents must be equal, or both even
166 // this is the same as saying their sum must be even:
167 return (m_current + that.m_current) & 1u ? false : true;
168 }
169 return false;
170 }
operator !=(const u32_to_u16_iterator & that) const171 bool operator!=(const u32_to_u16_iterator& that)const
172 {
173 return !(*this == that);
174 }
operator ++()175 u32_to_u16_iterator& operator++()
176 {
177 // if we have a pending read then read now, so that we know whether
178 // to skip a position, or move to a low-surrogate:
179 if(m_current == 2)
180 {
181 // pending read:
182 extract_current();
183 }
184 // move to the next surrogate position:
185 ++m_current;
186 // if we've reached the end skip a position:
187 if(m_values[m_current] == 0)
188 {
189 m_current = 2;
190 ++m_position;
191 }
192 return *this;
193 }
operator ++(int)194 u32_to_u16_iterator operator++(int)
195 {
196 u32_to_u16_iterator r(*this);
197 ++(*this);
198 return r;
199 }
operator --()200 u32_to_u16_iterator& operator--()
201 {
202 if(m_current != 1)
203 {
204 // decrementing an iterator always leads to a valid position:
205 --m_position;
206 extract_current();
207 m_current = m_values[1] ? 1 : 0;
208 }
209 else
210 {
211 m_current = 0;
212 }
213 return *this;
214 }
operator --(int)215 u32_to_u16_iterator operator--(int)
216 {
217 u32_to_u16_iterator r(*this);
218 --(*this);
219 return r;
220 }
base() const221 BaseIterator base()const
222 {
223 return m_position;
224 }
225 // construct:
u32_to_u16_iterator()226 u32_to_u16_iterator() : m_position(), m_current(0)
227 {
228 m_values[0] = 0;
229 m_values[1] = 0;
230 m_values[2] = 0;
231 }
u32_to_u16_iterator(BaseIterator b)232 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
233 {
234 m_values[0] = 0;
235 m_values[1] = 0;
236 m_values[2] = 0;
237 }
238 private:
239
extract_current() const240 void extract_current()const
241 {
242 // begin by checking for a code point out of range:
243 std::uint32_t v = *m_position;
244 if(v >= 0x10000u)
245 {
246 if(v > 0x10FFFFu)
247 detail::invalid_utf32_code_point(*m_position);
248 // split into two surrogates:
249 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
250 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
251 m_current = 0;
252 BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
253 BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
254 }
255 else
256 {
257 // 16-bit code point:
258 m_values[0] = static_cast<U16Type>(*m_position);
259 m_values[1] = 0;
260 m_current = 0;
261 // value must not be a surrogate:
262 if(detail::is_surrogate(m_values[0]))
263 detail::invalid_utf32_code_point(*m_position);
264 }
265 }
266 BaseIterator m_position;
267 mutable U16Type m_values[3];
268 mutable unsigned m_current;
269 };
270
271 template <class BaseIterator, class U32Type = std::uint32_t>
272 class u16_to_u32_iterator
273 {
274 // special values for pending iterator reads:
275 static const U32Type pending_read = 0xffffffffu;
276
277 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
278
279 static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
280 static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
281
282 public:
283 typedef std::ptrdiff_t difference_type;
284 typedef U32Type value_type;
285 typedef value_type const* pointer;
286 typedef value_type const reference;
287 typedef std::bidirectional_iterator_tag iterator_category;
288
operator *() const289 reference operator*()const
290 {
291 if(m_value == pending_read)
292 extract_current();
293 return m_value;
294 }
operator ==(const u16_to_u32_iterator & that) const295 bool operator==(const u16_to_u32_iterator& that)const
296 {
297 return m_position == that.m_position;
298 }
operator !=(const u16_to_u32_iterator & that) const299 bool operator!=(const u16_to_u32_iterator& that)const
300 {
301 return !(*this == that);
302 }
operator ++()303 u16_to_u32_iterator& operator++()
304 {
305 // skip high surrogate first if there is one:
306 if(detail::is_high_surrogate(*m_position)) ++m_position;
307 ++m_position;
308 m_value = pending_read;
309 return *this;
310 }
operator ++(int)311 u16_to_u32_iterator operator++(int)
312 {
313 u16_to_u32_iterator r(*this);
314 ++(*this);
315 return r;
316 }
operator --()317 u16_to_u32_iterator& operator--()
318 {
319 --m_position;
320 // if we have a low surrogate then go back one more:
321 if(detail::is_low_surrogate(*m_position))
322 --m_position;
323 m_value = pending_read;
324 return *this;
325 }
operator --(int)326 u16_to_u32_iterator operator--(int)
327 {
328 u16_to_u32_iterator r(*this);
329 --(*this);
330 return r;
331 }
base() const332 BaseIterator base()const
333 {
334 return m_position;
335 }
336 // construct:
u16_to_u32_iterator()337 u16_to_u32_iterator() : m_position()
338 {
339 m_value = pending_read;
340 }
u16_to_u32_iterator(BaseIterator b)341 u16_to_u32_iterator(BaseIterator b) : m_position(b)
342 {
343 m_value = pending_read;
344 }
345 //
346 // Range checked version:
347 //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)348 u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
349 {
350 m_value = pending_read;
351 //
352 // The range must not start with a low surrogate, or end in a high surrogate,
353 // otherwise we run the risk of running outside the underlying input range.
354 // Likewise b must not be located at a low surrogate.
355 //
356 std::uint16_t val;
357 if(start != end)
358 {
359 if((b != start) && (b != end))
360 {
361 val = *b;
362 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
363 invalid_code_point(val);
364 }
365 val = *start;
366 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
367 invalid_code_point(val);
368 val = *--end;
369 if(detail::is_high_surrogate(val))
370 invalid_code_point(val);
371 }
372 }
373 private:
invalid_code_point(std::uint16_t val)374 static void invalid_code_point(std::uint16_t val)
375 {
376 std::stringstream ss;
377 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
378 std::out_of_range e(ss.str());
379 #ifndef BOOST_REGEX_STANDALONE
380 boost::throw_exception(e);
381 #else
382 throw e;
383 #endif
384 }
extract_current() const385 void extract_current()const
386 {
387 m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
388 // if the last value is a high surrogate then adjust m_position and m_value as needed:
389 if(detail::is_high_surrogate(*m_position))
390 {
391 // precondition; next value must have be a low-surrogate:
392 BaseIterator next(m_position);
393 std::uint16_t t = *++next;
394 if((t & 0xFC00u) != 0xDC00u)
395 invalid_code_point(t);
396 m_value = (m_value - detail::high_surrogate_base) << 10;
397 m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
398 }
399 // postcondition; result must not be a surrogate:
400 if(detail::is_surrogate(m_value))
401 invalid_code_point(static_cast< std::uint16_t>(m_value));
402 }
403 BaseIterator m_position;
404 mutable U32Type m_value;
405 };
406
407 template <class BaseIterator, class U8Type = std::uint8_t>
408 class u32_to_u8_iterator
409 {
410 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
411
412 static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
413 static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
414
415 public:
416 typedef std::ptrdiff_t difference_type;
417 typedef U8Type value_type;
418 typedef value_type const* pointer;
419 typedef value_type const reference;
420 typedef std::bidirectional_iterator_tag iterator_category;
421
operator *() const422 reference operator*()const
423 {
424 if(m_current == 4)
425 extract_current();
426 return m_values[m_current];
427 }
operator ==(const u32_to_u8_iterator & that) const428 bool operator==(const u32_to_u8_iterator& that)const
429 {
430 if(m_position == that.m_position)
431 {
432 // either the m_current's must be equal, or one must be 0 and
433 // the other 4: which means neither must have bits 1 or 2 set:
434 return (m_current == that.m_current)
435 || (((m_current | that.m_current) & 3) == 0);
436 }
437 return false;
438 }
operator !=(const u32_to_u8_iterator & that) const439 bool operator!=(const u32_to_u8_iterator& that)const
440 {
441 return !(*this == that);
442 }
operator ++()443 u32_to_u8_iterator& operator++()
444 {
445 // if we have a pending read then read now, so that we know whether
446 // to skip a position, or move to a low-surrogate:
447 if(m_current == 4)
448 {
449 // pending read:
450 extract_current();
451 }
452 // move to the next surrogate position:
453 ++m_current;
454 // if we've reached the end skip a position:
455 if(m_values[m_current] == 0)
456 {
457 m_current = 4;
458 ++m_position;
459 }
460 return *this;
461 }
operator ++(int)462 u32_to_u8_iterator operator++(int)
463 {
464 u32_to_u8_iterator r(*this);
465 ++(*this);
466 return r;
467 }
operator --()468 u32_to_u8_iterator& operator--()
469 {
470 if((m_current & 3) == 0)
471 {
472 --m_position;
473 extract_current();
474 m_current = 3;
475 while(m_current && (m_values[m_current] == 0))
476 --m_current;
477 }
478 else
479 --m_current;
480 return *this;
481 }
operator --(int)482 u32_to_u8_iterator operator--(int)
483 {
484 u32_to_u8_iterator r(*this);
485 --(*this);
486 return r;
487 }
base() const488 BaseIterator base()const
489 {
490 return m_position;
491 }
492 // construct:
u32_to_u8_iterator()493 u32_to_u8_iterator() : m_position(), m_current(0)
494 {
495 m_values[0] = 0;
496 m_values[1] = 0;
497 m_values[2] = 0;
498 m_values[3] = 0;
499 m_values[4] = 0;
500 }
u32_to_u8_iterator(BaseIterator b)501 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
502 {
503 m_values[0] = 0;
504 m_values[1] = 0;
505 m_values[2] = 0;
506 m_values[3] = 0;
507 m_values[4] = 0;
508 }
509 private:
510
extract_current() const511 void extract_current()const
512 {
513 std::uint32_t c = *m_position;
514 if(c > 0x10FFFFu)
515 detail::invalid_utf32_code_point(c);
516 if(c < 0x80u)
517 {
518 m_values[0] = static_cast<unsigned char>(c);
519 m_values[1] = static_cast<unsigned char>(0u);
520 m_values[2] = static_cast<unsigned char>(0u);
521 m_values[3] = static_cast<unsigned char>(0u);
522 }
523 else if(c < 0x800u)
524 {
525 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
526 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
527 m_values[2] = static_cast<unsigned char>(0u);
528 m_values[3] = static_cast<unsigned char>(0u);
529 }
530 else if(c < 0x10000u)
531 {
532 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
533 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
534 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
535 m_values[3] = static_cast<unsigned char>(0u);
536 }
537 else
538 {
539 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
540 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
541 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
542 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
543 }
544 m_current= 0;
545 }
546 BaseIterator m_position;
547 mutable U8Type m_values[5];
548 mutable unsigned m_current;
549 };
550
551 template <class BaseIterator, class U32Type = std::uint32_t>
552 class u8_to_u32_iterator
553 {
554 // special values for pending iterator reads:
555 static const U32Type pending_read = 0xffffffffu;
556
557 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
558
559 static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
560 static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
561
562 public:
563 typedef std::ptrdiff_t difference_type;
564 typedef U32Type value_type;
565 typedef value_type const* pointer;
566 typedef value_type const reference;
567 typedef std::bidirectional_iterator_tag iterator_category;
568
operator *() const569 reference operator*()const
570 {
571 if(m_value == pending_read)
572 extract_current();
573 return m_value;
574 }
operator ==(const u8_to_u32_iterator & that) const575 bool operator==(const u8_to_u32_iterator& that)const
576 {
577 return m_position == that.m_position;
578 }
operator !=(const u8_to_u32_iterator & that) const579 bool operator!=(const u8_to_u32_iterator& that)const
580 {
581 return !(*this == that);
582 }
operator ++()583 u8_to_u32_iterator& operator++()
584 {
585 // We must not start with a continuation character:
586 if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
587 invalid_sequence();
588 // skip high surrogate first if there is one:
589 unsigned c = detail::utf8_byte_count(*m_position);
590 if(m_value == pending_read)
591 {
592 // Since we haven't read in a value, we need to validate the code points:
593 for(unsigned i = 0; i < c; ++i)
594 {
595 ++m_position;
596 // We must have a continuation byte:
597 if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
598 invalid_sequence();
599 }
600 }
601 else
602 {
603 std::advance(m_position, c);
604 }
605 m_value = pending_read;
606 return *this;
607 }
operator ++(int)608 u8_to_u32_iterator operator++(int)
609 {
610 u8_to_u32_iterator r(*this);
611 ++(*this);
612 return r;
613 }
operator --()614 u8_to_u32_iterator& operator--()
615 {
616 // Keep backtracking until we don't have a trailing character:
617 unsigned count = 0;
618 while((*--m_position & 0xC0u) == 0x80u) ++count;
619 // now check that the sequence was valid:
620 if(count != detail::utf8_trailing_byte_count(*m_position))
621 invalid_sequence();
622 m_value = pending_read;
623 return *this;
624 }
operator --(int)625 u8_to_u32_iterator operator--(int)
626 {
627 u8_to_u32_iterator r(*this);
628 --(*this);
629 return r;
630 }
base() const631 BaseIterator base()const
632 {
633 return m_position;
634 }
635 // construct:
u8_to_u32_iterator()636 u8_to_u32_iterator() : m_position()
637 {
638 m_value = pending_read;
639 }
u8_to_u32_iterator(BaseIterator b)640 u8_to_u32_iterator(BaseIterator b) : m_position(b)
641 {
642 m_value = pending_read;
643 }
644 //
645 // Checked constructor:
646 //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)647 u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
648 {
649 m_value = pending_read;
650 //
651 // We must not start with a continuation character, or end with a
652 // truncated UTF-8 sequence otherwise we run the risk of going past
653 // the start/end of the underlying sequence:
654 //
655 if(start != end)
656 {
657 unsigned char v = *start;
658 if((v & 0xC0u) == 0x80u)
659 invalid_sequence();
660 if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
661 invalid_sequence();
662 BaseIterator pos = end;
663 do
664 {
665 v = *--pos;
666 }
667 while((start != pos) && ((v & 0xC0u) == 0x80u));
668 std::ptrdiff_t extra = detail::utf8_byte_count(v);
669 if(std::distance(pos, end) < extra)
670 invalid_sequence();
671 }
672 }
673 private:
invalid_sequence()674 static void invalid_sequence()
675 {
676 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
677 #ifndef BOOST_REGEX_STANDALONE
678 boost::throw_exception(e);
679 #else
680 throw e;
681 #endif
682 }
extract_current() const683 void extract_current()const
684 {
685 m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
686 // we must not have a continuation character:
687 if((m_value & 0xC0u) == 0x80u)
688 invalid_sequence();
689 // see how many extra bytes we have:
690 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
691 // extract the extra bits, 6 from each extra byte:
692 BaseIterator next(m_position);
693 for(unsigned c = 0; c < extra; ++c)
694 {
695 ++next;
696 m_value <<= 6;
697 // We must have a continuation byte:
698 if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
699 invalid_sequence();
700 m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
701 }
702 // we now need to remove a few of the leftmost bits, but how many depends
703 // upon how many extra bytes we've extracted:
704 static const std::uint32_t masks[4] =
705 {
706 0x7Fu,
707 0x7FFu,
708 0xFFFFu,
709 0x1FFFFFu,
710 };
711 m_value &= masks[extra];
712 // check the result is in range:
713 if(m_value > static_cast<U32Type>(0x10FFFFu))
714 invalid_sequence();
715 // The result must not be a surrogate:
716 if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
717 invalid_sequence();
718 // We should not have had an invalidly encoded UTF8 sequence:
719 if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
720 invalid_sequence();
721 }
722 BaseIterator m_position;
723 mutable U32Type m_value;
724 };
725
726 template <class BaseIterator>
727 class utf16_output_iterator
728 {
729 public:
730 typedef void difference_type;
731 typedef void value_type;
732 typedef std::uint32_t* pointer;
733 typedef std::uint32_t& reference;
734 typedef std::output_iterator_tag iterator_category;
735
utf16_output_iterator(const BaseIterator & b)736 utf16_output_iterator(const BaseIterator& b)
737 : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)738 utf16_output_iterator(const utf16_output_iterator& that)
739 : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)740 utf16_output_iterator& operator=(const utf16_output_iterator& that)
741 {
742 m_position = that.m_position;
743 return *this;
744 }
operator *() const745 const utf16_output_iterator& operator*()const
746 {
747 return *this;
748 }
operator =(std::uint32_t val) const749 void operator=(std::uint32_t val)const
750 {
751 push(val);
752 }
operator ++()753 utf16_output_iterator& operator++()
754 {
755 return *this;
756 }
operator ++(int)757 utf16_output_iterator& operator++(int)
758 {
759 return *this;
760 }
base() const761 BaseIterator base()const
762 {
763 return m_position;
764 }
765 private:
push(std::uint32_t v) const766 void push(std::uint32_t v)const
767 {
768 if(v >= 0x10000u)
769 {
770 // begin by checking for a code point out of range:
771 if(v > 0x10FFFFu)
772 detail::invalid_utf32_code_point(v);
773 // split into two surrogates:
774 *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
775 *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
776 }
777 else
778 {
779 // 16-bit code point:
780 // value must not be a surrogate:
781 if(detail::is_surrogate(v))
782 detail::invalid_utf32_code_point(v);
783 *m_position++ = static_cast<std::uint16_t>(v);
784 }
785 }
786 mutable BaseIterator m_position;
787 };
788
789 template <class BaseIterator>
790 class utf8_output_iterator
791 {
792 public:
793 typedef void difference_type;
794 typedef void value_type;
795 typedef std::uint32_t* pointer;
796 typedef std::uint32_t& reference;
797 typedef std::output_iterator_tag iterator_category;
798
utf8_output_iterator(const BaseIterator & b)799 utf8_output_iterator(const BaseIterator& b)
800 : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)801 utf8_output_iterator(const utf8_output_iterator& that)
802 : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)803 utf8_output_iterator& operator=(const utf8_output_iterator& that)
804 {
805 m_position = that.m_position;
806 return *this;
807 }
operator *() const808 const utf8_output_iterator& operator*()const
809 {
810 return *this;
811 }
operator =(std::uint32_t val) const812 void operator=(std::uint32_t val)const
813 {
814 push(val);
815 }
operator ++()816 utf8_output_iterator& operator++()
817 {
818 return *this;
819 }
operator ++(int)820 utf8_output_iterator& operator++(int)
821 {
822 return *this;
823 }
base() const824 BaseIterator base()const
825 {
826 return m_position;
827 }
828 private:
push(std::uint32_t c) const829 void push(std::uint32_t c)const
830 {
831 if(c > 0x10FFFFu)
832 detail::invalid_utf32_code_point(c);
833 if(c < 0x80u)
834 {
835 *m_position++ = static_cast<unsigned char>(c);
836 }
837 else if(c < 0x800u)
838 {
839 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
840 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
841 }
842 else if(c < 0x10000u)
843 {
844 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
845 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
846 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
847 }
848 else
849 {
850 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
851 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
852 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
853 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
854 }
855 }
856 mutable BaseIterator m_position;
857 };
858
859 } // namespace boost
860
861 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
862
863