1 // Copyright 2006 Nemanja Trifunovic
2 
3 /*
4 Permission is hereby granted, free of charge, to any person or organization
5 obtaining a copy of the software and accompanying documentation covered by
6 this license (the "Software") to use, reproduce, display, distribute,
7 execute, and transmit the Software, and to prepare derivative works of the
8 Software, and to permit third-parties to whom the Software is furnished to
9 do so, all subject to the following:
10 
11 The copyright notices in the Software and this entire statement, including
12 the above license grant, this restriction and the following disclaimer,
13 must be included in all copies of the Software, in whole or in part, and
14 all derivative works of the Software, unless such copies or derivative
15 works are solely in the form of machine-executable object code generated by
16 a source language processor.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.
25 */
26 
27 /** \file checked.h \brief Contains part of the utfcpp library.  See
28     http://utfcpp.sourceforge.net for documentation. */
29 
30 
31 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
32 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
33 
34 #include "core.h"
35 #include <stdexcept>
36 
37 namespace utf8
38 {
39     // Exceptions that may be thrown from the library functions.
40     class invalid_code_point : public std::exception {
41         uint32_t cp;
42     public:
invalid_code_point(uint32_t cp)43         invalid_code_point(uint32_t cp) : cp(cp) {}
what()44         const char* what() const noexcept override { return "Invalid code point"; }
code_point()45         uint32_t code_point() const {return cp;}
46     };
47 
48     class invalid_utf8 : public std::exception {
49         uint8_t u8;
50     public:
invalid_utf8(uint8_t u)51         invalid_utf8 (uint8_t u) : u8(u) {}
what()52         const char* what() const noexcept override { return "Invalid UTF-8"; }
utf8_octet()53         uint8_t utf8_octet() const {return u8;}
54     };
55 
56     class invalid_utf16 : public std::exception {
57         uint16_t u16;
58     public:
invalid_utf16(uint16_t u)59         invalid_utf16 (uint16_t u) : u16(u) {}
what()60         const char* what() const noexcept override { return "Invalid UTF-16"; }
utf16_word()61         uint16_t utf16_word() const {return u16;}
62     };
63 
64     class not_enough_room : public std::exception {
65     public:
what()66         const char* what() const noexcept override { return "Not enough space"; }
67     };
68 
69     /// The library API - functions intended to be called by the users
70 
71     template <typename octet_iterator, typename output_iterator>
replace_invalid(octet_iterator start,octet_iterator end,output_iterator out,uint32_t replacement)72     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
73     {
74         while (start != end) {
75             octet_iterator sequence_start = start;
76             internal::utf_error err_code = internal::validate_next(start, end);
77             switch (err_code) {
78                 case internal::OK :
79                     for (octet_iterator it = sequence_start; it != start; ++it)
80                         *out++ = *it;
81                     break;
82                 case internal::NOT_ENOUGH_ROOM:
83                     throw not_enough_room();
84                 case internal::INVALID_LEAD:
85                     append (replacement, out);
86                     ++start;
87                     break;
88                 case internal::INCOMPLETE_SEQUENCE:
89                 case internal::OVERLONG_SEQUENCE:
90                 case internal::INVALID_CODE_POINT:
91                     append (replacement, out);
92                     ++start;
93                     // just one replacement mark for the sequence
94                     while (internal::is_trail(*start) && start != end)
95                         ++start;
96                     break;
97             }
98         }
99         return out;
100     }
101 
102     template <typename octet_iterator, typename output_iterator>
replace_invalid(octet_iterator start,octet_iterator end,output_iterator out)103     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
104     {
105         static const uint32_t replacement_marker = internal::mask16(0xfffd);
106         return replace_invalid(start, end, out, replacement_marker);
107     }
108 
109     template <typename octet_iterator>
append(uint32_t cp,octet_iterator result)110     octet_iterator append(uint32_t cp, octet_iterator result)
111     {
112         if (!internal::is_code_point_valid(cp))
113             throw invalid_code_point(cp);
114 
115         if (cp < 0x80)                        // one octet
116             *(result++) = static_cast<uint8_t>(cp);
117         else if (cp < 0x800) {                // two octets
118             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
119             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
120         }
121         else if (cp < 0x10000) {              // three octets
122             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
123             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
124             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
125         }
126         else if (cp <= internal::CODE_POINT_MAX) {      // four octets
127             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
128             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
129             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
130             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
131         }
132         else
133             throw invalid_code_point(cp);
134 
135         return result;
136     }
137 
138     template <typename octet_iterator>
next(octet_iterator & it,octet_iterator end)139     uint32_t next(octet_iterator& it, octet_iterator end)
140     {
141         uint32_t cp = 0;
142         internal::utf_error err_code = internal::validate_next(it, end, &cp);
143         switch (err_code) {
144             case internal::OK :
145                 break;
146             case internal::NOT_ENOUGH_ROOM :
147                 throw not_enough_room();
148             case internal::INVALID_LEAD :
149             case internal::INCOMPLETE_SEQUENCE :
150             case internal::OVERLONG_SEQUENCE :
151                 throw invalid_utf8(*it);
152             case internal::INVALID_CODE_POINT :
153                 throw invalid_code_point(cp);
154         }
155         return cp;
156     }
157 
158     template <typename octet_iterator>
peek_next(octet_iterator it,octet_iterator end)159     uint32_t peek_next(octet_iterator it, octet_iterator end)
160     {
161         return next(it, end);
162     }
163 
164     template <typename octet_iterator>
prior(octet_iterator & it,octet_iterator start)165     uint32_t prior(octet_iterator& it, octet_iterator start)
166     {
167         octet_iterator end = it;
168         while (internal::is_trail(*(--it)))
169             if (it < start)
170                 throw invalid_utf8(*it); // error - no lead byte in the sequence
171         octet_iterator temp = it;
172         return next(temp, end);
173     }
174 
175     /// Deprecated in versions that include "prior"
176     template <typename octet_iterator>
previous(octet_iterator & it,octet_iterator pass_start)177     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
178     {
179         octet_iterator end = it;
180         while (internal::is_trail(*(--it)))
181             if (it == pass_start)
182                 throw invalid_utf8(*it); // error - no lead byte in the sequence
183         octet_iterator temp = it;
184         return next(temp, end);
185     }
186 
187     template <typename octet_iterator, typename distance_type>
advance(octet_iterator & it,distance_type n,octet_iterator end)188     void advance (octet_iterator& it, distance_type n, octet_iterator end)
189     {
190         for (distance_type i = 0; i < n; ++i)
191             next(it, end);
192     }
193 
194     template <typename octet_iterator>
195     typename std::iterator_traits<octet_iterator>::difference_type
distance(octet_iterator first,octet_iterator last)196     distance (octet_iterator first, octet_iterator last)
197     {
198         typename std::iterator_traits<octet_iterator>::difference_type dist;
199         for (dist = 0; first < last; ++dist)
200             next(first, last);
201         return dist;
202     }
203 
204     template <typename u16bit_iterator, typename octet_iterator>
utf16to8(u16bit_iterator start,u16bit_iterator end,octet_iterator result)205     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
206     {
207         while (start != end) {
208             uint32_t cp = internal::mask16(*start++);
209             // Take care of surrogate pairs first
210             if (internal::is_surrogate(cp)) {
211                 if (start != end) {
212                     uint32_t trail_surrogate = internal::mask16(*start++);
213                     if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
214                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
215                     else
216                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
217                 }
218                 else
219                     throw invalid_utf16(static_cast<uint16_t>(*start));
220 
221             }
222             result = append(cp, result);
223         }
224         return result;
225     }
226 
227     template <typename u16bit_iterator, typename octet_iterator>
utf8to16(octet_iterator start,octet_iterator end,u16bit_iterator result)228     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
229     {
230         while (start != end) {
231             uint32_t cp = next(start, end);
232             if (cp > 0xffff) { //make a surrogate pair
233                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
234                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
235             }
236             else
237                 *result++ = static_cast<uint16_t>(cp);
238         }
239         return result;
240     }
241 
242     template <typename octet_iterator, typename u32bit_iterator>
utf32to8(u32bit_iterator start,u32bit_iterator end,octet_iterator result)243     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
244     {
245         while (start != end)
246             result = append(*(start++), result);
247 
248         return result;
249     }
250 
251     template <typename octet_iterator, typename u32bit_iterator>
utf8to32(octet_iterator start,octet_iterator end,u32bit_iterator result)252     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
253     {
254         while (start < end)
255             (*result++) = next(start, end);
256 
257         return result;
258     }
259 
260     // The iterator class
261     template <typename octet_iterator>
262     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
263       octet_iterator it;
264       octet_iterator range_start;
265       octet_iterator range_end;
266       public:
iterator()267       iterator () {};
iterator(const octet_iterator & octet_it,const octet_iterator & range_start,const octet_iterator & range_end)268       explicit iterator (const octet_iterator& octet_it,
269                          const octet_iterator& range_start,
270                          const octet_iterator& range_end) :
271                it(octet_it), range_start(range_start), range_end(range_end)
272       {
273           if (it < range_start || it > range_end)
274               throw std::out_of_range("Invalid utf-8 iterator position");
275       }
276       // the default "big three" are OK
base()277       octet_iterator base () const { return it; }
278       uint32_t operator * () const
279       {
280           octet_iterator temp = it;
281           return next(temp, range_end);
282       }
283       bool operator == (const iterator& rhs) const
284       {
285           if (range_start != rhs.range_start || range_end != rhs.range_end)
286               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
287           return (it == rhs.it);
288       }
289       bool operator != (const iterator& rhs) const
290       {
291           return !(operator == (rhs));
292       }
293       iterator& operator ++ ()
294       {
295           next(it, range_end);
296           return *this;
297       }
298       iterator operator ++ (int)
299       {
300           iterator temp = *this;
301           next(it, range_end);
302           return temp;
303       }
304       iterator& operator -- ()
305       {
306           prior(it, range_start);
307           return *this;
308       }
309       iterator operator -- (int)
310       {
311           iterator temp = *this;
312           prior(it, range_start);
313           return temp;
314       }
315     }; // class iterator
316 
317     // The wchar_t iterator class
318     template <typename octet_iterator>
319     class wchar_iterator :
320         public std::iterator<std::bidirectional_iterator_tag, wchar_t>
321     {
322         octet_iterator it;
323         octet_iterator range_start;
324         octet_iterator range_end;
325     public:
wchar_iterator()326         wchar_iterator () {};
wchar_iterator(const octet_iterator & octet_it,const octet_iterator & range_start,const octet_iterator & range_end)327         wchar_iterator (const octet_iterator& octet_it,
328                         const octet_iterator& range_start,
329                         const octet_iterator& range_end) :
330             it(octet_it), range_start(range_start), range_end(range_end)
331         {
332             if (it < range_start || it > range_end)
333                 throw std::out_of_range("Invalid utf-8 iterator position");
334         }
335         // the default "big three" are OK
base()336         octet_iterator base () const { return it; }
337         wchar_t operator * () const
338         {
339             octet_iterator temp = it;
340             uint32_t retval = next(temp, range_end);
341             assert(retval <= WCHAR_MAX);
342             return retval;
343         }
344         bool operator == (const wchar_iterator& rhs) const
345         {
346             if (range_start != rhs.range_start || range_end != rhs.range_end)
347                 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
348             return (it == rhs.it);
349         }
350         bool operator != (const wchar_iterator& rhs) const
351         {
352             return !(operator == (rhs));
353         }
354         wchar_iterator& operator ++ ()
355         {
356             next(it, range_end);
357             return *this;
358         }
359         wchar_iterator operator ++ (int)
360         {
361             wchar_iterator temp = *this;
362             next(it, range_end);
363             return temp;
364         }
365         wchar_iterator& operator -- ()
366         {
367             prior(it, range_start);
368             return *this;
369         }
370         wchar_iterator operator -- (int)
371         {
372             wchar_iterator temp = *this;
373             prior(it, range_start);
374             return temp;
375         }
376     };
377 
378 } // namespace utf8
379 
380 #endif //header guard
381 
382 
383