1 /*
2  * MIT License
3  *
4  * Copyright (c) 2017-2019 Mikhail Pilin
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include <cstdint>
28 #include <utility>
29 #include <stdexcept>
30 
31 #include "base/result.h"
32 
33 namespace ww898 {
34 namespace utf {
35 
36 // Supported combinations:
37 //   0xxx_xxxx
38 //   110x_xxxx 10xx_xxxx
39 //   1110_xxxx 10xx_xxxx 10xx_xxxx
40 //   1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
41 //   1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
42 //   1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
43 struct utf8 final
44 {
45     static size_t const max_unicode_symbol_size = 4;
46     static size_t const max_supported_symbol_size = 6;
47 
48     static uint32_t const max_supported_code_point = 0x7FFFFFFF;
49 
50     using char_type = uint8_t;
51 
52     template<typename PeekFn>
char_sizeww898::utf::utf853     static Result<size_t, const char *> char_size(PeekFn && peek_fn)
54     {
55         const std::pair<char_type, size_t> peek_res = std::forward<PeekFn>(peek_fn)();
56         const auto ch0 = peek_res.first;
57         const auto remaining = peek_res.second;
58         size_t retval = 0;
59 
60         if (ch0 < 0x80) { // 0xxx_xxxx
61             retval = 1;
62         } else if (ch0 < 0xC0) {
63             return Err("The utf8 first char in sequence is incorrect");
64         } else if (ch0 < 0xE0) { // 110x_xxxx 10xx_xxxx
65             retval = 2;
66         } else if (ch0 < 0xF0) { // 1110_xxxx 10xx_xxxx 10xx_xxxx
67             retval = 3;
68         } else if (ch0 < 0xF8) { // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
69             retval = 4;
70         } else if (ch0 < 0xFC) { // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
71             retval = 5;
72         } else if (ch0 < 0xFE) { // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
73             retval = 6;
74         } else {
75             return Err("The utf8 first char in sequence is incorrect");
76         }
77         if (retval - 1 > remaining) {
78             return Err("Truncated utf8 sequence");
79         }
80         return Ok(retval);
81     }
82 
83     template<typename ReadFn>
readww898::utf::utf884     static uint32_t read(ReadFn && read_fn)
85     {
86         char_type const ch0 = read_fn();
87         if (ch0 < 0x80) // 0xxx_xxxx
88             return ch0;
89         if (ch0 < 0xC0)
90             throw std::runtime_error("The utf8 first char in sequence is incorrect");
91         if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx
92         {
93             char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err;
94             return (ch0 << 6) + ch1 - 0x3080;
95         }
96         if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx
97         {
98             char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err;
99             char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err;
100             return (ch0 << 12) + (ch1 << 6) + ch2 - 0xE2080;
101         }
102         if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
103         {
104             char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err;
105             char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err;
106             char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err;
107             return (ch0 << 18) + (ch1 << 12) + (ch2 << 6) + ch3 - 0x3C82080;
108         }
109         if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
110         {
111             char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err;
112             char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err;
113             char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err;
114             char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err;
115             return (ch0 << 24) + (ch1 << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0xFA082080;
116         }
117         if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
118         {
119             char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err;
120             char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err;
121             char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err;
122             char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err;
123             char_type const ch5 = read_fn(); if (ch5 >> 6 != 2) goto _err;
124             return (ch0 << 30) + (ch1 << 24) + (ch2 << 18) + (ch3 << 12) + (ch4 << 6) + ch5 - 0x82082080;
125         }
126         throw std::runtime_error("The utf8 first char in sequence is incorrect");
127         _err: throw std::runtime_error("The utf8 slave char in sequence is incorrect");
128     }
129 
130     template<typename WriteFn>
writeww898::utf::utf8131     static void write(uint32_t const cp, WriteFn && write_fn)
132     {
133         if (cp < 0x80)          // 0xxx_xxxx
134             write_fn(static_cast<char_type>(cp));
135         else if (cp < 0x800)    // 110x_xxxx 10xx_xxxx
136         {
137             write_fn(static_cast<char_type>(0xC0 | cp >>  6));
138             goto _1;
139         }
140         else if (cp < 0x10000)  // 1110_xxxx 10xx_xxxx 10xx_xxxx
141         {
142             write_fn(static_cast<char_type>(0xE0 | cp >> 12));
143             goto _2;
144         }
145         else if (cp < 0x200000) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
146         {
147             write_fn(static_cast<char_type>(0xF0 | cp >> 18));
148             goto _3;
149         }
150         else if (cp < 0x4000000) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
151         {
152             write_fn(static_cast<char_type>(0xF8 | cp >> 24));
153             goto _4;
154         }
155         else if (cp < 0x80000000) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
156         {
157             write_fn(static_cast<char_type>(0xFC | cp >> 30));
158             goto _5;
159         }
160         else
161             throw std::runtime_error("Tool large UTF8 code point");
162         return;
163         _5: write_fn(static_cast<char_type>(0x80 | (cp >> 24 & 0x3F)));
164         _4: write_fn(static_cast<char_type>(0x80 | (cp >> 18 & 0x3F)));
165         _3: write_fn(static_cast<char_type>(0x80 | (cp >> 12 & 0x3F)));
166         _2: write_fn(static_cast<char_type>(0x80 | (cp >>  6 & 0x3F)));
167         _1: write_fn(static_cast<char_type>(0x80 | (cp       & 0x3F)));
168     }
169 };
170 
171 }}
172