1 /* 2 * MIT License 3 * 4 * Copyright (c) 2017-2019 Mikhail Pilin 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include <cstdint> 28 #include <utility> 29 #include <stdexcept> 30 31 #include "base/result.h" 32 33 namespace ww898 { 34 namespace utf { 35 36 // Supported combinations: 37 // 0xxx_xxxx 38 // 110x_xxxx 10xx_xxxx 39 // 1110_xxxx 10xx_xxxx 10xx_xxxx 40 // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 41 // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 42 // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 43 struct utf8 final 44 { 45 static size_t const max_unicode_symbol_size = 4; 46 static size_t const max_supported_symbol_size = 6; 47 48 static uint32_t const max_supported_code_point = 0x7FFFFFFF; 49 50 using char_type = uint8_t; 51 52 template<typename PeekFn> char_sizeww898::utf::utf853 static Result<size_t, const char *> char_size(PeekFn && peek_fn) 54 { 55 const std::pair<char_type, size_t> peek_res = std::forward<PeekFn>(peek_fn)(); 56 const auto ch0 = peek_res.first; 57 const auto remaining = peek_res.second; 58 size_t retval = 0; 59 60 if (ch0 < 0x80) { // 0xxx_xxxx 61 retval = 1; 62 } else if (ch0 < 0xC0) { 63 return Err("The utf8 first char in sequence is incorrect"); 64 } else if (ch0 < 0xE0) { // 110x_xxxx 10xx_xxxx 65 retval = 2; 66 } else if (ch0 < 0xF0) { // 1110_xxxx 10xx_xxxx 10xx_xxxx 67 retval = 3; 68 } else if (ch0 < 0xF8) { // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 69 retval = 4; 70 } else if (ch0 < 0xFC) { // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 71 retval = 5; 72 } else if (ch0 < 0xFE) { // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 73 retval = 6; 74 } else { 75 return Err("The utf8 first char in sequence is incorrect"); 76 } 77 if (retval - 1 > remaining) { 78 return Err("Truncated utf8 sequence"); 79 } 80 return Ok(retval); 81 } 82 83 template<typename ReadFn> readww898::utf::utf884 static uint32_t read(ReadFn && read_fn) 85 { 86 char_type const ch0 = read_fn(); 87 if (ch0 < 0x80) // 0xxx_xxxx 88 return ch0; 89 if (ch0 < 0xC0) 90 throw std::runtime_error("The utf8 first char in sequence is incorrect"); 91 if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx 92 { 93 char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; 94 return (ch0 << 6) + ch1 - 0x3080; 95 } 96 if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx 97 { 98 char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; 99 char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; 100 return (ch0 << 12) + (ch1 << 6) + ch2 - 0xE2080; 101 } 102 if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 103 { 104 char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; 105 char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; 106 char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; 107 return (ch0 << 18) + (ch1 << 12) + (ch2 << 6) + ch3 - 0x3C82080; 108 } 109 if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 110 { 111 char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; 112 char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; 113 char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; 114 char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; 115 return (ch0 << 24) + (ch1 << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0xFA082080; 116 } 117 if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 118 { 119 char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; 120 char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; 121 char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; 122 char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; 123 char_type const ch5 = read_fn(); if (ch5 >> 6 != 2) goto _err; 124 return (ch0 << 30) + (ch1 << 24) + (ch2 << 18) + (ch3 << 12) + (ch4 << 6) + ch5 - 0x82082080; 125 } 126 throw std::runtime_error("The utf8 first char in sequence is incorrect"); 127 _err: throw std::runtime_error("The utf8 slave char in sequence is incorrect"); 128 } 129 130 template<typename WriteFn> writeww898::utf::utf8131 static void write(uint32_t const cp, WriteFn && write_fn) 132 { 133 if (cp < 0x80) // 0xxx_xxxx 134 write_fn(static_cast<char_type>(cp)); 135 else if (cp < 0x800) // 110x_xxxx 10xx_xxxx 136 { 137 write_fn(static_cast<char_type>(0xC0 | cp >> 6)); 138 goto _1; 139 } 140 else if (cp < 0x10000) // 1110_xxxx 10xx_xxxx 10xx_xxxx 141 { 142 write_fn(static_cast<char_type>(0xE0 | cp >> 12)); 143 goto _2; 144 } 145 else if (cp < 0x200000) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 146 { 147 write_fn(static_cast<char_type>(0xF0 | cp >> 18)); 148 goto _3; 149 } 150 else if (cp < 0x4000000) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 151 { 152 write_fn(static_cast<char_type>(0xF8 | cp >> 24)); 153 goto _4; 154 } 155 else if (cp < 0x80000000) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 156 { 157 write_fn(static_cast<char_type>(0xFC | cp >> 30)); 158 goto _5; 159 } 160 else 161 throw std::runtime_error("Tool large UTF8 code point"); 162 return; 163 _5: write_fn(static_cast<char_type>(0x80 | (cp >> 24 & 0x3F))); 164 _4: write_fn(static_cast<char_type>(0x80 | (cp >> 18 & 0x3F))); 165 _3: write_fn(static_cast<char_type>(0x80 | (cp >> 12 & 0x3F))); 166 _2: write_fn(static_cast<char_type>(0x80 | (cp >> 6 & 0x3F))); 167 _1: write_fn(static_cast<char_type>(0x80 | (cp & 0x3F))); 168 } 169 }; 170 171 }} 172