1 /* 2 * RESTinio 3 */ 4 5 /*! 6 * @file 7 * @brief An implementation of checker for UTF-8 sequences. 8 * 9 * @since v.0.6.5 10 */ 11 12 #pragma once 13 14 #include <restinio/compiler_features.hpp> 15 16 #include <cstdint> 17 18 namespace restinio 19 { 20 21 namespace utils 22 { 23 24 // 25 // utf8_checker_t 26 // 27 28 /*! 29 * @brief Helper class for checking UTF-8 byte sequence during parsing 30 * URI or incoming byte stream. 31 * 32 * Note: this class is moved to restinio::utils namespace in v.0.6.5. 33 */ 34 class utf8_checker_t 35 { 36 public: 37 utf8_checker_t() = default; 38 39 RESTINIO_NODISCARD 40 bool process_byte(std::uint8_t byte)41 process_byte( std::uint8_t byte ) noexcept 42 { 43 check_overlong( byte ); 44 45 if( m_current_symbol_rest_bytes > 0 ) 46 { 47 // check byte is 10xxxxxx. 48 if( (byte & 0xC0) == 0x80 ) 49 { 50 m_current_symbol <<= 6; 51 byte &= 0x3F; 52 53 m_current_symbol |= byte; 54 55 if( --m_current_symbol_rest_bytes == 0 ) 56 { 57 validate_current_symbol(); 58 } 59 } 60 else 61 { 62 m_state = state_t::invalid; 63 } 64 } 65 else 66 { 67 m_current_symbol = 0; 68 69 if( (byte & 0x80) == 0x00) 70 { 71 // mask 0xxxxxxx 72 m_current_symbol_rest_bytes = 0; 73 } 74 else if( (byte & 0xE0) == 0xC0) 75 { 76 // mask 110xxxxx 77 m_current_symbol_rest_bytes = 1; 78 byte &= 0x1F; 79 } 80 else if( (byte & 0xF0) == 0xE0) 81 { 82 // mask 1110xxxx 83 m_current_symbol_rest_bytes = 2; 84 byte &= 0xF; 85 } 86 else if( (byte & 0xF8) == 0xF0) 87 { 88 // mask 11110xxx 89 m_current_symbol_rest_bytes = 3; 90 byte &= 0x7; 91 } 92 else if( (byte & 0xFC) == 0xF8) 93 { 94 // mask 111110xx 95 m_current_symbol_rest_bytes = 4; 96 byte &= 0x3; 97 } 98 else if( (byte & 0xFE) == 0xFC) 99 { 100 // mask 1111110x 101 m_current_symbol_rest_bytes = 5; 102 byte &= 0x1; 103 } 104 else 105 { 106 m_state = state_t::invalid; 107 } 108 109 m_current_symbol = byte; 110 } 111 112 return m_state == state_t::valid || m_state == state_t::may_be_overlong; 113 } 114 115 /*! 116 * @return true if the current sequence finalized. 117 */ 118 RESTINIO_NODISCARD 119 bool finalized() const120 finalized() const noexcept 121 { 122 return m_current_symbol_rest_bytes == 0; 123 } 124 125 void reset()126 reset() noexcept 127 { 128 m_current_symbol = 0; 129 m_current_symbol_rest_bytes = 0; 130 } 131 132 RESTINIO_NODISCARD 133 std::uint32_t current_symbol() const134 current_symbol() const noexcept { return m_current_symbol; } 135 136 private: 137 138 void validate_current_symbol()139 validate_current_symbol() noexcept 140 { 141 if( (m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF) || 142 (m_current_symbol >= 0x110000) ) 143 { 144 m_state = state_t::invalid; 145 } 146 } 147 148 void check_overlong(std::uint8_t byte)149 check_overlong( std::uint8_t byte ) noexcept 150 { 151 if( m_current_symbol_rest_bytes > 0 && 152 m_state == state_t::may_be_overlong ) 153 { 154 if( m_current_symbol_rest_bytes == 2 && 155 (byte & 0xE0) == 0x80 ) 156 m_state = state_t::overlong; 157 else if( m_current_symbol_rest_bytes == 3 && 158 (byte & 0xF0) == 0x80 ) 159 m_state = state_t::overlong; 160 else if( m_current_symbol_rest_bytes == 4 && 161 (byte & 0xF8) == 0x80 ) 162 m_state = state_t::overlong; 163 else if( m_current_symbol_rest_bytes == 5 && 164 (byte & 0xFC) == 0x80 ) 165 m_state = state_t::overlong; 166 else 167 m_state = state_t::valid; 168 } 169 else 170 { 171 if( byte == 0xC0 || byte == 0xC1 ) 172 { 173 m_state = state_t::overlong; 174 } 175 else if( byte == 0xE0 ) 176 { 177 m_state = state_t::may_be_overlong; 178 } 179 else if( byte == 0xF0 ) 180 { 181 m_state = state_t::may_be_overlong; 182 } 183 if( byte == 0xF8 ) 184 { 185 m_state = state_t::may_be_overlong; 186 } 187 if( byte == 0xFC ) 188 { 189 m_state = state_t::may_be_overlong; 190 } 191 } 192 } 193 194 std::uint32_t m_current_symbol = 0u; 195 196 std::size_t m_current_symbol_rest_bytes = 0u; 197 198 enum class state_t 199 { 200 valid, 201 invalid, 202 may_be_overlong, 203 overlong 204 }; 205 206 state_t m_state = state_t::valid; 207 }; 208 209 } /* namespace utils */ 210 211 } /* namespace restinio */ 212 213