1 namespace simdjson { 2 namespace SIMDJSON_IMPLEMENTATION { 3 namespace { 4 namespace utf8_validation { 5 6 using namespace simd; 7 check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)8 simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) { 9 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) 10 // Bit 1 = Too Long (ASCII followed by continuation) 11 // Bit 2 = Overlong 3-byte 12 // Bit 4 = Surrogate 13 // Bit 5 = Overlong 2-byte 14 // Bit 7 = Two Continuations 15 constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ 16 // 11______ 11______ 17 constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ 18 constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ 19 constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ 20 constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ 21 constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ 22 constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ 23 // 11110100 101_____ 24 // 11110101 1001____ 25 // 11110101 101_____ 26 // 1111011_ 1001____ 27 // 1111011_ 101_____ 28 // 11111___ 1001____ 29 // 11111___ 101_____ 30 constexpr const uint8_t TOO_LARGE_1000 = 1<<6; 31 // 11110101 1000____ 32 // 1111011_ 1000____ 33 // 11111___ 1000____ 34 constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ 35 36 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>( 37 // 0_______ ________ <ASCII in byte 1> 38 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, 39 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, 40 // 10______ ________ <continuation in byte 1> 41 TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, 42 // 1100____ ________ <two byte lead in byte 1> 43 TOO_SHORT | OVERLONG_2, 44 // 1101____ ________ <two byte lead in byte 1> 45 TOO_SHORT, 46 // 1110____ ________ <three byte lead in byte 1> 47 TOO_SHORT | OVERLONG_3 | SURROGATE, 48 // 1111____ ________ <four+ byte lead in byte 1> 49 TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 50 ); 51 constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . 52 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>( 53 // ____0000 ________ 54 CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, 55 // ____0001 ________ 56 CARRY | OVERLONG_2, 57 // ____001_ ________ 58 CARRY, 59 CARRY, 60 61 // ____0100 ________ 62 CARRY | TOO_LARGE, 63 // ____0101 ________ 64 CARRY | TOO_LARGE | TOO_LARGE_1000, 65 // ____011_ ________ 66 CARRY | TOO_LARGE | TOO_LARGE_1000, 67 CARRY | TOO_LARGE | TOO_LARGE_1000, 68 69 // ____1___ ________ 70 CARRY | TOO_LARGE | TOO_LARGE_1000, 71 CARRY | TOO_LARGE | TOO_LARGE_1000, 72 CARRY | TOO_LARGE | TOO_LARGE_1000, 73 CARRY | TOO_LARGE | TOO_LARGE_1000, 74 CARRY | TOO_LARGE | TOO_LARGE_1000, 75 // ____1101 ________ 76 CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, 77 CARRY | TOO_LARGE | TOO_LARGE_1000, 78 CARRY | TOO_LARGE | TOO_LARGE_1000 79 ); 80 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>( 81 // ________ 0_______ <ASCII in byte 2> 82 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, 83 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, 84 85 // ________ 1000____ 86 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, 87 // ________ 1001____ 88 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, 89 // ________ 101_____ 90 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, 91 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, 92 93 // ________ 11______ 94 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT 95 ); 96 return (byte_1_high & byte_1_low & byte_2_high); 97 } check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)98 simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input, 99 const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) { 100 simd8<uint8_t> prev2 = input.prev<2>(prev_input); 101 simd8<uint8_t> prev3 = input.prev<3>(prev_input); 102 simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3)); 103 simd8<uint8_t> must23_80 = must23 & uint8_t(0x80); 104 return must23_80 ^ sc; 105 } 106 107 // 108 // Return nonzero if there are incomplete multibyte characters at the end of the block: 109 // e.g. if there is a 4-byte character, but it's 3 bytes from the end. 110 // is_incomplete(const simd8<uint8_t> input)111 simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) { 112 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): 113 // ... 1111____ 111_____ 11______ 114 static const uint8_t max_array[32] = { 115 255, 255, 255, 255, 255, 255, 255, 255, 116 255, 255, 255, 255, 255, 255, 255, 255, 117 255, 255, 255, 255, 255, 255, 255, 255, 118 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 119 }; 120 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]); 121 return input.gt_bits(max_value); 122 } 123 124 struct utf8_checker { 125 // If this is nonzero, there has been a UTF-8 error. 126 simd8<uint8_t> error; 127 // The last input we received 128 simd8<uint8_t> prev_input_block; 129 // Whether the last input we received was incomplete (used for ASCII fast path) 130 simd8<uint8_t> prev_incomplete; 131 132 // 133 // Check whether the current bytes are valid UTF-8. 134 // check_utf8_bytesutf8_checker135 simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) { 136 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes 137 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) 138 simd8<uint8_t> prev1 = input.prev<1>(prev_input); 139 simd8<uint8_t> sc = check_special_cases(input, prev1); 140 this->error |= check_multibyte_lengths(input, prev_input, sc); 141 } 142 143 // The only problem that can happen at EOF is that a multibyte character is too short 144 // or a byte value too large in the last bytes: check_special_cases only checks for bytes 145 // too large in the first of two bytes. check_eofutf8_checker146 simdjson_really_inline void check_eof() { 147 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't 148 // possibly finish them. 149 this->error |= this->prev_incomplete; 150 } 151 check_next_inpututf8_checker152 simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) { 153 if(simdjson_likely(is_ascii(input))) { 154 this->error |= this->prev_incomplete; 155 } else { 156 // you might think that a for-loop would work, but under Visual Studio, it is not good enough. 157 static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4), 158 "We support either two or four chunks per 64-byte block."); 159 if(simd8x64<uint8_t>::NUM_CHUNKS == 2) { 160 this->check_utf8_bytes(input.chunks[0], this->prev_input_block); 161 this->check_utf8_bytes(input.chunks[1], input.chunks[0]); 162 } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) { 163 this->check_utf8_bytes(input.chunks[0], this->prev_input_block); 164 this->check_utf8_bytes(input.chunks[1], input.chunks[0]); 165 this->check_utf8_bytes(input.chunks[2], input.chunks[1]); 166 this->check_utf8_bytes(input.chunks[3], input.chunks[2]); 167 } 168 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]); 169 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]; 170 171 } 172 } 173 // do not forget to call check_eof! errorsutf8_checker174 simdjson_really_inline error_code errors() { 175 return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS; 176 } 177 178 }; // struct utf8_checker 179 } // namespace utf8_validation 180 181 using utf8_validation::utf8_checker; 182 183 } // unnamed namespace 184 } // namespace SIMDJSON_IMPLEMENTATION 185 } // namespace simdjson 186