1 namespace simdjson {
2 namespace SIMDJSON_IMPLEMENTATION {
3 namespace {
4 namespace utf8_validation {
5 
6 using namespace simd;
7 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)8   simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
9 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
10 // Bit 1 = Too Long (ASCII followed by continuation)
11 // Bit 2 = Overlong 3-byte
12 // Bit 4 = Surrogate
13 // Bit 5 = Overlong 2-byte
14 // Bit 7 = Two Continuations
15     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
16                                                 // 11______ 11______
17     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
18     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
19     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
20     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
21     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
22     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
23                                                 // 11110100 101_____
24                                                 // 11110101 1001____
25                                                 // 11110101 101_____
26                                                 // 1111011_ 1001____
27                                                 // 1111011_ 101_____
28                                                 // 11111___ 1001____
29                                                 // 11111___ 101_____
30     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
31                                                 // 11110101 1000____
32                                                 // 1111011_ 1000____
33                                                 // 11111___ 1000____
34     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
35 
36     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
37       // 0_______ ________ <ASCII in byte 1>
38       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
39       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
40       // 10______ ________ <continuation in byte 1>
41       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
42       // 1100____ ________ <two byte lead in byte 1>
43       TOO_SHORT | OVERLONG_2,
44       // 1101____ ________ <two byte lead in byte 1>
45       TOO_SHORT,
46       // 1110____ ________ <three byte lead in byte 1>
47       TOO_SHORT | OVERLONG_3 | SURROGATE,
48       // 1111____ ________ <four+ byte lead in byte 1>
49       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
50     );
51     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
52     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
53       // ____0000 ________
54       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
55       // ____0001 ________
56       CARRY | OVERLONG_2,
57       // ____001_ ________
58       CARRY,
59       CARRY,
60 
61       // ____0100 ________
62       CARRY | TOO_LARGE,
63       // ____0101 ________
64       CARRY | TOO_LARGE | TOO_LARGE_1000,
65       // ____011_ ________
66       CARRY | TOO_LARGE | TOO_LARGE_1000,
67       CARRY | TOO_LARGE | TOO_LARGE_1000,
68 
69       // ____1___ ________
70       CARRY | TOO_LARGE | TOO_LARGE_1000,
71       CARRY | TOO_LARGE | TOO_LARGE_1000,
72       CARRY | TOO_LARGE | TOO_LARGE_1000,
73       CARRY | TOO_LARGE | TOO_LARGE_1000,
74       CARRY | TOO_LARGE | TOO_LARGE_1000,
75       // ____1101 ________
76       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
77       CARRY | TOO_LARGE | TOO_LARGE_1000,
78       CARRY | TOO_LARGE | TOO_LARGE_1000
79     );
80     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
81       // ________ 0_______ <ASCII in byte 2>
82       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
83       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
84 
85       // ________ 1000____
86       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
87       // ________ 1001____
88       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
89       // ________ 101_____
90       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
91       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
92 
93       // ________ 11______
94       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
95     );
96     return (byte_1_high & byte_1_low & byte_2_high);
97   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)98   simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
99       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
100     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
101     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
102     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
103     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
104     return must23_80 ^ sc;
105   }
106 
107   //
108   // Return nonzero if there are incomplete multibyte characters at the end of the block:
109   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
110   //
is_incomplete(const simd8<uint8_t> input)111   simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
112     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
113     // ... 1111____ 111_____ 11______
114     static const uint8_t max_array[32] = {
115       255, 255, 255, 255, 255, 255, 255, 255,
116       255, 255, 255, 255, 255, 255, 255, 255,
117       255, 255, 255, 255, 255, 255, 255, 255,
118       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
119     };
120     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
121     return input.gt_bits(max_value);
122   }
123 
124   struct utf8_checker {
125     // If this is nonzero, there has been a UTF-8 error.
126     simd8<uint8_t> error;
127     // The last input we received
128     simd8<uint8_t> prev_input_block;
129     // Whether the last input we received was incomplete (used for ASCII fast path)
130     simd8<uint8_t> prev_incomplete;
131 
132     //
133     // Check whether the current bytes are valid UTF-8.
134     //
check_utf8_bytesutf8_checker135     simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
136       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
137       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
138       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
139       simd8<uint8_t> sc = check_special_cases(input, prev1);
140       this->error |= check_multibyte_lengths(input, prev_input, sc);
141     }
142 
143     // The only problem that can happen at EOF is that a multibyte character is too short
144     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
145     // too large in the first of two bytes.
check_eofutf8_checker146     simdjson_really_inline void check_eof() {
147       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
148       // possibly finish them.
149       this->error |= this->prev_incomplete;
150     }
151 
check_next_inpututf8_checker152     simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
153       if(simdjson_likely(is_ascii(input))) {
154         this->error |= this->prev_incomplete;
155       } else {
156         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
157         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
158             "We support either two or four chunks per 64-byte block.");
159         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
160           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
161           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
162         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
163           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
164           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
165           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
166           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
167         }
168         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
169         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
170 
171       }
172     }
173     // do not forget to call check_eof!
errorsutf8_checker174     simdjson_really_inline error_code errors() {
175       return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
176     }
177 
178   }; // struct utf8_checker
179 } // namespace utf8_validation
180 
181 using utf8_validation::utf8_checker;
182 
183 } // unnamed namespace
184 } // namespace SIMDJSON_IMPLEMENTATION
185 } // namespace simdjson
186