1 #include "simdjson/fallback/begin.h"
2
3 //
4 // Stage 1
5 //
6 #include "generic/stage1/find_next_document_index.h"
7
8 namespace simdjson {
9 namespace SIMDJSON_IMPLEMENTATION {
10 namespace {
11 namespace stage1 {
12
13 class structural_scanner {
14 public:
15
structural_scanner(dom_parser_implementation & _parser,bool _partial)16 simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
17 : buf{_parser.buf},
18 next_structural_index{_parser.structural_indexes.get()},
19 parser{_parser},
20 len{static_cast<uint32_t>(_parser.len)},
21 partial{_partial} {
22 }
23
add_structural()24 simdjson_really_inline void add_structural() {
25 *next_structural_index = idx;
26 next_structural_index++;
27 }
28
is_continuation(uint8_t c)29 simdjson_really_inline bool is_continuation(uint8_t c) {
30 return (c & 0b11000000) == 0b10000000;
31 }
32
validate_utf8_character()33 simdjson_really_inline void validate_utf8_character() {
34 // Continuation
35 if (simdjson_unlikely((buf[idx] & 0b01000000) == 0)) {
36 // extra continuation
37 error = UTF8_ERROR;
38 idx++;
39 return;
40 }
41
42 // 2-byte
43 if ((buf[idx] & 0b00100000) == 0) {
44 // missing continuation
45 if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
46 if (idx+1 > len && partial) { idx = len; return; }
47 error = UTF8_ERROR;
48 idx++;
49 return;
50 }
51 // overlong: 1100000_ 10______
52 if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
53 idx += 2;
54 return;
55 }
56
57 // 3-byte
58 if ((buf[idx] & 0b00010000) == 0) {
59 // missing continuation
60 if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
61 if (idx+2 > len && partial) { idx = len; return; }
62 error = UTF8_ERROR;
63 idx++;
64 return;
65 }
66 // overlong: 11100000 100_____ ________
67 if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
68 // surrogates: U+D800-U+DFFF 11101101 101_____
69 if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
70 idx += 3;
71 return;
72 }
73
74 // 4-byte
75 // missing continuation
76 if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
77 if (idx+2 > len && partial) { idx = len; return; }
78 error = UTF8_ERROR;
79 idx++;
80 return;
81 }
82 // overlong: 11110000 1000____ ________ ________
83 if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
84 // too large: > U+10FFFF:
85 // 11110100 (1001|101_)____
86 // 1111(1___|011_|0101) 10______
87 // also includes 5, 6, 7 and 8 byte characters:
88 // 11111___
89 if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
90 if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
91 idx += 4;
92 }
93
94 // Returns true if the string is unclosed.
validate_string()95 simdjson_really_inline bool validate_string() {
96 idx++; // skip first quote
97 while (idx < len && buf[idx] != '"') {
98 if (buf[idx] == '\\') {
99 idx += 2;
100 } else if (simdjson_unlikely(buf[idx] & 0b10000000)) {
101 validate_utf8_character();
102 } else {
103 if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
104 idx++;
105 }
106 }
107 if (idx >= len) { return true; }
108 return false;
109 }
110
is_whitespace_or_operator(uint8_t c)111 simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
112 switch (c) {
113 case '{': case '}': case '[': case ']': case ',': case ':':
114 case ' ': case '\r': case '\n': case '\t':
115 return true;
116 default:
117 return false;
118 }
119 }
120
121 //
122 // Parse the entire input in STEP_SIZE-byte chunks.
123 //
scan()124 simdjson_really_inline error_code scan() {
125 bool unclosed_string = false;
126 for (;idx<len;idx++) {
127 switch (buf[idx]) {
128 // String
129 case '"':
130 add_structural();
131 unclosed_string |= validate_string();
132 break;
133 // Operator
134 case '{': case '}': case '[': case ']': case ',': case ':':
135 add_structural();
136 break;
137 // Whitespace
138 case ' ': case '\r': case '\n': case '\t':
139 break;
140 // Primitive or invalid character (invalid characters will be checked in stage 2)
141 default:
142 // Anything else, add the structural and go until we find the next one
143 add_structural();
144 while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
145 idx++;
146 };
147 break;
148 }
149 }
150 *next_structural_index = len;
151 // We pad beyond.
152 // https://github.com/simdjson/simdjson/issues/906
153 next_structural_index[1] = len;
154 next_structural_index[2] = 0;
155 parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
156 if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
157 parser.next_structural_index = 0;
158 if (partial) {
159 if(unclosed_string) {
160 parser.n_structural_indexes--;
161 if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
162 }
163 auto new_structural_indexes = find_next_document_index(parser);
164 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
165 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
166 }
167 parser.n_structural_indexes = new_structural_indexes;
168 } else if(unclosed_string) { error = UNCLOSED_STRING; }
169 return error;
170 }
171
172 private:
173 const uint8_t *buf;
174 uint32_t *next_structural_index;
175 dom_parser_implementation &parser;
176 uint32_t len;
177 uint32_t idx{0};
178 error_code error{SUCCESS};
179 bool partial;
180 }; // structural_scanner
181
182 } // namespace stage1
183 } // unnamed namespace
184
stage1(const uint8_t * _buf,size_t _len,bool partial)185 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
186 this->buf = _buf;
187 this->len = _len;
188 stage1::structural_scanner scanner(*this, partial);
189 return scanner.scan();
190 }
191
192 // big table for the minifier
193 static uint8_t jump_table[256 * 3] = {
194 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
195 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
196 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
197 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
198 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
199 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
200 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
201 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
202 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
203 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
204 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
205 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
206 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
207 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
208 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
209 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
210 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
211 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
212 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
213 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
214 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
215 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
216 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
217 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
218 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
219 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
220 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
221 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
222 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
223 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
224 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
225 };
226
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const227 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
228 size_t i = 0, pos = 0;
229 uint8_t quote = 0;
230 uint8_t nonescape = 1;
231
232 while (i < len) {
233 unsigned char c = buf[i];
234 uint8_t *meta = jump_table + 3 * c;
235
236 quote = quote ^ (meta[0] & nonescape);
237 dst[pos] = c;
238 pos += meta[2] | quote;
239
240 i += 1;
241 nonescape = uint8_t(~nonescape) | (meta[1]);
242 }
243 dst_len = pos; // we intentionally do not work with a reference
244 // for fear of aliasing
245 return quote ? UNCLOSED_STRING : SUCCESS;
246 }
247
248 // credit: based on code from Google Fuchsia (Apache Licensed)
validate_utf8(const char * buf,size_t len) const249 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
250 const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
251 uint64_t pos = 0;
252 uint32_t code_point = 0;
253 while (pos < len) {
254 // check of the next 8 bytes are ascii.
255 uint64_t next_pos = pos + 16;
256 if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
257 uint64_t v1;
258 memcpy(&v1, data + pos, sizeof(uint64_t));
259 uint64_t v2;
260 memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
261 uint64_t v{v1 | v2};
262 if ((v & 0x8080808080808080) == 0) {
263 pos = next_pos;
264 continue;
265 }
266 }
267 unsigned char byte = data[pos];
268 if (byte < 0b10000000) {
269 pos++;
270 continue;
271 } else if ((byte & 0b11100000) == 0b11000000) {
272 next_pos = pos + 2;
273 if (next_pos > len) { return false; }
274 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
275 // range check
276 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
277 if (code_point < 0x80 || 0x7ff < code_point) { return false; }
278 } else if ((byte & 0b11110000) == 0b11100000) {
279 next_pos = pos + 3;
280 if (next_pos > len) { return false; }
281 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
282 if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
283 // range check
284 code_point = (byte & 0b00001111) << 12 |
285 (data[pos + 1] & 0b00111111) << 6 |
286 (data[pos + 2] & 0b00111111);
287 if (code_point < 0x800 || 0xffff < code_point ||
288 (0xd7ff < code_point && code_point < 0xe000)) {
289 return false;
290 }
291 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
292 next_pos = pos + 4;
293 if (next_pos > len) { return false; }
294 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
295 if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
296 if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
297 // range check
298 code_point =
299 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
300 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
301 if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
302 } else {
303 // we may have a continuation
304 return false;
305 }
306 pos = next_pos;
307 }
308 return true;
309 }
310
311 } // namespace SIMDJSON_IMPLEMENTATION
312 } // namespace simdjson
313
314 //
315 // Stage 2
316 //
317 #include "generic/stage2/tape_builder.h"
318
319 namespace simdjson {
320 namespace SIMDJSON_IMPLEMENTATION {
321
stage2(dom::document & _doc)322 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
323 return stage2::tape_builder::parse_document<false>(*this, _doc);
324 }
325
stage2_next(dom::document & _doc)326 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
327 return stage2::tape_builder::parse_document<true>(*this, _doc);
328 }
329
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)330 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
331 auto error = stage1(_buf, _len, false);
332 if (error) { return error; }
333 return stage2(_doc);
334 }
335
336 } // namespace SIMDJSON_IMPLEMENTATION
337 } // namespace simdjson
338
339 #include "simdjson/fallback/end.h"
340