1 #include "simdjson/fallback/begin.h"
2 
3 //
4 // Stage 1
5 //
6 #include "generic/stage1/find_next_document_index.h"
7 
8 namespace simdjson {
9 namespace SIMDJSON_IMPLEMENTATION {
10 namespace {
11 namespace stage1 {
12 
13 class structural_scanner {
14 public:
15 
structural_scanner(dom_parser_implementation & _parser,bool _partial)16 simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
17   : buf{_parser.buf},
18     next_structural_index{_parser.structural_indexes.get()},
19     parser{_parser},
20     len{static_cast<uint32_t>(_parser.len)},
21     partial{_partial} {
22 }
23 
add_structural()24 simdjson_really_inline void add_structural() {
25   *next_structural_index = idx;
26   next_structural_index++;
27 }
28 
is_continuation(uint8_t c)29 simdjson_really_inline bool is_continuation(uint8_t c) {
30   return (c & 0b11000000) == 0b10000000;
31 }
32 
validate_utf8_character()33 simdjson_really_inline void validate_utf8_character() {
34   // Continuation
35   if (simdjson_unlikely((buf[idx] & 0b01000000) == 0)) {
36     // extra continuation
37     error = UTF8_ERROR;
38     idx++;
39     return;
40   }
41 
42   // 2-byte
43   if ((buf[idx] & 0b00100000) == 0) {
44     // missing continuation
45     if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
46       if (idx+1 > len && partial) { idx = len; return; }
47       error = UTF8_ERROR;
48       idx++;
49       return;
50     }
51     // overlong: 1100000_ 10______
52     if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
53     idx += 2;
54     return;
55   }
56 
57   // 3-byte
58   if ((buf[idx] & 0b00010000) == 0) {
59     // missing continuation
60     if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
61       if (idx+2 > len && partial) { idx = len; return; }
62       error = UTF8_ERROR;
63       idx++;
64       return;
65     }
66     // overlong: 11100000 100_____ ________
67     if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
68     // surrogates: U+D800-U+DFFF 11101101 101_____
69     if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
70     idx += 3;
71     return;
72   }
73 
74   // 4-byte
75   // missing continuation
76   if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
77     if (idx+2 > len && partial) { idx = len; return; }
78     error = UTF8_ERROR;
79     idx++;
80     return;
81   }
82   // overlong: 11110000 1000____ ________ ________
83   if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
84   // too large: > U+10FFFF:
85   // 11110100 (1001|101_)____
86   // 1111(1___|011_|0101) 10______
87   // also includes 5, 6, 7 and 8 byte characters:
88   // 11111___
89   if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
90   if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
91   idx += 4;
92 }
93 
94 // Returns true if the string is unclosed.
validate_string()95 simdjson_really_inline bool validate_string() {
96   idx++; // skip first quote
97   while (idx < len && buf[idx] != '"') {
98     if (buf[idx] == '\\') {
99       idx += 2;
100     } else if (simdjson_unlikely(buf[idx] & 0b10000000)) {
101       validate_utf8_character();
102     } else {
103       if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
104       idx++;
105     }
106   }
107   if (idx >= len) { return true; }
108   return false;
109 }
110 
is_whitespace_or_operator(uint8_t c)111 simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
112   switch (c) {
113     case '{': case '}': case '[': case ']': case ',': case ':':
114     case ' ': case '\r': case '\n': case '\t':
115       return true;
116     default:
117       return false;
118   }
119 }
120 
121 //
122 // Parse the entire input in STEP_SIZE-byte chunks.
123 //
scan()124 simdjson_really_inline error_code scan() {
125   bool unclosed_string = false;
126   for (;idx<len;idx++) {
127     switch (buf[idx]) {
128       // String
129       case '"':
130         add_structural();
131         unclosed_string |= validate_string();
132         break;
133       // Operator
134       case '{': case '}': case '[': case ']': case ',': case ':':
135         add_structural();
136         break;
137       // Whitespace
138       case ' ': case '\r': case '\n': case '\t':
139         break;
140       // Primitive or invalid character (invalid characters will be checked in stage 2)
141       default:
142         // Anything else, add the structural and go until we find the next one
143         add_structural();
144         while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
145           idx++;
146         };
147         break;
148     }
149   }
150   *next_structural_index = len;
151   // We pad beyond.
152   // https://github.com/simdjson/simdjson/issues/906
153   next_structural_index[1] = len;
154   next_structural_index[2] = 0;
155   parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
156   if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
157   parser.next_structural_index = 0;
158   if (partial) {
159     if(unclosed_string) {
160       parser.n_structural_indexes--;
161       if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
162     }
163     auto new_structural_indexes = find_next_document_index(parser);
164     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
165       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
166     }
167     parser.n_structural_indexes = new_structural_indexes;
168   } else if(unclosed_string) { error = UNCLOSED_STRING; }
169   return error;
170 }
171 
172 private:
173   const uint8_t *buf;
174   uint32_t *next_structural_index;
175   dom_parser_implementation &parser;
176   uint32_t len;
177   uint32_t idx{0};
178   error_code error{SUCCESS};
179   bool partial;
180 }; // structural_scanner
181 
182 } // namespace stage1
183 } // unnamed namespace
184 
stage1(const uint8_t * _buf,size_t _len,bool partial)185 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
186   this->buf = _buf;
187   this->len = _len;
188   stage1::structural_scanner scanner(*this, partial);
189   return scanner.scan();
190 }
191 
192 // big table for the minifier
193 static uint8_t jump_table[256 * 3] = {
194     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
195     1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
196     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
197     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
198     1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
199     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
200     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
201     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
202     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
203     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
204     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
205     1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
206     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
207     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
208     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
209     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
210     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
211     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
212     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
213     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
214     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
215     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
216     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
217     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
218     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
219     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
220     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
221     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
222     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
223     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
224     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
225 };
226 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const227 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
228   size_t i = 0, pos = 0;
229   uint8_t quote = 0;
230   uint8_t nonescape = 1;
231 
232   while (i < len) {
233     unsigned char c = buf[i];
234     uint8_t *meta = jump_table + 3 * c;
235 
236     quote = quote ^ (meta[0] & nonescape);
237     dst[pos] = c;
238     pos += meta[2] | quote;
239 
240     i += 1;
241     nonescape = uint8_t(~nonescape) | (meta[1]);
242   }
243   dst_len = pos; // we intentionally do not work with a reference
244   // for fear of aliasing
245   return quote ? UNCLOSED_STRING : SUCCESS;
246 }
247 
248 // credit: based on code from Google Fuchsia (Apache Licensed)
validate_utf8(const char * buf,size_t len) const249 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
250   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
251   uint64_t pos = 0;
252   uint32_t code_point = 0;
253   while (pos < len) {
254     // check of the next 8 bytes are ascii.
255     uint64_t next_pos = pos + 16;
256     if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
257       uint64_t v1;
258       memcpy(&v1, data + pos, sizeof(uint64_t));
259       uint64_t v2;
260       memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
261       uint64_t v{v1 | v2};
262       if ((v & 0x8080808080808080) == 0) {
263         pos = next_pos;
264         continue;
265       }
266     }
267     unsigned char byte = data[pos];
268     if (byte < 0b10000000) {
269       pos++;
270       continue;
271     } else if ((byte & 0b11100000) == 0b11000000) {
272       next_pos = pos + 2;
273       if (next_pos > len) { return false; }
274       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
275       // range check
276       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
277       if (code_point < 0x80 || 0x7ff < code_point) { return false; }
278     } else if ((byte & 0b11110000) == 0b11100000) {
279       next_pos = pos + 3;
280       if (next_pos > len) { return false; }
281       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
282       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
283       // range check
284       code_point = (byte & 0b00001111) << 12 |
285                    (data[pos + 1] & 0b00111111) << 6 |
286                    (data[pos + 2] & 0b00111111);
287       if (code_point < 0x800 || 0xffff < code_point ||
288           (0xd7ff < code_point && code_point < 0xe000)) {
289         return false;
290       }
291     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
292       next_pos = pos + 4;
293       if (next_pos > len) { return false; }
294       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
295       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
296       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
297       // range check
298       code_point =
299           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
300           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
301       if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
302     } else {
303       // we may have a continuation
304       return false;
305     }
306     pos = next_pos;
307   }
308   return true;
309 }
310 
311 } // namespace SIMDJSON_IMPLEMENTATION
312 } // namespace simdjson
313 
314 //
315 // Stage 2
316 //
317 #include "generic/stage2/tape_builder.h"
318 
319 namespace simdjson {
320 namespace SIMDJSON_IMPLEMENTATION {
321 
stage2(dom::document & _doc)322 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
323   return stage2::tape_builder::parse_document<false>(*this, _doc);
324 }
325 
stage2_next(dom::document & _doc)326 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
327   return stage2::tape_builder::parse_document<true>(*this, _doc);
328 }
329 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)330 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
331   auto error = stage1(_buf, _len, false);
332   if (error) { return error; }
333   return stage2(_doc);
334 }
335 
336 } // namespace SIMDJSON_IMPLEMENTATION
337 } // namespace simdjson
338 
339 #include "simdjson/fallback/end.h"
340