1 #include "generic/stage2/logger.h"
2
3 namespace simdjson {
4 namespace SIMDJSON_IMPLEMENTATION {
5 namespace {
6 namespace stage2 {
7
8 class json_iterator {
9 public:
10 const uint8_t* const buf;
11 uint32_t *next_structural;
12 dom_parser_implementation &dom_parser;
13 uint32_t depth{0};
14
15 /**
16 * Walk the JSON document.
17 *
18 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
19 * the first parameter; some callbacks have other parameters as well:
20 *
21 * - visit_document_start() - at the beginning.
22 * - visit_document_end() - at the end (if things were successful).
23 *
24 * - visit_array_start() - at the start `[` of a non-empty array.
25 * - visit_array_end() - at the end `]` of a non-empty array.
26 * - visit_empty_array() - when an empty array is encountered.
27 *
28 * - visit_object_end() - at the start `]` of a non-empty object.
29 * - visit_object_start() - at the end `]` of a non-empty object.
30 * - visit_empty_object() - when an empty object is encountered.
31 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
32 * guaranteed to point at the first quote of the string (`"key"`).
33 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
34 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
35 *
36 * - increment_count(iter) - each time a value is found in an array or object.
37 */
38 template<bool STREAMING, typename V>
39 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
40
41 /**
42 * Create an iterator capable of walking a JSON document.
43 *
44 * The document must have already passed through stage 1.
45 */
46 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
47
48 /**
49 * Look at the next token.
50 *
51 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
52 *
53 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
54 */
55 simdjson_really_inline const uint8_t *peek() const noexcept;
56 /**
57 * Advance to the next token.
58 *
59 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
60 *
61 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
62 */
63 simdjson_really_inline const uint8_t *advance() noexcept;
64 /**
65 * Get the remaining length of the document, from the start of the current token.
66 */
67 simdjson_really_inline size_t remaining_len() const noexcept;
68 /**
69 * Check if we are at the end of the document.
70 *
71 * If this is true, there are no more tokens.
72 */
73 simdjson_really_inline bool at_eof() const noexcept;
74 /**
75 * Check if we are at the beginning of the document.
76 */
77 simdjson_really_inline bool at_beginning() const noexcept;
78 simdjson_really_inline uint8_t last_structural() const noexcept;
79
80 /**
81 * Log that a value has been found.
82 *
83 * Set ENABLE_LOGGING=true in logger.h to see logging.
84 */
85 simdjson_really_inline void log_value(const char *type) const noexcept;
86 /**
87 * Log the start of a multipart value.
88 *
89 * Set ENABLE_LOGGING=true in logger.h to see logging.
90 */
91 simdjson_really_inline void log_start_value(const char *type) const noexcept;
92 /**
93 * Log the end of a multipart value.
94 *
95 * Set ENABLE_LOGGING=true in logger.h to see logging.
96 */
97 simdjson_really_inline void log_end_value(const char *type) const noexcept;
98 /**
99 * Log an error.
100 *
101 * Set ENABLE_LOGGING=true in logger.h to see logging.
102 */
103 simdjson_really_inline void log_error(const char *error) const noexcept;
104
105 template<typename V>
106 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
107 template<typename V>
108 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
109 };
110
111 template<bool STREAMING, typename V>
walk_document(V & visitor)112 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
113 logger::log_start();
114
115 //
116 // Start the document
117 //
118 if (at_eof()) { return EMPTY; }
119 log_start_value("document");
120 SIMDJSON_TRY( visitor.visit_document_start(*this) );
121
122 //
123 // Read first value
124 //
125 {
126 auto value = advance();
127
128 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
129 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
130 if (!STREAMING) {
131 switch (*value) {
132 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
133 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
134 }
135 }
136
137 switch (*value) {
138 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
139 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
140 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
141 }
142 }
143 goto document_end;
144
145 //
146 // Object parser states
147 //
148 object_begin:
149 log_start_value("object");
150 depth++;
151 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
152 dom_parser.is_array[depth] = false;
153 SIMDJSON_TRY( visitor.visit_object_start(*this) );
154
155 {
156 auto key = advance();
157 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
158 SIMDJSON_TRY( visitor.increment_count(*this) );
159 SIMDJSON_TRY( visitor.visit_key(*this, key) );
160 }
161
162 object_field:
163 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
164 {
165 auto value = advance();
166 switch (*value) {
167 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
168 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
169 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
170 }
171 }
172
173 object_continue:
174 switch (*advance()) {
175 case ',':
176 SIMDJSON_TRY( visitor.increment_count(*this) );
177 {
178 auto key = advance();
179 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
180 SIMDJSON_TRY( visitor.visit_key(*this, key) );
181 }
182 goto object_field;
183 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
184 default: log_error("No comma between object fields"); return TAPE_ERROR;
185 }
186
187 scope_end:
188 depth--;
189 if (depth == 0) { goto document_end; }
190 if (dom_parser.is_array[depth]) { goto array_continue; }
191 goto object_continue;
192
193 //
194 // Array parser states
195 //
196 array_begin:
197 log_start_value("array");
198 depth++;
199 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
200 dom_parser.is_array[depth] = true;
201 SIMDJSON_TRY( visitor.visit_array_start(*this) );
202 SIMDJSON_TRY( visitor.increment_count(*this) );
203
204 array_value:
205 {
206 auto value = advance();
207 switch (*value) {
208 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
209 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
210 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
211 }
212 }
213
214 array_continue:
215 switch (*advance()) {
216 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
217 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
218 default: log_error("Missing comma between array values"); return TAPE_ERROR;
219 }
220
221 document_end:
222 log_end_value("document");
223 SIMDJSON_TRY( visitor.visit_document_end(*this) );
224
225 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
226
227 // If we didn't make it to the end, it's an error
228 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
229 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
230 return TAPE_ERROR;
231 }
232
233 return SUCCESS;
234
235 } // walk_document()
236
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)237 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
238 : buf{_dom_parser.buf},
239 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
240 dom_parser{_dom_parser} {
241 }
242
peek()243 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
244 return &buf[*(next_structural)];
245 }
advance()246 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
247 return &buf[*(next_structural++)];
248 }
remaining_len()249 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
250 return dom_parser.len - *(next_structural-1);
251 }
252
at_eof()253 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
254 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
255 }
at_beginning()256 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
257 return next_structural == dom_parser.structural_indexes.get();
258 }
last_structural()259 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
260 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
261 }
262
log_value(const char * type)263 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
264 logger::log_line(*this, "", type, "");
265 }
266
log_start_value(const char * type)267 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
268 logger::log_line(*this, "+", type, "");
269 if (logger::LOG_ENABLED) { logger::log_depth++; }
270 }
271
log_end_value(const char * type)272 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
273 if (logger::LOG_ENABLED) { logger::log_depth--; }
274 logger::log_line(*this, "-", type, "");
275 }
276
log_error(const char * error)277 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
278 logger::log_line(*this, "", "ERROR", error);
279 }
280
281 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)282 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
283 switch (*value) {
284 case '"': return visitor.visit_root_string(*this, value);
285 case 't': return visitor.visit_root_true_atom(*this, value);
286 case 'f': return visitor.visit_root_false_atom(*this, value);
287 case 'n': return visitor.visit_root_null_atom(*this, value);
288 case '-':
289 case '0': case '1': case '2': case '3': case '4':
290 case '5': case '6': case '7': case '8': case '9':
291 return visitor.visit_root_number(*this, value);
292 default:
293 log_error("Document starts with a non-value character");
294 return TAPE_ERROR;
295 }
296 }
297 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)298 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
299 switch (*value) {
300 case '"': return visitor.visit_string(*this, value);
301 case 't': return visitor.visit_true_atom(*this, value);
302 case 'f': return visitor.visit_false_atom(*this, value);
303 case 'n': return visitor.visit_null_atom(*this, value);
304 case '-':
305 case '0': case '1': case '2': case '3': case '4':
306 case '5': case '6': case '7': case '8': case '9':
307 return visitor.visit_number(*this, value);
308 default:
309 log_error("Non-value found when value was expected!");
310 return TAPE_ERROR;
311 }
312 }
313
314 } // namespace stage2
315 } // unnamed namespace
316 } // namespace SIMDJSON_IMPLEMENTATION
317 } // namespace simdjson
318