1 #include "generic/stage2/logger.h"
2 
3 namespace simdjson {
4 namespace SIMDJSON_IMPLEMENTATION {
5 namespace {
6 namespace stage2 {
7 
8 class json_iterator {
9 public:
10   const uint8_t* const buf;
11   uint32_t *next_structural;
12   dom_parser_implementation &dom_parser;
13   uint32_t depth{0};
14 
15   /**
16    * Walk the JSON document.
17    *
18    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
19    * the first parameter; some callbacks have other parameters as well:
20    *
21    * - visit_document_start() - at the beginning.
22    * - visit_document_end() - at the end (if things were successful).
23    *
24    * - visit_array_start() - at the start `[` of a non-empty array.
25    * - visit_array_end() - at the end `]` of a non-empty array.
26    * - visit_empty_array() - when an empty array is encountered.
27    *
28    * - visit_object_end() - at the start `]` of a non-empty object.
29    * - visit_object_start() - at the end `]` of a non-empty object.
30    * - visit_empty_object() - when an empty object is encountered.
31    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
32    *                                   guaranteed to point at the first quote of the string (`"key"`).
33    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
34    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
35    *
36    * - increment_count(iter) - each time a value is found in an array or object.
37    */
38   template<bool STREAMING, typename V>
39   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
40 
41   /**
42    * Create an iterator capable of walking a JSON document.
43    *
44    * The document must have already passed through stage 1.
45    */
46   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
47 
48   /**
49    * Look at the next token.
50    *
51    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
52    *
53    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
54    */
55   simdjson_really_inline const uint8_t *peek() const noexcept;
56   /**
57    * Advance to the next token.
58    *
59    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
60    *
61    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
62    */
63   simdjson_really_inline const uint8_t *advance() noexcept;
64   /**
65    * Get the remaining length of the document, from the start of the current token.
66    */
67   simdjson_really_inline size_t remaining_len() const noexcept;
68   /**
69    * Check if we are at the end of the document.
70    *
71    * If this is true, there are no more tokens.
72    */
73   simdjson_really_inline bool at_eof() const noexcept;
74   /**
75    * Check if we are at the beginning of the document.
76    */
77   simdjson_really_inline bool at_beginning() const noexcept;
78   simdjson_really_inline uint8_t last_structural() const noexcept;
79 
80   /**
81    * Log that a value has been found.
82    *
83    * Set ENABLE_LOGGING=true in logger.h to see logging.
84    */
85   simdjson_really_inline void log_value(const char *type) const noexcept;
86   /**
87    * Log the start of a multipart value.
88    *
89    * Set ENABLE_LOGGING=true in logger.h to see logging.
90    */
91   simdjson_really_inline void log_start_value(const char *type) const noexcept;
92   /**
93    * Log the end of a multipart value.
94    *
95    * Set ENABLE_LOGGING=true in logger.h to see logging.
96    */
97   simdjson_really_inline void log_end_value(const char *type) const noexcept;
98   /**
99    * Log an error.
100    *
101    * Set ENABLE_LOGGING=true in logger.h to see logging.
102    */
103   simdjson_really_inline void log_error(const char *error) const noexcept;
104 
105   template<typename V>
106   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
107   template<typename V>
108   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
109 };
110 
111 template<bool STREAMING, typename V>
walk_document(V & visitor)112 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
113   logger::log_start();
114 
115   //
116   // Start the document
117   //
118   if (at_eof()) { return EMPTY; }
119   log_start_value("document");
120   SIMDJSON_TRY( visitor.visit_document_start(*this) );
121 
122   //
123   // Read first value
124   //
125   {
126     auto value = advance();
127 
128     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
129     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
130     if (!STREAMING) {
131       switch (*value) {
132         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
133         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
134       }
135     }
136 
137     switch (*value) {
138       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
139       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
140       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
141     }
142   }
143   goto document_end;
144 
145 //
146 // Object parser states
147 //
148 object_begin:
149   log_start_value("object");
150   depth++;
151   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
152   dom_parser.is_array[depth] = false;
153   SIMDJSON_TRY( visitor.visit_object_start(*this) );
154 
155   {
156     auto key = advance();
157     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
158     SIMDJSON_TRY( visitor.increment_count(*this) );
159     SIMDJSON_TRY( visitor.visit_key(*this, key) );
160   }
161 
162 object_field:
163   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
164   {
165     auto value = advance();
166     switch (*value) {
167       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
168       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
169       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
170     }
171   }
172 
173 object_continue:
174   switch (*advance()) {
175     case ',':
176       SIMDJSON_TRY( visitor.increment_count(*this) );
177       {
178         auto key = advance();
179         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
180         SIMDJSON_TRY( visitor.visit_key(*this, key) );
181       }
182       goto object_field;
183     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
184     default: log_error("No comma between object fields"); return TAPE_ERROR;
185   }
186 
187 scope_end:
188   depth--;
189   if (depth == 0) { goto document_end; }
190   if (dom_parser.is_array[depth]) { goto array_continue; }
191   goto object_continue;
192 
193 //
194 // Array parser states
195 //
196 array_begin:
197   log_start_value("array");
198   depth++;
199   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
200   dom_parser.is_array[depth] = true;
201   SIMDJSON_TRY( visitor.visit_array_start(*this) );
202   SIMDJSON_TRY( visitor.increment_count(*this) );
203 
204 array_value:
205   {
206     auto value = advance();
207     switch (*value) {
208       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
209       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
210       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
211     }
212   }
213 
214 array_continue:
215   switch (*advance()) {
216     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
217     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
218     default: log_error("Missing comma between array values"); return TAPE_ERROR;
219   }
220 
221 document_end:
222   log_end_value("document");
223   SIMDJSON_TRY( visitor.visit_document_end(*this) );
224 
225   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
226 
227   // If we didn't make it to the end, it's an error
228   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
229     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
230     return TAPE_ERROR;
231   }
232 
233   return SUCCESS;
234 
235 } // walk_document()
236 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)237 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
238   : buf{_dom_parser.buf},
239     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
240     dom_parser{_dom_parser} {
241 }
242 
peek()243 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
244   return &buf[*(next_structural)];
245 }
advance()246 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
247   return &buf[*(next_structural++)];
248 }
remaining_len()249 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
250   return dom_parser.len - *(next_structural-1);
251 }
252 
at_eof()253 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
254   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
255 }
at_beginning()256 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
257   return next_structural == dom_parser.structural_indexes.get();
258 }
last_structural()259 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
260   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
261 }
262 
log_value(const char * type)263 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
264   logger::log_line(*this, "", type, "");
265 }
266 
log_start_value(const char * type)267 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
268   logger::log_line(*this, "+", type, "");
269   if (logger::LOG_ENABLED) { logger::log_depth++; }
270 }
271 
log_end_value(const char * type)272 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
273   if (logger::LOG_ENABLED) { logger::log_depth--; }
274   logger::log_line(*this, "-", type, "");
275 }
276 
log_error(const char * error)277 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
278   logger::log_line(*this, "", "ERROR", error);
279 }
280 
281 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)282 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
283   switch (*value) {
284     case '"': return visitor.visit_root_string(*this, value);
285     case 't': return visitor.visit_root_true_atom(*this, value);
286     case 'f': return visitor.visit_root_false_atom(*this, value);
287     case 'n': return visitor.visit_root_null_atom(*this, value);
288     case '-':
289     case '0': case '1': case '2': case '3': case '4':
290     case '5': case '6': case '7': case '8': case '9':
291       return visitor.visit_root_number(*this, value);
292     default:
293       log_error("Document starts with a non-value character");
294       return TAPE_ERROR;
295   }
296 }
297 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)298 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
299   switch (*value) {
300     case '"': return visitor.visit_string(*this, value);
301     case 't': return visitor.visit_true_atom(*this, value);
302     case 'f': return visitor.visit_false_atom(*this, value);
303     case 'n': return visitor.visit_null_atom(*this, value);
304     case '-':
305     case '0': case '1': case '2': case '3': case '4':
306     case '5': case '6': case '7': case '8': case '9':
307       return visitor.visit_number(*this, value);
308     default:
309       log_error("Non-value found when value was expected!");
310       return TAPE_ERROR;
311   }
312 }
313 
314 } // namespace stage2
315 } // unnamed namespace
316 } // namespace SIMDJSON_IMPLEMENTATION
317 } // namespace simdjson
318