1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "json.h"
6
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13
14 #include "cbor.h"
15 #include "json_platform.h"
16
17 namespace crdtp {
18 namespace json {
19 // =============================================================================
20 // json::NewJSONEncoder - for encoding streaming parser events as JSON
21 // =============================================================================
22
23 namespace {
24 // Prints |value| to |out| with 4 hex digits, most significant chunk first.
25 template <typename C>
PrintHex(uint16_t value,C * out)26 void PrintHex(uint16_t value, C* out) {
27 for (int ii = 3; ii >= 0; --ii) {
28 int four_bits = 0xf & (value >> (4 * ii));
29 out->push_back(four_bits + ((four_bits <= 9) ? '0' : ('a' - 10)));
30 }
31 }
32
33 // In the writer below, we maintain a stack of State instances.
34 // It is just enough to emit the appropriate delimiters and brackets
35 // in JSON.
36 enum class Container {
37 // Used for the top-level, initial state.
38 NONE,
39 // Inside a JSON object.
40 MAP,
41 // Inside a JSON array.
42 ARRAY
43 };
44
45 class State {
46 public:
State(Container container)47 explicit State(Container container) : container_(container) {}
StartElement(std::vector<uint8_t> * out)48 void StartElement(std::vector<uint8_t>* out) { StartElementTmpl(out); }
StartElement(std::string * out)49 void StartElement(std::string* out) { StartElementTmpl(out); }
container() const50 Container container() const { return container_; }
51
52 private:
53 template <typename C>
StartElementTmpl(C * out)54 void StartElementTmpl(C* out) {
55 assert(container_ != Container::NONE || size_ == 0);
56 if (size_ != 0) {
57 char delim = (!(size_ & 1) || container_ == Container::ARRAY) ? ',' : ':';
58 out->push_back(delim);
59 }
60 ++size_;
61 }
62
63 Container container_ = Container::NONE;
64 int size_ = 0;
65 };
66
67 constexpr char kBase64Table[] =
68 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
69 "abcdefghijklmnopqrstuvwxyz0123456789+/";
70
71 template <typename C>
Base64Encode(const span<uint8_t> & in,C * out)72 void Base64Encode(const span<uint8_t>& in, C* out) {
73 // The following three cases are based on the tables in the example
74 // section in https://en.wikipedia.org/wiki/Base64. We process three
75 // input bytes at a time, emitting 4 output bytes at a time.
76 size_t ii = 0;
77
78 // While possible, process three input bytes.
79 for (; ii + 3 <= in.size(); ii += 3) {
80 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8) | in[ii + 2];
81 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
82 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
83 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
84 out->push_back(kBase64Table[twentyfour_bits & 0x3f]);
85 }
86 if (ii + 2 <= in.size()) { // Process two input bytes.
87 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8);
88 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
89 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
90 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
91 out->push_back('='); // Emit padding.
92 return;
93 }
94 if (ii + 1 <= in.size()) { // Process a single input byte.
95 uint32_t twentyfour_bits = (in[ii] << 16);
96 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
97 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
98 out->push_back('='); // Emit padding.
99 out->push_back('='); // Emit padding.
100 }
101 }
102
103 // Implements a handler for JSON parser events to emit a JSON string.
104 template <typename C>
105 class JSONEncoder : public ParserHandler {
106 public:
JSONEncoder(C * out,Status * status)107 JSONEncoder(C* out, Status* status) : out_(out), status_(status) {
108 *status_ = Status();
109 state_.emplace(Container::NONE);
110 }
111
HandleMapBegin()112 void HandleMapBegin() override {
113 if (!status_->ok())
114 return;
115 assert(!state_.empty());
116 state_.top().StartElement(out_);
117 state_.emplace(Container::MAP);
118 Emit('{');
119 }
120
HandleMapEnd()121 void HandleMapEnd() override {
122 if (!status_->ok())
123 return;
124 assert(state_.size() >= 2 && state_.top().container() == Container::MAP);
125 state_.pop();
126 Emit('}');
127 }
128
HandleArrayBegin()129 void HandleArrayBegin() override {
130 if (!status_->ok())
131 return;
132 state_.top().StartElement(out_);
133 state_.emplace(Container::ARRAY);
134 Emit('[');
135 }
136
HandleArrayEnd()137 void HandleArrayEnd() override {
138 if (!status_->ok())
139 return;
140 assert(state_.size() >= 2 && state_.top().container() == Container::ARRAY);
141 state_.pop();
142 Emit(']');
143 }
144
HandleString16(span<uint16_t> chars)145 void HandleString16(span<uint16_t> chars) override {
146 if (!status_->ok())
147 return;
148 state_.top().StartElement(out_);
149 Emit('"');
150 for (const uint16_t ch : chars) {
151 if (ch == '"') {
152 Emit("\\\"");
153 } else if (ch == '\\') {
154 Emit("\\\\");
155 } else if (ch == '\b') {
156 Emit("\\b");
157 } else if (ch == '\f') {
158 Emit("\\f");
159 } else if (ch == '\n') {
160 Emit("\\n");
161 } else if (ch == '\r') {
162 Emit("\\r");
163 } else if (ch == '\t') {
164 Emit("\\t");
165 } else if (ch >= 32 && ch <= 126) {
166 Emit(ch);
167 } else {
168 Emit("\\u");
169 PrintHex(ch, out_);
170 }
171 }
172 Emit('"');
173 }
174
HandleString8(span<uint8_t> chars)175 void HandleString8(span<uint8_t> chars) override {
176 if (!status_->ok())
177 return;
178 state_.top().StartElement(out_);
179 Emit('"');
180 for (size_t ii = 0; ii < chars.size(); ++ii) {
181 uint8_t c = chars[ii];
182 if (c == '"') {
183 Emit("\\\"");
184 } else if (c == '\\') {
185 Emit("\\\\");
186 } else if (c == '\b') {
187 Emit("\\b");
188 } else if (c == '\f') {
189 Emit("\\f");
190 } else if (c == '\n') {
191 Emit("\\n");
192 } else if (c == '\r') {
193 Emit("\\r");
194 } else if (c == '\t') {
195 Emit("\\t");
196 } else if (c >= 32 && c <= 126) {
197 Emit(c);
198 } else if (c < 32) {
199 Emit("\\u");
200 PrintHex(static_cast<uint16_t>(c), out_);
201 } else {
202 // Inspect the leading byte to figure out how long the utf8
203 // byte sequence is; while doing this initialize |codepoint|
204 // with the first few bits.
205 // See table in: https://en.wikipedia.org/wiki/UTF-8
206 // byte one is 110x xxxx -> 2 byte utf8 sequence
207 // byte one is 1110 xxxx -> 3 byte utf8 sequence
208 // byte one is 1111 0xxx -> 4 byte utf8 sequence
209 uint32_t codepoint;
210 int num_bytes_left;
211 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
212 num_bytes_left = 1;
213 codepoint = c & 0x1f;
214 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
215 num_bytes_left = 2;
216 codepoint = c & 0x0f;
217 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
218 codepoint = c & 0x07;
219 num_bytes_left = 3;
220 } else {
221 continue; // invalid leading byte
222 }
223
224 // If we have enough bytes in our input, decode the remaining ones
225 // belonging to this Unicode character into |codepoint|.
226 if (ii + num_bytes_left >= chars.size())
227 continue;
228 bool invalid_byte_seen = false;
229 while (num_bytes_left > 0) {
230 c = chars[++ii];
231 --num_bytes_left;
232 // Check the next byte is a continuation byte, that is 10xx xxxx.
233 if ((c & 0xc0) != 0x80)
234 invalid_byte_seen = true;
235 codepoint = (codepoint << 6) | (c & 0x3f);
236 }
237 if (invalid_byte_seen)
238 continue;
239
240 // Disallow overlong encodings for ascii characters, as these
241 // would include " and other characters significant to JSON
242 // string termination / control.
243 if (codepoint <= 0x7f)
244 continue;
245 // Invalid in UTF8, and can't be represented in UTF16 anyway.
246 if (codepoint > 0x10ffff)
247 continue;
248
249 // So, now we transcode to UTF16,
250 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
251 // for either one or two 16 bit characters.
252 if (codepoint <= 0xffff) {
253 Emit("\\u");
254 PrintHex(static_cast<uint16_t>(codepoint), out_);
255 continue;
256 }
257 codepoint -= 0x10000;
258 // high surrogate
259 Emit("\\u");
260 PrintHex(static_cast<uint16_t>((codepoint >> 10) + 0xd800), out_);
261 // low surrogate
262 Emit("\\u");
263 PrintHex(static_cast<uint16_t>((codepoint & 0x3ff) + 0xdc00), out_);
264 }
265 }
266 Emit('"');
267 }
268
HandleBinary(span<uint8_t> bytes)269 void HandleBinary(span<uint8_t> bytes) override {
270 if (!status_->ok())
271 return;
272 state_.top().StartElement(out_);
273 Emit('"');
274 Base64Encode(bytes, out_);
275 Emit('"');
276 }
277
HandleDouble(double value)278 void HandleDouble(double value) override {
279 if (!status_->ok())
280 return;
281 state_.top().StartElement(out_);
282 // JSON cannot represent NaN or Infinity. So, for compatibility,
283 // we behave like the JSON object in web browsers: emit 'null'.
284 if (!std::isfinite(value)) {
285 Emit("null");
286 return;
287 }
288 // If |value| is a scalar, emit it as an int. Taken from json_writer.cc in
289 // Chromium.
290 if (value <= std::numeric_limits<int64_t>::max() &&
291 value >= std::numeric_limits<int64_t>::min() &&
292 std::floor(value) == value) {
293 Emit(std::to_string(static_cast<int64_t>(value)));
294 return;
295 }
296 std::string str_value = json::platform::DToStr(value);
297 // The following is somewhat paranoid, but also taken from json_writer.cc
298 // in Chromium:
299 // Ensure that the number has a .0 if there's no decimal or 'e'. This
300 // makes sure that when we read the JSON back, it's interpreted as a
301 // real rather than an int.
302 if (str_value.find_first_of(".eE") == std::string::npos)
303 str_value.append(".0");
304
305 // DToStr may fail to emit a 0 before the decimal dot. E.g. this is
306 // the case in base::NumberToString in Chromium (which is based on
307 // dmg_fp). So, much like
308 // https://cs.chromium.org/chromium/src/base/json/json_writer.cc
309 // we probe for this and emit the leading 0 anyway if necessary.
310 const char* chars = str_value.c_str();
311 if (chars[0] == '.') {
312 Emit('0');
313 } else if (chars[0] == '-' && chars[1] == '.') {
314 Emit("-0");
315 ++chars;
316 }
317 Emit(chars);
318 }
319
HandleInt32(int32_t value)320 void HandleInt32(int32_t value) override {
321 if (!status_->ok())
322 return;
323 state_.top().StartElement(out_);
324 Emit(std::to_string(value));
325 }
326
HandleBool(bool value)327 void HandleBool(bool value) override {
328 if (!status_->ok())
329 return;
330 state_.top().StartElement(out_);
331 Emit(value ? "true" : "false");
332 }
333
HandleNull()334 void HandleNull() override {
335 if (!status_->ok())
336 return;
337 state_.top().StartElement(out_);
338 Emit("null");
339 }
340
HandleError(Status error)341 void HandleError(Status error) override {
342 assert(!error.ok());
343 *status_ = error;
344 out_->clear();
345 }
346
347 private:
Emit(char c)348 void Emit(char c) { out_->push_back(c); }
Emit(const char * str)349 void Emit(const char* str) {
350 out_->insert(out_->end(), str, str + strlen(str));
351 }
Emit(const std::string & str)352 void Emit(const std::string& str) {
353 out_->insert(out_->end(), str.begin(), str.end());
354 }
355
356 C* out_;
357 Status* status_;
358 std::stack<State> state_;
359 };
360 } // namespace
361
NewJSONEncoder(std::vector<uint8_t> * out,Status * status)362 std::unique_ptr<ParserHandler> NewJSONEncoder(std::vector<uint8_t>* out,
363 Status* status) {
364 return std::unique_ptr<ParserHandler>(
365 new JSONEncoder<std::vector<uint8_t>>(out, status));
366 }
367
NewJSONEncoder(std::string * out,Status * status)368 std::unique_ptr<ParserHandler> NewJSONEncoder(std::string* out,
369 Status* status) {
370 return std::unique_ptr<ParserHandler>(
371 new JSONEncoder<std::string>(out, status));
372 }
373
374 // =============================================================================
375 // json::ParseJSON - for receiving streaming parser events for JSON.
376 // =============================================================================
377
378 namespace {
379 const int kStackLimit = 300;
380
381 enum Token {
382 ObjectBegin,
383 ObjectEnd,
384 ArrayBegin,
385 ArrayEnd,
386 StringLiteral,
387 Number,
388 BoolTrue,
389 BoolFalse,
390 NullToken,
391 ListSeparator,
392 ObjectPairSeparator,
393 InvalidToken,
394 NoInput
395 };
396
397 const char* const kNullString = "null";
398 const char* const kTrueString = "true";
399 const char* const kFalseString = "false";
400
401 template <typename Char>
402 class JsonParser {
403 public:
JsonParser(ParserHandler * handler)404 explicit JsonParser(ParserHandler* handler) : handler_(handler) {}
405
Parse(const Char * start,size_t length)406 void Parse(const Char* start, size_t length) {
407 start_pos_ = start;
408 const Char* end = start + length;
409 const Char* tokenEnd = nullptr;
410 ParseValue(start, end, &tokenEnd, 0);
411 if (error_)
412 return;
413 if (tokenEnd != end) {
414 HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd);
415 }
416 }
417
418 private:
CharsToDouble(const uint16_t * chars,size_t length,double * result)419 bool CharsToDouble(const uint16_t* chars, size_t length, double* result) {
420 std::string buffer;
421 buffer.reserve(length + 1);
422 for (size_t ii = 0; ii < length; ++ii) {
423 bool is_ascii = !(chars[ii] & ~0x7F);
424 if (!is_ascii)
425 return false;
426 buffer.push_back(static_cast<char>(chars[ii]));
427 }
428 return platform::StrToD(buffer.c_str(), result);
429 }
430
CharsToDouble(const uint8_t * chars,size_t length,double * result)431 bool CharsToDouble(const uint8_t* chars, size_t length, double* result) {
432 std::string buffer(reinterpret_cast<const char*>(chars), length);
433 return platform::StrToD(buffer.c_str(), result);
434 }
435
ParseConstToken(const Char * start,const Char * end,const Char ** token_end,const char * token)436 static bool ParseConstToken(const Char* start,
437 const Char* end,
438 const Char** token_end,
439 const char* token) {
440 // |token| is \0 terminated, it's one of the constants at top of the file.
441 while (start < end && *token != '\0' && *start++ == *token++) {
442 }
443 if (*token != '\0')
444 return false;
445 *token_end = start;
446 return true;
447 }
448
ReadInt(const Char * start,const Char * end,const Char ** token_end,bool allow_leading_zeros)449 static bool ReadInt(const Char* start,
450 const Char* end,
451 const Char** token_end,
452 bool allow_leading_zeros) {
453 if (start == end)
454 return false;
455 bool has_leading_zero = '0' == *start;
456 int length = 0;
457 while (start < end && '0' <= *start && *start <= '9') {
458 ++start;
459 ++length;
460 }
461 if (!length)
462 return false;
463 if (!allow_leading_zeros && length > 1 && has_leading_zero)
464 return false;
465 *token_end = start;
466 return true;
467 }
468
ParseNumberToken(const Char * start,const Char * end,const Char ** token_end)469 static bool ParseNumberToken(const Char* start,
470 const Char* end,
471 const Char** token_end) {
472 // We just grab the number here. We validate the size in DecodeNumber.
473 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
474 if (start == end)
475 return false;
476 Char c = *start;
477 if ('-' == c)
478 ++start;
479
480 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false))
481 return false;
482 if (start == end) {
483 *token_end = start;
484 return true;
485 }
486
487 // Optional fraction part
488 c = *start;
489 if ('.' == c) {
490 ++start;
491 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
492 return false;
493 if (start == end) {
494 *token_end = start;
495 return true;
496 }
497 c = *start;
498 }
499
500 // Optional exponent part
501 if ('e' == c || 'E' == c) {
502 ++start;
503 if (start == end)
504 return false;
505 c = *start;
506 if ('-' == c || '+' == c) {
507 ++start;
508 if (start == end)
509 return false;
510 }
511 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
512 return false;
513 }
514
515 *token_end = start;
516 return true;
517 }
518
ReadHexDigits(const Char * start,const Char * end,const Char ** token_end,int digits)519 static bool ReadHexDigits(const Char* start,
520 const Char* end,
521 const Char** token_end,
522 int digits) {
523 if (end - start < digits)
524 return false;
525 for (int i = 0; i < digits; ++i) {
526 Char c = *start++;
527 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
528 ('A' <= c && c <= 'F')))
529 return false;
530 }
531 *token_end = start;
532 return true;
533 }
534
ParseStringToken(const Char * start,const Char * end,const Char ** token_end)535 static bool ParseStringToken(const Char* start,
536 const Char* end,
537 const Char** token_end) {
538 while (start < end) {
539 Char c = *start++;
540 if ('\\' == c) {
541 if (start == end)
542 return false;
543 c = *start++;
544 // Make sure the escaped char is valid.
545 switch (c) {
546 case 'x':
547 if (!ReadHexDigits(start, end, &start, 2))
548 return false;
549 break;
550 case 'u':
551 if (!ReadHexDigits(start, end, &start, 4))
552 return false;
553 break;
554 case '\\':
555 case '/':
556 case 'b':
557 case 'f':
558 case 'n':
559 case 'r':
560 case 't':
561 case 'v':
562 case '"':
563 break;
564 default:
565 return false;
566 }
567 } else if ('"' == c) {
568 *token_end = start;
569 return true;
570 }
571 }
572 return false;
573 }
574
SkipComment(const Char * start,const Char * end,const Char ** comment_end)575 static bool SkipComment(const Char* start,
576 const Char* end,
577 const Char** comment_end) {
578 if (start == end)
579 return false;
580
581 if (*start != '/' || start + 1 >= end)
582 return false;
583 ++start;
584
585 if (*start == '/') {
586 // Single line comment, read to newline.
587 for (++start; start < end; ++start) {
588 if (*start == '\n' || *start == '\r') {
589 *comment_end = start + 1;
590 return true;
591 }
592 }
593 *comment_end = end;
594 // Comment reaches end-of-input, which is fine.
595 return true;
596 }
597
598 if (*start == '*') {
599 Char previous = '\0';
600 // Block comment, read until end marker.
601 for (++start; start < end; previous = *start++) {
602 if (previous == '*' && *start == '/') {
603 *comment_end = start + 1;
604 return true;
605 }
606 }
607 // Block comment must close before end-of-input.
608 return false;
609 }
610
611 return false;
612 }
613
IsSpaceOrNewLine(Char c)614 static bool IsSpaceOrNewLine(Char c) {
615 // \v = vertial tab; \f = form feed page break.
616 return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' ||
617 c == '\t';
618 }
619
SkipWhitespaceAndComments(const Char * start,const Char * end,const Char ** whitespace_end)620 static void SkipWhitespaceAndComments(const Char* start,
621 const Char* end,
622 const Char** whitespace_end) {
623 while (start < end) {
624 if (IsSpaceOrNewLine(*start)) {
625 ++start;
626 } else if (*start == '/') {
627 const Char* comment_end = nullptr;
628 if (!SkipComment(start, end, &comment_end))
629 break;
630 start = comment_end;
631 } else {
632 break;
633 }
634 }
635 *whitespace_end = start;
636 }
637
ParseToken(const Char * start,const Char * end,const Char ** tokenStart,const Char ** token_end)638 static Token ParseToken(const Char* start,
639 const Char* end,
640 const Char** tokenStart,
641 const Char** token_end) {
642 SkipWhitespaceAndComments(start, end, tokenStart);
643 start = *tokenStart;
644
645 if (start == end)
646 return NoInput;
647
648 switch (*start) {
649 case 'n':
650 if (ParseConstToken(start, end, token_end, kNullString))
651 return NullToken;
652 break;
653 case 't':
654 if (ParseConstToken(start, end, token_end, kTrueString))
655 return BoolTrue;
656 break;
657 case 'f':
658 if (ParseConstToken(start, end, token_end, kFalseString))
659 return BoolFalse;
660 break;
661 case '[':
662 *token_end = start + 1;
663 return ArrayBegin;
664 case ']':
665 *token_end = start + 1;
666 return ArrayEnd;
667 case ',':
668 *token_end = start + 1;
669 return ListSeparator;
670 case '{':
671 *token_end = start + 1;
672 return ObjectBegin;
673 case '}':
674 *token_end = start + 1;
675 return ObjectEnd;
676 case ':':
677 *token_end = start + 1;
678 return ObjectPairSeparator;
679 case '0':
680 case '1':
681 case '2':
682 case '3':
683 case '4':
684 case '5':
685 case '6':
686 case '7':
687 case '8':
688 case '9':
689 case '-':
690 if (ParseNumberToken(start, end, token_end))
691 return Number;
692 break;
693 case '"':
694 if (ParseStringToken(start + 1, end, token_end))
695 return StringLiteral;
696 break;
697 }
698 return InvalidToken;
699 }
700
HexToInt(Char c)701 static int HexToInt(Char c) {
702 if ('0' <= c && c <= '9')
703 return c - '0';
704 if ('A' <= c && c <= 'F')
705 return c - 'A' + 10;
706 if ('a' <= c && c <= 'f')
707 return c - 'a' + 10;
708 assert(false); // Unreachable.
709 return 0;
710 }
711
DecodeString(const Char * start,const Char * end,std::vector<uint16_t> * output)712 static bool DecodeString(const Char* start,
713 const Char* end,
714 std::vector<uint16_t>* output) {
715 if (start == end)
716 return true;
717 if (start > end)
718 return false;
719 output->reserve(end - start);
720 while (start < end) {
721 uint16_t c = *start++;
722 // If the |Char| we're dealing with is really a byte, then
723 // we have utf8 here, and we need to check for multibyte characters
724 // and transcode them to utf16 (either one or two utf16 chars).
725 if (sizeof(Char) == sizeof(uint8_t) && c > 0x7f) {
726 // Inspect the leading byte to figure out how long the utf8
727 // byte sequence is; while doing this initialize |codepoint|
728 // with the first few bits.
729 // See table in: https://en.wikipedia.org/wiki/UTF-8
730 // byte one is 110x xxxx -> 2 byte utf8 sequence
731 // byte one is 1110 xxxx -> 3 byte utf8 sequence
732 // byte one is 1111 0xxx -> 4 byte utf8 sequence
733 uint32_t codepoint;
734 int num_bytes_left;
735 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
736 num_bytes_left = 1;
737 codepoint = c & 0x1f;
738 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
739 num_bytes_left = 2;
740 codepoint = c & 0x0f;
741 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
742 codepoint = c & 0x07;
743 num_bytes_left = 3;
744 } else {
745 return false; // invalid leading byte
746 }
747
748 // If we have enough bytes in our inpput, decode the remaining ones
749 // belonging to this Unicode character into |codepoint|.
750 if (start + num_bytes_left > end)
751 return false;
752 while (num_bytes_left > 0) {
753 c = *start++;
754 --num_bytes_left;
755 // Check the next byte is a continuation byte, that is 10xx xxxx.
756 if ((c & 0xc0) != 0x80)
757 return false;
758 codepoint = (codepoint << 6) | (c & 0x3f);
759 }
760
761 // Disallow overlong encodings for ascii characters, as these
762 // would include " and other characters significant to JSON
763 // string termination / control.
764 if (codepoint <= 0x7f)
765 return false;
766 // Invalid in UTF8, and can't be represented in UTF16 anyway.
767 if (codepoint > 0x10ffff)
768 return false;
769
770 // So, now we transcode to UTF16,
771 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
772 // for either one or two 16 bit characters.
773 if (codepoint <= 0xffff) {
774 output->push_back(codepoint);
775 continue;
776 }
777 codepoint -= 0x10000;
778 output->push_back((codepoint >> 10) + 0xd800); // high surrogate
779 output->push_back((codepoint & 0x3ff) + 0xdc00); // low surrogate
780 continue;
781 }
782 if ('\\' != c) {
783 output->push_back(c);
784 continue;
785 }
786 if (start == end)
787 return false;
788 c = *start++;
789
790 if (c == 'x') {
791 // \x is not supported.
792 return false;
793 }
794
795 switch (c) {
796 case '"':
797 case '/':
798 case '\\':
799 break;
800 case 'b':
801 c = '\b';
802 break;
803 case 'f':
804 c = '\f';
805 break;
806 case 'n':
807 c = '\n';
808 break;
809 case 'r':
810 c = '\r';
811 break;
812 case 't':
813 c = '\t';
814 break;
815 case 'v':
816 c = '\v';
817 break;
818 case 'u':
819 c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) +
820 (HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3));
821 start += 4;
822 break;
823 default:
824 return false;
825 }
826 output->push_back(c);
827 }
828 return true;
829 }
830
ParseValue(const Char * start,const Char * end,const Char ** value_token_end,int depth)831 void ParseValue(const Char* start,
832 const Char* end,
833 const Char** value_token_end,
834 int depth) {
835 if (depth > kStackLimit) {
836 HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start);
837 return;
838 }
839 const Char* token_start = nullptr;
840 const Char* token_end = nullptr;
841 Token token = ParseToken(start, end, &token_start, &token_end);
842 switch (token) {
843 case NoInput:
844 HandleError(Error::JSON_PARSER_NO_INPUT, token_start);
845 return;
846 case InvalidToken:
847 HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start);
848 return;
849 case NullToken:
850 handler_->HandleNull();
851 break;
852 case BoolTrue:
853 handler_->HandleBool(true);
854 break;
855 case BoolFalse:
856 handler_->HandleBool(false);
857 break;
858 case Number: {
859 double value;
860 if (!CharsToDouble(token_start, token_end - token_start, &value)) {
861 HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start);
862 return;
863 }
864 if (value >= std::numeric_limits<int32_t>::min() &&
865 value <= std::numeric_limits<int32_t>::max() &&
866 static_cast<int32_t>(value) == value)
867 handler_->HandleInt32(static_cast<int32_t>(value));
868 else
869 handler_->HandleDouble(value);
870 break;
871 }
872 case StringLiteral: {
873 std::vector<uint16_t> value;
874 bool ok = DecodeString(token_start + 1, token_end - 1, &value);
875 if (!ok) {
876 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
877 return;
878 }
879 handler_->HandleString16(span<uint16_t>(value.data(), value.size()));
880 break;
881 }
882 case ArrayBegin: {
883 handler_->HandleArrayBegin();
884 start = token_end;
885 token = ParseToken(start, end, &token_start, &token_end);
886 while (token != ArrayEnd) {
887 ParseValue(start, end, &token_end, depth + 1);
888 if (error_)
889 return;
890
891 // After a list value, we expect a comma or the end of the list.
892 start = token_end;
893 token = ParseToken(start, end, &token_start, &token_end);
894 if (token == ListSeparator) {
895 start = token_end;
896 token = ParseToken(start, end, &token_start, &token_end);
897 if (token == ArrayEnd) {
898 HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start);
899 return;
900 }
901 } else if (token != ArrayEnd) {
902 // Unexpected value after list value. Bail out.
903 HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED,
904 token_start);
905 return;
906 }
907 }
908 handler_->HandleArrayEnd();
909 break;
910 }
911 case ObjectBegin: {
912 handler_->HandleMapBegin();
913 start = token_end;
914 token = ParseToken(start, end, &token_start, &token_end);
915 while (token != ObjectEnd) {
916 if (token != StringLiteral) {
917 HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED,
918 token_start);
919 return;
920 }
921 std::vector<uint16_t> key;
922 if (!DecodeString(token_start + 1, token_end - 1, &key)) {
923 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
924 return;
925 }
926 handler_->HandleString16(span<uint16_t>(key.data(), key.size()));
927 start = token_end;
928
929 token = ParseToken(start, end, &token_start, &token_end);
930 if (token != ObjectPairSeparator) {
931 HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start);
932 return;
933 }
934 start = token_end;
935
936 ParseValue(start, end, &token_end, depth + 1);
937 if (error_)
938 return;
939 start = token_end;
940
941 // After a key/value pair, we expect a comma or the end of the
942 // object.
943 token = ParseToken(start, end, &token_start, &token_end);
944 if (token == ListSeparator) {
945 start = token_end;
946 token = ParseToken(start, end, &token_start, &token_end);
947 if (token == ObjectEnd) {
948 HandleError(Error::JSON_PARSER_UNEXPECTED_MAP_END, token_start);
949 return;
950 }
951 } else if (token != ObjectEnd) {
952 // Unexpected value after last object value. Bail out.
953 HandleError(Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED,
954 token_start);
955 return;
956 }
957 }
958 handler_->HandleMapEnd();
959 break;
960 }
961
962 default:
963 // We got a token that's not a value.
964 HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start);
965 return;
966 }
967
968 SkipWhitespaceAndComments(token_end, end, value_token_end);
969 }
970
HandleError(Error error,const Char * pos)971 void HandleError(Error error, const Char* pos) {
972 assert(error != Error::OK);
973 if (!error_) {
974 handler_->HandleError(
975 Status{error, static_cast<size_t>(pos - start_pos_)});
976 error_ = true;
977 }
978 }
979
980 const Char* start_pos_ = nullptr;
981 bool error_ = false;
982 ParserHandler* handler_;
983 };
984 } // namespace
985
ParseJSON(span<uint8_t> chars,ParserHandler * handler)986 void ParseJSON(span<uint8_t> chars, ParserHandler* handler) {
987 JsonParser<uint8_t> parser(handler);
988 parser.Parse(chars.data(), chars.size());
989 }
990
ParseJSON(span<uint16_t> chars,ParserHandler * handler)991 void ParseJSON(span<uint16_t> chars, ParserHandler* handler) {
992 JsonParser<uint16_t> parser(handler);
993 parser.Parse(chars.data(), chars.size());
994 }
995
996 // =============================================================================
997 // json::ConvertCBORToJSON, json::ConvertJSONToCBOR - for transcoding
998 // =============================================================================
999 template <typename C>
ConvertCBORToJSONTmpl(span<uint8_t> cbor,C * json)1000 Status ConvertCBORToJSONTmpl(span<uint8_t> cbor, C* json) {
1001 Status status;
1002 std::unique_ptr<ParserHandler> json_writer = NewJSONEncoder(json, &status);
1003 cbor::ParseCBOR(cbor, json_writer.get());
1004 return status;
1005 }
1006
ConvertCBORToJSON(span<uint8_t> cbor,std::vector<uint8_t> * json)1007 Status ConvertCBORToJSON(span<uint8_t> cbor, std::vector<uint8_t>* json) {
1008 return ConvertCBORToJSONTmpl(cbor, json);
1009 }
1010
ConvertCBORToJSON(span<uint8_t> cbor,std::string * json)1011 Status ConvertCBORToJSON(span<uint8_t> cbor, std::string* json) {
1012 return ConvertCBORToJSONTmpl(cbor, json);
1013 }
1014
1015 template <typename T>
ConvertJSONToCBORTmpl(span<T> json,std::vector<uint8_t> * cbor)1016 Status ConvertJSONToCBORTmpl(span<T> json, std::vector<uint8_t>* cbor) {
1017 Status status;
1018 std::unique_ptr<ParserHandler> encoder = cbor::NewCBOREncoder(cbor, &status);
1019 ParseJSON(json, encoder.get());
1020 return status;
1021 }
1022
ConvertJSONToCBOR(span<uint8_t> json,std::vector<uint8_t> * cbor)1023 Status ConvertJSONToCBOR(span<uint8_t> json, std::vector<uint8_t>* cbor) {
1024 return ConvertJSONToCBORTmpl(json, cbor);
1025 }
1026
ConvertJSONToCBOR(span<uint16_t> json,std::vector<uint8_t> * cbor)1027 Status ConvertJSONToCBOR(span<uint16_t> json, std::vector<uint8_t>* cbor) {
1028 return ConvertJSONToCBORTmpl(json, cbor);
1029 }
1030 } // namespace json
1031 } // namespace crdtp
1032