1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "json.h"
6 
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13 
14 #include "cbor.h"
15 #include "json_platform.h"
16 
17 namespace v8_crdtp {
18 namespace json {
19 // =============================================================================
20 // json::NewJSONEncoder - for encoding streaming parser events as JSON
21 // =============================================================================
22 
23 namespace {
24 // Prints |value| to |out| with 4 hex digits, most significant chunk first.
25 template <typename C>
PrintHex(uint16_t value,C * out)26 void PrintHex(uint16_t value, C* out) {
27   for (int ii = 3; ii >= 0; --ii) {
28     int four_bits = 0xf & (value >> (4 * ii));
29     out->push_back(four_bits + ((four_bits <= 9) ? '0' : ('a' - 10)));
30   }
31 }
32 
33 // In the writer below, we maintain a stack of State instances.
34 // It is just enough to emit the appropriate delimiters and brackets
35 // in JSON.
36 enum class Container {
37   // Used for the top-level, initial state.
38   NONE,
39   // Inside a JSON object.
40   MAP,
41   // Inside a JSON array.
42   ARRAY
43 };
44 
45 class State {
46  public:
State(Container container)47   explicit State(Container container) : container_(container) {}
StartElement(std::vector<uint8_t> * out)48   void StartElement(std::vector<uint8_t>* out) { StartElementTmpl(out); }
StartElement(std::string * out)49   void StartElement(std::string* out) { StartElementTmpl(out); }
container() const50   Container container() const { return container_; }
51 
52  private:
53   template <typename C>
StartElementTmpl(C * out)54   void StartElementTmpl(C* out) {
55     assert(container_ != Container::NONE || size_ == 0);
56     if (size_ != 0) {
57       char delim = (!(size_ & 1) || container_ == Container::ARRAY) ? ',' : ':';
58       out->push_back(delim);
59     }
60     ++size_;
61   }
62 
63   Container container_ = Container::NONE;
64   int size_ = 0;
65 };
66 
67 constexpr char kBase64Table[] =
68     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
69     "abcdefghijklmnopqrstuvwxyz0123456789+/";
70 
71 template <typename C>
Base64Encode(const span<uint8_t> & in,C * out)72 void Base64Encode(const span<uint8_t>& in, C* out) {
73   // The following three cases are based on the tables in the example
74   // section in https://en.wikipedia.org/wiki/Base64. We process three
75   // input bytes at a time, emitting 4 output bytes at a time.
76   size_t ii = 0;
77 
78   // While possible, process three input bytes.
79   for (; ii + 3 <= in.size(); ii += 3) {
80     uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8) | in[ii + 2];
81     out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
82     out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
83     out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
84     out->push_back(kBase64Table[twentyfour_bits & 0x3f]);
85   }
86   if (ii + 2 <= in.size()) {  // Process two input bytes.
87     uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8);
88     out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
89     out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
90     out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
91     out->push_back('=');  // Emit padding.
92     return;
93   }
94   if (ii + 1 <= in.size()) {  // Process a single input byte.
95     uint32_t twentyfour_bits = (in[ii] << 16);
96     out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
97     out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
98     out->push_back('=');  // Emit padding.
99     out->push_back('=');  // Emit padding.
100   }
101 }
102 
103 // Implements a handler for JSON parser events to emit a JSON string.
104 template <typename C>
105 class JSONEncoder : public ParserHandler {
106  public:
JSONEncoder(C * out,Status * status)107   JSONEncoder(C* out, Status* status) : out_(out), status_(status) {
108     *status_ = Status();
109     state_.emplace(Container::NONE);
110   }
111 
HandleMapBegin()112   void HandleMapBegin() override {
113     if (!status_->ok())
114       return;
115     assert(!state_.empty());
116     state_.top().StartElement(out_);
117     state_.emplace(Container::MAP);
118     Emit('{');
119   }
120 
HandleMapEnd()121   void HandleMapEnd() override {
122     if (!status_->ok())
123       return;
124     assert(state_.size() >= 2 && state_.top().container() == Container::MAP);
125     state_.pop();
126     Emit('}');
127   }
128 
HandleArrayBegin()129   void HandleArrayBegin() override {
130     if (!status_->ok())
131       return;
132     state_.top().StartElement(out_);
133     state_.emplace(Container::ARRAY);
134     Emit('[');
135   }
136 
HandleArrayEnd()137   void HandleArrayEnd() override {
138     if (!status_->ok())
139       return;
140     assert(state_.size() >= 2 && state_.top().container() == Container::ARRAY);
141     state_.pop();
142     Emit(']');
143   }
144 
HandleString16(span<uint16_t> chars)145   void HandleString16(span<uint16_t> chars) override {
146     if (!status_->ok())
147       return;
148     state_.top().StartElement(out_);
149     Emit('"');
150     for (const uint16_t ch : chars) {
151       if (ch == '"') {
152         Emit("\\\"");
153       } else if (ch == '\\') {
154         Emit("\\\\");
155       } else if (ch == '\b') {
156         Emit("\\b");
157       } else if (ch == '\f') {
158         Emit("\\f");
159       } else if (ch == '\n') {
160         Emit("\\n");
161       } else if (ch == '\r') {
162         Emit("\\r");
163       } else if (ch == '\t') {
164         Emit("\\t");
165       } else if (ch >= 32 && ch <= 127) {
166         Emit(ch);
167       } else {
168         Emit("\\u");
169         PrintHex(ch, out_);
170       }
171     }
172     Emit('"');
173   }
174 
HandleString8(span<uint8_t> chars)175   void HandleString8(span<uint8_t> chars) override {
176     if (!status_->ok())
177       return;
178     state_.top().StartElement(out_);
179     Emit('"');
180     for (size_t ii = 0; ii < chars.size(); ++ii) {
181       uint8_t c = chars[ii];
182       if (c == '"') {
183         Emit("\\\"");
184       } else if (c == '\\') {
185         Emit("\\\\");
186       } else if (c == '\b') {
187         Emit("\\b");
188       } else if (c == '\f') {
189         Emit("\\f");
190       } else if (c == '\n') {
191         Emit("\\n");
192       } else if (c == '\r') {
193         Emit("\\r");
194       } else if (c == '\t') {
195         Emit("\\t");
196       } else if (c >= 32 && c <= 127) {
197         Emit(c);
198       } else if (c < 32) {
199         Emit("\\u");
200         PrintHex(static_cast<uint16_t>(c), out_);
201       } else {
202         // Inspect the leading byte to figure out how long the utf8
203         // byte sequence is; while doing this initialize |codepoint|
204         // with the first few bits.
205         // See table in: https://en.wikipedia.org/wiki/UTF-8
206         // byte one is 110x xxxx -> 2 byte utf8 sequence
207         // byte one is 1110 xxxx -> 3 byte utf8 sequence
208         // byte one is 1111 0xxx -> 4 byte utf8 sequence
209         uint32_t codepoint;
210         int num_bytes_left;
211         if ((c & 0xe0) == 0xc0) {  // 2 byte utf8 sequence
212           num_bytes_left = 1;
213           codepoint = c & 0x1f;
214         } else if ((c & 0xf0) == 0xe0) {  // 3 byte utf8 sequence
215           num_bytes_left = 2;
216           codepoint = c & 0x0f;
217         } else if ((c & 0xf8) == 0xf0) {  // 4 byte utf8 sequence
218           codepoint = c & 0x07;
219           num_bytes_left = 3;
220         } else {
221           continue;  // invalid leading byte
222         }
223 
224         // If we have enough bytes in our input, decode the remaining ones
225         // belonging to this Unicode character into |codepoint|.
226         if (ii + num_bytes_left >= chars.size())
227           continue;
228         bool invalid_byte_seen = false;
229         while (num_bytes_left > 0) {
230           c = chars[++ii];
231           --num_bytes_left;
232           // Check the next byte is a continuation byte, that is 10xx xxxx.
233           if ((c & 0xc0) != 0x80)
234             invalid_byte_seen = true;
235           codepoint = (codepoint << 6) | (c & 0x3f);
236         }
237         if (invalid_byte_seen)
238           continue;
239 
240         // Disallow overlong encodings for ascii characters, as these
241         // would include " and other characters significant to JSON
242         // string termination / control.
243         if (codepoint <= 0x7f)
244           continue;
245         // Invalid in UTF8, and can't be represented in UTF16 anyway.
246         if (codepoint > 0x10ffff)
247           continue;
248 
249         // So, now we transcode to UTF16,
250         // using the math described at https://en.wikipedia.org/wiki/UTF-16,
251         // for either one or two 16 bit characters.
252         if (codepoint <= 0xffff) {
253           Emit("\\u");
254           PrintHex(static_cast<uint16_t>(codepoint), out_);
255           continue;
256         }
257         codepoint -= 0x10000;
258         // high surrogate
259         Emit("\\u");
260         PrintHex(static_cast<uint16_t>((codepoint >> 10) + 0xd800), out_);
261         // low surrogate
262         Emit("\\u");
263         PrintHex(static_cast<uint16_t>((codepoint & 0x3ff) + 0xdc00), out_);
264       }
265     }
266     Emit('"');
267   }
268 
HandleBinary(span<uint8_t> bytes)269   void HandleBinary(span<uint8_t> bytes) override {
270     if (!status_->ok())
271       return;
272     state_.top().StartElement(out_);
273     Emit('"');
274     Base64Encode(bytes, out_);
275     Emit('"');
276   }
277 
HandleDouble(double value)278   void HandleDouble(double value) override {
279     if (!status_->ok())
280       return;
281     state_.top().StartElement(out_);
282     // JSON cannot represent NaN or Infinity. So, for compatibility,
283     // we behave like the JSON object in web browsers: emit 'null'.
284     if (!std::isfinite(value)) {
285       Emit("null");
286       return;
287     }
288     // If |value| is a scalar, emit it as an int. Taken from json_writer.cc in
289     // Chromium.
290     if (value < static_cast<double>(std::numeric_limits<int64_t>::max()) &&
291         value >= std::numeric_limits<int64_t>::min() &&
292         std::floor(value) == value) {
293       Emit(std::to_string(static_cast<int64_t>(value)));
294       return;
295     }
296     std::string str_value = json::platform::DToStr(value);
297     // The following is somewhat paranoid, but also taken from json_writer.cc
298     // in Chromium:
299     // Ensure that the number has a .0 if there's no decimal or 'e'.  This
300     // makes sure that when we read the JSON back, it's interpreted as a
301     // real rather than an int.
302     if (str_value.find_first_of(".eE") == std::string::npos)
303       str_value.append(".0");
304 
305     // DToStr may fail to emit a 0 before the decimal dot. E.g. this is
306     // the case in base::NumberToString in Chromium (which is based on
307     // dmg_fp). So, much like
308     // https://cs.chromium.org/chromium/src/base/json/json_writer.cc
309     // we probe for this and emit the leading 0 anyway if necessary.
310     const char* chars = str_value.c_str();
311     if (chars[0] == '.') {
312       Emit('0');
313     } else if (chars[0] == '-' && chars[1] == '.') {
314       Emit("-0");
315       ++chars;
316     }
317     Emit(chars);
318   }
319 
HandleInt32(int32_t value)320   void HandleInt32(int32_t value) override {
321     if (!status_->ok())
322       return;
323     state_.top().StartElement(out_);
324     Emit(std::to_string(value));
325   }
326 
HandleBool(bool value)327   void HandleBool(bool value) override {
328     if (!status_->ok())
329       return;
330     state_.top().StartElement(out_);
331     Emit(value ? "true" : "false");
332   }
333 
HandleNull()334   void HandleNull() override {
335     if (!status_->ok())
336       return;
337     state_.top().StartElement(out_);
338     Emit("null");
339   }
340 
HandleError(Status error)341   void HandleError(Status error) override {
342     assert(!error.ok());
343     *status_ = error;
344     out_->clear();
345   }
346 
347  private:
Emit(char c)348   void Emit(char c) { out_->push_back(c); }
Emit(const char * str)349   void Emit(const char* str) {
350     out_->insert(out_->end(), str, str + strlen(str));
351   }
Emit(const std::string & str)352   void Emit(const std::string& str) {
353     out_->insert(out_->end(), str.begin(), str.end());
354   }
355 
356   C* out_;
357   Status* status_;
358   std::stack<State> state_;
359 };
360 }  // namespace
361 
NewJSONEncoder(std::vector<uint8_t> * out,Status * status)362 std::unique_ptr<ParserHandler> NewJSONEncoder(std::vector<uint8_t>* out,
363                                               Status* status) {
364   return std::unique_ptr<ParserHandler>(
365       new JSONEncoder<std::vector<uint8_t>>(out, status));
366 }
367 
NewJSONEncoder(std::string * out,Status * status)368 std::unique_ptr<ParserHandler> NewJSONEncoder(std::string* out,
369                                               Status* status) {
370   return std::unique_ptr<ParserHandler>(
371       new JSONEncoder<std::string>(out, status));
372 }
373 
374 // =============================================================================
375 // json::ParseJSON - for receiving streaming parser events for JSON.
376 // =============================================================================
377 
378 namespace {
379 const int kStackLimit = 300;
380 
381 enum Token {
382   ObjectBegin,
383   ObjectEnd,
384   ArrayBegin,
385   ArrayEnd,
386   StringLiteral,
387   Number,
388   BoolTrue,
389   BoolFalse,
390   NullToken,
391   ListSeparator,
392   ObjectPairSeparator,
393   InvalidToken,
394   NoInput
395 };
396 
397 const char* const kNullString = "null";
398 const char* const kTrueString = "true";
399 const char* const kFalseString = "false";
400 
401 template <typename Char>
402 class JsonParser {
403  public:
JsonParser(ParserHandler * handler)404   explicit JsonParser(ParserHandler* handler) : handler_(handler) {}
405 
Parse(const Char * start,size_t length)406   void Parse(const Char* start, size_t length) {
407     start_pos_ = start;
408     const Char* end = start + length;
409     const Char* tokenEnd = nullptr;
410     ParseValue(start, end, &tokenEnd, 0);
411     if (error_)
412       return;
413     if (tokenEnd != end) {
414       HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd);
415     }
416   }
417 
418  private:
CharsToDouble(const uint16_t * chars,size_t length,double * result)419   bool CharsToDouble(const uint16_t* chars, size_t length, double* result) {
420     std::string buffer;
421     buffer.reserve(length + 1);
422     for (size_t ii = 0; ii < length; ++ii) {
423       bool is_ascii = !(chars[ii] & ~0x7F);
424       if (!is_ascii)
425         return false;
426       buffer.push_back(static_cast<char>(chars[ii]));
427     }
428     return platform::StrToD(buffer.c_str(), result);
429   }
430 
CharsToDouble(const uint8_t * chars,size_t length,double * result)431   bool CharsToDouble(const uint8_t* chars, size_t length, double* result) {
432     std::string buffer(reinterpret_cast<const char*>(chars), length);
433     return platform::StrToD(buffer.c_str(), result);
434   }
435 
ParseConstToken(const Char * start,const Char * end,const Char ** token_end,const char * token)436   static bool ParseConstToken(const Char* start,
437                               const Char* end,
438                               const Char** token_end,
439                               const char* token) {
440     // |token| is \0 terminated, it's one of the constants at top of the file.
441     while (start < end && *token != '\0' && *start++ == *token++) {
442     }
443     if (*token != '\0')
444       return false;
445     *token_end = start;
446     return true;
447   }
448 
ReadInt(const Char * start,const Char * end,const Char ** token_end,bool allow_leading_zeros)449   static bool ReadInt(const Char* start,
450                       const Char* end,
451                       const Char** token_end,
452                       bool allow_leading_zeros) {
453     if (start == end)
454       return false;
455     bool has_leading_zero = '0' == *start;
456     int length = 0;
457     while (start < end && '0' <= *start && *start <= '9') {
458       ++start;
459       ++length;
460     }
461     if (!length)
462       return false;
463     if (!allow_leading_zeros && length > 1 && has_leading_zero)
464       return false;
465     *token_end = start;
466     return true;
467   }
468 
ParseNumberToken(const Char * start,const Char * end,const Char ** token_end)469   static bool ParseNumberToken(const Char* start,
470                                const Char* end,
471                                const Char** token_end) {
472     // We just grab the number here. We validate the size in DecodeNumber.
473     // According to RFC4627, a valid number is: [minus] int [frac] [exp]
474     if (start == end)
475       return false;
476     Char c = *start;
477     if ('-' == c)
478       ++start;
479 
480     if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false))
481       return false;
482     if (start == end) {
483       *token_end = start;
484       return true;
485     }
486 
487     // Optional fraction part
488     c = *start;
489     if ('.' == c) {
490       ++start;
491       if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
492         return false;
493       if (start == end) {
494         *token_end = start;
495         return true;
496       }
497       c = *start;
498     }
499 
500     // Optional exponent part
501     if ('e' == c || 'E' == c) {
502       ++start;
503       if (start == end)
504         return false;
505       c = *start;
506       if ('-' == c || '+' == c) {
507         ++start;
508         if (start == end)
509           return false;
510       }
511       if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
512         return false;
513     }
514 
515     *token_end = start;
516     return true;
517   }
518 
ReadHexDigits(const Char * start,const Char * end,const Char ** token_end,int digits)519   static bool ReadHexDigits(const Char* start,
520                             const Char* end,
521                             const Char** token_end,
522                             int digits) {
523     if (end - start < digits)
524       return false;
525     for (int i = 0; i < digits; ++i) {
526       Char c = *start++;
527       if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
528             ('A' <= c && c <= 'F')))
529         return false;
530     }
531     *token_end = start;
532     return true;
533   }
534 
ParseStringToken(const Char * start,const Char * end,const Char ** token_end)535   static bool ParseStringToken(const Char* start,
536                                const Char* end,
537                                const Char** token_end) {
538     while (start < end) {
539       Char c = *start++;
540       if ('\\' == c) {
541         if (start == end)
542           return false;
543         c = *start++;
544         // Make sure the escaped char is valid.
545         switch (c) {
546           case 'x':
547             if (!ReadHexDigits(start, end, &start, 2))
548               return false;
549             break;
550           case 'u':
551             if (!ReadHexDigits(start, end, &start, 4))
552               return false;
553             break;
554           case '\\':
555           case '/':
556           case 'b':
557           case 'f':
558           case 'n':
559           case 'r':
560           case 't':
561           case 'v':
562           case '"':
563             break;
564           default:
565             return false;
566         }
567       } else if ('"' == c) {
568         *token_end = start;
569         return true;
570       }
571     }
572     return false;
573   }
574 
SkipComment(const Char * start,const Char * end,const Char ** comment_end)575   static bool SkipComment(const Char* start,
576                           const Char* end,
577                           const Char** comment_end) {
578     if (start == end)
579       return false;
580 
581     if (*start != '/' || start + 1 >= end)
582       return false;
583     ++start;
584 
585     if (*start == '/') {
586       // Single line comment, read to newline.
587       for (++start; start < end; ++start) {
588         if (*start == '\n' || *start == '\r') {
589           *comment_end = start + 1;
590           return true;
591         }
592       }
593       *comment_end = end;
594       // Comment reaches end-of-input, which is fine.
595       return true;
596     }
597 
598     if (*start == '*') {
599       Char previous = '\0';
600       // Block comment, read until end marker.
601       for (++start; start < end; previous = *start++) {
602         if (previous == '*' && *start == '/') {
603           *comment_end = start + 1;
604           return true;
605         }
606       }
607       // Block comment must close before end-of-input.
608       return false;
609     }
610 
611     return false;
612   }
613 
IsSpaceOrNewLine(Char c)614   static bool IsSpaceOrNewLine(Char c) {
615     // \v = vertial tab; \f = form feed page break.
616     return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' ||
617            c == '\t';
618   }
619 
SkipWhitespaceAndComments(const Char * start,const Char * end,const Char ** whitespace_end)620   static void SkipWhitespaceAndComments(const Char* start,
621                                         const Char* end,
622                                         const Char** whitespace_end) {
623     while (start < end) {
624       if (IsSpaceOrNewLine(*start)) {
625         ++start;
626       } else if (*start == '/') {
627         const Char* comment_end = nullptr;
628         if (!SkipComment(start, end, &comment_end))
629           break;
630         start = comment_end;
631       } else {
632         break;
633       }
634     }
635     *whitespace_end = start;
636   }
637 
ParseToken(const Char * start,const Char * end,const Char ** tokenStart,const Char ** token_end)638   static Token ParseToken(const Char* start,
639                           const Char* end,
640                           const Char** tokenStart,
641                           const Char** token_end) {
642     SkipWhitespaceAndComments(start, end, tokenStart);
643     start = *tokenStart;
644 
645     if (start == end)
646       return NoInput;
647 
648     switch (*start) {
649       case 'n':
650         if (ParseConstToken(start, end, token_end, kNullString))
651           return NullToken;
652         break;
653       case 't':
654         if (ParseConstToken(start, end, token_end, kTrueString))
655           return BoolTrue;
656         break;
657       case 'f':
658         if (ParseConstToken(start, end, token_end, kFalseString))
659           return BoolFalse;
660         break;
661       case '[':
662         *token_end = start + 1;
663         return ArrayBegin;
664       case ']':
665         *token_end = start + 1;
666         return ArrayEnd;
667       case ',':
668         *token_end = start + 1;
669         return ListSeparator;
670       case '{':
671         *token_end = start + 1;
672         return ObjectBegin;
673       case '}':
674         *token_end = start + 1;
675         return ObjectEnd;
676       case ':':
677         *token_end = start + 1;
678         return ObjectPairSeparator;
679       case '0':
680       case '1':
681       case '2':
682       case '3':
683       case '4':
684       case '5':
685       case '6':
686       case '7':
687       case '8':
688       case '9':
689       case '-':
690         if (ParseNumberToken(start, end, token_end))
691           return Number;
692         break;
693       case '"':
694         if (ParseStringToken(start + 1, end, token_end))
695           return StringLiteral;
696         break;
697     }
698     return InvalidToken;
699   }
700 
HexToInt(Char c)701   static int HexToInt(Char c) {
702     if ('0' <= c && c <= '9')
703       return c - '0';
704     if ('A' <= c && c <= 'F')
705       return c - 'A' + 10;
706     if ('a' <= c && c <= 'f')
707       return c - 'a' + 10;
708     assert(false);  // Unreachable.
709     return 0;
710   }
711 
DecodeString(const Char * start,const Char * end,std::vector<uint16_t> * output)712   static bool DecodeString(const Char* start,
713                            const Char* end,
714                            std::vector<uint16_t>* output) {
715     if (start == end)
716       return true;
717     if (start > end)
718       return false;
719     output->reserve(end - start);
720     while (start < end) {
721       uint16_t c = *start++;
722       // If the |Char| we're dealing with is really a byte, then
723       // we have utf8 here, and we need to check for multibyte characters
724       // and transcode them to utf16 (either one or two utf16 chars).
725       if (sizeof(Char) == sizeof(uint8_t) && c > 0x7f) {
726         // Inspect the leading byte to figure out how long the utf8
727         // byte sequence is; while doing this initialize |codepoint|
728         // with the first few bits.
729         // See table in: https://en.wikipedia.org/wiki/UTF-8
730         // byte one is 110x xxxx -> 2 byte utf8 sequence
731         // byte one is 1110 xxxx -> 3 byte utf8 sequence
732         // byte one is 1111 0xxx -> 4 byte utf8 sequence
733         uint32_t codepoint;
734         int num_bytes_left;
735         if ((c & 0xe0) == 0xc0) {  // 2 byte utf8 sequence
736           num_bytes_left = 1;
737           codepoint = c & 0x1f;
738         } else if ((c & 0xf0) == 0xe0) {  // 3 byte utf8 sequence
739           num_bytes_left = 2;
740           codepoint = c & 0x0f;
741         } else if ((c & 0xf8) == 0xf0) {  // 4 byte utf8 sequence
742           codepoint = c & 0x07;
743           num_bytes_left = 3;
744         } else {
745           return false;  // invalid leading byte
746         }
747 
748         // If we have enough bytes in our inpput, decode the remaining ones
749         // belonging to this Unicode character into |codepoint|.
750         if (start + num_bytes_left > end)
751           return false;
752         while (num_bytes_left > 0) {
753           c = *start++;
754           --num_bytes_left;
755           // Check the next byte is a continuation byte, that is 10xx xxxx.
756           if ((c & 0xc0) != 0x80)
757             return false;
758           codepoint = (codepoint << 6) | (c & 0x3f);
759         }
760 
761         // Disallow overlong encodings for ascii characters, as these
762         // would include " and other characters significant to JSON
763         // string termination / control.
764         if (codepoint <= 0x7f)
765           return false;
766         // Invalid in UTF8, and can't be represented in UTF16 anyway.
767         if (codepoint > 0x10ffff)
768           return false;
769 
770         // So, now we transcode to UTF16,
771         // using the math described at https://en.wikipedia.org/wiki/UTF-16,
772         // for either one or two 16 bit characters.
773         if (codepoint <= 0xffff) {
774           output->push_back(codepoint);
775           continue;
776         }
777         codepoint -= 0x10000;
778         output->push_back((codepoint >> 10) + 0xd800);    // high surrogate
779         output->push_back((codepoint & 0x3ff) + 0xdc00);  // low surrogate
780         continue;
781       }
782       if ('\\' != c) {
783         output->push_back(c);
784         continue;
785       }
786       if (start == end)
787         return false;
788       c = *start++;
789 
790       if (c == 'x') {
791         // \x is not supported.
792         return false;
793       }
794 
795       switch (c) {
796         case '"':
797         case '/':
798         case '\\':
799           break;
800         case 'b':
801           c = '\b';
802           break;
803         case 'f':
804           c = '\f';
805           break;
806         case 'n':
807           c = '\n';
808           break;
809         case 'r':
810           c = '\r';
811           break;
812         case 't':
813           c = '\t';
814           break;
815         case 'v':
816           c = '\v';
817           break;
818         case 'u':
819           c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) +
820               (HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3));
821           start += 4;
822           break;
823         default:
824           return false;
825       }
826       output->push_back(c);
827     }
828     return true;
829   }
830 
ParseValue(const Char * start,const Char * end,const Char ** value_token_end,int depth)831   void ParseValue(const Char* start,
832                   const Char* end,
833                   const Char** value_token_end,
834                   int depth) {
835     if (depth > kStackLimit) {
836       HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start);
837       return;
838     }
839     const Char* token_start = nullptr;
840     const Char* token_end = nullptr;
841     Token token = ParseToken(start, end, &token_start, &token_end);
842     switch (token) {
843       case NoInput:
844         HandleError(Error::JSON_PARSER_NO_INPUT, token_start);
845         return;
846       case InvalidToken:
847         HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start);
848         return;
849       case NullToken:
850         handler_->HandleNull();
851         break;
852       case BoolTrue:
853         handler_->HandleBool(true);
854         break;
855       case BoolFalse:
856         handler_->HandleBool(false);
857         break;
858       case Number: {
859         double value;
860         if (!CharsToDouble(token_start, token_end - token_start, &value)) {
861           HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start);
862           return;
863         }
864         if (value >= std::numeric_limits<int32_t>::min() &&
865             value <= std::numeric_limits<int32_t>::max() &&
866             static_cast<int32_t>(value) == value)
867           handler_->HandleInt32(static_cast<int32_t>(value));
868         else
869           handler_->HandleDouble(value);
870         break;
871       }
872       case StringLiteral: {
873         std::vector<uint16_t> value;
874         bool ok = DecodeString(token_start + 1, token_end - 1, &value);
875         if (!ok) {
876           HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
877           return;
878         }
879         handler_->HandleString16(span<uint16_t>(value.data(), value.size()));
880         break;
881       }
882       case ArrayBegin: {
883         handler_->HandleArrayBegin();
884         start = token_end;
885         token = ParseToken(start, end, &token_start, &token_end);
886         while (token != ArrayEnd) {
887           ParseValue(start, end, &token_end, depth + 1);
888           if (error_)
889             return;
890 
891           // After a list value, we expect a comma or the end of the list.
892           start = token_end;
893           token = ParseToken(start, end, &token_start, &token_end);
894           if (token == ListSeparator) {
895             start = token_end;
896             token = ParseToken(start, end, &token_start, &token_end);
897             if (token == ArrayEnd) {
898               HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start);
899               return;
900             }
901           } else if (token != ArrayEnd) {
902             // Unexpected value after list value. Bail out.
903             HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED,
904                         token_start);
905             return;
906           }
907         }
908         handler_->HandleArrayEnd();
909         break;
910       }
911       case ObjectBegin: {
912         handler_->HandleMapBegin();
913         start = token_end;
914         token = ParseToken(start, end, &token_start, &token_end);
915         while (token != ObjectEnd) {
916           if (token != StringLiteral) {
917             HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED,
918                         token_start);
919             return;
920           }
921           std::vector<uint16_t> key;
922           if (!DecodeString(token_start + 1, token_end - 1, &key)) {
923             HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
924             return;
925           }
926           handler_->HandleString16(span<uint16_t>(key.data(), key.size()));
927           start = token_end;
928 
929           token = ParseToken(start, end, &token_start, &token_end);
930           if (token != ObjectPairSeparator) {
931             HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start);
932             return;
933           }
934           start = token_end;
935 
936           ParseValue(start, end, &token_end, depth + 1);
937           if (error_)
938             return;
939           start = token_end;
940 
941           // After a key/value pair, we expect a comma or the end of the
942           // object.
943           token = ParseToken(start, end, &token_start, &token_end);
944           if (token == ListSeparator) {
945             start = token_end;
946             token = ParseToken(start, end, &token_start, &token_end);
947             if (token == ObjectEnd) {
948               HandleError(Error::JSON_PARSER_UNEXPECTED_MAP_END, token_start);
949               return;
950             }
951           } else if (token != ObjectEnd) {
952             // Unexpected value after last object value. Bail out.
953             HandleError(Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED,
954                         token_start);
955             return;
956           }
957         }
958         handler_->HandleMapEnd();
959         break;
960       }
961 
962       default:
963         // We got a token that's not a value.
964         HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start);
965         return;
966     }
967 
968     SkipWhitespaceAndComments(token_end, end, value_token_end);
969   }
970 
HandleError(Error error,const Char * pos)971   void HandleError(Error error, const Char* pos) {
972     assert(error != Error::OK);
973     if (!error_) {
974       handler_->HandleError(
975           Status{error, static_cast<size_t>(pos - start_pos_)});
976       error_ = true;
977     }
978   }
979 
980   const Char* start_pos_ = nullptr;
981   bool error_ = false;
982   ParserHandler* handler_;
983 };
984 }  // namespace
985 
ParseJSON(span<uint8_t> chars,ParserHandler * handler)986 void ParseJSON(span<uint8_t> chars, ParserHandler* handler) {
987   JsonParser<uint8_t> parser(handler);
988   parser.Parse(chars.data(), chars.size());
989 }
990 
ParseJSON(span<uint16_t> chars,ParserHandler * handler)991 void ParseJSON(span<uint16_t> chars, ParserHandler* handler) {
992   JsonParser<uint16_t> parser(handler);
993   parser.Parse(chars.data(), chars.size());
994 }
995 
996 // =============================================================================
997 // json::ConvertCBORToJSON, json::ConvertJSONToCBOR - for transcoding
998 // =============================================================================
999 template <typename C>
ConvertCBORToJSONTmpl(span<uint8_t> cbor,C * json)1000 Status ConvertCBORToJSONTmpl(span<uint8_t> cbor, C* json) {
1001   Status status;
1002   std::unique_ptr<ParserHandler> json_writer = NewJSONEncoder(json, &status);
1003   cbor::ParseCBOR(cbor, json_writer.get());
1004   return status;
1005 }
1006 
ConvertCBORToJSON(span<uint8_t> cbor,std::vector<uint8_t> * json)1007 Status ConvertCBORToJSON(span<uint8_t> cbor, std::vector<uint8_t>* json) {
1008   return ConvertCBORToJSONTmpl(cbor, json);
1009 }
1010 
ConvertCBORToJSON(span<uint8_t> cbor,std::string * json)1011 Status ConvertCBORToJSON(span<uint8_t> cbor, std::string* json) {
1012   return ConvertCBORToJSONTmpl(cbor, json);
1013 }
1014 
1015 template <typename T>
ConvertJSONToCBORTmpl(span<T> json,std::vector<uint8_t> * cbor)1016 Status ConvertJSONToCBORTmpl(span<T> json, std::vector<uint8_t>* cbor) {
1017   Status status;
1018   std::unique_ptr<ParserHandler> encoder = cbor::NewCBOREncoder(cbor, &status);
1019   ParseJSON(json, encoder.get());
1020   return status;
1021 }
1022 
ConvertJSONToCBOR(span<uint8_t> json,std::vector<uint8_t> * cbor)1023 Status ConvertJSONToCBOR(span<uint8_t> json, std::vector<uint8_t>* cbor) {
1024   return ConvertJSONToCBORTmpl(json, cbor);
1025 }
1026 
ConvertJSONToCBOR(span<uint16_t> json,std::vector<uint8_t> * cbor)1027 Status ConvertJSONToCBOR(span<uint16_t> json, std::vector<uint8_t>* cbor) {
1028   return ConvertJSONToCBORTmpl(json, cbor);
1029 }
1030 }  // namespace json
1031 }  // namespace v8_crdtp
1032