1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <folly/json.h>
18 
19 #include <algorithm>
20 #include <functional>
21 #include <iterator>
22 #include <sstream>
23 #include <type_traits>
24 
25 #include <boost/algorithm/string.hpp>
26 #include <glog/logging.h>
27 
28 #include <folly/Conv.h>
29 #include <folly/Portability.h>
30 #include <folly/Range.h>
31 #include <folly/String.h>
32 #include <folly/Unicode.h>
33 #include <folly/Utility.h>
34 #include <folly/lang/Bits.h>
35 #include <folly/portability/Constexpr.h>
36 
37 namespace folly {
38 
39 //////////////////////////////////////////////////////////////////////
40 
41 namespace json {
42 
43 namespace {
44 
make_parse_error(unsigned int line,std::string const & context,std::string const & expected)45 parse_error make_parse_error(
46     unsigned int line,
47     std::string const& context,
48     std::string const& expected) {
49   return parse_error(to<std::string>(
50       "json parse error on line ",
51       line,
52       !context.empty() ? to<std::string>(" near `", context, '\'') : "",
53       ": ",
54       expected));
55 }
56 
57 struct Printer {
58   // Context class is allows to restore the path to element that we are about to
59   // print so that if error happens we can throw meaningful exception.
60   class Context {
61    public:
Context(const Context * parent_context,const dynamic & key)62     Context(const Context* parent_context, const dynamic& key)
63         : parent_context_(parent_context), key_(key), is_key_(false) {}
Context(const Context * parent_context,const dynamic & key,bool is_key)64     Context(const Context* parent_context, const dynamic& key, bool is_key)
65         : parent_context_(parent_context), key_(key), is_key_(is_key) {}
66 
67     // Return location description of a context as a chain of keys
68     // ex., '"outherKey"->"innerKey"'.
locationDescription() const69     std::string locationDescription() const {
70       std::vector<std::string> keys;
71       const Context* ptr = parent_context_;
72       while (ptr) {
73         keys.push_back(ptr->getName());
74         ptr = ptr->parent_context_;
75       }
76       keys.push_back(getName());
77       std::ostringstream stream;
78       std::reverse_copy(
79           keys.begin(),
80           keys.end() - 1,
81           std::ostream_iterator<std::string>(stream, "->"));
82 
83       // Add current key.
84       stream << keys.back();
85       return stream.str();
86     }
getName() const87     std::string getName() const {
88       return Printer::toStringOr(key_, "<unprintable>");
89     }
typeDescription() const90     std::string typeDescription() const { return is_key_ ? "key" : "value"; }
91 
92    private:
93     const Context* const parent_context_;
94     const dynamic& key_;
95     bool is_key_;
96   };
97 
Printerfolly::json::__anon3d94b9db0111::Printer98   explicit Printer(
99       std::string& out, unsigned* indentLevel, serialization_opts const* opts)
100       : out_(out), indentLevel_(indentLevel), opts_(*opts) {}
101 
operator ()folly::json::__anon3d94b9db0111::Printer102   void operator()(dynamic const& v, const Context& context) const {
103     (*this)(v, &context);
104   }
operator ()folly::json::__anon3d94b9db0111::Printer105   void operator()(dynamic const& v, const Context* context) const {
106     switch (v.type()) {
107       case dynamic::DOUBLE:
108         if (!opts_.allow_nan_inf) {
109           if (std::isnan(v.asDouble())) {
110             throw json::print_error(
111                 "folly::toJson: JSON object value was a NaN when serializing " +
112                 contextDescription(context));
113           }
114           if (std::isinf(v.asDouble())) {
115             throw json::print_error(
116                 "folly::toJson: JSON object value was an INF when serializing " +
117                 contextDescription(context));
118           }
119         }
120         toAppend(
121             v.asDouble(), &out_, opts_.double_mode, opts_.double_num_digits);
122         break;
123       case dynamic::INT64: {
124         auto intval = v.asInt();
125         if (opts_.javascript_safe) {
126           // Use folly::to to check that this integer can be represented
127           // as a double without loss of precision.
128           intval = int64_t(to<double>(intval));
129         }
130         toAppend(intval, &out_);
131         break;
132       }
133       case dynamic::BOOL:
134         out_ += v.asBool() ? "true" : "false";
135         break;
136       case dynamic::NULLT:
137         out_ += "null";
138         break;
139       case dynamic::STRING:
140         escapeString(v.stringPiece(), out_, opts_);
141         break;
142       case dynamic::OBJECT:
143         printObject(v, context);
144         break;
145       case dynamic::ARRAY:
146         printArray(v, context);
147         break;
148       default:
149         CHECK(0) << "Bad type " << v.type();
150     }
151   }
152 
153  private:
printKVfolly::json::__anon3d94b9db0111::Printer154   void printKV(
155       const std::pair<const dynamic, dynamic>& p,
156       const Context* context) const {
157     if (!opts_.allow_non_string_keys && !p.first.isString()) {
158       throw json::print_error(
159           "folly::toJson: JSON object key " +
160           toStringOr(p.first, "<unprintable key>") +
161           " was not a string when serializing key at " +
162           Context(context, p.first, true).locationDescription());
163     }
164     (*this)(p.first, Context(context, p.first, true)); // Key
165     mapColon();
166     (*this)(p.second, Context(context, p.first, false)); // Value
167   }
168 
169   template <typename Iterator>
printKVPairsfolly::json::__anon3d94b9db0111::Printer170   void printKVPairs(
171       Iterator begin, Iterator end, const Context* context) const {
172     printKV(*begin, context);
173     for (++begin; begin != end; ++begin) {
174       out_ += ',';
175       newline();
176       printKV(*begin, context);
177     }
178   }
179 
printObjectfolly::json::__anon3d94b9db0111::Printer180   void printObject(dynamic const& o, const Context* context) const {
181     if (o.empty()) {
182       out_ += "{}";
183       return;
184     }
185 
186     out_ += '{';
187     indent();
188     newline();
189     if (opts_.sort_keys || opts_.sort_keys_by) {
190       using ref = std::reference_wrapper<decltype(o.items())::value_type const>;
191       auto sort_keys_by = [&](auto begin, auto end, const auto& comp) {
192         std::sort(begin, end, [&](ref a, ref b) {
193           // Only compare keys.  No ordering among identical keys.
194           return comp(a.get().first, b.get().first);
195         });
196       };
197       std::vector<ref> refs(o.items().begin(), o.items().end());
198       if (opts_.sort_keys_by) {
199         sort_keys_by(refs.begin(), refs.end(), opts_.sort_keys_by);
200       } else {
201         sort_keys_by(refs.begin(), refs.end(), std::less<>());
202       }
203       printKVPairs(refs.cbegin(), refs.cend(), context);
204     } else {
205       printKVPairs(o.items().begin(), o.items().end(), context);
206     }
207     outdent();
208     newline();
209     out_ += '}';
210   }
211 
toStringOrfolly::json::__anon3d94b9db0111::Printer212   static std::string toStringOr(dynamic const& v, const char* placeholder) {
213     try {
214       std::string result;
215       unsigned indentLevel = 0;
216       serialization_opts opts;
217       opts.allow_nan_inf = true;
218       opts.allow_non_string_keys = true;
219       Printer printer(result, &indentLevel, &opts);
220       printer(v, nullptr);
221       return result;
222     } catch (...) {
223       return placeholder;
224     }
225   }
226 
contextDescriptionfolly::json::__anon3d94b9db0111::Printer227   static std::string contextDescription(const Context* context) {
228     if (!context) {
229       return "<undefined location>";
230     }
231     return context->typeDescription() + " at " + context->locationDescription();
232   }
233 
printArrayfolly::json::__anon3d94b9db0111::Printer234   void printArray(dynamic const& a, const Context* context) const {
235     if (a.empty()) {
236       out_ += "[]";
237       return;
238     }
239 
240     out_ += '[';
241     indent();
242     newline();
243     (*this)(a[0], Context(context, dynamic(0)));
244     for (auto it = std::next(a.begin()); it != a.end(); ++it) {
245       out_ += ',';
246       newline();
247       (*this)(*it, Context(context, dynamic(std::distance(a.begin(), it))));
248     }
249     outdent();
250     newline();
251     out_ += ']';
252   }
253 
254  private:
outdentfolly::json::__anon3d94b9db0111::Printer255   void outdent() const {
256     if (indentLevel_) {
257       --*indentLevel_;
258     }
259   }
260 
indentfolly::json::__anon3d94b9db0111::Printer261   void indent() const {
262     if (indentLevel_) {
263       ++*indentLevel_;
264     }
265   }
266 
newlinefolly::json::__anon3d94b9db0111::Printer267   void newline() const {
268     if (indentLevel_) {
269       auto indent = *indentLevel_ * opts_.pretty_formatting_indent_width;
270       out_ += to<std::string>('\n', std::string(indent, ' '));
271     }
272   }
273 
mapColonfolly::json::__anon3d94b9db0111::Printer274   void mapColon() const { out_ += indentLevel_ ? ": " : ":"; }
275 
276  private:
277   std::string& out_;
278   unsigned* const indentLevel_;
279   serialization_opts const& opts_;
280 };
281 
282 //////////////////////////////////////////////////////////////////////
283 
284 // Wraps our input buffer with some helper functions.
285 struct Input {
Inputfolly::json::__anon3d94b9db0111::Input286   explicit Input(StringPiece range, json::serialization_opts const* opts)
287       : range_(range), opts_(*opts), lineNum_(0) {
288     storeCurrent();
289   }
290 
291   Input(Input const&) = delete;
292   Input& operator=(Input const&) = delete;
293 
beginfolly::json::__anon3d94b9db0111::Input294   char const* begin() const { return range_.begin(); }
295 
getLineNumfolly::json::__anon3d94b9db0111::Input296   unsigned getLineNum() const { return lineNum_; }
297 
298   // Parse ahead for as long as the supplied predicate is satisfied,
299   // returning a range of what was skipped.
300   template <class Predicate>
skipWhilefolly::json::__anon3d94b9db0111::Input301   StringPiece skipWhile(const Predicate& p) {
302     std::size_t skipped = 0;
303     for (; skipped < range_.size(); ++skipped) {
304       if (!p(range_[skipped])) {
305         break;
306       }
307       if (range_[skipped] == '\n') {
308         ++lineNum_;
309       }
310     }
311     auto ret = range_.subpiece(0, skipped);
312     range_.advance(skipped);
313     storeCurrent();
314     return ret;
315   }
316 
skipDigitsfolly::json::__anon3d94b9db0111::Input317   StringPiece skipDigits() {
318     return skipWhile([](char c) { return c >= '0' && c <= '9'; });
319   }
320 
skipMinusAndDigitsfolly::json::__anon3d94b9db0111::Input321   StringPiece skipMinusAndDigits() {
322     bool firstChar = true;
323     return skipWhile([&firstChar](char c) {
324       bool result = (c >= '0' && c <= '9') || (firstChar && c == '-');
325       firstChar = false;
326       return result;
327     });
328   }
329 
skipWhitespacefolly::json::__anon3d94b9db0111::Input330   void skipWhitespace() {
331     unsigned index = 0;
332     while (true) {
333       while (index < range_.size() && range_[index] == ' ') {
334         index++;
335       }
336       if (index < range_.size()) {
337         if (range_[index] == '\n') {
338           index++;
339           ++lineNum_;
340           continue;
341         }
342         if (range_[index] == '\t' || range_[index] == '\r') {
343           index++;
344           continue;
345         }
346       }
347       break;
348     }
349     range_.advance(index);
350     storeCurrent();
351   }
352 
expectfolly::json::__anon3d94b9db0111::Input353   void expect(char c) {
354     if (**this != c) {
355       throw json::make_parse_error(
356           lineNum_, context(), to<std::string>("expected '", c, '\''));
357     }
358     ++*this;
359   }
360 
sizefolly::json::__anon3d94b9db0111::Input361   std::size_t size() const { return range_.size(); }
362 
operator *folly::json::__anon3d94b9db0111::Input363   int operator*() const { return current_; }
364 
operator ++folly::json::__anon3d94b9db0111::Input365   void operator++() {
366     range_.pop_front();
367     storeCurrent();
368   }
369 
370   template <class T>
extractfolly::json::__anon3d94b9db0111::Input371   T extract() {
372     try {
373       return to<T>(&range_);
374     } catch (std::exception const& e) {
375       error(e.what());
376     }
377   }
378 
consumefolly::json::__anon3d94b9db0111::Input379   bool consume(StringPiece str) {
380     if (boost::starts_with(range_, str)) {
381       range_.advance(str.size());
382       storeCurrent();
383       return true;
384     }
385     return false;
386   }
387 
contextfolly::json::__anon3d94b9db0111::Input388   std::string context() const {
389     return range_.subpiece(0, 16 /* arbitrary */).toString();
390   }
391 
errorfolly::json::__anon3d94b9db0111::Input392   dynamic error(char const* what) const {
393     throw json::make_parse_error(lineNum_, context(), what);
394   }
395 
getOptsfolly::json::__anon3d94b9db0111::Input396   json::serialization_opts const& getOpts() { return opts_; }
397 
incrementRecursionLevelfolly::json::__anon3d94b9db0111::Input398   void incrementRecursionLevel() {
399     if (currentRecursionLevel_ > opts_.recursion_limit) {
400       error("recursion limit exceeded");
401     }
402     currentRecursionLevel_++;
403   }
404 
decrementRecursionLevelfolly::json::__anon3d94b9db0111::Input405   void decrementRecursionLevel() { currentRecursionLevel_--; }
406 
407  private:
storeCurrentfolly::json::__anon3d94b9db0111::Input408   void storeCurrent() { current_ = range_.empty() ? EOF : range_.front(); }
409 
410  private:
411   StringPiece range_;
412   json::serialization_opts const& opts_;
413   unsigned lineNum_;
414   int current_;
415   unsigned int currentRecursionLevel_{0};
416 };
417 
418 class RecursionGuard {
419  public:
RecursionGuard(Input & in)420   explicit RecursionGuard(Input& in) : in_(in) {
421     in_.incrementRecursionLevel();
422   }
423 
~RecursionGuard()424   ~RecursionGuard() { in_.decrementRecursionLevel(); }
425 
426  private:
427   Input& in_;
428 };
429 
430 dynamic parseValue(Input& in, json::metadata_map* map);
431 std::string parseString(Input& in);
432 dynamic parseNumber(Input& in);
433 
434 template <class K>
parseObjectKeyValue(Input & in,dynamic & ret,K && key,json::metadata_map * map)435 void parseObjectKeyValue(
436     Input& in, dynamic& ret, K&& key, json::metadata_map* map) {
437   auto keyLineNumber = in.getLineNum();
438   in.skipWhitespace();
439   in.expect(':');
440   in.skipWhitespace();
441   K tmp;
442   if (map) {
443     tmp = K(key);
444   }
445   auto valueLineNumber = in.getLineNum();
446   ret.insert(std::forward<K>(key), parseValue(in, map));
447   if (map) {
448     auto val = ret.get_ptr(tmp);
449     // We just inserted it, so it should be there!
450     DCHECK(val != nullptr);
451     map->emplace(
452         val, json::parse_metadata{{{keyLineNumber}}, {{valueLineNumber}}});
453   }
454 }
455 
parseObject(Input & in,json::metadata_map * map)456 dynamic parseObject(Input& in, json::metadata_map* map) {
457   DCHECK_EQ(*in, '{');
458   ++in;
459 
460   dynamic ret = dynamic::object;
461 
462   in.skipWhitespace();
463   if (*in == '}') {
464     ++in;
465     return ret;
466   }
467 
468   for (;;) {
469     if (in.getOpts().allow_trailing_comma && *in == '}') {
470       break;
471     }
472     if (*in == '\"') { // string
473       auto key = parseString(in);
474       parseObjectKeyValue(in, ret, std::move(key), map);
475     } else if (!in.getOpts().allow_non_string_keys) {
476       in.error("expected string for object key name");
477     } else {
478       auto key = parseValue(in, map);
479       parseObjectKeyValue(in, ret, std::move(key), map);
480     }
481 
482     in.skipWhitespace();
483     if (*in != ',') {
484       break;
485     }
486     ++in;
487     in.skipWhitespace();
488   }
489   in.expect('}');
490 
491   return ret;
492 }
493 
parseArray(Input & in,json::metadata_map * map)494 dynamic parseArray(Input& in, json::metadata_map* map) {
495   DCHECK_EQ(*in, '[');
496   ++in;
497 
498   dynamic ret = dynamic::array;
499 
500   in.skipWhitespace();
501   if (*in == ']') {
502     ++in;
503     return ret;
504   }
505 
506   std::vector<uint32_t> lineNumbers;
507   for (;;) {
508     if (in.getOpts().allow_trailing_comma && *in == ']') {
509       break;
510     }
511     ret.push_back(parseValue(in, map));
512     if (map) {
513       lineNumbers.push_back(in.getLineNum());
514     }
515     in.skipWhitespace();
516     if (*in != ',') {
517       break;
518     }
519     ++in;
520     in.skipWhitespace();
521   }
522   if (map) {
523     for (size_t i = 0; i < ret.size(); i++) {
524       map->emplace(&ret[i], json::parse_metadata{{{0}}, {{lineNumbers[i]}}});
525     }
526   }
527   in.expect(']');
528 
529   return ret;
530 }
531 
parseNumber(Input & in)532 dynamic parseNumber(Input& in) {
533   bool const negative = (*in == '-');
534   if (negative && in.consume("-Infinity")) {
535     if (in.getOpts().parse_numbers_as_strings) {
536       return "-Infinity";
537     } else {
538       return -std::numeric_limits<double>::infinity();
539     }
540   }
541 
542   auto integral = in.skipMinusAndDigits();
543   if (negative && integral.size() < 2) {
544     in.error("expected digits after `-'");
545   }
546 
547   auto const wasE = *in == 'e' || *in == 'E';
548 
549   constexpr const char* maxInt = "9223372036854775807";
550   constexpr const char* minInt = "-9223372036854775808";
551   constexpr auto maxIntLen = constexpr_strlen(maxInt);
552   constexpr auto minIntLen = constexpr_strlen(minInt);
553 
554   if (*in != '.' && !wasE && in.getOpts().parse_numbers_as_strings) {
555     return integral;
556   }
557 
558   if (*in != '.' && !wasE) {
559     if (LIKELY(!in.getOpts().double_fallback || integral.size() < maxIntLen) ||
560         (!negative && integral.size() == maxIntLen && integral <= maxInt) ||
561         (negative && integral.size() == minIntLen && integral <= minInt)) {
562       auto val = to<int64_t>(integral);
563       in.skipWhitespace();
564       return val;
565     } else {
566       auto val = to<double>(integral);
567       in.skipWhitespace();
568       return val;
569     }
570   }
571 
572   auto end = !wasE ? (++in, in.skipDigits().end()) : in.begin();
573   if (*in == 'e' || *in == 'E') {
574     ++in;
575     if (*in == '+' || *in == '-') {
576       ++in;
577     }
578     auto expPart = in.skipDigits();
579     end = expPart.end();
580   }
581   auto fullNum = range(integral.begin(), end);
582   if (in.getOpts().parse_numbers_as_strings) {
583     return fullNum;
584   }
585   auto val = to<double>(fullNum);
586   return val;
587 }
588 
decodeUnicodeEscape(Input & in)589 std::string decodeUnicodeEscape(Input& in) {
590   auto hexVal = [&](int c) -> uint16_t {
591     // clang-format off
592     return uint16_t(
593         c >= '0' && c <= '9' ? c - '0' :
594         c >= 'a' && c <= 'f' ? c - 'a' + 10 :
595         c >= 'A' && c <= 'F' ? c - 'A' + 10 :
596         (in.error("invalid hex digit"), 0));
597     // clang-format on
598   };
599 
600   auto readHex = [&]() -> uint16_t {
601     if (in.size() < 4) {
602       in.error("expected 4 hex digits");
603     }
604 
605     auto ret = uint16_t(hexVal(*in) * 4096);
606     ++in;
607     ret += hexVal(*in) * 256;
608     ++in;
609     ret += hexVal(*in) * 16;
610     ++in;
611     ret += hexVal(*in);
612     ++in;
613     return ret;
614   };
615 
616   //  If the value encoded is in the surrogate pair range, we need to make
617   //  sure there is another escape that we can use also.
618   //
619   //  See the explanation in folly/Unicode.h.
620   uint16_t prefix = readHex();
621   char32_t codePoint = prefix;
622   if (utf16_code_unit_is_high_surrogate(prefix)) {
623     if (!in.consume("\\u")) {
624       in.error(
625           "expected another unicode escape for second half of "
626           "surrogate pair");
627     }
628     uint16_t suffix = readHex();
629     if (!utf16_code_unit_is_low_surrogate(suffix)) {
630       in.error("second character in surrogate pair is invalid");
631     }
632     codePoint = unicode_code_point_from_utf16_surrogate_pair(prefix, suffix);
633   } else if (!utf16_code_unit_is_bmp(prefix)) {
634     in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
635   }
636 
637   return codePointToUtf8(codePoint);
638 }
639 
parseString(Input & in)640 std::string parseString(Input& in) {
641   DCHECK_EQ(*in, '\"');
642   ++in;
643 
644   std::string ret;
645   for (;;) {
646     auto range = in.skipWhile([](char c) { return c != '\"' && c != '\\'; });
647     ret.append(range.begin(), range.end());
648 
649     if (*in == '\"') {
650       ++in;
651       break;
652     }
653     if (*in == '\\') {
654       ++in;
655       switch (*in) {
656         // clang-format off
657         case '\"':    ret.push_back('\"'); ++in; break;
658         case '\\':    ret.push_back('\\'); ++in; break;
659         case '/':     ret.push_back('/');  ++in; break;
660         case 'b':     ret.push_back('\b'); ++in; break;
661         case 'f':     ret.push_back('\f'); ++in; break;
662         case 'n':     ret.push_back('\n'); ++in; break;
663         case 'r':     ret.push_back('\r'); ++in; break;
664         case 't':     ret.push_back('\t'); ++in; break;
665         case 'u':     ++in; ret += decodeUnicodeEscape(in); break;
666         // clang-format on
667         default:
668           in.error(
669               to<std::string>("unknown escape ", *in, " in string").c_str());
670       }
671       continue;
672     }
673     if (*in == EOF) {
674       in.error("unterminated string");
675     }
676     if (!*in) {
677       /*
678        * Apparently we're actually supposed to ban all control
679        * characters from strings.  This seems unnecessarily
680        * restrictive, so we're only banning zero bytes.  (Since the
681        * string is presumed to be UTF-8 encoded it's fine to just
682        * check this way.)
683        */
684       in.error("null byte in string");
685     }
686 
687     ret.push_back(char(*in));
688     ++in;
689   }
690 
691   return ret;
692 }
693 
parseValue(Input & in,json::metadata_map * map)694 dynamic parseValue(Input& in, json::metadata_map* map) {
695   RecursionGuard guard(in);
696 
697   in.skipWhitespace();
698   // clang-format off
699   return
700       *in == '[' ? parseArray(in, map) :
701       *in == '{' ? parseObject(in, map) :
702       *in == '\"' ? parseString(in) :
703       (*in == '-' || (*in >= '0' && *in <= '9')) ? parseNumber(in) :
704       in.consume("true") ? true :
705       in.consume("false") ? false :
706       in.consume("null") ? nullptr :
707       in.consume("Infinity") ?
708       (in.getOpts().parse_numbers_as_strings ? (dynamic)"Infinity" :
709         (dynamic)std::numeric_limits<double>::infinity()) :
710       in.consume("NaN") ?
711         (in.getOpts().parse_numbers_as_strings ? (dynamic)"NaN" :
712           (dynamic)std::numeric_limits<double>::quiet_NaN()) :
713       in.error("expected json value");
714   // clang-format on
715 }
716 
717 } // namespace
718 
719 //////////////////////////////////////////////////////////////////////
720 
buildExtraAsciiToEscapeBitmap(StringPiece chars)721 std::array<uint64_t, 2> buildExtraAsciiToEscapeBitmap(StringPiece chars) {
722   std::array<uint64_t, 2> escapes{{0, 0}};
723   for (auto b : ByteRange(chars)) {
724     if (b >= 0x20 && b < 0x80) {
725       escapes[b / 64] |= uint64_t(1) << (b % 64);
726     }
727   }
728   return escapes;
729 }
730 
serialize(dynamic const & dyn,serialization_opts const & opts)731 std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
732   std::string ret;
733   unsigned indentLevel = 0;
734   Printer p(ret, opts.pretty_formatting ? &indentLevel : nullptr, &opts);
735   p(dyn, nullptr);
736   return ret;
737 }
738 
739 // Fast path to determine the longest prefix that can be left
740 // unescaped in a string of sizeof(T) bytes packed in an integer of
741 // type T.
742 template <bool EnableExtraAsciiEscapes, class T>
firstEscapableInWord(T s,const serialization_opts & opts)743 size_t firstEscapableInWord(T s, const serialization_opts& opts) {
744   static_assert(std::is_unsigned<T>::value, "Unsigned integer required");
745   static constexpr T kOnes = ~T() / 255; // 0x...0101
746   static constexpr T kMsbs = kOnes * 0x80; // 0x...8080
747 
748   // Sets the MSB of bytes < b. Precondition: b < 128.
749   auto isLess = [](T w, uint8_t b) {
750     // A byte is < b iff subtracting b underflows, so we check that
751     // the MSB wasn't set before and it's set after the subtraction.
752     return (w - kOnes * b) & ~w & kMsbs;
753   };
754 
755   auto isChar = [&](uint8_t c) {
756     // A byte is == c iff it is 0 if xor'd with c.
757     return isLess(s ^ (kOnes * c), 1);
758   };
759 
760   // The following masks have the MSB set for each byte of the word
761   // that satisfies the corresponding condition.
762   auto isHigh = s & kMsbs; // >= 128
763   auto isLow = isLess(s, 0x20); // <= 0x1f
764   auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"');
765 
766   if /* constexpr */ (EnableExtraAsciiEscapes) {
767     // Deal with optional bitmap for unicode escapes. Escapes can optionally be
768     // set for ascii characters 32 - 127, so the inner loop may run up to 96
769     // times. However, for the case where 0 or a handful of bits are set,
770     // looping will be minimal through use of findFirstSet.
771     for (size_t i = 0; i < opts.extra_ascii_to_escape_bitmap.size(); ++i) {
772       const auto offset = i * 64;
773       // Clear first 32 characters if this is the first index, since those are
774       // always escaped.
775       auto bitmap = opts.extra_ascii_to_escape_bitmap[i] &
776           (i == 0 ? uint64_t(-1) << 32 : ~0UL);
777       while (bitmap) {
778         auto bit = folly::findFirstSet(bitmap);
779         needsEscape |= isChar(static_cast<uint8_t>(offset + bit - 1));
780         bitmap &= bitmap - 1;
781       }
782     }
783   }
784 
785   if (!needsEscape) {
786     return sizeof(T);
787   }
788 
789   if (folly::kIsLittleEndian) {
790     return folly::findFirstSet(needsEscape) / 8 - 1;
791   } else {
792     return sizeof(T) - folly::findLastSet(needsEscape) / 8;
793   }
794 }
795 
796 // Escape a string so that it is legal to print it in JSON text.
797 template <bool EnableExtraAsciiEscapes>
escapeStringImpl(StringPiece input,std::string & out,const serialization_opts & opts)798 void escapeStringImpl(
799     StringPiece input, std::string& out, const serialization_opts& opts) {
800   auto hexDigit = [](uint8_t c) -> char {
801     return c < 10 ? c + '0' : c - 10 + 'a';
802   };
803 
804   out.push_back('\"');
805 
806   auto* p = reinterpret_cast<const unsigned char*>(input.begin());
807   auto* q = reinterpret_cast<const unsigned char*>(input.begin());
808   auto* e = reinterpret_cast<const unsigned char*>(input.end());
809 
810   while (p < e) {
811     // Find the longest prefix that does not need escaping, and copy
812     // it literally into the output string.
813     auto firstEsc = p;
814     while (firstEsc < e) {
815       auto avail = to_unsigned(e - firstEsc);
816       uint64_t word = 0;
817       if (avail >= 8) {
818         word = folly::loadUnaligned<uint64_t>(firstEsc);
819       } else {
820         word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail);
821       }
822       auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts);
823       DCHECK_LE(prefix, avail);
824       firstEsc += prefix;
825       if (prefix < 8) {
826         break;
827       }
828     }
829     if (firstEsc > p) {
830       out.append(reinterpret_cast<const char*>(p), firstEsc - p);
831       p = firstEsc;
832       // We can't be in the middle of a multibyte sequence, so we can reset q.
833       q = p;
834       if (p == e) {
835         break;
836       }
837     }
838 
839     // Handle the next byte that may need escaping.
840 
841     // Since non-ascii encoding inherently does utf8 validation
842     // we explicitly validate utf8 only if non-ascii encoding is disabled.
843     if ((opts.validate_utf8 || opts.skip_invalid_utf8) &&
844         !opts.encode_non_ascii) {
845       // To achieve better spatial and temporal coherence
846       // we do utf8 validation progressively along with the
847       // string-escaping instead of two separate passes.
848 
849       // As the encoding progresses, q will stay at or ahead of p.
850       CHECK_GE(q, p);
851 
852       // As p catches up with q, move q forward.
853       if (q == p) {
854         // calling utf8_decode has the side effect of
855         // checking that utf8 encodings are valid
856         char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
857         if (opts.skip_invalid_utf8 && v == U'\ufffd') {
858           out.append(reinterpret_cast<const char*>(u8"\ufffd"));
859           p = q;
860           continue;
861         }
862       }
863     }
864 
865     auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80);
866     if /* constexpr */ (EnableExtraAsciiEscapes) {
867       encodeUnicode = encodeUnicode ||
868           (*p >= 0x20 && *p < 0x80 &&
869            (opts.extra_ascii_to_escape_bitmap[*p / 64] &
870             (uint64_t(1) << (*p % 64))));
871     }
872 
873     if (encodeUnicode) {
874       // note that this if condition captures utf8 chars
875       // with value > 127, so size > 1 byte (or they are whitelisted for
876       // Unicode encoding).
877       // NOTE: char32_t / char16_t are both unsigned.
878       char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
879       auto writeHex = [&](char16_t v) {
880         char buf[] = "\\u\0\0\0\0";
881         buf[2] = hexDigit((v >> 12) & 0x0f);
882         buf[3] = hexDigit((v >> 8) & 0x0f);
883         buf[4] = hexDigit((v >> 4) & 0x0f);
884         buf[5] = hexDigit(v & 0x0f);
885         out.append(buf, 6);
886       };
887       // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
888       if (cp < 0x10000u) {
889         // If the code point is in the Basic Multilingual Plane (U+0000 through
890         // U+FFFF), then it may be represented as a six-character sequence:
891         // a reverse solidus, followed by the lowercase letter u, followed by
892         // four hexadecimal digits that encode the code point.
893         writeHex(static_cast<char16_t>(cp));
894       } else {
895         // To escape a code point that is not in the Basic Multilingual Plane,
896         // the character may be represented as a twelve-character sequence,
897         // encoding the UTF-16 surrogate pair corresponding to the code point.
898         writeHex(static_cast<char16_t>(
899             0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
900         writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
901       }
902     } else if (*p == '\\' || *p == '\"') {
903       char buf[] = "\\\0";
904       buf[1] = char(*p++);
905       out.append(buf, 2);
906     } else if (*p <= 0x1f) {
907       switch (*p) {
908         // clang-format off
909         case '\b': out.append("\\b"); p++; break;
910         case '\f': out.append("\\f"); p++; break;
911         case '\n': out.append("\\n"); p++; break;
912         case '\r': out.append("\\r"); p++; break;
913         case '\t': out.append("\\t"); p++; break;
914         // clang-format on
915         default:
916           // Note that this if condition captures non readable chars
917           // with value < 32, so size = 1 byte (e.g control chars).
918           char buf[] = "\\u00\0\0";
919           buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
920           buf[5] = hexDigit(uint8_t(*p & 0xf));
921           out.append(buf, 6);
922           p++;
923       }
924     } else {
925       out.push_back(char(*p++));
926     }
927   }
928 
929   out.push_back('\"');
930 }
931 
escapeString(StringPiece input,std::string & out,const serialization_opts & opts)932 void escapeString(
933     StringPiece input, std::string& out, const serialization_opts& opts) {
934   if (FOLLY_UNLIKELY(
935           opts.extra_ascii_to_escape_bitmap[0] ||
936           opts.extra_ascii_to_escape_bitmap[1])) {
937     escapeStringImpl<true>(input, out, opts);
938   } else {
939     escapeStringImpl<false>(input, out, opts);
940   }
941 }
942 
stripComments(StringPiece jsonC)943 std::string stripComments(StringPiece jsonC) {
944   std::string result;
945   enum class State {
946     None,
947     InString,
948     InlineComment,
949     LineComment
950   } state = State::None;
951 
952   for (size_t i = 0; i < jsonC.size(); ++i) {
953     auto s = jsonC.subpiece(i);
954     switch (state) {
955       case State::None:
956         if (s.startsWith("/*")) {
957           state = State::InlineComment;
958           ++i;
959           continue;
960         } else if (s.startsWith("//")) {
961           state = State::LineComment;
962           ++i;
963           continue;
964         } else if (s[0] == '\"') {
965           state = State::InString;
966         }
967         result.push_back(s[0]);
968         break;
969       case State::InString:
970         if (s[0] == '\\') {
971           if (UNLIKELY(s.size() == 1)) {
972             throw std::logic_error("Invalid JSONC: string is not terminated");
973           }
974           result.push_back(s[0]);
975           result.push_back(s[1]);
976           ++i;
977           continue;
978         } else if (s[0] == '\"') {
979           state = State::None;
980         }
981         result.push_back(s[0]);
982         break;
983       case State::InlineComment:
984         if (s.startsWith("*/")) {
985           state = State::None;
986           ++i;
987         }
988         break;
989       case State::LineComment:
990         if (s[0] == '\n') {
991           // skip the line break. It doesn't matter.
992           state = State::None;
993         }
994         break;
995       default:
996         throw std::logic_error("Unknown comment state");
997     }
998   }
999   return result;
1000 }
1001 
1002 } // namespace json
1003 
1004 //////////////////////////////////////////////////////////////////////
1005 
parseJsonWithMetadata(StringPiece range,json::metadata_map * map)1006 dynamic parseJsonWithMetadata(StringPiece range, json::metadata_map* map) {
1007   return parseJsonWithMetadata(range, json::serialization_opts(), map);
1008 }
1009 
parseJsonWithMetadata(StringPiece range,json::serialization_opts const & opts,json::metadata_map * map)1010 dynamic parseJsonWithMetadata(
1011     StringPiece range,
1012     json::serialization_opts const& opts,
1013     json::metadata_map* map) {
1014   json::Input in(range, &opts);
1015 
1016   uint32_t n = in.getLineNum();
1017   auto ret = parseValue(in, map);
1018   if (map) {
1019     map->emplace(&ret, json::parse_metadata{{{0}}, {{n}}});
1020   }
1021 
1022   in.skipWhitespace();
1023   if (in.size() && *in != '\0') {
1024     in.error("parsing didn't consume all input");
1025   }
1026   return ret;
1027 }
1028 
parseJson(StringPiece range)1029 dynamic parseJson(StringPiece range) {
1030   return parseJson(range, json::serialization_opts());
1031 }
1032 
parseJson(StringPiece range,json::serialization_opts const & opts)1033 dynamic parseJson(StringPiece range, json::serialization_opts const& opts) {
1034   json::Input in(range, &opts);
1035 
1036   auto ret = parseValue(in, nullptr);
1037   in.skipWhitespace();
1038   if (in.size() && *in != '\0') {
1039     in.error("parsing didn't consume all input");
1040   }
1041   return ret;
1042 }
1043 
toJson(dynamic const & dyn)1044 std::string toJson(dynamic const& dyn) {
1045   return json::serialize(dyn, json::serialization_opts());
1046 }
1047 
toPrettyJson(dynamic const & dyn)1048 std::string toPrettyJson(dynamic const& dyn) {
1049   json::serialization_opts opts;
1050   opts.pretty_formatting = true;
1051   opts.sort_keys = true;
1052   return json::serialize(dyn, opts);
1053 }
1054 
1055 //////////////////////////////////////////////////////////////////////
1056 // dynamic::print_as_pseudo_json() is implemented here for header
1057 // ordering reasons (most of the dynamic implementation is in
1058 // dynamic-inl.h, which we don't want to include json.h).
1059 
print_as_pseudo_json(std::ostream & out) const1060 void dynamic::print_as_pseudo_json(std::ostream& out) const {
1061   json::serialization_opts opts;
1062   opts.allow_non_string_keys = true;
1063   opts.allow_nan_inf = true;
1064   out << json::serialize(*this, opts);
1065 }
1066 
PrintTo(const dynamic & dyn,std::ostream * os)1067 void PrintTo(const dynamic& dyn, std::ostream* os) {
1068   json::serialization_opts opts;
1069   opts.allow_nan_inf = true;
1070   opts.allow_non_string_keys = true;
1071   opts.pretty_formatting = true;
1072   opts.sort_keys = true;
1073   *os << json::serialize(dyn, opts);
1074 }
1075 
1076 //////////////////////////////////////////////////////////////////////
1077 
1078 } // namespace folly
1079