1 #include "simdjson/westmere/begin.h"
2 
3 //
4 // Stage 1
5 //
6 
7 namespace simdjson {
8 namespace SIMDJSON_IMPLEMENTATION {
9 namespace {
10 
11 using namespace simd;
12 
13 struct json_character_block {
14   static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
15 
whitespacesimdjson::SIMDJSON_IMPLEMENTATION::__anon554fe1710111::json_character_block16   simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::SIMDJSON_IMPLEMENTATION::__anon554fe1710111::json_character_block17   simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::SIMDJSON_IMPLEMENTATION::__anon554fe1710111::json_character_block18   simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
19 
20   uint64_t _whitespace;
21   uint64_t _op;
22 };
23 
classify(const simd::simd8x64<uint8_t> & in)24 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
25   // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
26   // we can't use the generic lookup_16.
27   auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
28 
29   // The 6 operators (:,[]{}) have these values:
30   //
31   // , 2C
32   // : 3A
33   // [ 5B
34   // { 7B
35   // ] 5D
36   // } 7D
37   //
38   // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
39   // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
40   // match it (against | 0x20).
41   //
42   // To prevent recognizing other characters, everything else gets compared with 0, which cannot
43   // match due to the | 0x20.
44   //
45   // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
46   // and :. This gets caught in stage 2, which checks the actual character to ensure the right
47   // operators are in the right places.
48   const auto op_table = simd8<uint8_t>::repeat_16(
49     0, 0, 0, 0,
50     0, 0, 0, 0,
51     0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
52     ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
53   );
54 
55   // We compute whitespace and op separately. If the code later only use one or the
56   // other, given the fact that all functions are aggressively inlined, we can
57   // hope that useless computations will be omitted. This is namely case when
58   // minifying (we only need whitespace).
59 
60 
61   const uint64_t whitespace = in.eq({
62     _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
63     _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
64     _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
65     _mm_shuffle_epi8(whitespace_table, in.chunks[3])
66   });
67   // Turn [ and ] into { and }
68   const simd8x64<uint8_t> curlified{
69     in.chunks[0] | 0x20,
70     in.chunks[1] | 0x20,
71     in.chunks[2] | 0x20,
72     in.chunks[3] | 0x20
73   };
74   const uint64_t op = curlified.eq({
75     _mm_shuffle_epi8(op_table, in.chunks[0]),
76     _mm_shuffle_epi8(op_table, in.chunks[1]),
77     _mm_shuffle_epi8(op_table, in.chunks[2]),
78     _mm_shuffle_epi8(op_table, in.chunks[3])
79   });
80     return { whitespace, op };
81 }
82 
is_ascii(const simd8x64<uint8_t> & input)83 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
84   return input.reduce_or().is_ascii();
85 }
86 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)87 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
88   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
89   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
90   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
91   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
92   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
93 }
94 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)95 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
96   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
97   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
98   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
99   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
100 }
101 
102 } // unnamed namespace
103 } // namespace SIMDJSON_IMPLEMENTATION
104 } // namespace simdjson
105 
106 #include "generic/stage1/utf8_lookup4_algorithm.h"
107 #include "generic/stage1/json_structural_indexer.h"
108 #include "generic/stage1/utf8_validator.h"
109 
110 //
111 // Stage 2
112 //
113 #include "generic/stage2/tape_builder.h"
114 
115 //
116 // Implementation-specific overrides
117 //
118 
119 namespace simdjson {
120 namespace SIMDJSON_IMPLEMENTATION {
121 namespace {
122 namespace stage1 {
123 
find_escaped(uint64_t backslash)124 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
125   if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
126   return find_escaped_branchless(backslash);
127 }
128 
129 } // namespace stage1
130 } // unnamed namespace
131 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const132 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
133   return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
134 }
135 
stage1(const uint8_t * _buf,size_t _len,bool streaming)136 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
137   this->buf = _buf;
138   this->len = _len;
139   return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
140 }
141 
validate_utf8(const char * buf,size_t len) const142 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
143   return westmere::stage1::generic_validate_utf8(buf,len);
144 }
145 
stage2(dom::document & _doc)146 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
147   return stage2::tape_builder::parse_document<false>(*this, _doc);
148 }
149 
stage2_next(dom::document & _doc)150 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
151   return stage2::tape_builder::parse_document<true>(*this, _doc);
152 }
153 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)154 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
155   auto error = stage1(_buf, _len, false);
156   if (error) { return error; }
157   return stage2(_doc);
158 }
159 
160 } // namespace SIMDJSON_IMPLEMENTATION
161 } // namespace simdjson
162 
163 #include "simdjson/westmere/end.h"
164