1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <memory>
21 #include <random>
22 #include <string>
23 #include <vector>
24 
25 #include "arrow/json/rapidjson_defs.h"
26 #include "rapidjson/document.h"
27 #include "rapidjson/prettywriter.h"
28 #include "rapidjson/reader.h"
29 #include "rapidjson/writer.h"
30 
31 #include "arrow/io/memory.h"
32 #include "arrow/json/converter.h"
33 #include "arrow/json/options.h"
34 #include "arrow/json/parser.h"
35 #include "arrow/testing/gtest_util.h"
36 #include "arrow/type.h"
37 #include "arrow/util/string_view.h"
38 #include "arrow/visitor_inline.h"
39 
40 namespace arrow {
41 namespace json {
42 
43 namespace rj = arrow::rapidjson;
44 
45 using rj::StringBuffer;
46 using util::string_view;
47 using Writer = rj::Writer<StringBuffer>;
48 
OK(bool ok)49 inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
50 
51 template <typename Engine>
52 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
53                               Writer* writer);
54 
55 template <typename Engine>
56 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
57                               Engine& e, Writer* writer);
58 
59 template <typename Engine>
Generate(const std::shared_ptr<Schema> & schm,Engine & e,Writer * writer)60 inline static Status Generate(const std::shared_ptr<Schema>& schm, Engine& e,
61                               Writer* writer) {
62   return Generate(schm->fields(), e, writer);
63 }
64 
65 template <typename Engine>
66 struct GenerateImpl {
VisitGenerateImpl67   Status Visit(const NullType&) { return OK(writer.Null()); }
68 
VisitGenerateImpl69   Status Visit(const BooleanType&) {
70     return OK(writer.Bool(std::uniform_int_distribution<uint16_t>{}(e)&1));
71   }
72 
73   template <typename T>
VisitGenerateImpl74   enable_if_physical_unsigned_integer<T, Status> Visit(const T&) {
75     auto val = std::uniform_int_distribution<>{}(e);
76     return OK(writer.Uint64(static_cast<typename T::c_type>(val)));
77   }
78 
79   template <typename T>
VisitGenerateImpl80   enable_if_physical_signed_integer<T, Status> Visit(const T&) {
81     auto val = std::uniform_int_distribution<>{}(e);
82     return OK(writer.Int64(static_cast<typename T::c_type>(val)));
83   }
84 
85   template <typename T>
VisitGenerateImpl86   enable_if_physical_floating_point<T, Status> Visit(const T&) {
87     auto val = std::normal_distribution<typename T::c_type>{0, 1 << 10}(e);
88     return OK(writer.Double(val));
89   }
90 
91   template <typename T>
VisitGenerateImpl92   enable_if_base_binary<T, Status> Visit(const T&) {
93     auto size = std::poisson_distribution<>{4}(e);
94     std::uniform_int_distribution<uint16_t> gen_char(32, 127);  // FIXME generate UTF8
95     std::string s(size, '\0');
96     for (char& ch : s) ch = static_cast<char>(gen_char(e));
97     return OK(writer.String(s.c_str()));
98   }
99 
100   template <typename T>
VisitGenerateImpl101   enable_if_list_like<T, Status> Visit(const T& t) {
102     auto size = std::poisson_distribution<>{4}(e);
103     writer.StartArray();
104     for (int i = 0; i < size; ++i) RETURN_NOT_OK(Generate(t.value_type(), e, &writer));
105     return OK(writer.EndArray(size));
106   }
107 
VisitGenerateImpl108   Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer); }
109 
VisitGenerateImpl110   Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
111 
VisitGenerateImpl112   Status Visit(const DictionaryType& t) { return NotImplemented(t); }
113 
VisitGenerateImpl114   Status Visit(const ExtensionType& t) { return NotImplemented(t); }
115 
VisitGenerateImpl116   Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
117 
VisitGenerateImpl118   Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
119 
VisitGenerateImpl120   Status Visit(const UnionType& t) { return NotImplemented(t); }
121 
NotImplementedGenerateImpl122   Status NotImplemented(const DataType& t) {
123     return Status::NotImplemented("random generation of arrays of type ", t);
124   }
125 
126   Engine& e;
127   rj::Writer<rj::StringBuffer>& writer;
128 };
129 
130 template <typename Engine>
Generate(const std::shared_ptr<DataType> & type,Engine & e,Writer * writer)131 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
132                               Writer* writer) {
133   if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
134     // one out of 5 chance of null, anywhere
135     writer->Null();
136     return Status::OK();
137   }
138   GenerateImpl<Engine> visitor = {e, *writer};
139   return VisitTypeInline(*type, &visitor);
140 }
141 
142 template <typename Engine>
Generate(const std::vector<std::shared_ptr<Field>> & fields,Engine & e,Writer * writer)143 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
144                               Engine& e, Writer* writer) {
145   RETURN_NOT_OK(OK(writer->StartObject()));
146   for (const auto& f : fields) {
147     writer->Key(f->name().c_str());
148     RETURN_NOT_OK(Generate(f->type(), e, writer));
149   }
150   return OK(writer->EndObject(static_cast<int>(fields.size())));
151 }
152 
MakeStream(string_view src_str,std::shared_ptr<io::InputStream> * out)153 inline static Status MakeStream(string_view src_str,
154                                 std::shared_ptr<io::InputStream>* out) {
155   auto src = std::make_shared<Buffer>(src_str);
156   *out = std::make_shared<io::BufferReader>(src);
157   return Status::OK();
158 }
159 
160 // scalar values (numbers and strings) are parsed into a
161 // dictionary<index:int32, value:string>. This can be decoded for ease of comparison
DecodeStringDictionary(const DictionaryArray & dict_array,std::shared_ptr<Array> * decoded)162 inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
163                                             std::shared_ptr<Array>* decoded) {
164   const StringArray& dict = static_cast<const StringArray&>(*dict_array.dictionary());
165   const Int32Array& indices = static_cast<const Int32Array&>(*dict_array.indices());
166   StringBuilder builder;
167   RETURN_NOT_OK(builder.Resize(indices.length()));
168   for (int64_t i = 0; i < indices.length(); ++i) {
169     if (indices.IsNull(i)) {
170       builder.UnsafeAppendNull();
171       continue;
172     }
173     auto value = dict.GetView(indices.GetView(i));
174     RETURN_NOT_OK(builder.ReserveData(value.size()));
175     builder.UnsafeAppend(value);
176   }
177   return builder.Finish(decoded);
178 }
179 
ParseFromString(ParseOptions options,string_view src_str,std::shared_ptr<Array> * parsed)180 inline static Status ParseFromString(ParseOptions options, string_view src_str,
181                                      std::shared_ptr<Array>* parsed) {
182   auto src = std::make_shared<Buffer>(src_str);
183   std::unique_ptr<BlockParser> parser;
184   RETURN_NOT_OK(BlockParser::Make(options, &parser));
185   RETURN_NOT_OK(parser->Parse(src));
186   return parser->Finish(parsed);
187 }
188 
PrettyPrint(string_view one_line)189 static inline std::string PrettyPrint(string_view one_line) {
190   rj::Document document;
191 
192   // Must pass size to avoid ASAN issues.
193   document.Parse(one_line.data(), one_line.size());
194   rj::StringBuffer sb;
195   rj::PrettyWriter<rj::StringBuffer> writer(sb);
196   document.Accept(writer);
197   return sb.GetString();
198 }
199 
scalars_only_src()200 inline static std::string scalars_only_src() {
201   return R"(
202     { "hello": 3.5, "world": false, "yo": "thing" }
203     { "hello": 3.25, "world": null }
204     { "hello": 3.125, "world": null, "yo": "\u5fcd" }
205     { "hello": 0.0, "world": true, "yo": null }
206   )";
207 }
208 
nested_src()209 inline static std::string nested_src() {
210   return R"(
211     { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
212     { "hello": 3.25, "world": null, "arr": [2], "nuf": null }
213     { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
214     { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
215   )";
216 }
217 
null_src()218 inline static std::string null_src() {
219   return R"(
220     { "plain": null, "list1": [], "list2": [], "struct": { "plain": null } }
221     { "plain": null, "list1": [], "list2": [null], "struct": {} }
222   )";
223 }
224 
225 }  // namespace json
226 }  // namespace arrow
227