1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 #include <boost/algorithm/string/replace.hpp>
19 #include <sstream>
20 #include <utility>
21
22 #include "Compiler.hh"
23 #include "Schema.hh"
24 #include "Stream.hh"
25 #include "Types.hh"
26 #include "ValidSchema.hh"
27
28 #include "json/JsonDom.hh"
29
30 using std::make_pair;
31 using std::map;
32 using std::pair;
33 using std::string;
34 using std::vector;
35
36 namespace avro {
37 using json::Array;
38 using json::Entity;
39 using json::EntityType;
40 using json::Object;
41
42 using SymbolTable = map<Name, NodePtr>;
43
44 // #define DEBUG_VERBOSE
45
makePrimitive(const string & t)46 static NodePtr makePrimitive(const string &t) {
47 if (t == "null") {
48 return NodePtr(new NodePrimitive(AVRO_NULL));
49 } else if (t == "boolean") {
50 return NodePtr(new NodePrimitive(AVRO_BOOL));
51 } else if (t == "int") {
52 return NodePtr(new NodePrimitive(AVRO_INT));
53 } else if (t == "long") {
54 return NodePtr(new NodePrimitive(AVRO_LONG));
55 } else if (t == "float") {
56 return NodePtr(new NodePrimitive(AVRO_FLOAT));
57 } else if (t == "double") {
58 return NodePtr(new NodePrimitive(AVRO_DOUBLE));
59 } else if (t == "string") {
60 return NodePtr(new NodePrimitive(AVRO_STRING));
61 } else if (t == "bytes") {
62 return NodePtr(new NodePrimitive(AVRO_BYTES));
63 } else {
64 return NodePtr();
65 }
66 }
67
68 static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns);
69
70 template<typename T>
asSingleAttribute(const T & t)71 concepts::SingleAttribute<T> asSingleAttribute(const T &t) {
72 concepts::SingleAttribute<T> n;
73 n.add(t);
74 return n;
75 }
76
isFullName(const string & s)77 static bool isFullName(const string &s) {
78 return s.find('.') != string::npos;
79 }
80
getName(const string & name,const string & ns)81 static Name getName(const string &name, const string &ns) {
82 return (isFullName(name)) ? Name(name) : Name(name, ns);
83 }
84
makeNode(const string & t,SymbolTable & st,const string & ns)85 static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns) {
86 NodePtr result = makePrimitive(t);
87 if (result) {
88 return result;
89 }
90 Name n = getName(t, ns);
91
92 auto it = st.find(n);
93 if (it != st.end()) {
94 return NodePtr(new NodeSymbolic(asSingleAttribute(n), it->second));
95 }
96 throw Exception(boost::format("Unknown type: %1%") % n.fullname());
97 }
98
99 /** Returns "true" if the field is in the container */
100 // e.g.: can be false for non-mandatory fields
containsField(const Object & m,const string & fieldName)101 bool containsField(const Object &m, const string &fieldName) {
102 auto it = m.find(fieldName);
103 return (it != m.end());
104 }
105
106 json::Object::const_iterator findField(const Entity &e,
107 const Object &m, const string &fieldName);
108
109 template<typename T>
ensureType(const Entity & e,const string & name)110 void ensureType(const Entity &e, const string &name) {
111 if (e.type() != json::type_traits<T>::type()) {
112 throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") % name % json::type_traits<T>::name() % e.toString());
113 }
114 }
115
getStringField(const Entity & e,const Object & m,const string & fieldName)116 string getStringField(const Entity &e, const Object &m,
117 const string &fieldName) {
118 auto it = findField(e, m, fieldName);
119 ensureType<string>(it->second, fieldName);
120 return it->second.stringValue();
121 }
122
123 const Array &getArrayField(const Entity &e, const Object &m,
124 const string &fieldName);
125
getLongField(const Entity & e,const Object & m,const string & fieldName)126 int64_t getLongField(const Entity &e, const Object &m,
127 const string &fieldName) {
128 auto it = findField(e, m, fieldName);
129 ensureType<int64_t>(it->second, fieldName);
130 return it->second.longValue();
131 }
132
133 // Unescape double quotes (") for de-serialization. This method complements the
134 // method NodeImpl::escape() which is used for serialization.
unescape(string & s)135 static void unescape(string &s) {
136 boost::replace_all(s, "\\\"", "\"");
137 }
138
getDocField(const Entity & e,const Object & m)139 string getDocField(const Entity &e, const Object &m) {
140 string doc = getStringField(e, m, "doc");
141 unescape(doc);
142 return doc;
143 }
144
145 struct Field {
146 const string name;
147 const NodePtr schema;
148 const GenericDatum defaultValue;
Fieldavro::Field149 Field(string n, NodePtr v, GenericDatum dv) : name(std::move(n)), schema(std::move(v)), defaultValue(std::move(dv)) {}
150 };
151
assertType(const Entity & e,EntityType et)152 static void assertType(const Entity &e, EntityType et) {
153 if (e.type() != et) {
154 throw Exception(boost::format("Unexpected type for default value: "
155 "Expected %1%, but found %2% in line %3%")
156 % json::typeToString(et) % json::typeToString(e.type()) % e.line());
157 }
158 }
159
toBin(const string & s)160 static vector<uint8_t> toBin(const string &s) {
161 vector<uint8_t> result(s.size());
162 if (!s.empty()) {
163 std::copy(s.c_str(), s.c_str() + s.size(), result.data());
164 }
165 return result;
166 }
167
makeGenericDatum(NodePtr n,const Entity & e,const SymbolTable & st)168 static GenericDatum makeGenericDatum(NodePtr n,
169 const Entity &e, const SymbolTable &st) {
170 Type t = n->type();
171 EntityType dt = e.type();
172
173 if (t == AVRO_SYMBOLIC) {
174 n = st.find(n->name())->second;
175 t = n->type();
176 }
177 switch (t) {
178 case AVRO_STRING:
179 assertType(e, json::EntityType::String);
180 return GenericDatum(e.stringValue());
181 case AVRO_BYTES:
182 assertType(e, json::EntityType::String);
183 return GenericDatum(toBin(e.bytesValue()));
184 case AVRO_INT:
185 assertType(e, json::EntityType::Long);
186 return GenericDatum(static_cast<int32_t>(e.longValue()));
187 case AVRO_LONG:
188 assertType(e, json::EntityType::Long);
189 return GenericDatum(e.longValue());
190 case AVRO_FLOAT:
191 if (dt == json::EntityType::Long) {
192 return GenericDatum(static_cast<float>(e.longValue()));
193 }
194 assertType(e, json::EntityType::Double);
195 return GenericDatum(static_cast<float>(e.doubleValue()));
196 case AVRO_DOUBLE:
197 if (dt == json::EntityType::Long) {
198 return GenericDatum(static_cast<double>(e.longValue()));
199 }
200 assertType(e, json::EntityType::Double);
201 return GenericDatum(e.doubleValue());
202 case AVRO_BOOL:
203 assertType(e, json::EntityType::Bool);
204 return GenericDatum(e.boolValue());
205 case AVRO_NULL:
206 assertType(e, json::EntityType::Null);
207 return GenericDatum();
208 case AVRO_RECORD: {
209 assertType(e, json::EntityType::Obj);
210 GenericRecord result(n);
211 const map<string, Entity> &v = e.objectValue();
212 for (size_t i = 0; i < n->leaves(); ++i) {
213 auto it = v.find(n->nameAt(i));
214 if (it == v.end()) {
215 throw Exception(boost::format(
216 "No value found in default for %1%")
217 % n->nameAt(i));
218 }
219 result.setFieldAt(i,
220 makeGenericDatum(n->leafAt(i), it->second, st));
221 }
222 return GenericDatum(n, result);
223 }
224 case AVRO_ENUM:
225 assertType(e, json::EntityType::String);
226 return GenericDatum(n, GenericEnum(n, e.stringValue()));
227 case AVRO_ARRAY: {
228 assertType(e, json::EntityType::Arr);
229 GenericArray result(n);
230 const vector<Entity> &elements = e.arrayValue();
231 for (const auto &element : elements) {
232 result.value().push_back(makeGenericDatum(n->leafAt(0), element, st));
233 }
234 return GenericDatum(n, result);
235 }
236 case AVRO_MAP: {
237 assertType(e, json::EntityType::Obj);
238 GenericMap result(n);
239 const map<string, Entity> &v = e.objectValue();
240 for (const auto &it : v) {
241 result.value().push_back(make_pair(it.first,
242 makeGenericDatum(n->leafAt(1), it.second, st)));
243 }
244 return GenericDatum(n, result);
245 }
246 case AVRO_UNION: {
247 GenericUnion result(n);
248 result.selectBranch(0);
249 result.datum() = makeGenericDatum(n->leafAt(0), e, st);
250 return GenericDatum(n, result);
251 }
252 case AVRO_FIXED:
253 assertType(e, json::EntityType::String);
254 return GenericDatum(n, GenericFixed(n, toBin(e.bytesValue())));
255 default: throw Exception(boost::format("Unknown type: %1%") % t);
256 }
257 }
258
makeField(const Entity & e,SymbolTable & st,const string & ns)259 static Field makeField(const Entity &e, SymbolTable &st, const string &ns) {
260 const Object &m = e.objectValue();
261 const string &n = getStringField(e, m, "name");
262 auto it = findField(e, m, "type");
263 auto it2 = m.find("default");
264 NodePtr node = makeNode(it->second, st, ns);
265 if (containsField(m, "doc")) {
266 node->setDoc(getDocField(e, m));
267 }
268 GenericDatum d = (it2 == m.end()) ? GenericDatum() : makeGenericDatum(node, it2->second, st);
269 return Field(n, node, d);
270 }
271
272 // Extended makeRecordNode (with doc).
makeRecordNode(const Entity & e,const Name & name,const string * doc,const Object & m,SymbolTable & st,const string & ns)273 static NodePtr makeRecordNode(const Entity &e, const Name &name,
274 const string *doc, const Object &m,
275 SymbolTable &st, const string &ns) {
276 const Array &v = getArrayField(e, m, "fields");
277 concepts::MultiAttribute<string> fieldNames;
278 concepts::MultiAttribute<NodePtr> fieldValues;
279 vector<GenericDatum> defaultValues;
280
281 for (const auto &it : v) {
282 Field f = makeField(it, st, ns);
283 fieldNames.add(f.name);
284 fieldValues.add(f.schema);
285 defaultValues.push_back(f.defaultValue);
286 }
287 NodeRecord *node;
288 if (doc == nullptr) {
289 node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
290 defaultValues);
291 } else {
292 node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
293 fieldValues, fieldNames, defaultValues);
294 }
295 return NodePtr(node);
296 }
297
makeLogicalType(const Entity & e,const Object & m)298 static LogicalType makeLogicalType(const Entity &e, const Object &m) {
299 if (!containsField(m, "logicalType")) {
300 return LogicalType(LogicalType::NONE);
301 }
302
303 const std::string &typeField = getStringField(e, m, "logicalType");
304
305 if (typeField == "decimal") {
306 LogicalType decimalType(LogicalType::DECIMAL);
307 try {
308 decimalType.setPrecision(getLongField(e, m, "precision"));
309 if (containsField(m, "scale")) {
310 decimalType.setScale(getLongField(e, m, "scale"));
311 }
312 } catch (Exception &ex) {
313 // If any part of the logical type is malformed, per the standard we
314 // must ignore the whole attribute.
315 return LogicalType(LogicalType::NONE);
316 }
317 return decimalType;
318 }
319
320 LogicalType::Type t = LogicalType::NONE;
321 if (typeField == "date")
322 t = LogicalType::DATE;
323 else if (typeField == "time-millis")
324 t = LogicalType::TIME_MILLIS;
325 else if (typeField == "time-micros")
326 t = LogicalType::TIME_MICROS;
327 else if (typeField == "timestamp-millis")
328 t = LogicalType::TIMESTAMP_MILLIS;
329 else if (typeField == "timestamp-micros")
330 t = LogicalType::TIMESTAMP_MICROS;
331 else if (typeField == "duration")
332 t = LogicalType::DURATION;
333 else if (typeField == "uuid")
334 t = LogicalType::UUID;
335 return LogicalType(t);
336 }
337
makeEnumNode(const Entity & e,const Name & name,const Object & m)338 static NodePtr makeEnumNode(const Entity &e,
339 const Name &name, const Object &m) {
340 const Array &v = getArrayField(e, m, "symbols");
341 concepts::MultiAttribute<string> symbols;
342 for (const auto &it : v) {
343 if (it.type() != json::EntityType::String) {
344 throw Exception(boost::format("Enum symbol not a string: %1%") % it.toString());
345 }
346 symbols.add(it.stringValue());
347 }
348 NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
349 if (containsField(m, "doc")) {
350 node->setDoc(getDocField(e, m));
351 }
352 return node;
353 }
354
makeFixedNode(const Entity & e,const Name & name,const Object & m)355 static NodePtr makeFixedNode(const Entity &e,
356 const Name &name, const Object &m) {
357 int v = static_cast<int>(getLongField(e, m, "size"));
358 if (v <= 0) {
359 throw Exception(boost::format("Size for fixed is not positive: %1%") % e.toString());
360 }
361 NodePtr node =
362 NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
363 if (containsField(m, "doc")) {
364 node->setDoc(getDocField(e, m));
365 }
366 return node;
367 }
368
makeArrayNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)369 static NodePtr makeArrayNode(const Entity &e, const Object &m,
370 SymbolTable &st, const string &ns) {
371 auto it = findField(e, m, "items");
372 NodePtr node = NodePtr(new NodeArray(
373 asSingleAttribute(makeNode(it->second, st, ns))));
374 if (containsField(m, "doc")) {
375 node->setDoc(getDocField(e, m));
376 }
377 return node;
378 }
379
makeMapNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)380 static NodePtr makeMapNode(const Entity &e, const Object &m,
381 SymbolTable &st, const string &ns) {
382 auto it = findField(e, m, "values");
383
384 NodePtr node = NodePtr(new NodeMap(
385 asSingleAttribute(makeNode(it->second, st, ns))));
386 if (containsField(m, "doc")) {
387 node->setDoc(getDocField(e, m));
388 }
389 return node;
390 }
391
getName(const Entity & e,const Object & m,const string & ns)392 static Name getName(const Entity &e, const Object &m, const string &ns) {
393 const string &name = getStringField(e, m, "name");
394
395 if (isFullName(name)) {
396 return Name(name);
397 } else {
398 auto it = m.find("namespace");
399 if (it != m.end()) {
400 if (it->second.type() != json::type_traits<string>::type()) {
401 throw Exception(boost::format(
402 "Json field \"%1%\" is not a %2%: %3%")
403 % "namespace" % json::type_traits<string>::name() % it->second.toString());
404 }
405 Name result = Name(name, it->second.stringValue());
406 return result;
407 }
408 return Name(name, ns);
409 }
410 }
411
makeNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)412 static NodePtr makeNode(const Entity &e, const Object &m,
413 SymbolTable &st, const string &ns) {
414 const string &type = getStringField(e, m, "type");
415 NodePtr result;
416 if (type == "record" || type == "error" || type == "enum" || type == "fixed") {
417 Name nm = getName(e, m, ns);
418 if (type == "record" || type == "error") {
419 result = NodePtr(new NodeRecord());
420 st[nm] = result;
421 // Get field doc
422 if (containsField(m, "doc")) {
423 string doc = getDocField(e, m);
424
425 NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
426 (std::dynamic_pointer_cast<NodeRecord>(r))->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
427 } else { // No doc
428 NodePtr r =
429 makeRecordNode(e, nm, nullptr, m, st, nm.ns());
430 (std::dynamic_pointer_cast<NodeRecord>(r))
431 ->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
432 }
433 } else {
434 result = (type == "enum") ? makeEnumNode(e, nm, m) : makeFixedNode(e, nm, m);
435 st[nm] = result;
436 }
437 } else if (type == "array") {
438 result = makeArrayNode(e, m, st, ns);
439 } else if (type == "map") {
440 result = makeMapNode(e, m, st, ns);
441 } else {
442 result = makePrimitive(type);
443 }
444
445 if (result) {
446 try {
447 result->setLogicalType(makeLogicalType(e, m));
448 } catch (Exception &ex) {
449 // Per the standard we must ignore the logical type attribute if it
450 // is malformed.
451 }
452 return result;
453 }
454
455 throw Exception(boost::format("Unknown type definition: %1%")
456 % e.toString());
457 }
458
makeNode(const Entity & e,const Array & m,SymbolTable & st,const string & ns)459 static NodePtr makeNode(const Entity &e, const Array &m,
460 SymbolTable &st, const string &ns) {
461 concepts::MultiAttribute<NodePtr> mm;
462 for (const auto &it : m) {
463 mm.add(makeNode(it, st, ns));
464 }
465 return NodePtr(new NodeUnion(mm));
466 }
467
makeNode(const json::Entity & e,SymbolTable & st,const string & ns)468 static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns) {
469 switch (e.type()) {
470 case json::EntityType::String: return makeNode(e.stringValue(), st, ns);
471 case json::EntityType::Obj: return makeNode(e, e.objectValue(), st, ns);
472 case json::EntityType::Arr: return makeNode(e, e.arrayValue(), st, ns);
473 default: throw Exception(boost::format("Invalid Avro type: %1%") % e.toString());
474 }
475 }
findField(const Entity & e,const Object & m,const string & fieldName)476 json::Object::const_iterator findField(const Entity &e, const Object &m, const string &fieldName) {
477 auto it = m.find(fieldName);
478 if (it == m.end()) {
479 throw Exception(boost::format("Missing Json field \"%1%\": %2%") % fieldName % e.toString());
480 } else {
481 return it;
482 }
483 }
getArrayField(const Entity & e,const Object & m,const string & fieldName)484 const Array &getArrayField(const Entity &e, const Object &m, const string &fieldName) {
485 auto it = findField(e, m, fieldName);
486 ensureType<Array>(it->second, fieldName);
487 return it->second.arrayValue();
488 }
489
compileJsonSchemaFromStream(InputStream & is)490 ValidSchema compileJsonSchemaFromStream(InputStream &is) {
491 json::Entity e = json::loadEntity(is);
492 SymbolTable st;
493 NodePtr n = makeNode(e, st, "");
494 return ValidSchema(n);
495 }
496
compileJsonSchemaFromFile(const char * filename)497 AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char *filename) {
498 std::unique_ptr<InputStream> s = fileInputStream(filename);
499 return compileJsonSchemaFromStream(*s);
500 }
501
compileJsonSchemaFromMemory(const uint8_t * input,size_t len)502 AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t *input, size_t len) {
503 return compileJsonSchemaFromStream(*memoryInputStream(input, len));
504 }
505
compileJsonSchemaFromString(const char * input)506 AVRO_DECL ValidSchema compileJsonSchemaFromString(const char *input) {
507 return compileJsonSchemaFromMemory(reinterpret_cast<const uint8_t *>(input),
508 ::strlen(input));
509 }
510
compileJsonSchemaFromString(const string & input)511 AVRO_DECL ValidSchema compileJsonSchemaFromString(const string &input) {
512 return compileJsonSchemaFromMemory(
513 reinterpret_cast<const uint8_t *>(input.data()), input.size());
514 }
515
compile(std::istream & is)516 static ValidSchema compile(std::istream &is) {
517 std::unique_ptr<InputStream> in = istreamInputStream(is);
518 return compileJsonSchemaFromStream(*in);
519 }
520
compileJsonSchema(std::istream & is,ValidSchema & schema)521 void compileJsonSchema(std::istream &is, ValidSchema &schema) {
522 if (!is.good()) {
523 throw Exception("Input stream is not good");
524 }
525
526 schema = compile(is);
527 }
528
compileJsonSchema(std::istream & is,ValidSchema & schema,string & error)529 AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string &error) {
530 try {
531 compileJsonSchema(is, schema);
532 return true;
533 } catch (const Exception &e) {
534 error = e.what();
535 return false;
536 }
537 }
538
539 } // namespace avro
540