1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     https://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 #include <boost/algorithm/string/replace.hpp>
19 #include <sstream>
20 #include <utility>
21 
22 #include "Compiler.hh"
23 #include "Schema.hh"
24 #include "Stream.hh"
25 #include "Types.hh"
26 #include "ValidSchema.hh"
27 
28 #include "json/JsonDom.hh"
29 
30 using std::make_pair;
31 using std::map;
32 using std::pair;
33 using std::string;
34 using std::vector;
35 
36 namespace avro {
37 using json::Array;
38 using json::Entity;
39 using json::EntityType;
40 using json::Object;
41 
42 using SymbolTable = map<Name, NodePtr>;
43 
44 // #define DEBUG_VERBOSE
45 
makePrimitive(const string & t)46 static NodePtr makePrimitive(const string &t) {
47     if (t == "null") {
48         return NodePtr(new NodePrimitive(AVRO_NULL));
49     } else if (t == "boolean") {
50         return NodePtr(new NodePrimitive(AVRO_BOOL));
51     } else if (t == "int") {
52         return NodePtr(new NodePrimitive(AVRO_INT));
53     } else if (t == "long") {
54         return NodePtr(new NodePrimitive(AVRO_LONG));
55     } else if (t == "float") {
56         return NodePtr(new NodePrimitive(AVRO_FLOAT));
57     } else if (t == "double") {
58         return NodePtr(new NodePrimitive(AVRO_DOUBLE));
59     } else if (t == "string") {
60         return NodePtr(new NodePrimitive(AVRO_STRING));
61     } else if (t == "bytes") {
62         return NodePtr(new NodePrimitive(AVRO_BYTES));
63     } else {
64         return NodePtr();
65     }
66 }
67 
68 static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns);
69 
70 template<typename T>
asSingleAttribute(const T & t)71 concepts::SingleAttribute<T> asSingleAttribute(const T &t) {
72     concepts::SingleAttribute<T> n;
73     n.add(t);
74     return n;
75 }
76 
isFullName(const string & s)77 static bool isFullName(const string &s) {
78     return s.find('.') != string::npos;
79 }
80 
getName(const string & name,const string & ns)81 static Name getName(const string &name, const string &ns) {
82     return (isFullName(name)) ? Name(name) : Name(name, ns);
83 }
84 
makeNode(const string & t,SymbolTable & st,const string & ns)85 static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns) {
86     NodePtr result = makePrimitive(t);
87     if (result) {
88         return result;
89     }
90     Name n = getName(t, ns);
91 
92     auto it = st.find(n);
93     if (it != st.end()) {
94         return NodePtr(new NodeSymbolic(asSingleAttribute(n), it->second));
95     }
96     throw Exception(boost::format("Unknown type: %1%") % n.fullname());
97 }
98 
99 /** Returns "true" if the field is in the container */
100 // e.g.: can be false for non-mandatory fields
containsField(const Object & m,const string & fieldName)101 bool containsField(const Object &m, const string &fieldName) {
102     auto it = m.find(fieldName);
103     return (it != m.end());
104 }
105 
106 json::Object::const_iterator findField(const Entity &e,
107                                        const Object &m, const string &fieldName);
108 
109 template<typename T>
ensureType(const Entity & e,const string & name)110 void ensureType(const Entity &e, const string &name) {
111     if (e.type() != json::type_traits<T>::type()) {
112         throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") % name % json::type_traits<T>::name() % e.toString());
113     }
114 }
115 
getStringField(const Entity & e,const Object & m,const string & fieldName)116 string getStringField(const Entity &e, const Object &m,
117                       const string &fieldName) {
118     auto it = findField(e, m, fieldName);
119     ensureType<string>(it->second, fieldName);
120     return it->second.stringValue();
121 }
122 
123 const Array &getArrayField(const Entity &e, const Object &m,
124                            const string &fieldName);
125 
getLongField(const Entity & e,const Object & m,const string & fieldName)126 int64_t getLongField(const Entity &e, const Object &m,
127                      const string &fieldName) {
128     auto it = findField(e, m, fieldName);
129     ensureType<int64_t>(it->second, fieldName);
130     return it->second.longValue();
131 }
132 
133 // Unescape double quotes (") for de-serialization.  This method complements the
134 // method NodeImpl::escape() which is used for serialization.
unescape(string & s)135 static void unescape(string &s) {
136     boost::replace_all(s, "\\\"", "\"");
137 }
138 
getDocField(const Entity & e,const Object & m)139 string getDocField(const Entity &e, const Object &m) {
140     string doc = getStringField(e, m, "doc");
141     unescape(doc);
142     return doc;
143 }
144 
145 struct Field {
146     const string name;
147     const NodePtr schema;
148     const GenericDatum defaultValue;
Fieldavro::Field149     Field(string n, NodePtr v, GenericDatum dv) : name(std::move(n)), schema(std::move(v)), defaultValue(std::move(dv)) {}
150 };
151 
assertType(const Entity & e,EntityType et)152 static void assertType(const Entity &e, EntityType et) {
153     if (e.type() != et) {
154         throw Exception(boost::format("Unexpected type for default value: "
155                                       "Expected %1%, but found %2% in line %3%")
156                         % json::typeToString(et) % json::typeToString(e.type()) % e.line());
157     }
158 }
159 
toBin(const string & s)160 static vector<uint8_t> toBin(const string &s) {
161     vector<uint8_t> result(s.size());
162     if (!s.empty()) {
163         std::copy(s.c_str(), s.c_str() + s.size(), result.data());
164     }
165     return result;
166 }
167 
makeGenericDatum(NodePtr n,const Entity & e,const SymbolTable & st)168 static GenericDatum makeGenericDatum(NodePtr n,
169                                      const Entity &e, const SymbolTable &st) {
170     Type t = n->type();
171     EntityType dt = e.type();
172 
173     if (t == AVRO_SYMBOLIC) {
174         n = st.find(n->name())->second;
175         t = n->type();
176     }
177     switch (t) {
178         case AVRO_STRING:
179             assertType(e, json::EntityType::String);
180             return GenericDatum(e.stringValue());
181         case AVRO_BYTES:
182             assertType(e, json::EntityType::String);
183             return GenericDatum(toBin(e.bytesValue()));
184         case AVRO_INT:
185             assertType(e, json::EntityType::Long);
186             return GenericDatum(static_cast<int32_t>(e.longValue()));
187         case AVRO_LONG:
188             assertType(e, json::EntityType::Long);
189             return GenericDatum(e.longValue());
190         case AVRO_FLOAT:
191             if (dt == json::EntityType::Long) {
192                 return GenericDatum(static_cast<float>(e.longValue()));
193             }
194             assertType(e, json::EntityType::Double);
195             return GenericDatum(static_cast<float>(e.doubleValue()));
196         case AVRO_DOUBLE:
197             if (dt == json::EntityType::Long) {
198                 return GenericDatum(static_cast<double>(e.longValue()));
199             }
200             assertType(e, json::EntityType::Double);
201             return GenericDatum(e.doubleValue());
202         case AVRO_BOOL:
203             assertType(e, json::EntityType::Bool);
204             return GenericDatum(e.boolValue());
205         case AVRO_NULL:
206             assertType(e, json::EntityType::Null);
207             return GenericDatum();
208         case AVRO_RECORD: {
209             assertType(e, json::EntityType::Obj);
210             GenericRecord result(n);
211             const map<string, Entity> &v = e.objectValue();
212             for (size_t i = 0; i < n->leaves(); ++i) {
213                 auto it = v.find(n->nameAt(i));
214                 if (it == v.end()) {
215                     throw Exception(boost::format(
216                                         "No value found in default for %1%")
217                                     % n->nameAt(i));
218                 }
219                 result.setFieldAt(i,
220                                   makeGenericDatum(n->leafAt(i), it->second, st));
221             }
222             return GenericDatum(n, result);
223         }
224         case AVRO_ENUM:
225             assertType(e, json::EntityType::String);
226             return GenericDatum(n, GenericEnum(n, e.stringValue()));
227         case AVRO_ARRAY: {
228             assertType(e, json::EntityType::Arr);
229             GenericArray result(n);
230             const vector<Entity> &elements = e.arrayValue();
231             for (const auto &element : elements) {
232                 result.value().push_back(makeGenericDatum(n->leafAt(0), element, st));
233             }
234             return GenericDatum(n, result);
235         }
236         case AVRO_MAP: {
237             assertType(e, json::EntityType::Obj);
238             GenericMap result(n);
239             const map<string, Entity> &v = e.objectValue();
240             for (const auto &it : v) {
241                 result.value().push_back(make_pair(it.first,
242                                                    makeGenericDatum(n->leafAt(1), it.second, st)));
243             }
244             return GenericDatum(n, result);
245         }
246         case AVRO_UNION: {
247             GenericUnion result(n);
248             result.selectBranch(0);
249             result.datum() = makeGenericDatum(n->leafAt(0), e, st);
250             return GenericDatum(n, result);
251         }
252         case AVRO_FIXED:
253             assertType(e, json::EntityType::String);
254             return GenericDatum(n, GenericFixed(n, toBin(e.bytesValue())));
255         default: throw Exception(boost::format("Unknown type: %1%") % t);
256     }
257 }
258 
makeField(const Entity & e,SymbolTable & st,const string & ns)259 static Field makeField(const Entity &e, SymbolTable &st, const string &ns) {
260     const Object &m = e.objectValue();
261     const string &n = getStringField(e, m, "name");
262     auto it = findField(e, m, "type");
263     auto it2 = m.find("default");
264     NodePtr node = makeNode(it->second, st, ns);
265     if (containsField(m, "doc")) {
266         node->setDoc(getDocField(e, m));
267     }
268     GenericDatum d = (it2 == m.end()) ? GenericDatum() : makeGenericDatum(node, it2->second, st);
269     return Field(n, node, d);
270 }
271 
272 // Extended makeRecordNode (with doc).
makeRecordNode(const Entity & e,const Name & name,const string * doc,const Object & m,SymbolTable & st,const string & ns)273 static NodePtr makeRecordNode(const Entity &e, const Name &name,
274                               const string *doc, const Object &m,
275                               SymbolTable &st, const string &ns) {
276     const Array &v = getArrayField(e, m, "fields");
277     concepts::MultiAttribute<string> fieldNames;
278     concepts::MultiAttribute<NodePtr> fieldValues;
279     vector<GenericDatum> defaultValues;
280 
281     for (const auto &it : v) {
282         Field f = makeField(it, st, ns);
283         fieldNames.add(f.name);
284         fieldValues.add(f.schema);
285         defaultValues.push_back(f.defaultValue);
286     }
287     NodeRecord *node;
288     if (doc == nullptr) {
289         node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
290                               defaultValues);
291     } else {
292         node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
293                               fieldValues, fieldNames, defaultValues);
294     }
295     return NodePtr(node);
296 }
297 
makeLogicalType(const Entity & e,const Object & m)298 static LogicalType makeLogicalType(const Entity &e, const Object &m) {
299     if (!containsField(m, "logicalType")) {
300         return LogicalType(LogicalType::NONE);
301     }
302 
303     const std::string &typeField = getStringField(e, m, "logicalType");
304 
305     if (typeField == "decimal") {
306         LogicalType decimalType(LogicalType::DECIMAL);
307         try {
308             decimalType.setPrecision(getLongField(e, m, "precision"));
309             if (containsField(m, "scale")) {
310                 decimalType.setScale(getLongField(e, m, "scale"));
311             }
312         } catch (Exception &ex) {
313             // If any part of the logical type is malformed, per the standard we
314             // must ignore the whole attribute.
315             return LogicalType(LogicalType::NONE);
316         }
317         return decimalType;
318     }
319 
320     LogicalType::Type t = LogicalType::NONE;
321     if (typeField == "date")
322         t = LogicalType::DATE;
323     else if (typeField == "time-millis")
324         t = LogicalType::TIME_MILLIS;
325     else if (typeField == "time-micros")
326         t = LogicalType::TIME_MICROS;
327     else if (typeField == "timestamp-millis")
328         t = LogicalType::TIMESTAMP_MILLIS;
329     else if (typeField == "timestamp-micros")
330         t = LogicalType::TIMESTAMP_MICROS;
331     else if (typeField == "duration")
332         t = LogicalType::DURATION;
333     else if (typeField == "uuid")
334         t = LogicalType::UUID;
335     return LogicalType(t);
336 }
337 
makeEnumNode(const Entity & e,const Name & name,const Object & m)338 static NodePtr makeEnumNode(const Entity &e,
339                             const Name &name, const Object &m) {
340     const Array &v = getArrayField(e, m, "symbols");
341     concepts::MultiAttribute<string> symbols;
342     for (const auto &it : v) {
343         if (it.type() != json::EntityType::String) {
344             throw Exception(boost::format("Enum symbol not a string: %1%") % it.toString());
345         }
346         symbols.add(it.stringValue());
347     }
348     NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
349     if (containsField(m, "doc")) {
350         node->setDoc(getDocField(e, m));
351     }
352     return node;
353 }
354 
makeFixedNode(const Entity & e,const Name & name,const Object & m)355 static NodePtr makeFixedNode(const Entity &e,
356                              const Name &name, const Object &m) {
357     int v = static_cast<int>(getLongField(e, m, "size"));
358     if (v <= 0) {
359         throw Exception(boost::format("Size for fixed is not positive: %1%") % e.toString());
360     }
361     NodePtr node =
362         NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
363     if (containsField(m, "doc")) {
364         node->setDoc(getDocField(e, m));
365     }
366     return node;
367 }
368 
makeArrayNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)369 static NodePtr makeArrayNode(const Entity &e, const Object &m,
370                              SymbolTable &st, const string &ns) {
371     auto it = findField(e, m, "items");
372     NodePtr node = NodePtr(new NodeArray(
373         asSingleAttribute(makeNode(it->second, st, ns))));
374     if (containsField(m, "doc")) {
375         node->setDoc(getDocField(e, m));
376     }
377     return node;
378 }
379 
makeMapNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)380 static NodePtr makeMapNode(const Entity &e, const Object &m,
381                            SymbolTable &st, const string &ns) {
382     auto it = findField(e, m, "values");
383 
384     NodePtr node = NodePtr(new NodeMap(
385         asSingleAttribute(makeNode(it->second, st, ns))));
386     if (containsField(m, "doc")) {
387         node->setDoc(getDocField(e, m));
388     }
389     return node;
390 }
391 
getName(const Entity & e,const Object & m,const string & ns)392 static Name getName(const Entity &e, const Object &m, const string &ns) {
393     const string &name = getStringField(e, m, "name");
394 
395     if (isFullName(name)) {
396         return Name(name);
397     } else {
398         auto it = m.find("namespace");
399         if (it != m.end()) {
400             if (it->second.type() != json::type_traits<string>::type()) {
401                 throw Exception(boost::format(
402                                     "Json field \"%1%\" is not a %2%: %3%")
403                                 % "namespace" % json::type_traits<string>::name() % it->second.toString());
404             }
405             Name result = Name(name, it->second.stringValue());
406             return result;
407         }
408         return Name(name, ns);
409     }
410 }
411 
makeNode(const Entity & e,const Object & m,SymbolTable & st,const string & ns)412 static NodePtr makeNode(const Entity &e, const Object &m,
413                         SymbolTable &st, const string &ns) {
414     const string &type = getStringField(e, m, "type");
415     NodePtr result;
416     if (type == "record" || type == "error" || type == "enum" || type == "fixed") {
417         Name nm = getName(e, m, ns);
418         if (type == "record" || type == "error") {
419             result = NodePtr(new NodeRecord());
420             st[nm] = result;
421             // Get field doc
422             if (containsField(m, "doc")) {
423                 string doc = getDocField(e, m);
424 
425                 NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
426                 (std::dynamic_pointer_cast<NodeRecord>(r))->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
427             } else { // No doc
428                 NodePtr r =
429                     makeRecordNode(e, nm, nullptr, m, st, nm.ns());
430                 (std::dynamic_pointer_cast<NodeRecord>(r))
431                     ->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
432             }
433         } else {
434             result = (type == "enum") ? makeEnumNode(e, nm, m) : makeFixedNode(e, nm, m);
435             st[nm] = result;
436         }
437     } else if (type == "array") {
438         result = makeArrayNode(e, m, st, ns);
439     } else if (type == "map") {
440         result = makeMapNode(e, m, st, ns);
441     } else {
442         result = makePrimitive(type);
443     }
444 
445     if (result) {
446         try {
447             result->setLogicalType(makeLogicalType(e, m));
448         } catch (Exception &ex) {
449             // Per the standard we must ignore the logical type attribute if it
450             // is malformed.
451         }
452         return result;
453     }
454 
455     throw Exception(boost::format("Unknown type definition: %1%")
456                     % e.toString());
457 }
458 
makeNode(const Entity & e,const Array & m,SymbolTable & st,const string & ns)459 static NodePtr makeNode(const Entity &e, const Array &m,
460                         SymbolTable &st, const string &ns) {
461     concepts::MultiAttribute<NodePtr> mm;
462     for (const auto &it : m) {
463         mm.add(makeNode(it, st, ns));
464     }
465     return NodePtr(new NodeUnion(mm));
466 }
467 
makeNode(const json::Entity & e,SymbolTable & st,const string & ns)468 static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns) {
469     switch (e.type()) {
470         case json::EntityType::String: return makeNode(e.stringValue(), st, ns);
471         case json::EntityType::Obj: return makeNode(e, e.objectValue(), st, ns);
472         case json::EntityType::Arr: return makeNode(e, e.arrayValue(), st, ns);
473         default: throw Exception(boost::format("Invalid Avro type: %1%") % e.toString());
474     }
475 }
findField(const Entity & e,const Object & m,const string & fieldName)476 json::Object::const_iterator findField(const Entity &e, const Object &m, const string &fieldName) {
477     auto it = m.find(fieldName);
478     if (it == m.end()) {
479         throw Exception(boost::format("Missing Json field \"%1%\": %2%") % fieldName % e.toString());
480     } else {
481         return it;
482     }
483 }
getArrayField(const Entity & e,const Object & m,const string & fieldName)484 const Array &getArrayField(const Entity &e, const Object &m, const string &fieldName) {
485     auto it = findField(e, m, fieldName);
486     ensureType<Array>(it->second, fieldName);
487     return it->second.arrayValue();
488 }
489 
compileJsonSchemaFromStream(InputStream & is)490 ValidSchema compileJsonSchemaFromStream(InputStream &is) {
491     json::Entity e = json::loadEntity(is);
492     SymbolTable st;
493     NodePtr n = makeNode(e, st, "");
494     return ValidSchema(n);
495 }
496 
compileJsonSchemaFromFile(const char * filename)497 AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char *filename) {
498     std::unique_ptr<InputStream> s = fileInputStream(filename);
499     return compileJsonSchemaFromStream(*s);
500 }
501 
compileJsonSchemaFromMemory(const uint8_t * input,size_t len)502 AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t *input, size_t len) {
503     return compileJsonSchemaFromStream(*memoryInputStream(input, len));
504 }
505 
compileJsonSchemaFromString(const char * input)506 AVRO_DECL ValidSchema compileJsonSchemaFromString(const char *input) {
507     return compileJsonSchemaFromMemory(reinterpret_cast<const uint8_t *>(input),
508                                        ::strlen(input));
509 }
510 
compileJsonSchemaFromString(const string & input)511 AVRO_DECL ValidSchema compileJsonSchemaFromString(const string &input) {
512     return compileJsonSchemaFromMemory(
513         reinterpret_cast<const uint8_t *>(input.data()), input.size());
514 }
515 
compile(std::istream & is)516 static ValidSchema compile(std::istream &is) {
517     std::unique_ptr<InputStream> in = istreamInputStream(is);
518     return compileJsonSchemaFromStream(*in);
519 }
520 
compileJsonSchema(std::istream & is,ValidSchema & schema)521 void compileJsonSchema(std::istream &is, ValidSchema &schema) {
522     if (!is.good()) {
523         throw Exception("Input stream is not good");
524     }
525 
526     schema = compile(is);
527 }
528 
compileJsonSchema(std::istream & is,ValidSchema & schema,string & error)529 AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string &error) {
530     try {
531         compileJsonSchema(is, schema);
532         return true;
533     } catch (const Exception &e) {
534         error = e.what();
535         return false;
536     }
537 }
538 
539 } // namespace avro
540