1 // *****************************************************************************
2 // * This file is part of the FreeFileSync project. It is distributed under    *
3 // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0           *
4 // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
5 // *****************************************************************************
6 
7 #ifndef PARSER_H_81248670213764583021432
8 #define PARSER_H_81248670213764583021432
9 
10 #include <cstdio>
11 #include <cstddef> //ptrdiff_t; req. on Linux
12 #include <zen/string_tools.h>
13 #include "dom.h"
14 #include "error.h"
15 
16 
17 namespace zen
18 {
19 /**
20 \file
21 \brief Convert an XML document object model (class XmlDoc) to and from a byte stream representation.
22 */
23 
24 ///Save XML document as a byte stream
25 /**
26 \param doc Input XML document
27 \param lineBreak Line break, default: carriage return + new line
28 \param indent Indentation, default: four space characters
29 \return Output byte stream
30 */
31 std::string serialize(const XmlDoc& doc,
32                       const std::string& lineBreak = "\r\n",
33                       const std::string& indent = "    "); //throw ()
34 
35 ///Exception thrown due to an XML parsing error
36 struct XmlParsingError : public XmlError
37 {
XmlParsingErrorXmlParsingError38     XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
39     ///Input file row where the parsing error occured (zero-based)
40     const size_t row; //beginning with 0
41     ///Input file column where the parsing error occured (zero-based)
42     const size_t col; //
43 };
44 
45 
46 ///Load XML document from a byte stream
47 /**
48 \param stream Input byte stream
49 \returns Output XML document
50 \throw XmlParsingError
51 */
52 XmlDoc parse(const std::string& stream); //throw XmlParsingError
53 
54 
55 
56 
57 
58 
59 
60 
61 
62 
63 
64 
65 
66 
67 
68 
69 
70 
71 
72 
73 //---------------------------- implementation ----------------------------
74 //see: http://www.w3.org/TR/xml/
75 
76 namespace implementation
77 {
78 template <class Predicate> inline
normalize(const std::string & str,Predicate pred)79 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
80 {
81     std::string output;
82     for (const char c : str)
83     {
84         if (c == '&')      //
85             output += "&amp;";
86         else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
87             output += "&lt;";
88         else if (c == '>') //
89             output += "&gt;";
90         else if (pred(c))
91         {
92             if (c == '\'')
93                 output += "&apos;";
94             else if (c == '\"')
95                 output += "&quot;";
96             else
97             {
98                 output += "&#x";
99                 const auto hexDigits = hexify(c);
100                 output += hexDigits.first;
101                 output += hexDigits.second;
102                 output += ';';
103             }
104         }
105         else
106             output += c;
107     }
108     return output;
109 }
110 
111 inline
normalizeName(const std::string & str)112 std::string normalizeName(const std::string& str)
113 {
114     return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
115 }
116 
117 inline
normalizeElementValue(const std::string & str)118 std::string normalizeElementValue(const std::string& str)
119 {
120     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
121 }
122 
123 inline
normalizeAttribValue(const std::string & str)124 std::string normalizeAttribValue(const std::string& str)
125 {
126     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
127 }
128 
129 
130 template <class CharIterator, size_t N> inline
checkEntity(CharIterator & first,CharIterator last,const char (& placeholder)[N])131 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
132 {
133     assert(placeholder[N - 1] == 0);
134     const ptrdiff_t strLen = N - 1; //don't count null-terminator
135     if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
136     {
137         first += strLen - 1;
138         return true;
139     }
140     return false;
141 }
142 
143 
144 namespace
145 {
denormalize(const std::string & str)146 std::string denormalize(const std::string& str)
147 {
148     std::string output;
149     for (auto it = str.begin(); it != str.end(); ++it)
150     {
151         const char c = *it;
152 
153         if (c == '&')
154         {
155             if (checkEntity(it, str.end(), "&amp;"))
156                 output += '&';
157             else if (checkEntity(it, str.end(), "&lt;"))
158                 output += '<';
159             else if (checkEntity(it, str.end(), "&gt;"))
160                 output += '>';
161             else if (checkEntity(it, str.end(), "&apos;"))
162                 output += '\'';
163             else if (checkEntity(it, str.end(), "&quot;"))
164                 output += '\"';
165             else if (str.end() - it >= 6 &&
166                      it[1] == '#' &&
167                      it[2] == 'x' &&
168                      it[5] == ';')
169             {
170                 output += unhexify(it[3], it[4]);
171                 it += 5;
172             }
173             else
174                 output += c; //unexpected char!
175         }
176         else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
177         {
178             auto itNext = it + 1;
179             if (itNext != str.end() && *itNext == '\n')
180                 ++it;
181             output += '\n';
182         }
183         else
184             output += c;
185     }
186     return output;
187 }
188 
189 
serialize(const XmlElement & element,std::string & stream,const std::string & lineBreak,const std::string & indent,size_t indentLevel)190 void serialize(const XmlElement& element, std::string& stream,
191                const std::string& lineBreak,
192                const std::string& indent,
193                size_t indentLevel)
194 {
195     const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
196 
197     for (size_t i = 0; i < indentLevel; ++i)
198         stream += indent;
199 
200     stream += '<' + nameFmt;
201 
202     auto attr = element.getAttributes();
203     for (auto it = attr.first; it != attr.second; ++it)
204         stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
205 
206     //no support for mixed-mode content
207     auto iterPair = element.getChildren();
208     if (iterPair.first != iterPair.second) //structured element
209     {
210         stream += '>' + lineBreak;
211 
212         std::for_each(iterPair.first, iterPair.second,
213         [&](const XmlElement& el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
214 
215         for (size_t i = 0; i < indentLevel; ++i)
216             stream += indent;
217         stream += "</" + nameFmt + '>' + lineBreak;
218     }
219     else
220     {
221         std::string value;
222         element.getValue(value);
223 
224         if (!value.empty()) //value element
225             stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
226         else //empty element
227             stream += "/>" + lineBreak;
228     }
229 }
230 
serialize(const XmlDoc & doc,const std::string & lineBreak,const std::string & indent)231 std::string serialize(const XmlDoc& doc,
232                       const std::string& lineBreak,
233                       const std::string& indent)
234 {
235     std::string version = doc.getVersionAs<std::string>();
236     if (!version.empty())
237         version = " version=\"" + normalizeAttribValue(version) + '\"';
238 
239     std::string encoding = doc.getEncodingAs<std::string>();
240     if (!encoding.empty())
241         encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
242 
243     std::string standalone = doc.getStandaloneAs<std::string>();
244     if (!standalone.empty())
245         standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
246 
247     std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
248     serialize(doc.root(), output, lineBreak, indent, 0);
249     return output;
250 }
251 }
252 }
253 
254 inline
serialize(const XmlDoc & doc,const std::string & lineBreak,const std::string & indent)255 std::string serialize(const XmlDoc& doc,
256                       const std::string& lineBreak,
257                       const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
258 
259 /*
260 Grammar for XML parser
261 -------------------------------
262 document-expression:
263     <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
264     element-expression:
265 
266 element-expression:
267     <string attributes-expression/>
268     <string attributes-expression> pm-expression </string>
269 
270 element-list-expression:
271     <empty>
272     element-expression element-list-expression
273 
274 attributes-expression:
275     <empty>
276     string="string" attributes-expression
277 
278 pm-expression:
279     string
280     element-list-expression
281 */
282 
283 namespace implementation
284 {
285 struct Token
286 {
287     enum Type
288     {
289         TK_LESS,
290         TK_GREATER,
291         TK_LESS_SLASH,
292         TK_SLASH_GREATER,
293         TK_EQUAL,
294         TK_QUOTE,
295         TK_DECL_BEGIN,
296         TK_DECL_END,
297         TK_NAME,
298         TK_END
299     };
300 
TokenToken301     Token(Type t) : type(t) {}
TokenToken302     Token(const std::string& txt) : type(TK_NAME), name(txt) {}
303 
304     Type type;
305     std::string name; //filled if type == TK_NAME
306 };
307 
308 class Scanner
309 {
310 public:
Scanner(const std::string & stream)311     Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
312     {
313         if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
314             pos += strLength(BYTE_ORDER_MARK_UTF8);
315     }
316 
nextToken()317     Token nextToken() //throw XmlParsingError
318     {
319         //skip whitespace
320         pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
321 
322         if (pos == stream_.end())
323             return Token::TK_END;
324 
325         //skip XML comments
326         if (startsWith(xmlCommentBegin))
327         {
328             auto it = std::search(pos + xmlCommentBegin.size(), stream_.end(), xmlCommentEnd.begin(), xmlCommentEnd.end());
329             if (it != stream_.end())
330             {
331                 pos = it + xmlCommentEnd.size();
332                 return nextToken();
333             }
334         }
335 
336         for (auto it = tokens.begin(); it != tokens.end(); ++it)
337             if (startsWith(it->first))
338             {
339                 pos += it->first.size();
340                 return it->second;
341             }
342 
343         auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
344         {
345             return c == '<'  ||
346                    c == '>'  ||
347                    c == '='  ||
348                    c == '/'  ||
349                    c == '\'' ||
350                    c == '\"' ||
351                    zen::isWhiteSpace(c);
352         });
353 
354         if (nameEnd != pos)
355         {
356             std::string name(&*pos, nameEnd - pos);
357             pos = nameEnd;
358             return implementation::denormalize(name);
359         }
360 
361         //unknown token
362         throw XmlParsingError(posRow(), posCol());
363     }
364 
extractElementValue()365     std::string extractElementValue()
366     {
367         auto it = std::find_if(pos, stream_.end(), [](char c)
368         {
369             return c == '<'  ||
370                    c == '>';
371         });
372         std::string output(pos, it);
373         pos = it;
374         return implementation::denormalize(output);
375     }
376 
extractAttributeValue()377     std::string extractAttributeValue()
378     {
379         auto it = std::find_if(pos, stream_.end(), [](char c)
380         {
381             return c == '<'  ||
382                    c == '>'  ||
383                    c == '\'' ||
384                    c == '\"';
385         });
386         std::string output(pos, it);
387         pos = it;
388         return implementation::denormalize(output);
389     }
390 
posRow()391     size_t posRow() const //current row beginning with 0
392     {
393         const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
394         const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
395         assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
396         return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
397     }
398 
posCol()399     size_t posCol() const //current col beginning with 0
400     {
401         //seek beginning of line
402         for (auto it = pos; it != stream_.begin(); )
403         {
404             --it;
405             if (*it == '\r' || *it == '\n')
406                 return pos - it - 1;
407         }
408         return pos - stream_.begin();
409     }
410 
411 private:
412     Scanner           (const Scanner&) = delete;
413     Scanner& operator=(const Scanner&) = delete;
414 
startsWith(const std::string & prefix)415     bool startsWith(const std::string& prefix) const
416     {
417         if (stream_.end() - pos < static_cast<ptrdiff_t>(prefix.size()))
418             return false;
419         return std::equal(prefix.begin(), prefix.end(), pos);
420     }
421 
422     using TokenList = std::vector<std::pair<std::string, Token::Type>>;
423     const TokenList tokens
424     {
425         { "<?xml", Token::TK_DECL_BEGIN    },
426         { "?>",    Token::TK_DECL_END      },
427         { "</",    Token::TK_LESS_SLASH    },
428         { "/>",    Token::TK_SLASH_GREATER },
429         { "<" ,    Token::TK_LESS          }, //evaluate after TK_DECL_BEGIN!
430         { ">" ,    Token::TK_GREATER       },
431         { "=" ,    Token::TK_EQUAL         },
432         { "\"",    Token::TK_QUOTE         },
433         { "\'",    Token::TK_QUOTE         },
434     };
435 
436     const std::string xmlCommentBegin = "<!--";
437     const std::string xmlCommentEnd   = "-->";
438 
439     const std::string stream_;
440     std::string::const_iterator pos;
441 };
442 
443 
444 class XmlParser
445 {
446 public:
XmlParser(const std::string & stream)447     XmlParser(const std::string& stream) :
448         scn(stream),
449         tk(scn.nextToken()) {}
450 
parse()451     XmlDoc parse() //throw XmlParsingError
452     {
453         XmlDoc doc;
454 
455         //declaration (optional)
456         if (token().type == Token::TK_DECL_BEGIN)
457         {
458             nextToken();
459 
460             while (token().type == Token::TK_NAME)
461             {
462                 std::string attribName = token().name;
463                 nextToken();
464 
465                 consumeToken(Token::TK_EQUAL);
466                 expectToken(Token::TK_QUOTE);
467                 std::string attribValue = scn.extractAttributeValue();
468                 nextToken();
469 
470                 consumeToken(Token::TK_QUOTE);
471 
472                 if (attribName == "version")
473                     doc.setVersion(attribValue);
474                 else if (attribName == "encoding")
475                     doc.setEncoding(attribValue);
476                 else if (attribName == "standalone")
477                     doc.setStandalone(attribValue);
478             }
479             consumeToken(Token::TK_DECL_END);
480         }
481 
482         XmlElement dummy;
483         parseChildElements(dummy);
484 
485         auto itPair = dummy.getChildren();
486         if (itPair.first != itPair.second)
487             doc.root().swapSubtree(*itPair.first);
488 
489         expectToken(Token::TK_END);
490         return doc;
491     }
492 
493 private:
494     XmlParser           (const XmlParser&) = delete;
495     XmlParser& operator=(const XmlParser&) = delete;
496 
parseChildElements(XmlElement & parent)497     void parseChildElements(XmlElement& parent)
498     {
499         while (token().type == Token::TK_LESS)
500         {
501             nextToken();
502 
503             expectToken(Token::TK_NAME);
504             std::string elementName = token().name;
505             nextToken();
506 
507             XmlElement& newElement = parent.addChild(elementName);
508 
509             parseAttributes(newElement);
510 
511             if (token().type == Token::TK_SLASH_GREATER) //empty element
512             {
513                 nextToken();
514                 continue;
515             }
516 
517             expectToken(Token::TK_GREATER);
518             std::string elementValue = scn.extractElementValue();
519             nextToken();
520 
521             //no support for mixed-mode content
522             if (token().type == Token::TK_LESS) //structured element
523                 parseChildElements(newElement);
524             else //value element
525                 newElement.setValue(elementValue);
526 
527             consumeToken(Token::TK_LESS_SLASH);
528 
529             if (token().type != Token::TK_NAME ||
530                 elementName != token().name)
531                 throw XmlParsingError(scn.posRow(), scn.posCol());
532             nextToken();
533 
534             consumeToken(Token::TK_GREATER);
535         }
536     }
537 
parseAttributes(XmlElement & element)538     void parseAttributes(XmlElement& element)
539     {
540         while (token().type == Token::TK_NAME)
541         {
542             std::string attribName = token().name;
543             nextToken();
544 
545             consumeToken(Token::TK_EQUAL);
546             expectToken(Token::TK_QUOTE);
547             std::string attribValue = scn.extractAttributeValue();
548             nextToken();
549 
550             consumeToken(Token::TK_QUOTE);
551             element.setAttribute(attribName, attribValue);
552         }
553     }
554 
token()555     const Token& token() const { return tk; }
nextToken()556     void nextToken() { tk = scn.nextToken(); }
557 
consumeToken(Token::Type t)558     void consumeToken(Token::Type t) //throw XmlParsingError
559     {
560         expectToken(t); //throw XmlParsingError
561         nextToken();
562     }
563 
expectToken(Token::Type t)564     void expectToken(Token::Type t) //throw XmlParsingError
565     {
566         if (token().type != t)
567             throw XmlParsingError(scn.posRow(), scn.posCol());
568     }
569 
570     Scanner scn;
571     Token tk;
572 };
573 }
574 
575 inline
parse(const std::string & stream)576 XmlDoc parse(const std::string& stream) //throw XmlParsingError
577 {
578     return implementation::XmlParser(stream).parse();  //throw XmlParsingError
579 }
580 }
581 
582 #endif //PARSER_H_81248670213764583021432
583