1 // *****************************************************************************
2 // * This file is part of the FreeFileSync project. It is distributed under *
3 // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 *
4 // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
5 // *****************************************************************************
6
7 #ifndef PARSER_H_81248670213764583021432
8 #define PARSER_H_81248670213764583021432
9
10 #include <cstdio>
11 #include <cstddef> //ptrdiff_t; req. on Linux
12 #include <zen/string_tools.h>
13 #include "dom.h"
14 #include "error.h"
15
16
17 namespace zen
18 {
19 /**
20 \file
21 \brief Convert an XML document object model (class XmlDoc) to and from a byte stream representation.
22 */
23
24 ///Save XML document as a byte stream
25 /**
26 \param doc Input XML document
27 \param lineBreak Line break, default: carriage return + new line
28 \param indent Indentation, default: four space characters
29 \return Output byte stream
30 */
31 std::string serialize(const XmlDoc& doc,
32 const std::string& lineBreak = "\r\n",
33 const std::string& indent = " "); //throw ()
34
35 ///Exception thrown due to an XML parsing error
36 struct XmlParsingError : public XmlError
37 {
XmlParsingErrorXmlParsingError38 XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
39 ///Input file row where the parsing error occured (zero-based)
40 const size_t row; //beginning with 0
41 ///Input file column where the parsing error occured (zero-based)
42 const size_t col; //
43 };
44
45
46 ///Load XML document from a byte stream
47 /**
48 \param stream Input byte stream
49 \returns Output XML document
50 \throw XmlParsingError
51 */
52 XmlDoc parse(const std::string& stream); //throw XmlParsingError
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 //---------------------------- implementation ----------------------------
74 //see: http://www.w3.org/TR/xml/
75
76 namespace implementation
77 {
78 template <class Predicate> inline
normalize(const std::string & str,Predicate pred)79 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
80 {
81 std::string output;
82 for (const char c : str)
83 {
84 if (c == '&') //
85 output += "&";
86 else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
87 output += "<";
88 else if (c == '>') //
89 output += ">";
90 else if (pred(c))
91 {
92 if (c == '\'')
93 output += "'";
94 else if (c == '\"')
95 output += """;
96 else
97 {
98 output += "&#x";
99 const auto hexDigits = hexify(c);
100 output += hexDigits.first;
101 output += hexDigits.second;
102 output += ';';
103 }
104 }
105 else
106 output += c;
107 }
108 return output;
109 }
110
111 inline
normalizeName(const std::string & str)112 std::string normalizeName(const std::string& str)
113 {
114 return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
115 }
116
117 inline
normalizeElementValue(const std::string & str)118 std::string normalizeElementValue(const std::string& str)
119 {
120 return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
121 }
122
123 inline
normalizeAttribValue(const std::string & str)124 std::string normalizeAttribValue(const std::string& str)
125 {
126 return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
127 }
128
129
130 template <class CharIterator, size_t N> inline
checkEntity(CharIterator & first,CharIterator last,const char (& placeholder)[N])131 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
132 {
133 assert(placeholder[N - 1] == 0);
134 const ptrdiff_t strLen = N - 1; //don't count null-terminator
135 if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
136 {
137 first += strLen - 1;
138 return true;
139 }
140 return false;
141 }
142
143
144 namespace
145 {
denormalize(const std::string & str)146 std::string denormalize(const std::string& str)
147 {
148 std::string output;
149 for (auto it = str.begin(); it != str.end(); ++it)
150 {
151 const char c = *it;
152
153 if (c == '&')
154 {
155 if (checkEntity(it, str.end(), "&"))
156 output += '&';
157 else if (checkEntity(it, str.end(), "<"))
158 output += '<';
159 else if (checkEntity(it, str.end(), ">"))
160 output += '>';
161 else if (checkEntity(it, str.end(), "'"))
162 output += '\'';
163 else if (checkEntity(it, str.end(), """))
164 output += '\"';
165 else if (str.end() - it >= 6 &&
166 it[1] == '#' &&
167 it[2] == 'x' &&
168 it[5] == ';')
169 {
170 output += unhexify(it[3], it[4]);
171 it += 5;
172 }
173 else
174 output += c; //unexpected char!
175 }
176 else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
177 {
178 auto itNext = it + 1;
179 if (itNext != str.end() && *itNext == '\n')
180 ++it;
181 output += '\n';
182 }
183 else
184 output += c;
185 }
186 return output;
187 }
188
189
serialize(const XmlElement & element,std::string & stream,const std::string & lineBreak,const std::string & indent,size_t indentLevel)190 void serialize(const XmlElement& element, std::string& stream,
191 const std::string& lineBreak,
192 const std::string& indent,
193 size_t indentLevel)
194 {
195 const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
196
197 for (size_t i = 0; i < indentLevel; ++i)
198 stream += indent;
199
200 stream += '<' + nameFmt;
201
202 auto attr = element.getAttributes();
203 for (auto it = attr.first; it != attr.second; ++it)
204 stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
205
206 //no support for mixed-mode content
207 auto iterPair = element.getChildren();
208 if (iterPair.first != iterPair.second) //structured element
209 {
210 stream += '>' + lineBreak;
211
212 std::for_each(iterPair.first, iterPair.second,
213 [&](const XmlElement& el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
214
215 for (size_t i = 0; i < indentLevel; ++i)
216 stream += indent;
217 stream += "</" + nameFmt + '>' + lineBreak;
218 }
219 else
220 {
221 std::string value;
222 element.getValue(value);
223
224 if (!value.empty()) //value element
225 stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
226 else //empty element
227 stream += "/>" + lineBreak;
228 }
229 }
230
serialize(const XmlDoc & doc,const std::string & lineBreak,const std::string & indent)231 std::string serialize(const XmlDoc& doc,
232 const std::string& lineBreak,
233 const std::string& indent)
234 {
235 std::string version = doc.getVersionAs<std::string>();
236 if (!version.empty())
237 version = " version=\"" + normalizeAttribValue(version) + '\"';
238
239 std::string encoding = doc.getEncodingAs<std::string>();
240 if (!encoding.empty())
241 encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
242
243 std::string standalone = doc.getStandaloneAs<std::string>();
244 if (!standalone.empty())
245 standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
246
247 std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
248 serialize(doc.root(), output, lineBreak, indent, 0);
249 return output;
250 }
251 }
252 }
253
254 inline
serialize(const XmlDoc & doc,const std::string & lineBreak,const std::string & indent)255 std::string serialize(const XmlDoc& doc,
256 const std::string& lineBreak,
257 const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
258
259 /*
260 Grammar for XML parser
261 -------------------------------
262 document-expression:
263 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
264 element-expression:
265
266 element-expression:
267 <string attributes-expression/>
268 <string attributes-expression> pm-expression </string>
269
270 element-list-expression:
271 <empty>
272 element-expression element-list-expression
273
274 attributes-expression:
275 <empty>
276 string="string" attributes-expression
277
278 pm-expression:
279 string
280 element-list-expression
281 */
282
283 namespace implementation
284 {
285 struct Token
286 {
287 enum Type
288 {
289 TK_LESS,
290 TK_GREATER,
291 TK_LESS_SLASH,
292 TK_SLASH_GREATER,
293 TK_EQUAL,
294 TK_QUOTE,
295 TK_DECL_BEGIN,
296 TK_DECL_END,
297 TK_NAME,
298 TK_END
299 };
300
TokenToken301 Token(Type t) : type(t) {}
TokenToken302 Token(const std::string& txt) : type(TK_NAME), name(txt) {}
303
304 Type type;
305 std::string name; //filled if type == TK_NAME
306 };
307
308 class Scanner
309 {
310 public:
Scanner(const std::string & stream)311 Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
312 {
313 if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
314 pos += strLength(BYTE_ORDER_MARK_UTF8);
315 }
316
nextToken()317 Token nextToken() //throw XmlParsingError
318 {
319 //skip whitespace
320 pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
321
322 if (pos == stream_.end())
323 return Token::TK_END;
324
325 //skip XML comments
326 if (startsWith(xmlCommentBegin))
327 {
328 auto it = std::search(pos + xmlCommentBegin.size(), stream_.end(), xmlCommentEnd.begin(), xmlCommentEnd.end());
329 if (it != stream_.end())
330 {
331 pos = it + xmlCommentEnd.size();
332 return nextToken();
333 }
334 }
335
336 for (auto it = tokens.begin(); it != tokens.end(); ++it)
337 if (startsWith(it->first))
338 {
339 pos += it->first.size();
340 return it->second;
341 }
342
343 auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
344 {
345 return c == '<' ||
346 c == '>' ||
347 c == '=' ||
348 c == '/' ||
349 c == '\'' ||
350 c == '\"' ||
351 zen::isWhiteSpace(c);
352 });
353
354 if (nameEnd != pos)
355 {
356 std::string name(&*pos, nameEnd - pos);
357 pos = nameEnd;
358 return implementation::denormalize(name);
359 }
360
361 //unknown token
362 throw XmlParsingError(posRow(), posCol());
363 }
364
extractElementValue()365 std::string extractElementValue()
366 {
367 auto it = std::find_if(pos, stream_.end(), [](char c)
368 {
369 return c == '<' ||
370 c == '>';
371 });
372 std::string output(pos, it);
373 pos = it;
374 return implementation::denormalize(output);
375 }
376
extractAttributeValue()377 std::string extractAttributeValue()
378 {
379 auto it = std::find_if(pos, stream_.end(), [](char c)
380 {
381 return c == '<' ||
382 c == '>' ||
383 c == '\'' ||
384 c == '\"';
385 });
386 std::string output(pos, it);
387 pos = it;
388 return implementation::denormalize(output);
389 }
390
posRow()391 size_t posRow() const //current row beginning with 0
392 {
393 const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
394 const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
395 assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
396 return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
397 }
398
posCol()399 size_t posCol() const //current col beginning with 0
400 {
401 //seek beginning of line
402 for (auto it = pos; it != stream_.begin(); )
403 {
404 --it;
405 if (*it == '\r' || *it == '\n')
406 return pos - it - 1;
407 }
408 return pos - stream_.begin();
409 }
410
411 private:
412 Scanner (const Scanner&) = delete;
413 Scanner& operator=(const Scanner&) = delete;
414
startsWith(const std::string & prefix)415 bool startsWith(const std::string& prefix) const
416 {
417 if (stream_.end() - pos < static_cast<ptrdiff_t>(prefix.size()))
418 return false;
419 return std::equal(prefix.begin(), prefix.end(), pos);
420 }
421
422 using TokenList = std::vector<std::pair<std::string, Token::Type>>;
423 const TokenList tokens
424 {
425 { "<?xml", Token::TK_DECL_BEGIN },
426 { "?>", Token::TK_DECL_END },
427 { "</", Token::TK_LESS_SLASH },
428 { "/>", Token::TK_SLASH_GREATER },
429 { "<" , Token::TK_LESS }, //evaluate after TK_DECL_BEGIN!
430 { ">" , Token::TK_GREATER },
431 { "=" , Token::TK_EQUAL },
432 { "\"", Token::TK_QUOTE },
433 { "\'", Token::TK_QUOTE },
434 };
435
436 const std::string xmlCommentBegin = "<!--";
437 const std::string xmlCommentEnd = "-->";
438
439 const std::string stream_;
440 std::string::const_iterator pos;
441 };
442
443
444 class XmlParser
445 {
446 public:
XmlParser(const std::string & stream)447 XmlParser(const std::string& stream) :
448 scn(stream),
449 tk(scn.nextToken()) {}
450
parse()451 XmlDoc parse() //throw XmlParsingError
452 {
453 XmlDoc doc;
454
455 //declaration (optional)
456 if (token().type == Token::TK_DECL_BEGIN)
457 {
458 nextToken();
459
460 while (token().type == Token::TK_NAME)
461 {
462 std::string attribName = token().name;
463 nextToken();
464
465 consumeToken(Token::TK_EQUAL);
466 expectToken(Token::TK_QUOTE);
467 std::string attribValue = scn.extractAttributeValue();
468 nextToken();
469
470 consumeToken(Token::TK_QUOTE);
471
472 if (attribName == "version")
473 doc.setVersion(attribValue);
474 else if (attribName == "encoding")
475 doc.setEncoding(attribValue);
476 else if (attribName == "standalone")
477 doc.setStandalone(attribValue);
478 }
479 consumeToken(Token::TK_DECL_END);
480 }
481
482 XmlElement dummy;
483 parseChildElements(dummy);
484
485 auto itPair = dummy.getChildren();
486 if (itPair.first != itPair.second)
487 doc.root().swapSubtree(*itPair.first);
488
489 expectToken(Token::TK_END);
490 return doc;
491 }
492
493 private:
494 XmlParser (const XmlParser&) = delete;
495 XmlParser& operator=(const XmlParser&) = delete;
496
parseChildElements(XmlElement & parent)497 void parseChildElements(XmlElement& parent)
498 {
499 while (token().type == Token::TK_LESS)
500 {
501 nextToken();
502
503 expectToken(Token::TK_NAME);
504 std::string elementName = token().name;
505 nextToken();
506
507 XmlElement& newElement = parent.addChild(elementName);
508
509 parseAttributes(newElement);
510
511 if (token().type == Token::TK_SLASH_GREATER) //empty element
512 {
513 nextToken();
514 continue;
515 }
516
517 expectToken(Token::TK_GREATER);
518 std::string elementValue = scn.extractElementValue();
519 nextToken();
520
521 //no support for mixed-mode content
522 if (token().type == Token::TK_LESS) //structured element
523 parseChildElements(newElement);
524 else //value element
525 newElement.setValue(elementValue);
526
527 consumeToken(Token::TK_LESS_SLASH);
528
529 if (token().type != Token::TK_NAME ||
530 elementName != token().name)
531 throw XmlParsingError(scn.posRow(), scn.posCol());
532 nextToken();
533
534 consumeToken(Token::TK_GREATER);
535 }
536 }
537
parseAttributes(XmlElement & element)538 void parseAttributes(XmlElement& element)
539 {
540 while (token().type == Token::TK_NAME)
541 {
542 std::string attribName = token().name;
543 nextToken();
544
545 consumeToken(Token::TK_EQUAL);
546 expectToken(Token::TK_QUOTE);
547 std::string attribValue = scn.extractAttributeValue();
548 nextToken();
549
550 consumeToken(Token::TK_QUOTE);
551 element.setAttribute(attribName, attribValue);
552 }
553 }
554
token()555 const Token& token() const { return tk; }
nextToken()556 void nextToken() { tk = scn.nextToken(); }
557
consumeToken(Token::Type t)558 void consumeToken(Token::Type t) //throw XmlParsingError
559 {
560 expectToken(t); //throw XmlParsingError
561 nextToken();
562 }
563
expectToken(Token::Type t)564 void expectToken(Token::Type t) //throw XmlParsingError
565 {
566 if (token().type != t)
567 throw XmlParsingError(scn.posRow(), scn.posCol());
568 }
569
570 Scanner scn;
571 Token tk;
572 };
573 }
574
575 inline
parse(const std::string & stream)576 XmlDoc parse(const std::string& stream) //throw XmlParsingError
577 {
578 return implementation::XmlParser(stream).parse(); //throw XmlParsingError
579 }
580 }
581
582 #endif //PARSER_H_81248670213764583021432
583