1 /* Copyright (C) 2016 J.F.Dockes
2  *
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions are
5  * met:
6  *
7  *     (1) Redistributions of source code must retain the above copyright
8  *     notice, this list of conditions and the following disclaimer.
9  *
10  *     (2) Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer in
12  *     the documentation and/or other materials provided with the
13  *     distribution.
14  *
15  *     (3)The name of the author may not be used to
16  *     endorse or promote products derived from this software without
17  *     specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
23  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30 **********************************************************/
31 
32 #ifndef _PICOXML_H_INCLUDED_
33 #define _PICOXML_H_INCLUDED_
34 
35 /**
36  * PicoXMLParser: a single include file parser for an XML-like, but
37  * restricted language, adequate for config files, not for arbitrary
38  * externally generated data.
39  *
40  *  - The code depends on nothing but the C++ standard library
41  *  - The input to the parser is a single c++ string. Does not deal with
42  *    input in several pieces or files.
43  *  - SAX mode only. You have access to the tag stack. I've always
44  *    found DOM mode less usable.
45  *  - Checks for proper tag nesting and not much else.
46  *  - ! No CDATA
47  *  - ! Attributes should really really not contain XML special chars.
48  *
49  * A typical input would be like the following (you can add XML
50  * declarations, whitespace and newlines to taste).
51  *
52  * <top>top chrs1<sub attr="attrval">sub chrs</sub>top chrs2 <emptyelt /></top>
53  *
54  * Usage: subclass PicoXMLParser, overriding the methods in the
55  *  "protected:" section (look there for more details), call the
56  * constructor with your input, then call parse().
57  */
58 
59 #include <string>
60 #include <vector>
61 #include <map>
62 #include <sstream>
63 #include <iostream>
64 #include <algorithm>
65 
66 // Expat compat
67 typedef char XML_Char;
68 
69 class PicoXMLParser {
70 public:
PicoXMLParser(const std::string & input)71     PicoXMLParser(const std::string& input)
72         : m_in(input), m_pos(0) {}
73 
~PicoXMLParser()74     virtual ~PicoXMLParser() {}
75 
parse()76     virtual bool parse() {
77         return _parse();
78     }
Parse()79     virtual bool Parse() {
80         return _parse();
81     }
82 
getReason()83     virtual std::string getReason() {
84         return m_reason.str();
85     }
86 
87 protected:
88 
89     /* Methods to be overriden */
90 
91     /**
92      * Tag open handler.
93      * @param tagname the tag name
94      * @param attrs a map of attribute name/value pairs
95      */
startElement(const std::string &,const std::map<std::string,std::string> &)96     virtual void startElement(
97         const std::string& /* nm */,
98         const std::map<std::string, std::string>& /* attrs */) {}
99     /** Expatmm compat. We don't support attributes with this at the moment */
StartElement(const XML_Char *,const XML_Char **)100     virtual void StartElement(const XML_Char *, const XML_Char **) {}
101 
102     /**
103      * Tag close handler.
104      * You should probably have been accumulating text and stuff since
105      * the tag opening.
106      * @param tagname the tag name.
107      */
endElement(const std::string &)108     virtual void endElement(const std::string& /* nm */) {}
109     /** Expatmm compat */
EndElement(const XML_Char *)110     virtual void EndElement(const XML_Char * /* nm */) {}
111 
112     /**
113      * Non-tag data handler.
114      * @param data the data.
115      */
characterData(const std::string &)116     virtual void characterData(const std::string& /*data*/) {}
117     /** Expatmm compat */
CharacterData(const XML_Char *,int)118     virtual void CharacterData(const XML_Char *, int) {}
119 
120     /**
121      * Return current tag name stack. Deprecated, use m_path.
122      * This does not include the current (bottom) tag.
123      * Attributes are not kept in there, you'll have to do this yourself.
124      * @return a const ref to a vector of tag names.
125      */
tagStack()126     virtual const std::vector<std::string>& tagStack() {
127         return m_tagstack;
128     }
129 
130     /**
131      * Current element stack, including the bottom one
132      * Each entry includes the attributes and the starting character offset.
133      * The stack includes the last element (the one open is called for).
134      */
135     class StackEl {
136     public:
StackEl(const std::string & nm)137         StackEl(const std::string& nm) : name(nm) {}
138         std::string name;
139         std::string::size_type start_index;
140         std::map<std::string,std::string> attributes;
141         std::string data; // Derived class usage
142     };
143     std::vector<StackEl> m_path;
144 
145 private:
146     const std::string& m_in;
147     std::string::size_type m_pos{0};
148     std::stringstream m_reason;
149     std::vector<std::string> m_tagstack;
150 
_startelem(const std::string & tagname,const std::map<std::string,std::string> & attrs,bool empty)151     void _startelem(const std::string& tagname,
152                     const std::map<std::string, std::string>& attrs, bool empty)
153     {
154         m_path.push_back(StackEl(tagname));
155         StackEl& lastelt = m_path.back();
156         lastelt.start_index = m_pos;
157         lastelt.attributes = attrs;
158 
159         startElement(tagname, attrs);
160         StartElement(tagname.c_str(), nullptr);
161 
162         m_tagstack.push_back(tagname);      // Compat
163         if (empty) {
164             _endelem(tagname);
165         }
166     }
167 
_endelem(const std::string & tagname)168     void _endelem(const std::string& tagname)
169     {
170         m_tagstack.pop_back();
171         endElement(tagname);
172         EndElement(tagname.c_str());
173         m_path.pop_back();
174     }
175 
_parse()176     bool _parse() {
177         // skip initial whitespace and XML decl. On success, returns with
178         // current pos on first tag '<'
179         if (!skipDecl()) {
180             return false;
181         }
182         if (nomore()) {
183             // empty file
184             return true;
185         }
186 
187         for (;;) {
188             // Current char is '<' and the next char is not '?'
189             //std::cerr<< "m_pos "<< m_pos<<" char "<< m_in[m_pos]<<std::endl;
190             // skipComment also processes
191             if (!skipComment()) {
192                 return false;
193             }
194             if (nomore()) {
195                 if (!m_tagstack.empty()) {
196                     m_reason << "EOF hit inside open element";
197                     return false;
198                 }
199                 return true;
200             }
201             m_pos++;
202             if (nomore()) {
203                 m_reason << "EOF within tag";
204                 return false;
205             }
206             std::string::size_type spos = m_pos;
207             int isendtag = m_in[m_pos] == '/' ? 1 : 0;
208 
209             skipStr(">");
210             if (m_pos == std::string::npos || m_pos <= spos + 1) {
211                 m_reason << "Empty tag or EOF inside tag. pos " << spos;
212                 return false;
213             }
214 
215             int emptyel = m_in[m_pos-2] == '/' ? 1 : 0;
216             if (emptyel && isendtag) {
217                 m_reason << "Bad tag </xx/> at cpos " << spos;
218                 return false;
219             }
220 
221             std::string tag =
222                 m_in.substr(spos + isendtag,
223                             m_pos - (spos + 1 + isendtag + emptyel));
224             //std::cerr << "TAG NAME [" << tag << "]\n";
225             trimtag(tag);
226             std::map<std::string, std::string> attrs;
227             if (!parseattrs(tag, attrs)) {
228                 return false;
229             }
230             if (isendtag) {
231                 if (m_tagstack.empty() || tag.compare(m_tagstack.back())) {
232                     m_reason << "Closing not open tag " << tag <<
233                         " at cpos " << m_pos;
234                     return false;
235                 }
236                 _endelem(tag);
237             } else {
238                 _startelem(tag, attrs, emptyel);
239             }
240             spos = m_pos;
241             if (!_chardata()) {
242                 return false;
243             }
244         }
245         return false;
246     }
247 
_chardata()248     bool _chardata() {
249         std::string::size_type spos = m_pos;
250         m_pos = m_in.find("<", m_pos);
251         if (nomore()) {
252             return true;
253         }
254         if (m_pos != spos) {
255             std::string data{unQuote(m_in.substr(spos, m_pos - spos))};
256             characterData(data);
257             CharacterData(data.c_str(), data.size());
258         }
259         return true;
260     }
261 
262     bool nomore(int sz = 0) const {
263         return m_pos == std::string::npos || m_pos >= m_in.size() - sz;
264     }
skipWS(const std::string & in,std::string::size_type & pos)265     bool skipWS(const std::string& in, std::string::size_type& pos) {
266         if (pos == std::string::npos)
267             return false;
268         pos = in.find_first_not_of(" \t\n\r", pos);
269         return pos != std::string::npos;
270     }
skipStr(const std::string & str)271     bool skipStr(const std::string& str) {
272         if (m_pos == std::string::npos)
273             return false;
274         m_pos = m_in.find(str, m_pos);
275         if (m_pos != std::string::npos)
276             m_pos += str.size();
277         return m_pos != std::string::npos;
278     }
279     int peek(int sz = 0) const {
280         if (nomore(sz))
281             return -1;
282         return m_in[m_pos + 1 + sz];
283     }
trimtag(std::string & tagname)284     void trimtag(std::string& tagname) {
285         std::string::size_type trimpos = tagname.find_last_not_of(" \t\n\r");
286         if (trimpos != std::string::npos) {
287             tagname = tagname.substr(0, trimpos+1);
288         }
289     }
290 
skipDecl()291     bool skipDecl() {
292         for (;;) {
293             if (!skipWS(m_in, m_pos)) {
294                 m_reason << "EOF during initial ws skip";
295                 return true;
296             }
297             if (m_in[m_pos] != '<') {
298                 m_reason << "EOF file does not begin with decl/tag: m_pos " <<
299                     m_pos << " char [" << m_in[m_pos] << "]\n";
300                 return false;
301             }
302             if (peek() == '?') {
303                 if (!skipStr("?>")) {
304                     m_reason << "EOF while looking for end of xml decl";
305                     return false;
306                 }
307             } else {
308                 break;
309             }
310         }
311         return true;
312     }
313 
skipComment()314     bool skipComment() {
315         if (nomore()) {
316             return true;
317         }
318         if (m_in[m_pos] != '<') {
319             m_reason << "Internal error: skipComment called with wrong "
320                 "start: m_pos " <<
321                 m_pos << " char [" << m_in[m_pos] << "]\n";
322             return false;
323         }
324         if (peek() == '!' && peek(1) == '-' && peek(2) == '-') {
325             if (!skipStr("-->")) {
326                 m_reason << "EOF while looking for end of XML comment";
327                 return false;
328             }
329             // Process possible characters until next tag
330             return _chardata();
331         }
332         return true;
333     }
334 
parseattrs(std::string & tag,std::map<std::string,std::string> & attrs)335     bool parseattrs(std::string& tag,
336                     std::map<std::string, std::string>& attrs) {
337         //std::cerr << "parseattrs: [" << tag << "]\n";
338         attrs.clear();
339         std::string::size_type spos = tag.find_first_of(" \t\n\r");
340         if (spos == std::string::npos)
341             return true;
342         std::string tagname = tag.substr(0, spos);
343         //std::cerr << "tag name [" << tagname << "] pos " << spos << "\n";
344         skipWS(tag, spos);
345 
346         for (;;) {
347             //std::cerr << "top of loop [" << tag.substr(spos) << "]\n";
348             std::string::size_type epos = tag.find_first_of(" \t\n\r=", spos);
349             if (epos == std::string::npos) {
350                 m_reason << "Bad attributes syntax at cpos " << m_pos + epos;
351                 return false;
352             }
353             std::string attrnm = tag.substr(spos, epos - spos);
354             if (attrnm.empty()) {
355                 m_reason << "Empty attribute name ?? at cpos " << m_pos + epos;
356                 return false;
357             }
358             //std::cerr << "attr name [" << attrnm << "]\n";
359             skipWS(tag, epos);
360             if (epos == std::string::npos || epos == tag.size() - 1 ||
361                 tag[epos] != '=') {
362                 m_reason <<"Missing equal sign or value at cpos " << m_pos+epos;
363                 return false;
364             }
365             epos++;
366             skipWS(tag, epos);
367             char qc{0};
368             if ((tag[epos] != '"' && tag[epos] != '\'') ||
369                 epos == tag.size() - 1) {
370                 m_reason << "Missing quote or value at cpos " << m_pos+epos;
371                 return false;
372             }
373             qc = tag[epos];
374             spos = epos + 1;
375             epos = tag.find_first_of(qc, spos);
376             if (epos == std::string::npos) {
377                 m_reason << "Missing closing quote at cpos " << m_pos+spos;
378                 return false;
379             }
380             attrs[attrnm] = tag.substr(spos, epos - spos);
381             //std::cerr << "attr value [" << attrs[attrnm] << "]\n";
382             if (epos == tag.size() - 1) {
383                 break;
384             }
385             epos++;
386             skipWS(tag, epos);
387             if (epos == tag.size() - 1) {
388                 break;
389             }
390             spos = epos;
391         }
392         tag = tagname;
393         return true;
394     }
395 
unQuote(const std::string & s)396     std::string unQuote(const std::string &s) {
397         static const std::string e_quot{"quot"};
398         static const std::string e_amp{"amp"};
399         static const std::string e_apos{"apos"};
400         static const std::string e_lt{"lt"};
401         static const std::string e_gt{"gt"};
402 
403         std::string out;
404         out.reserve(s.size());
405         std::string::const_iterator it = s.begin();
406         while (it != s.end()) {
407             if (*it != '&') {
408                 out += *it;
409                 it++;
410                 continue;
411             }
412             if (it == s.end()) {
413                 // Unexpected
414                 break;
415             }
416             it++;
417             std::string code;
418             while (it != s.end() && *it != ';') {
419                 code += *it;
420                 it++;
421             }
422             if (it == s.end()) {
423                 // Unexpected
424                 break;
425             }
426             it++;
427             if (code == e_quot) {
428                 out += '"';
429             } else if (code == e_amp) {
430                 out += '&';
431             } else if (code == e_apos) {
432                 out += '\'';
433             } else if (code == e_lt) {
434                 out += '<';
435             } else if (code == e_gt) {
436                 out += '>';
437             }
438         }
439         return out;
440     }
441 };
442 #endif /* _PICOXML_H_INCLUDED_ */
443