1 /* Copyright (C) 2016 J.F.Dockes 2 * 3 * Redistribution and use in source and binary forms, with or without 4 * modification, are permitted provided that the following conditions are 5 * met: 6 * 7 * (1) Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 10 * (2) Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * (3)The name of the author may not be used to 16 * endorse or promote products derived from this software without 17 * specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, 23 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 **********************************************************/ 31 32 #ifndef _PICOXML_H_INCLUDED_ 33 #define _PICOXML_H_INCLUDED_ 34 35 /** 36 * PicoXMLParser: a single include file parser for an XML-like, but 37 * restricted language, adequate for config files, not for arbitrary 38 * externally generated data. 39 * 40 * - The code depends on nothing but the C++ standard library 41 * - The input to the parser is a single c++ string. Does not deal with 42 * input in several pieces or files. 43 * - SAX mode only. You have access to the tag stack. I've always 44 * found DOM mode less usable. 45 * - Checks for proper tag nesting and not much else. 46 * - ! No CDATA 47 * - ! Attributes should really really not contain XML special chars. 48 * 49 * A typical input would be like the following (you can add XML 50 * declarations, whitespace and newlines to taste). 51 * 52 * <top>top chrs1<sub attr="attrval">sub chrs</sub>top chrs2 <emptyelt /></top> 53 * 54 * Usage: subclass PicoXMLParser, overriding the methods in the 55 * "protected:" section (look there for more details), call the 56 * constructor with your input, then call parse(). 57 */ 58 59 #include <string> 60 #include <vector> 61 #include <map> 62 #include <sstream> 63 #include <iostream> 64 #include <algorithm> 65 66 // Expat compat 67 typedef char XML_Char; 68 69 class PicoXMLParser { 70 public: PicoXMLParser(const std::string & input)71 PicoXMLParser(const std::string& input) 72 : m_in(input), m_pos(0) {} 73 ~PicoXMLParser()74 virtual ~PicoXMLParser() {} 75 parse()76 virtual bool parse() { 77 return _parse(); 78 } Parse()79 virtual bool Parse() { 80 return _parse(); 81 } 82 getReason()83 virtual std::string getReason() { 84 return m_reason.str(); 85 } 86 87 protected: 88 89 /* Methods to be overriden */ 90 91 /** 92 * Tag open handler. 93 * @param tagname the tag name 94 * @param attrs a map of attribute name/value pairs 95 */ startElement(const std::string &,const std::map<std::string,std::string> &)96 virtual void startElement( 97 const std::string& /* nm */, 98 const std::map<std::string, std::string>& /* attrs */) {} 99 /** Expatmm compat. We don't support attributes with this at the moment */ StartElement(const XML_Char *,const XML_Char **)100 virtual void StartElement(const XML_Char *, const XML_Char **) {} 101 102 /** 103 * Tag close handler. 104 * You should probably have been accumulating text and stuff since 105 * the tag opening. 106 * @param tagname the tag name. 107 */ endElement(const std::string &)108 virtual void endElement(const std::string& /* nm */) {} 109 /** Expatmm compat */ EndElement(const XML_Char *)110 virtual void EndElement(const XML_Char * /* nm */) {} 111 112 /** 113 * Non-tag data handler. 114 * @param data the data. 115 */ characterData(const std::string &)116 virtual void characterData(const std::string& /*data*/) {} 117 /** Expatmm compat */ CharacterData(const XML_Char *,int)118 virtual void CharacterData(const XML_Char *, int) {} 119 120 /** 121 * Return current tag name stack. Deprecated, use m_path. 122 * This does not include the current (bottom) tag. 123 * Attributes are not kept in there, you'll have to do this yourself. 124 * @return a const ref to a vector of tag names. 125 */ tagStack()126 virtual const std::vector<std::string>& tagStack() { 127 return m_tagstack; 128 } 129 130 /** 131 * Current element stack, including the bottom one 132 * Each entry includes the attributes and the starting character offset. 133 * The stack includes the last element (the one open is called for). 134 */ 135 class StackEl { 136 public: StackEl(const std::string & nm)137 StackEl(const std::string& nm) : name(nm) {} 138 std::string name; 139 std::string::size_type start_index; 140 std::map<std::string,std::string> attributes; 141 std::string data; // Derived class usage 142 }; 143 std::vector<StackEl> m_path; 144 145 private: 146 const std::string& m_in; 147 std::string::size_type m_pos{0}; 148 std::stringstream m_reason; 149 std::vector<std::string> m_tagstack; 150 _startelem(const std::string & tagname,const std::map<std::string,std::string> & attrs,bool empty)151 void _startelem(const std::string& tagname, 152 const std::map<std::string, std::string>& attrs, bool empty) 153 { 154 m_path.push_back(StackEl(tagname)); 155 StackEl& lastelt = m_path.back(); 156 lastelt.start_index = m_pos; 157 lastelt.attributes = attrs; 158 159 startElement(tagname, attrs); 160 StartElement(tagname.c_str(), nullptr); 161 162 m_tagstack.push_back(tagname); // Compat 163 if (empty) { 164 _endelem(tagname); 165 } 166 } 167 _endelem(const std::string & tagname)168 void _endelem(const std::string& tagname) 169 { 170 m_tagstack.pop_back(); 171 endElement(tagname); 172 EndElement(tagname.c_str()); 173 m_path.pop_back(); 174 } 175 _parse()176 bool _parse() { 177 // skip initial whitespace and XML decl. On success, returns with 178 // current pos on first tag '<' 179 if (!skipDecl()) { 180 return false; 181 } 182 if (nomore()) { 183 // empty file 184 return true; 185 } 186 187 for (;;) { 188 // Current char is '<' and the next char is not '?' 189 //std::cerr<< "m_pos "<< m_pos<<" char "<< m_in[m_pos]<<std::endl; 190 // skipComment also processes 191 if (!skipComment()) { 192 return false; 193 } 194 if (nomore()) { 195 if (!m_tagstack.empty()) { 196 m_reason << "EOF hit inside open element"; 197 return false; 198 } 199 return true; 200 } 201 m_pos++; 202 if (nomore()) { 203 m_reason << "EOF within tag"; 204 return false; 205 } 206 std::string::size_type spos = m_pos; 207 int isendtag = m_in[m_pos] == '/' ? 1 : 0; 208 209 skipStr(">"); 210 if (m_pos == std::string::npos || m_pos <= spos + 1) { 211 m_reason << "Empty tag or EOF inside tag. pos " << spos; 212 return false; 213 } 214 215 int emptyel = m_in[m_pos-2] == '/' ? 1 : 0; 216 if (emptyel && isendtag) { 217 m_reason << "Bad tag </xx/> at cpos " << spos; 218 return false; 219 } 220 221 std::string tag = 222 m_in.substr(spos + isendtag, 223 m_pos - (spos + 1 + isendtag + emptyel)); 224 //std::cerr << "TAG NAME [" << tag << "]\n"; 225 trimtag(tag); 226 std::map<std::string, std::string> attrs; 227 if (!parseattrs(tag, attrs)) { 228 return false; 229 } 230 if (isendtag) { 231 if (m_tagstack.empty() || tag.compare(m_tagstack.back())) { 232 m_reason << "Closing not open tag " << tag << 233 " at cpos " << m_pos; 234 return false; 235 } 236 _endelem(tag); 237 } else { 238 _startelem(tag, attrs, emptyel); 239 } 240 spos = m_pos; 241 if (!_chardata()) { 242 return false; 243 } 244 } 245 return false; 246 } 247 _chardata()248 bool _chardata() { 249 std::string::size_type spos = m_pos; 250 m_pos = m_in.find("<", m_pos); 251 if (nomore()) { 252 return true; 253 } 254 if (m_pos != spos) { 255 std::string data{unQuote(m_in.substr(spos, m_pos - spos))}; 256 characterData(data); 257 CharacterData(data.c_str(), data.size()); 258 } 259 return true; 260 } 261 262 bool nomore(int sz = 0) const { 263 return m_pos == std::string::npos || m_pos >= m_in.size() - sz; 264 } skipWS(const std::string & in,std::string::size_type & pos)265 bool skipWS(const std::string& in, std::string::size_type& pos) { 266 if (pos == std::string::npos) 267 return false; 268 pos = in.find_first_not_of(" \t\n\r", pos); 269 return pos != std::string::npos; 270 } skipStr(const std::string & str)271 bool skipStr(const std::string& str) { 272 if (m_pos == std::string::npos) 273 return false; 274 m_pos = m_in.find(str, m_pos); 275 if (m_pos != std::string::npos) 276 m_pos += str.size(); 277 return m_pos != std::string::npos; 278 } 279 int peek(int sz = 0) const { 280 if (nomore(sz)) 281 return -1; 282 return m_in[m_pos + 1 + sz]; 283 } trimtag(std::string & tagname)284 void trimtag(std::string& tagname) { 285 std::string::size_type trimpos = tagname.find_last_not_of(" \t\n\r"); 286 if (trimpos != std::string::npos) { 287 tagname = tagname.substr(0, trimpos+1); 288 } 289 } 290 skipDecl()291 bool skipDecl() { 292 for (;;) { 293 if (!skipWS(m_in, m_pos)) { 294 m_reason << "EOF during initial ws skip"; 295 return true; 296 } 297 if (m_in[m_pos] != '<') { 298 m_reason << "EOF file does not begin with decl/tag: m_pos " << 299 m_pos << " char [" << m_in[m_pos] << "]\n"; 300 return false; 301 } 302 if (peek() == '?') { 303 if (!skipStr("?>")) { 304 m_reason << "EOF while looking for end of xml decl"; 305 return false; 306 } 307 } else { 308 break; 309 } 310 } 311 return true; 312 } 313 skipComment()314 bool skipComment() { 315 if (nomore()) { 316 return true; 317 } 318 if (m_in[m_pos] != '<') { 319 m_reason << "Internal error: skipComment called with wrong " 320 "start: m_pos " << 321 m_pos << " char [" << m_in[m_pos] << "]\n"; 322 return false; 323 } 324 if (peek() == '!' && peek(1) == '-' && peek(2) == '-') { 325 if (!skipStr("-->")) { 326 m_reason << "EOF while looking for end of XML comment"; 327 return false; 328 } 329 // Process possible characters until next tag 330 return _chardata(); 331 } 332 return true; 333 } 334 parseattrs(std::string & tag,std::map<std::string,std::string> & attrs)335 bool parseattrs(std::string& tag, 336 std::map<std::string, std::string>& attrs) { 337 //std::cerr << "parseattrs: [" << tag << "]\n"; 338 attrs.clear(); 339 std::string::size_type spos = tag.find_first_of(" \t\n\r"); 340 if (spos == std::string::npos) 341 return true; 342 std::string tagname = tag.substr(0, spos); 343 //std::cerr << "tag name [" << tagname << "] pos " << spos << "\n"; 344 skipWS(tag, spos); 345 346 for (;;) { 347 //std::cerr << "top of loop [" << tag.substr(spos) << "]\n"; 348 std::string::size_type epos = tag.find_first_of(" \t\n\r=", spos); 349 if (epos == std::string::npos) { 350 m_reason << "Bad attributes syntax at cpos " << m_pos + epos; 351 return false; 352 } 353 std::string attrnm = tag.substr(spos, epos - spos); 354 if (attrnm.empty()) { 355 m_reason << "Empty attribute name ?? at cpos " << m_pos + epos; 356 return false; 357 } 358 //std::cerr << "attr name [" << attrnm << "]\n"; 359 skipWS(tag, epos); 360 if (epos == std::string::npos || epos == tag.size() - 1 || 361 tag[epos] != '=') { 362 m_reason <<"Missing equal sign or value at cpos " << m_pos+epos; 363 return false; 364 } 365 epos++; 366 skipWS(tag, epos); 367 char qc{0}; 368 if ((tag[epos] != '"' && tag[epos] != '\'') || 369 epos == tag.size() - 1) { 370 m_reason << "Missing quote or value at cpos " << m_pos+epos; 371 return false; 372 } 373 qc = tag[epos]; 374 spos = epos + 1; 375 epos = tag.find_first_of(qc, spos); 376 if (epos == std::string::npos) { 377 m_reason << "Missing closing quote at cpos " << m_pos+spos; 378 return false; 379 } 380 attrs[attrnm] = tag.substr(spos, epos - spos); 381 //std::cerr << "attr value [" << attrs[attrnm] << "]\n"; 382 if (epos == tag.size() - 1) { 383 break; 384 } 385 epos++; 386 skipWS(tag, epos); 387 if (epos == tag.size() - 1) { 388 break; 389 } 390 spos = epos; 391 } 392 tag = tagname; 393 return true; 394 } 395 unQuote(const std::string & s)396 std::string unQuote(const std::string &s) { 397 static const std::string e_quot{"quot"}; 398 static const std::string e_amp{"amp"}; 399 static const std::string e_apos{"apos"}; 400 static const std::string e_lt{"lt"}; 401 static const std::string e_gt{"gt"}; 402 403 std::string out; 404 out.reserve(s.size()); 405 std::string::const_iterator it = s.begin(); 406 while (it != s.end()) { 407 if (*it != '&') { 408 out += *it; 409 it++; 410 continue; 411 } 412 if (it == s.end()) { 413 // Unexpected 414 break; 415 } 416 it++; 417 std::string code; 418 while (it != s.end() && *it != ';') { 419 code += *it; 420 it++; 421 } 422 if (it == s.end()) { 423 // Unexpected 424 break; 425 } 426 it++; 427 if (code == e_quot) { 428 out += '"'; 429 } else if (code == e_amp) { 430 out += '&'; 431 } else if (code == e_apos) { 432 out += '\''; 433 } else if (code == e_lt) { 434 out += '<'; 435 } else if (code == e_gt) { 436 out += '>'; 437 } 438 } 439 return out; 440 } 441 }; 442 #endif /* _PICOXML_H_INCLUDED_ */ 443