1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 */ 7 8 #ifndef INCLUDED_ORCUS_SAX_PARSER_BASE_HPP 9 #define INCLUDED_ORCUS_SAX_PARSER_BASE_HPP 10 11 #include "env.hpp" 12 #include "pstring.hpp" 13 #include "cell_buffer.hpp" 14 #include "parser_global.hpp" 15 #include "parser_base.hpp" 16 17 #include <cassert> 18 #include <cstdlib> 19 #include <exception> 20 #include <sstream> 21 #include <memory> 22 23 #define ORCUS_DEBUG_SAX_PARSER 0 24 25 #if ORCUS_DEBUG_SAX_PARSER 26 #include <iostream> 27 using std::cout; 28 using std::endl; 29 #endif 30 31 namespace orcus { namespace sax { 32 33 class ORCUS_PSR_DLLPUBLIC malformed_xml_error : public ::orcus::parse_error 34 { 35 public: 36 malformed_xml_error() = delete; 37 malformed_xml_error(const std::string& msg, std::ptrdiff_t offset); 38 virtual ~malformed_xml_error() throw(); 39 }; 40 41 /** 42 * Document type declaration passed by sax_parser to its handler's doctype() 43 * call. 44 */ 45 struct doctype_declaration 46 { 47 enum class keyword_type { dtd_public, dtd_private }; 48 49 keyword_type keyword; 50 pstring root_element; 51 pstring fpi; 52 pstring uri; 53 }; 54 55 /** 56 * Given an encoded name (such as 'quot' and 'amp'), return a single 57 * character that corresponds with the name. The name shouldn't include the 58 * leading '&' and trailing ';'. 59 * 60 * @param p pointer to the first character of encoded name 61 * @param n length of encoded name 62 * 63 * @return single character that corresponds with the encoded name. '\0' is 64 * returned if decoding fails. 65 */ 66 ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n); 67 68 /** 69 * Given an encoded unicode value (such as #20A9), return a UTF-8 string 70 * that corresponds with the unicode value. The value shouldn't include the 71 * leading '&' and trailing ';'. 72 * 73 * @param p pointer to the first character of encoded name 74 * @param n length of encoded name 75 * 76 * @return string that corresponds with the encoded value. An empty string 77 * is returned if decoding fails. 78 */ 79 ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n); 80 81 /** 82 * Element properties passed by sax_parser to its handler's open_element() 83 * and close_element() calls. 84 */ 85 struct parser_element 86 { 87 pstring ns; // element namespace (optional) 88 pstring name; // element name 89 std::ptrdiff_t begin_pos; // position of the opening brace '<'. 90 std::ptrdiff_t end_pos; // position of the char after the closing brace '>'. 91 }; 92 93 /** 94 * Attribute properties passed by sax_parser to its handler's attribute() 95 * call. When an attribute value is transient, it has been converted due to 96 * presence of encoded character(s) and stored in a temporary buffer. The 97 * handler must assume that the value will not survive beyond the scope of 98 * the callback. 99 */ 100 struct parser_attribute 101 { 102 pstring ns; // attribute namespace (optional) 103 pstring name; // attribute name 104 pstring value; // attribute value 105 bool transient; // whether or not the attribute value is on a temporary buffer. 106 }; 107 108 class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base 109 { 110 struct impl; 111 std::unique_ptr<impl> mp_impl; 112 113 parser_base() = delete; 114 parser_base(const parser_base&) = delete; 115 parser_base& operator=(const parser_base&) = delete; 116 protected: 117 size_t m_nest_level; 118 size_t m_buffer_pos; 119 bool m_root_elem_open:1; 120 121 protected: 122 parser_base(const char* content, size_t size, bool transient_stream); 123 ~parser_base(); 124 next_check()125 void next_check() 126 { 127 next(); 128 if (!has_char()) 129 throw malformed_xml_error("xml stream ended prematurely.", offset()); 130 } 131 nest_up()132 void nest_up() { ++m_nest_level; } nest_down()133 void nest_down() 134 { 135 if (m_nest_level == 0) 136 throw malformed_xml_error("incorrect nesting in xml stream", offset()); 137 138 --m_nest_level; 139 } 140 141 void inc_buffer_pos(); reset_buffer_pos()142 void reset_buffer_pos() { m_buffer_pos = 0; } 143 has_char_throw(const char * msg) const144 void has_char_throw(const char* msg) const 145 { 146 if (!has_char()) 147 throw malformed_xml_error(msg, offset()); 148 } 149 150 /** 151 * Determine the number of remaining characters <strong>including</strong> 152 * the current character. 153 * 154 * 155 * @return number of remaining characters including the current character. 156 */ remains() const157 inline size_t remains() const 158 { 159 #if ORCUS_DEBUG_SAX_PARSER 160 if (mp_char >= mp_end) 161 throw malformed_xml_error("xml stream ended prematurely.", offset()); 162 #endif 163 return mp_end - mp_char; 164 } 165 cur_char_checked() const166 char cur_char_checked() const 167 { 168 if (!has_char()) 169 throw malformed_xml_error("xml stream ended prematurely.", offset()); 170 171 return *mp_char; 172 } 173 next_and_char()174 char next_and_char() 175 { 176 next(); 177 #if ORCUS_DEBUG_SAX_PARSER 178 if (mp_char >= mp_end) 179 throw malformed_xml_error("xml stream ended prematurely.", offset()); 180 #endif 181 return *mp_char; 182 } 183 next_char_checked()184 char next_char_checked() 185 { 186 next(); 187 if (!has_char()) 188 throw malformed_xml_error("xml stream ended prematurely.", offset()); 189 190 return *mp_char; 191 } 192 193 cell_buffer& get_cell_buffer(); 194 195 void comment(); 196 197 /** 198 * Skip an optional byte order mark at the begining of the xml stream. 199 */ 200 void skip_bom(); 201 202 void expects_next(const char* p, size_t n); 203 204 void parse_encoded_char(cell_buffer& buf); 205 void value_with_encoded_char(cell_buffer& buf, pstring& str, char quote_char); 206 207 /** 208 * Parse quoted value. Note that the retrieved string may be stored in 209 * the temporary cell buffer if the decode parameter is true. Use the 210 * string immediately after this call before the buffer becomes invalid. 211 * 212 * @return true if the value is stored in temporary buffer, false 213 * otherwise. 214 */ 215 bool value(pstring& str, bool decode); 216 217 void name(pstring& str); 218 void element_name(parser_element& elem, std::ptrdiff_t begin_pos); 219 void attribute_name(pstring& attr_ns, pstring& attr_name); 220 void characters_with_encoded_char(cell_buffer& buf); 221 }; 222 223 }} 224 225 #endif 226 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 227