1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef INCLUDED_ORCUS_SAX_PARSER_BASE_HPP
9 #define INCLUDED_ORCUS_SAX_PARSER_BASE_HPP
10 
11 #include "env.hpp"
12 #include "pstring.hpp"
13 #include "cell_buffer.hpp"
14 #include "parser_global.hpp"
15 #include "parser_base.hpp"
16 
17 #include <cassert>
18 #include <cstdlib>
19 #include <exception>
20 #include <sstream>
21 #include <memory>
22 
23 #define ORCUS_DEBUG_SAX_PARSER 0
24 
25 #if ORCUS_DEBUG_SAX_PARSER
26 #include <iostream>
27 using std::cout;
28 using std::endl;
29 #endif
30 
31 namespace orcus { namespace sax {
32 
33 class ORCUS_PSR_DLLPUBLIC malformed_xml_error : public ::orcus::parse_error
34 {
35 public:
36     malformed_xml_error() = delete;
37     malformed_xml_error(const std::string& msg, std::ptrdiff_t offset);
38     virtual ~malformed_xml_error() throw();
39 };
40 
41 /**
42  * Document type declaration passed by sax_parser to its handler's doctype()
43  * call.
44  */
45 struct doctype_declaration
46 {
47     enum class keyword_type { dtd_public, dtd_private };
48 
49     keyword_type keyword;
50     pstring root_element;
51     pstring fpi;
52     pstring uri;
53 };
54 
55 /**
56  * Given an encoded name (such as 'quot' and 'amp'), return a single
57  * character that corresponds with the name.  The name shouldn't include the
58  * leading '&' and trailing ';'.
59  *
60  * @param p pointer to the first character of encoded name
61  * @param n length of encoded name
62  *
63  * @return single character that corresponds with the encoded name.  '\0' is
64  *         returned if decoding fails.
65  */
66 ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n);
67 
68 /**
69  * Given an encoded unicode value (such as #20A9), return a UTF-8 string
70  * that corresponds with the unicode value.  The value shouldn't include the
71  * leading '&' and trailing ';'.
72  *
73  * @param p pointer to the first character of encoded name
74  * @param n length of encoded name
75  *
76  * @return string that corresponds with the encoded value.  An empty string
77  *         is returned if decoding fails.
78  */
79 ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n);
80 
81 /**
82  * Element properties passed by sax_parser to its handler's open_element()
83  * and close_element() calls.
84  */
85 struct parser_element
86 {
87     pstring ns;            // element namespace (optional)
88     pstring name;          // element name
89     std::ptrdiff_t begin_pos; // position of the opening brace '<'.
90     std::ptrdiff_t end_pos;   // position of the char after the closing brace '>'.
91 };
92 
93 /**
94  * Attribute properties passed by sax_parser to its handler's attribute()
95  * call. When an attribute value is transient, it has been converted due to
96  * presence of encoded character(s) and stored in a temporary buffer. The
97  * handler must assume that the value will not survive beyond the scope of
98  * the callback.
99  */
100 struct parser_attribute
101 {
102     pstring ns;      // attribute namespace (optional)
103     pstring name;    // attribute name
104     pstring value;   // attribute value
105     bool transient;  // whether or not the attribute value is on a temporary buffer.
106 };
107 
108 class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base
109 {
110     struct impl;
111     std::unique_ptr<impl> mp_impl;
112 
113     parser_base() = delete;
114     parser_base(const parser_base&) = delete;
115     parser_base& operator=(const parser_base&) = delete;
116 protected:
117     size_t m_nest_level;
118     size_t m_buffer_pos;
119     bool m_root_elem_open:1;
120 
121 protected:
122     parser_base(const char* content, size_t size, bool transient_stream);
123     ~parser_base();
124 
next_check()125     void next_check()
126     {
127         next();
128         if (!has_char())
129             throw malformed_xml_error("xml stream ended prematurely.", offset());
130     }
131 
nest_up()132     void nest_up() { ++m_nest_level; }
nest_down()133     void nest_down()
134     {
135         if (m_nest_level == 0)
136             throw malformed_xml_error("incorrect nesting in xml stream", offset());
137 
138         --m_nest_level;
139     }
140 
141     void inc_buffer_pos();
reset_buffer_pos()142     void reset_buffer_pos() { m_buffer_pos = 0; }
143 
has_char_throw(const char * msg) const144     void has_char_throw(const char* msg) const
145     {
146         if (!has_char())
147             throw malformed_xml_error(msg, offset());
148     }
149 
150     /**
151      * Determine the number of remaining characters <strong>including</strong>
152      * the current character.
153      *
154      *
155      * @return number of remaining characters including the current character.
156      */
remains() const157     inline size_t remains() const
158     {
159 #if ORCUS_DEBUG_SAX_PARSER
160         if (mp_char >= mp_end)
161             throw malformed_xml_error("xml stream ended prematurely.", offset());
162 #endif
163         return mp_end - mp_char;
164     }
165 
cur_char_checked() const166     char cur_char_checked() const
167     {
168         if (!has_char())
169             throw malformed_xml_error("xml stream ended prematurely.", offset());
170 
171         return *mp_char;
172     }
173 
next_and_char()174     char next_and_char()
175     {
176         next();
177 #if ORCUS_DEBUG_SAX_PARSER
178         if (mp_char >= mp_end)
179             throw malformed_xml_error("xml stream ended prematurely.", offset());
180 #endif
181         return *mp_char;
182     }
183 
next_char_checked()184     char next_char_checked()
185     {
186         next();
187         if (!has_char())
188             throw malformed_xml_error("xml stream ended prematurely.", offset());
189 
190         return *mp_char;
191     }
192 
193     cell_buffer& get_cell_buffer();
194 
195     void comment();
196 
197     /**
198      * Skip an optional byte order mark at the begining of the xml stream.
199      */
200     void skip_bom();
201 
202     void expects_next(const char* p, size_t n);
203 
204     void parse_encoded_char(cell_buffer& buf);
205     void value_with_encoded_char(cell_buffer& buf, pstring& str, char quote_char);
206 
207     /**
208      * Parse quoted value.  Note that the retrieved string may be stored in
209      * the temporary cell buffer if the decode parameter is true. Use the
210      * string immediately after this call before the buffer becomes invalid.
211      *
212      * @return true if the value is stored in temporary buffer, false
213      *         otherwise.
214      */
215     bool value(pstring& str, bool decode);
216 
217     void name(pstring& str);
218     void element_name(parser_element& elem, std::ptrdiff_t begin_pos);
219     void attribute_name(pstring& attr_ns, pstring& attr_name);
220     void characters_with_encoded_char(cell_buffer& buf);
221 };
222 
223 }}
224 
225 #endif
226 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
227