1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #include "orcus/sax_parser_base.hpp"
9 #include "orcus/global.hpp"
10 
11 #include <cstring>
12 #include <vector>
13 #include <memory>
14 
15 #ifdef __ORCUS_CPU_FEATURES
16 #include <immintrin.h>
17 #endif
18 
19 namespace orcus { namespace sax {
20 
malformed_xml_error(const std::string & msg,std::ptrdiff_t offset)21 malformed_xml_error::malformed_xml_error(const std::string& msg, std::ptrdiff_t offset) :
22     ::orcus::parse_error("malformed_xml_error", msg, offset) {}
23 
~malformed_xml_error()24 malformed_xml_error::~malformed_xml_error() throw() {}
25 
decode_xml_encoded_char(const char * p,size_t n)26 char decode_xml_encoded_char(const char* p, size_t n)
27 {
28     if (n == 2)
29     {
30         if (!std::strncmp(p, "lt", n))
31             return '<';
32         else if (!std::strncmp(p, "gt", n))
33             return '>';
34         else
35             return '\0';
36     }
37     else if (n == 3)
38     {
39         if (!std::strncmp(p, "amp", n))
40             return '&';
41         else
42             return '\0';
43     }
44     else if (n == 4)
45     {
46         if (!std::strncmp(p, "apos", n))
47             return '\'';
48         else if (!std::strncmp(p, "quot", 4))
49             return '"';
50         else
51             return '\0';
52     }
53 
54     return '\0';
55 }
56 
decode_xml_unicode_char(const char * p,size_t n)57 std::string decode_xml_unicode_char(const char* p, size_t n)
58 {
59     if (*p == '#' && n >= 2)
60     {
61         uint32_t point = 0;
62         if (p[1] == 'x')
63         {
64             if (n == 2)
65                 throw orcus::xml_structure_error(
66                     "invalid number of characters for hexadecimal unicode reference");
67 
68             point = std::stoi(std::string(p + 2, n - 2), nullptr, 16);
69         }
70         else
71             point = std::stoi(std::string(p + 1, n - 1), nullptr, 10);
72 
73         if (point < 0x80)
74         {
75             // is it really necessary to do the bit manipulation here?
76             std::string s(1, static_cast<char>(point & 0x7F));
77             return s;
78         }
79         else if (point < 0x0800)
80         {
81             std::string s(1, static_cast<char>((point >> 6 & 0x1F) | 0xC0));
82             s += static_cast<char>((point & 0x3F) | 0x80);
83             return s;
84         }
85         else if (point < 0x010000)
86         {
87             std::string s(1, static_cast<char>((point >> 12 & 0x0F) | 0xE0));
88             s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
89             s += static_cast<char>((point & 0x3F) | 0x80);
90             return s;
91         }
92         else if (point < 0x110000)
93         {
94             std::string s(1, static_cast<char>((point >> 18 & 0x07) | 0xF0));
95             s += static_cast<char>((point >> 12 & 0x3F) | 0x80);
96             s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
97             s += static_cast<char>((point & 0x3F) | 0x80);
98             return s;
99         }
100         else
101         {
102             // should not happen as that is not represented by utf-8
103             assert(false);
104         }
105     }
106 
107     return std::string();
108 }
109 
110 struct parser_base::impl
111 {
112     std::vector<std::unique_ptr<cell_buffer>> m_cell_buffers;
113 };
114 
parser_base(const char * content,size_t size,bool transient_stream)115 parser_base::parser_base(const char* content, size_t size, bool transient_stream) :
116     ::orcus::parser_base(content, size, transient_stream),
117     mp_impl(orcus::make_unique<impl>()),
118     m_nest_level(0),
119     m_buffer_pos(0),
120     m_root_elem_open(true)
121 {
122     mp_impl->m_cell_buffers.push_back(orcus::make_unique<cell_buffer>());
123 }
124 
~parser_base()125 parser_base::~parser_base() {}
126 
inc_buffer_pos()127 void parser_base::inc_buffer_pos()
128 {
129     ++m_buffer_pos;
130     if (m_buffer_pos == mp_impl->m_cell_buffers.size())
131         mp_impl->m_cell_buffers.push_back(orcus::make_unique<cell_buffer>());
132 }
133 
get_cell_buffer()134 cell_buffer& parser_base::get_cell_buffer()
135 {
136     return *mp_impl->m_cell_buffers[m_buffer_pos];
137 }
138 
comment()139 void parser_base::comment()
140 {
141     // Parse until we reach '-->'.
142     size_t len = remains();
143     assert(len > 3);
144     char c = cur_char();
145     size_t i = 0;
146     bool hyphen = false;
147     for (; i < len; ++i, c = next_and_char())
148     {
149         if (c == '-')
150         {
151             if (!hyphen)
152                 // first hyphen.
153                 hyphen = true;
154             else
155                 // second hyphen.
156                 break;
157         }
158         else
159             hyphen = false;
160     }
161 
162     if (len - i < 2 || next_and_char() != '>')
163         throw malformed_xml_error(
164             "'--' should not occur in comment other than in the closing tag.", offset());
165 
166     next();
167 }
168 
skip_bom()169 void parser_base::skip_bom()
170 {
171     if (remains() < 4)
172         // Stream too short to have a byte order mark.
173         return;
174 
175     if (is_blank(cur_char()))
176         // Allow leading whitespace in the XML stream.
177         // TODO : Make this configurable since strictly speaking such an XML
178         // sttream is invalid.
179         return;
180 
181     // 0xef 0xbb 0 xbf is the UTF-8 byte order mark
182     unsigned char c = static_cast<unsigned char>(cur_char());
183     if (c != '<')
184     {
185         if (c != 0xef || static_cast<unsigned char>(next_and_char()) != 0xbb ||
186             static_cast<unsigned char>(next_and_char()) != 0xbf || next_and_char() != '<')
187             throw malformed_xml_error(
188                 "unsupported encoding. only 8 bit encodings are supported", offset());
189     }
190 }
191 
expects_next(const char * p,size_t n)192 void parser_base::expects_next(const char* p, size_t n)
193 {
194     if (remains() < n+1)
195         throw malformed_xml_error(
196             "not enough stream left to check for an expected string segment.", offset());
197 
198     const char* p0 = p;
199     const char* p_end = p + n;
200     char c = next_and_char();
201     for (; p != p_end; ++p, c = next_and_char())
202     {
203         if (c == *p)
204             continue;
205 
206         std::ostringstream os;
207         os << "'" << std::string(p0, n) << "' was expected, but not found.";
208         throw malformed_xml_error(os.str(), offset());
209     }
210 }
211 
parse_encoded_char(cell_buffer & buf)212 void parser_base::parse_encoded_char(cell_buffer& buf)
213 {
214     assert(cur_char() == '&');
215     next();
216     const char* p0 = mp_char;
217     for (; has_char(); next())
218     {
219         if (cur_char() != ';')
220             continue;
221 
222         size_t n = mp_char - p0;
223         if (!n)
224             throw malformed_xml_error("empty encoded character.", offset());
225 
226 #if ORCUS_DEBUG_SAX_PARSER
227         cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl;
228 #endif
229 
230         char c = decode_xml_encoded_char(p0, n);
231         if (c)
232             buf.append(&c, 1);
233         else
234         {
235             std::string utf8 = decode_xml_unicode_char(p0, n);
236 
237             if (!utf8.empty())
238             {
239                 buf.append(utf8.data(), utf8.size());
240                 c = '1'; // just to avoid hitting the !c case below
241             }
242         }
243 
244         // Move to the character past ';' before returning to the parent call.
245         next();
246 
247         if (!c)
248         {
249 #if ORCUS_DEBUG_SAX_PARSER
250             cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl;
251 #endif
252             // Unexpected encoding name. Use the original text.
253             buf.append(p0, mp_char-p0);
254         }
255 
256         return;
257     }
258 
259     throw malformed_xml_error(
260         "error parsing encoded character: terminating character is not found.", offset());
261 }
262 
value_with_encoded_char(cell_buffer & buf,pstring & str,char quote_char)263 void parser_base::value_with_encoded_char(cell_buffer& buf, pstring& str, char quote_char)
264 {
265     assert(cur_char() == '&');
266     parse_encoded_char(buf);
267 
268     const char* p0 = mp_char;
269 
270     while (has_char())
271     {
272         if (cur_char() == '&')
273         {
274             if (mp_char > p0)
275                 buf.append(p0, mp_char-p0);
276 
277             parse_encoded_char(buf);
278             p0 = mp_char;
279         }
280 
281         if (cur_char() == quote_char)
282             break;
283 
284         if (cur_char() != '&')
285             next();
286     }
287 
288     if (mp_char > p0)
289         buf.append(p0, mp_char-p0);
290 
291     if (!buf.empty())
292         str = pstring(buf.get(), buf.size());
293 
294     // Skip the closing quote.
295     assert(!has_char() || cur_char() == quote_char);
296     next();
297 }
298 
value(pstring & str,bool decode)299 bool parser_base::value(pstring& str, bool decode)
300 {
301     char c = cur_char();
302     if (c != '"' && c != '\'')
303         throw malformed_xml_error("value must be quoted", offset());
304 
305     char quote_char = c;
306 
307     c = next_char_checked();
308 
309     const char* p0 = mp_char;
310     for (; c != quote_char; c = next_char_checked())
311     {
312         if (decode && c == '&')
313         {
314             // This value contains one or more encoded characters.
315             cell_buffer& buf = get_cell_buffer();
316             buf.reset();
317             buf.append(p0, mp_char-p0);
318             value_with_encoded_char(buf, str, quote_char);
319             return true;
320         }
321     }
322 
323     str = pstring(p0, mp_char-p0);
324 
325     // Skip the closing quote.
326     next();
327 
328     return transient_stream();
329 }
330 
name(pstring & str)331 void parser_base::name(pstring& str)
332 {
333     const char* p0 = mp_char;
334     char c = cur_char();
335     if (!is_alpha(c) && c != '_')
336     {
337         ::std::ostringstream os;
338         os << "name must begin with an alphabet, but got this instead '" << c << "'";
339         throw malformed_xml_error(os.str(), offset());
340     }
341 
342 #if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__)
343 
344     const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__");
345     const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
346 
347     size_t n_total = available_size();
348 
349     while (n_total)
350     {
351         __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char);
352 
353         int n = std::min<size_t>(16u, n_total);
354         int r = _mm_cmpestri(match, 10, char_block, n, mode);
355         mp_char += r; // Move the current char position.
356 
357         if (r < 16)
358             // No need to move to the next segment. Stop here.
359             break;
360 
361         // Skip 16 chars to the next segment.
362         n_total -= 16;
363     }
364 
365 #else
366     while (is_alpha(c) || is_numeric(c) || is_name_char(c))
367         c = next_char_checked();
368 #endif
369 
370     str = pstring(p0, mp_char-p0);
371 }
372 
element_name(parser_element & elem,std::ptrdiff_t begin_pos)373 void parser_base::element_name(parser_element& elem, std::ptrdiff_t begin_pos)
374 {
375     elem.begin_pos = begin_pos;
376     name(elem.name);
377     if (cur_char() == ':')
378     {
379         elem.ns = elem.name;
380         next_check();
381         name(elem.name);
382     }
383 }
384 
attribute_name(pstring & attr_ns,pstring & attr_name)385 void parser_base::attribute_name(pstring& attr_ns, pstring& attr_name)
386 {
387     name(attr_name);
388     if (cur_char() == ':')
389     {
390         // Attribute name is namespaced.
391         attr_ns = attr_name;
392         next_check();
393         name(attr_name);
394     }
395 }
396 
characters_with_encoded_char(cell_buffer & buf)397 void parser_base::characters_with_encoded_char(cell_buffer& buf)
398 {
399     assert(cur_char() == '&');
400     parse_encoded_char(buf);
401 
402     const char* p0 = mp_char;
403 
404     while (has_char())
405     {
406         if (cur_char() == '&')
407         {
408             if (mp_char > p0)
409                 buf.append(p0, mp_char-p0);
410 
411             parse_encoded_char(buf);
412             p0 = mp_char;
413         }
414 
415         if (cur_char() == '<')
416             break;
417 
418         if (cur_char() != '&')
419             next();
420     }
421 
422     if (mp_char > p0)
423         buf.append(p0, mp_char-p0);
424 }
425 
426 }}
427 
428 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
429