1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8 #include "orcus/sax_parser_base.hpp"
9 #include "orcus/global.hpp"
10
11 #include <cstring>
12 #include <vector>
13 #include <memory>
14
15 #ifdef __ORCUS_CPU_FEATURES
16 #include <immintrin.h>
17 #endif
18
19 namespace orcus { namespace sax {
20
malformed_xml_error(const std::string & msg,std::ptrdiff_t offset)21 malformed_xml_error::malformed_xml_error(const std::string& msg, std::ptrdiff_t offset) :
22 ::orcus::parse_error("malformed_xml_error", msg, offset) {}
23
~malformed_xml_error()24 malformed_xml_error::~malformed_xml_error() throw() {}
25
decode_xml_encoded_char(const char * p,size_t n)26 char decode_xml_encoded_char(const char* p, size_t n)
27 {
28 if (n == 2)
29 {
30 if (!std::strncmp(p, "lt", n))
31 return '<';
32 else if (!std::strncmp(p, "gt", n))
33 return '>';
34 else
35 return '\0';
36 }
37 else if (n == 3)
38 {
39 if (!std::strncmp(p, "amp", n))
40 return '&';
41 else
42 return '\0';
43 }
44 else if (n == 4)
45 {
46 if (!std::strncmp(p, "apos", n))
47 return '\'';
48 else if (!std::strncmp(p, "quot", 4))
49 return '"';
50 else
51 return '\0';
52 }
53
54 return '\0';
55 }
56
decode_xml_unicode_char(const char * p,size_t n)57 std::string decode_xml_unicode_char(const char* p, size_t n)
58 {
59 if (*p == '#' && n >= 2)
60 {
61 uint32_t point = 0;
62 if (p[1] == 'x')
63 {
64 if (n == 2)
65 throw orcus::xml_structure_error(
66 "invalid number of characters for hexadecimal unicode reference");
67
68 point = std::stoi(std::string(p + 2, n - 2), nullptr, 16);
69 }
70 else
71 point = std::stoi(std::string(p + 1, n - 1), nullptr, 10);
72
73 if (point < 0x80)
74 {
75 // is it really necessary to do the bit manipulation here?
76 std::string s(1, static_cast<char>(point & 0x7F));
77 return s;
78 }
79 else if (point < 0x0800)
80 {
81 std::string s(1, static_cast<char>((point >> 6 & 0x1F) | 0xC0));
82 s += static_cast<char>((point & 0x3F) | 0x80);
83 return s;
84 }
85 else if (point < 0x010000)
86 {
87 std::string s(1, static_cast<char>((point >> 12 & 0x0F) | 0xE0));
88 s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
89 s += static_cast<char>((point & 0x3F) | 0x80);
90 return s;
91 }
92 else if (point < 0x110000)
93 {
94 std::string s(1, static_cast<char>((point >> 18 & 0x07) | 0xF0));
95 s += static_cast<char>((point >> 12 & 0x3F) | 0x80);
96 s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
97 s += static_cast<char>((point & 0x3F) | 0x80);
98 return s;
99 }
100 else
101 {
102 // should not happen as that is not represented by utf-8
103 assert(false);
104 }
105 }
106
107 return std::string();
108 }
109
110 struct parser_base::impl
111 {
112 std::vector<std::unique_ptr<cell_buffer>> m_cell_buffers;
113 };
114
parser_base(const char * content,size_t size,bool transient_stream)115 parser_base::parser_base(const char* content, size_t size, bool transient_stream) :
116 ::orcus::parser_base(content, size, transient_stream),
117 mp_impl(orcus::make_unique<impl>()),
118 m_nest_level(0),
119 m_buffer_pos(0),
120 m_root_elem_open(true)
121 {
122 mp_impl->m_cell_buffers.push_back(orcus::make_unique<cell_buffer>());
123 }
124
~parser_base()125 parser_base::~parser_base() {}
126
inc_buffer_pos()127 void parser_base::inc_buffer_pos()
128 {
129 ++m_buffer_pos;
130 if (m_buffer_pos == mp_impl->m_cell_buffers.size())
131 mp_impl->m_cell_buffers.push_back(orcus::make_unique<cell_buffer>());
132 }
133
get_cell_buffer()134 cell_buffer& parser_base::get_cell_buffer()
135 {
136 return *mp_impl->m_cell_buffers[m_buffer_pos];
137 }
138
comment()139 void parser_base::comment()
140 {
141 // Parse until we reach '-->'.
142 size_t len = remains();
143 assert(len > 3);
144 char c = cur_char();
145 size_t i = 0;
146 bool hyphen = false;
147 for (; i < len; ++i, c = next_and_char())
148 {
149 if (c == '-')
150 {
151 if (!hyphen)
152 // first hyphen.
153 hyphen = true;
154 else
155 // second hyphen.
156 break;
157 }
158 else
159 hyphen = false;
160 }
161
162 if (len - i < 2 || next_and_char() != '>')
163 throw malformed_xml_error(
164 "'--' should not occur in comment other than in the closing tag.", offset());
165
166 next();
167 }
168
skip_bom()169 void parser_base::skip_bom()
170 {
171 if (remains() < 4)
172 // Stream too short to have a byte order mark.
173 return;
174
175 if (is_blank(cur_char()))
176 // Allow leading whitespace in the XML stream.
177 // TODO : Make this configurable since strictly speaking such an XML
178 // sttream is invalid.
179 return;
180
181 // 0xef 0xbb 0 xbf is the UTF-8 byte order mark
182 unsigned char c = static_cast<unsigned char>(cur_char());
183 if (c != '<')
184 {
185 if (c != 0xef || static_cast<unsigned char>(next_and_char()) != 0xbb ||
186 static_cast<unsigned char>(next_and_char()) != 0xbf || next_and_char() != '<')
187 throw malformed_xml_error(
188 "unsupported encoding. only 8 bit encodings are supported", offset());
189 }
190 }
191
expects_next(const char * p,size_t n)192 void parser_base::expects_next(const char* p, size_t n)
193 {
194 if (remains() < n+1)
195 throw malformed_xml_error(
196 "not enough stream left to check for an expected string segment.", offset());
197
198 const char* p0 = p;
199 const char* p_end = p + n;
200 char c = next_and_char();
201 for (; p != p_end; ++p, c = next_and_char())
202 {
203 if (c == *p)
204 continue;
205
206 std::ostringstream os;
207 os << "'" << std::string(p0, n) << "' was expected, but not found.";
208 throw malformed_xml_error(os.str(), offset());
209 }
210 }
211
parse_encoded_char(cell_buffer & buf)212 void parser_base::parse_encoded_char(cell_buffer& buf)
213 {
214 assert(cur_char() == '&');
215 next();
216 const char* p0 = mp_char;
217 for (; has_char(); next())
218 {
219 if (cur_char() != ';')
220 continue;
221
222 size_t n = mp_char - p0;
223 if (!n)
224 throw malformed_xml_error("empty encoded character.", offset());
225
226 #if ORCUS_DEBUG_SAX_PARSER
227 cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl;
228 #endif
229
230 char c = decode_xml_encoded_char(p0, n);
231 if (c)
232 buf.append(&c, 1);
233 else
234 {
235 std::string utf8 = decode_xml_unicode_char(p0, n);
236
237 if (!utf8.empty())
238 {
239 buf.append(utf8.data(), utf8.size());
240 c = '1'; // just to avoid hitting the !c case below
241 }
242 }
243
244 // Move to the character past ';' before returning to the parent call.
245 next();
246
247 if (!c)
248 {
249 #if ORCUS_DEBUG_SAX_PARSER
250 cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl;
251 #endif
252 // Unexpected encoding name. Use the original text.
253 buf.append(p0, mp_char-p0);
254 }
255
256 return;
257 }
258
259 throw malformed_xml_error(
260 "error parsing encoded character: terminating character is not found.", offset());
261 }
262
value_with_encoded_char(cell_buffer & buf,pstring & str,char quote_char)263 void parser_base::value_with_encoded_char(cell_buffer& buf, pstring& str, char quote_char)
264 {
265 assert(cur_char() == '&');
266 parse_encoded_char(buf);
267
268 const char* p0 = mp_char;
269
270 while (has_char())
271 {
272 if (cur_char() == '&')
273 {
274 if (mp_char > p0)
275 buf.append(p0, mp_char-p0);
276
277 parse_encoded_char(buf);
278 p0 = mp_char;
279 }
280
281 if (cur_char() == quote_char)
282 break;
283
284 if (cur_char() != '&')
285 next();
286 }
287
288 if (mp_char > p0)
289 buf.append(p0, mp_char-p0);
290
291 if (!buf.empty())
292 str = pstring(buf.get(), buf.size());
293
294 // Skip the closing quote.
295 assert(!has_char() || cur_char() == quote_char);
296 next();
297 }
298
value(pstring & str,bool decode)299 bool parser_base::value(pstring& str, bool decode)
300 {
301 char c = cur_char();
302 if (c != '"' && c != '\'')
303 throw malformed_xml_error("value must be quoted", offset());
304
305 char quote_char = c;
306
307 c = next_char_checked();
308
309 const char* p0 = mp_char;
310 for (; c != quote_char; c = next_char_checked())
311 {
312 if (decode && c == '&')
313 {
314 // This value contains one or more encoded characters.
315 cell_buffer& buf = get_cell_buffer();
316 buf.reset();
317 buf.append(p0, mp_char-p0);
318 value_with_encoded_char(buf, str, quote_char);
319 return true;
320 }
321 }
322
323 str = pstring(p0, mp_char-p0);
324
325 // Skip the closing quote.
326 next();
327
328 return transient_stream();
329 }
330
name(pstring & str)331 void parser_base::name(pstring& str)
332 {
333 const char* p0 = mp_char;
334 char c = cur_char();
335 if (!is_alpha(c) && c != '_')
336 {
337 ::std::ostringstream os;
338 os << "name must begin with an alphabet, but got this instead '" << c << "'";
339 throw malformed_xml_error(os.str(), offset());
340 }
341
342 #if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__)
343
344 const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__");
345 const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
346
347 size_t n_total = available_size();
348
349 while (n_total)
350 {
351 __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char);
352
353 int n = std::min<size_t>(16u, n_total);
354 int r = _mm_cmpestri(match, 10, char_block, n, mode);
355 mp_char += r; // Move the current char position.
356
357 if (r < 16)
358 // No need to move to the next segment. Stop here.
359 break;
360
361 // Skip 16 chars to the next segment.
362 n_total -= 16;
363 }
364
365 #else
366 while (is_alpha(c) || is_numeric(c) || is_name_char(c))
367 c = next_char_checked();
368 #endif
369
370 str = pstring(p0, mp_char-p0);
371 }
372
element_name(parser_element & elem,std::ptrdiff_t begin_pos)373 void parser_base::element_name(parser_element& elem, std::ptrdiff_t begin_pos)
374 {
375 elem.begin_pos = begin_pos;
376 name(elem.name);
377 if (cur_char() == ':')
378 {
379 elem.ns = elem.name;
380 next_check();
381 name(elem.name);
382 }
383 }
384
attribute_name(pstring & attr_ns,pstring & attr_name)385 void parser_base::attribute_name(pstring& attr_ns, pstring& attr_name)
386 {
387 name(attr_name);
388 if (cur_char() == ':')
389 {
390 // Attribute name is namespaced.
391 attr_ns = attr_name;
392 next_check();
393 name(attr_name);
394 }
395 }
396
characters_with_encoded_char(cell_buffer & buf)397 void parser_base::characters_with_encoded_char(cell_buffer& buf)
398 {
399 assert(cur_char() == '&');
400 parse_encoded_char(buf);
401
402 const char* p0 = mp_char;
403
404 while (has_char())
405 {
406 if (cur_char() == '&')
407 {
408 if (mp_char > p0)
409 buf.append(p0, mp_char-p0);
410
411 parse_encoded_char(buf);
412 p0 = mp_char;
413 }
414
415 if (cur_char() == '<')
416 break;
417
418 if (cur_char() != '&')
419 next();
420 }
421
422 if (mp_char > p0)
423 buf.append(p0, mp_char-p0);
424 }
425
426 }}
427
428 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
429