1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8 #include "orcus/yaml_parser_base.hpp"
9 #include "orcus/global.hpp"
10 #include "orcus/cell_buffer.hpp"
11 #include "orcus/parser_global.hpp"
12
13 #include <mdds/sorted_string_map.hpp>
14
15 #include <limits>
16 #include <vector>
17 #include <deque>
18 #include <sstream>
19 #include <algorithm>
20
21 namespace orcus { namespace yaml {
22
parse_error(const std::string & msg,std::ptrdiff_t offset)23 parse_error::parse_error(const std::string& msg, std::ptrdiff_t offset) :
24 ::orcus::parse_error(msg, offset) {}
25
throw_with(const char * msg_before,char c,const char * msg_after,std::ptrdiff_t offset)26 void parse_error::throw_with(const char* msg_before, char c, const char* msg_after, std::ptrdiff_t offset)
27 {
28 throw parse_error(build_message(msg_before, c, msg_after), offset);
29 }
30
throw_with(const char * msg_before,const char * p,size_t n,const char * msg_after,std::ptrdiff_t offset)31 void parse_error::throw_with(
32 const char* msg_before, const char* p, size_t n, const char* msg_after, std::ptrdiff_t offset)
33 {
34 throw parse_error(build_message(msg_before, p, n, msg_after), offset);
35 }
36
37 struct scope
38 {
39 size_t width;
40 detail::scope_t type;
41
scopeorcus::yaml::scope42 scope(size_t _width) : width(_width), type(detail::scope_t::unset) {}
43 };
44
45 struct parser_base::impl
46 {
47 cell_buffer m_buffer;
48 std::vector<scope> m_scopes;
49 std::deque<pstring> m_line_buffer;
50 const char* m_document;
51
52 size_t m_comment_length;
53
54 bool m_in_literal_block;
55 bool m_parsed_to_end_of_line;
56
57 detail::parse_token_t m_last_token;
58
implorcus::yaml::parser_base::impl59 impl() :
60 m_document(nullptr),
61 m_comment_length(0),
62 m_in_literal_block(false),
63 m_parsed_to_end_of_line(false),
64 m_last_token(detail::parse_token_t::unknown) {}
65 };
66
67 const size_t parser_base::parse_indent_blank_line = std::numeric_limits<size_t>::max();
68 const size_t parser_base::parse_indent_end_of_stream = std::numeric_limits<size_t>::max() - 1;
69 const size_t parser_base::scope_empty = std::numeric_limits<size_t>::max() - 2;
70
parser_base(const char * p,size_t n)71 parser_base::parser_base(const char* p, size_t n) :
72 ::orcus::parser_base(p, n, false), mp_impl(orcus::make_unique<impl>()) {}
73
~parser_base()74 parser_base::~parser_base() {}
75
push_parse_token(detail::parse_token_t t)76 void parser_base::push_parse_token(detail::parse_token_t t)
77 {
78 mp_impl->m_last_token = t;
79 }
80
get_last_parse_token() const81 detail::parse_token_t parser_base::get_last_parse_token() const
82 {
83 return mp_impl->m_last_token;
84 }
85
offset_last_char_of_line() const86 size_t parser_base::offset_last_char_of_line() const
87 {
88 // The current parser position should be on the linefeed char after
89 // calling parse_to_end_of_line().
90 assert(mp_impl->m_parsed_to_end_of_line);
91
92 size_t pos = offset(); // character past the '\n'.
93 pos -= 1; // move back to the '\n'.
94
95 if (mp_impl->m_comment_length)
96 {
97 assert(mp_impl->m_comment_length < pos);
98 pos -= mp_impl->m_comment_length; // should be on the '#' character.
99 }
100
101 pos -= 1;
102
103 // Ignore any trailing whitespaces.
104 const char* p = mp_begin + pos;
105 for (; mp_begin < p && *p == ' '; --p, --pos)
106 ;
107
108 return pos;
109 }
110
parse_indent()111 size_t parser_base::parse_indent()
112 {
113 for (size_t indent = 0; has_char(); next(), ++indent)
114 {
115 char c = cur_char();
116 switch (c)
117 {
118 case '#':
119 skip_comment();
120 return parse_indent_blank_line;
121 case '\n':
122 next();
123 return parse_indent_blank_line;
124 case ' ':
125 continue;
126 default:
127 return indent;
128 }
129 }
130
131 return parse_indent_end_of_stream;
132 }
133
parse_to_end_of_line()134 pstring parser_base::parse_to_end_of_line()
135 {
136 const char* p = mp_char;
137 size_t len = 0;
138 for (; has_char(); next(), ++len)
139 {
140 switch (cur_char())
141 {
142 case '#':
143 skip_comment();
144 break;
145 case '\'':
146 {
147 const char* p_open_quote = mp_char;
148
149 // character immediately after the closing quote.
150 const char* p_end =
151 parse_to_closing_single_quote(mp_char, remaining_size());
152
153 if (!p_end)
154 throw parse_error("parse_to_end_of_line: closing single quote was expected but not found.", offset());
155
156 size_t diff = p_end - p_open_quote - 1;
157
158 // Move the cursor to the closing quote.
159 next(diff);
160 len += diff;
161 assert(cur_char() == '\'');
162 continue;
163 }
164 break;
165 case '"':
166 {
167 const char* p_open_quote = mp_char;
168
169 // character immediately after the closing quote.
170 const char* p_end =
171 parse_to_closing_double_quote(mp_char, remaining_size());
172
173 if (!p_end)
174 throw parse_error("parse_to_end_of_line: closing double quote was expected but not found.", offset());
175
176 size_t diff = p_end - p_open_quote - 1;
177
178 // Move the cursor to the closing quote.
179 next(diff);
180 len += diff;
181 assert(cur_char() == '"');
182 continue;
183 }
184 break;
185 case '\n':
186 next();
187 break;
188 default:
189 continue;
190 }
191 break;
192 }
193
194 pstring ret(p, len);
195 mp_impl->m_parsed_to_end_of_line = true;
196 return ret;
197 }
198
skip_comment()199 void parser_base::skip_comment()
200 {
201 assert(cur_char() == '#');
202
203 size_t n = 1;
204
205 for (; has_char(); next(), ++n)
206 {
207 if (cur_char() == '\n')
208 {
209 next();
210 break;
211 }
212 }
213
214 mp_impl->m_comment_length = n;
215 }
216
reset_on_new_line()217 void parser_base::reset_on_new_line()
218 {
219 mp_impl->m_comment_length = 0;
220 mp_impl->m_parsed_to_end_of_line = false;
221 }
222
get_scope() const223 size_t parser_base::get_scope() const
224 {
225 return (mp_impl->m_scopes.empty()) ? scope_empty : mp_impl->m_scopes.back().width;
226 }
227
push_scope(size_t scope_width)228 void parser_base::push_scope(size_t scope_width)
229 {
230 mp_impl->m_scopes.emplace_back(scope_width);
231 }
232
clear_scopes()233 void parser_base::clear_scopes()
234 {
235 mp_impl->m_scopes.clear();
236 }
237
get_scope_type() const238 detail::scope_t parser_base::get_scope_type() const
239 {
240 assert(!mp_impl->m_scopes.empty());
241 return mp_impl->m_scopes.back().type;
242 }
243
set_scope_type(detail::scope_t type)244 void parser_base::set_scope_type(detail::scope_t type)
245 {
246 assert(!mp_impl->m_scopes.empty());
247 mp_impl->m_scopes.back().type = type;
248 }
249
pop_scope()250 size_t parser_base::pop_scope()
251 {
252 assert(!mp_impl->m_scopes.empty());
253 mp_impl->m_scopes.pop_back();
254 return get_scope();
255 }
256
push_line_back(const char * p,size_t n)257 void parser_base::push_line_back(const char* p, size_t n)
258 {
259 mp_impl->m_line_buffer.emplace_back(p, n);
260 }
261
pop_line_front()262 pstring parser_base::pop_line_front()
263 {
264 assert(!mp_impl->m_line_buffer.empty());
265
266 pstring ret = mp_impl->m_line_buffer.front();
267 mp_impl->m_line_buffer.pop_front();
268 return ret;
269 }
270
has_line_buffer() const271 bool parser_base::has_line_buffer() const
272 {
273 return !mp_impl->m_line_buffer.empty();
274 }
275
get_line_buffer_count() const276 size_t parser_base::get_line_buffer_count() const
277 {
278 return mp_impl->m_line_buffer.size();
279 }
280
merge_line_buffer()281 pstring parser_base::merge_line_buffer()
282 {
283 assert(!mp_impl->m_line_buffer.empty());
284
285 char sep = mp_impl->m_in_literal_block ? '\n' : ' ';
286
287 cell_buffer& buf = mp_impl->m_buffer;
288 buf.reset();
289
290 auto it = mp_impl->m_line_buffer.begin();
291 buf.append(it->get(), it->size());
292 ++it;
293
294 std::for_each(it, mp_impl->m_line_buffer.end(),
295 [&](const pstring& line)
296 {
297 buf.append(&sep, 1);
298 buf.append(line.get(), line.size());
299 }
300 );
301
302 mp_impl->m_line_buffer.clear();
303 mp_impl->m_in_literal_block = false;
304
305 return pstring(buf.get(), buf.size());
306 }
307
get_doc_hash() const308 const char* parser_base::get_doc_hash() const
309 {
310 return mp_impl->m_document;
311 }
312
set_doc_hash(const char * hash)313 void parser_base::set_doc_hash(const char* hash)
314 {
315 mp_impl->m_document = hash;
316 }
317
318 namespace {
319
320 mdds::sorted_string_map<detail::keyword_t>::entry keyword_entries[] = {
321 { ORCUS_ASCII("FALSE"), detail::keyword_t::boolean_false },
322 { ORCUS_ASCII("False"), detail::keyword_t::boolean_false },
323 { ORCUS_ASCII("N"), detail::keyword_t::boolean_false },
324 { ORCUS_ASCII("NO"), detail::keyword_t::boolean_false },
325 { ORCUS_ASCII("NULL"), detail::keyword_t::null },
326 { ORCUS_ASCII("No"), detail::keyword_t::boolean_false },
327 { ORCUS_ASCII("Null"), detail::keyword_t::null },
328 { ORCUS_ASCII("OFF"), detail::keyword_t::boolean_false },
329 { ORCUS_ASCII("ON"), detail::keyword_t::boolean_true },
330 { ORCUS_ASCII("Off"), detail::keyword_t::boolean_false },
331 { ORCUS_ASCII("On"), detail::keyword_t::boolean_true },
332 { ORCUS_ASCII("TRUE"), detail::keyword_t::boolean_true },
333 { ORCUS_ASCII("True"), detail::keyword_t::boolean_true },
334 { ORCUS_ASCII("Y"), detail::keyword_t::boolean_true },
335 { ORCUS_ASCII("YES"), detail::keyword_t::boolean_true },
336 { ORCUS_ASCII("Yes"), detail::keyword_t::boolean_true },
337 { ORCUS_ASCII("false"), detail::keyword_t::boolean_false },
338 { ORCUS_ASCII("n"), detail::keyword_t::boolean_false },
339 { ORCUS_ASCII("no"), detail::keyword_t::boolean_false },
340 { ORCUS_ASCII("null"), detail::keyword_t::null },
341 { ORCUS_ASCII("off"), detail::keyword_t::boolean_false },
342 { ORCUS_ASCII("on"), detail::keyword_t::boolean_true },
343 { ORCUS_ASCII("true"), detail::keyword_t::boolean_true },
344 { ORCUS_ASCII("y"), detail::keyword_t::boolean_true },
345 { ORCUS_ASCII("yes"), detail::keyword_t::boolean_true },
346 { ORCUS_ASCII("~"), detail::keyword_t::null },
347 };
348
throw_quoted_string_parse_error(const char * func_name,const parse_quoted_string_state & ret,std::ptrdiff_t offset)349 void throw_quoted_string_parse_error(
350 const char* func_name, const parse_quoted_string_state& ret, std::ptrdiff_t offset)
351 {
352 std::ostringstream os;
353 os << func_name << ": failed to parse ";
354 if (ret.length == parse_quoted_string_state::error_illegal_escape_char)
355 os << "due to the presence of illegal escape character.";
356 else if (ret.length == parse_quoted_string_state::error_no_closing_quote)
357 os << "because the closing quote was not found.";
358 else
359 os << "due to unknown reason.";
360
361 throw parse_error(os.str(), offset);
362 }
363
364 }
365
parse_keyword(const char * p,size_t len)366 detail::keyword_t parser_base::parse_keyword(const char* p, size_t len)
367 {
368 static mdds::sorted_string_map<detail::keyword_t> map(
369 keyword_entries,
370 ORCUS_N_ELEMENTS(keyword_entries),
371 detail::keyword_t::unknown);
372
373 detail::keyword_t value = map.find(p, len);
374 return value;
375 }
376
parse_key_value(const char * p,size_t len)377 parser_base::key_value parser_base::parse_key_value(const char* p, size_t len)
378 {
379 size_t scope = get_scope();
380 assert(scope != scope_empty);
381
382 assert(*p != ' ');
383 assert(len);
384
385 const char* p_end = p + len;
386
387 key_value kv;
388
389 char last = 0;
390 bool key_found = false;
391
392 const char* p_head = p;
393
394 for (; p != p_end; ++p)
395 {
396 if (*p == ' ')
397 {
398 if (!key_found)
399 {
400 if (last == ':')
401 {
402 // Key found.
403 kv.key = pstring(p_head, p-p_head-1).trim();
404 key_found = true;
405 p_head = nullptr;
406 }
407 }
408 }
409 else
410 {
411 if (!p_head)
412 p_head = p;
413 }
414
415 last = *p;
416 }
417
418 assert(p_head);
419
420 if (key_found)
421 {
422 // Key has already been found and the value comes after the ':'.
423 kv.value = pstring(p_head, p-p_head);
424 }
425 else if (last == ':')
426 {
427 // Line only contains a key and ends with ':'.
428 kv.key = pstring(p_head, p-p_head-1).trim();
429 }
430 else
431 {
432 // Key has not been found.
433 detail::scope_t st = get_scope_type();
434 if (st == detail::scope_t::map)
435 throw yaml::parse_error("key was expected, but not found.", offset_last_char_of_line());
436 }
437
438 return kv;
439 }
440
parse_single_quoted_string_value(const char * & p,size_t max_length)441 pstring parser_base::parse_single_quoted_string_value(const char*& p, size_t max_length)
442 {
443 parse_quoted_string_state ret =
444 parse_single_quoted_string(p, max_length, mp_impl->m_buffer);
445
446 if (!ret.str)
447 throw_quoted_string_parse_error("parse_single_quoted_string_value", ret, offset());
448
449 return pstring(ret.str, ret.length);
450 }
451
parse_double_quoted_string_value(const char * & p,size_t max_length)452 pstring parser_base::parse_double_quoted_string_value(const char*& p, size_t max_length)
453 {
454 parse_quoted_string_state ret =
455 parse_double_quoted_string(p, max_length, mp_impl->m_buffer);
456
457 if (!ret.str)
458 throw_quoted_string_parse_error("parse_double_quoted_string_value", ret, offset());
459
460 return pstring(ret.str, ret.length);
461 }
462
skip_blanks(const char * & p,size_t len)463 void parser_base::skip_blanks(const char*& p, size_t len)
464 {
465 const char* p_end = p + len;
466 for (; p != p_end && *p == ' '; ++p)
467 ;
468 }
469
start_literal_block()470 void parser_base::start_literal_block()
471 {
472 mp_impl->m_in_literal_block = true;
473 }
474
in_literal_block() const475 bool parser_base::in_literal_block() const
476 {
477 return mp_impl->m_in_literal_block;
478 }
479
handle_line_in_literal(size_t indent)480 void parser_base::handle_line_in_literal(size_t indent)
481 {
482 size_t cur_scope = get_scope();
483
484 if (!has_line_buffer())
485 {
486 // Start a new multi-line string scope.
487
488 if (indent == cur_scope)
489 throw yaml::parse_error("parse: first line of a literal block must be indented.", offset());
490
491 push_scope(indent);
492 set_scope_type(yaml::detail::scope_t::multi_line_string);
493 }
494 else
495 {
496 // The current scope is already a multi-line scope.
497 assert(get_scope_type() == yaml::detail::scope_t::multi_line_string);
498 size_t leading_indent = indent - cur_scope;
499 prev(leading_indent);
500 }
501
502 pstring line = parse_to_end_of_line();
503 push_line_back(line.get(), line.size());
504 }
505
handle_line_in_multi_line_string()506 void parser_base::handle_line_in_multi_line_string()
507 {
508 if (get_scope_type() != yaml::detail::scope_t::multi_line_string)
509 set_scope_type(yaml::detail::scope_t::multi_line_string);
510
511 pstring line = parse_to_end_of_line();
512 line = line.trim();
513 assert(!line.empty());
514 push_line_back(line.get(), line.size());
515 }
516
517 }}
518
519 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
520