1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #include "orcus/yaml_parser_base.hpp"
9 #include "orcus/global.hpp"
10 #include "orcus/cell_buffer.hpp"
11 #include "orcus/parser_global.hpp"
12 
13 #include <mdds/sorted_string_map.hpp>
14 
15 #include <limits>
16 #include <vector>
17 #include <deque>
18 #include <sstream>
19 #include <algorithm>
20 
21 namespace orcus { namespace yaml {
22 
parse_error(const std::string & msg,std::ptrdiff_t offset)23 parse_error::parse_error(const std::string& msg, std::ptrdiff_t offset) :
24     ::orcus::parse_error(msg, offset) {}
25 
throw_with(const char * msg_before,char c,const char * msg_after,std::ptrdiff_t offset)26 void parse_error::throw_with(const char* msg_before, char c, const char* msg_after, std::ptrdiff_t offset)
27 {
28     throw parse_error(build_message(msg_before, c, msg_after), offset);
29 }
30 
throw_with(const char * msg_before,const char * p,size_t n,const char * msg_after,std::ptrdiff_t offset)31 void parse_error::throw_with(
32     const char* msg_before, const char* p, size_t n, const char* msg_after, std::ptrdiff_t offset)
33 {
34     throw parse_error(build_message(msg_before, p, n, msg_after), offset);
35 }
36 
37 struct scope
38 {
39     size_t width;
40     detail::scope_t type;
41 
scopeorcus::yaml::scope42     scope(size_t _width) : width(_width), type(detail::scope_t::unset) {}
43 };
44 
45 struct parser_base::impl
46 {
47     cell_buffer m_buffer;
48     std::vector<scope> m_scopes;
49     std::deque<pstring> m_line_buffer;
50     const char* m_document;
51 
52     size_t m_comment_length;
53 
54     bool m_in_literal_block;
55     bool m_parsed_to_end_of_line;
56 
57     detail::parse_token_t m_last_token;
58 
implorcus::yaml::parser_base::impl59     impl() :
60         m_document(nullptr),
61         m_comment_length(0),
62         m_in_literal_block(false),
63         m_parsed_to_end_of_line(false),
64         m_last_token(detail::parse_token_t::unknown) {}
65 };
66 
67 const size_t parser_base::parse_indent_blank_line    = std::numeric_limits<size_t>::max();
68 const size_t parser_base::parse_indent_end_of_stream = std::numeric_limits<size_t>::max() - 1;
69 const size_t parser_base::scope_empty = std::numeric_limits<size_t>::max() - 2;
70 
parser_base(const char * p,size_t n)71 parser_base::parser_base(const char* p, size_t n) :
72     ::orcus::parser_base(p, n, false), mp_impl(orcus::make_unique<impl>()) {}
73 
~parser_base()74 parser_base::~parser_base() {}
75 
push_parse_token(detail::parse_token_t t)76 void parser_base::push_parse_token(detail::parse_token_t t)
77 {
78     mp_impl->m_last_token = t;
79 }
80 
get_last_parse_token() const81 detail::parse_token_t parser_base::get_last_parse_token() const
82 {
83     return mp_impl->m_last_token;
84 }
85 
offset_last_char_of_line() const86 size_t parser_base::offset_last_char_of_line() const
87 {
88     // The current parser position should be on the linefeed char after
89     // calling parse_to_end_of_line().
90     assert(mp_impl->m_parsed_to_end_of_line);
91 
92     size_t pos = offset(); // character past the '\n'.
93     pos -= 1; // move back to the '\n'.
94 
95     if (mp_impl->m_comment_length)
96     {
97         assert(mp_impl->m_comment_length < pos);
98         pos -= mp_impl->m_comment_length; // should be on the '#' character.
99     }
100 
101     pos -= 1;
102 
103     // Ignore any trailing whitespaces.
104     const char* p = mp_begin + pos;
105     for (; mp_begin < p && *p == ' '; --p, --pos)
106         ;
107 
108     return pos;
109 }
110 
parse_indent()111 size_t parser_base::parse_indent()
112 {
113     for (size_t indent = 0; has_char(); next(), ++indent)
114     {
115         char c = cur_char();
116         switch (c)
117         {
118             case '#':
119                 skip_comment();
120                 return parse_indent_blank_line;
121             case '\n':
122                 next();
123                 return parse_indent_blank_line;
124             case ' ':
125                 continue;
126             default:
127                 return indent;
128         }
129     }
130 
131     return parse_indent_end_of_stream;
132 }
133 
parse_to_end_of_line()134 pstring parser_base::parse_to_end_of_line()
135 {
136     const char* p = mp_char;
137     size_t len = 0;
138     for (; has_char(); next(), ++len)
139     {
140         switch (cur_char())
141         {
142             case '#':
143                 skip_comment();
144             break;
145             case '\'':
146             {
147                 const char* p_open_quote = mp_char;
148 
149                 // character immediately after the closing quote.
150                 const char* p_end =
151                     parse_to_closing_single_quote(mp_char, remaining_size());
152 
153                 if (!p_end)
154                     throw parse_error("parse_to_end_of_line: closing single quote was expected but not found.", offset());
155 
156                 size_t diff = p_end - p_open_quote - 1;
157 
158                 // Move the cursor to the closing quote.
159                 next(diff);
160                 len += diff;
161                 assert(cur_char() == '\'');
162                 continue;
163             }
164             break;
165             case '"':
166             {
167                 const char* p_open_quote = mp_char;
168 
169                 // character immediately after the closing quote.
170                 const char* p_end =
171                     parse_to_closing_double_quote(mp_char, remaining_size());
172 
173                 if (!p_end)
174                     throw parse_error("parse_to_end_of_line: closing double quote was expected but not found.", offset());
175 
176                 size_t diff = p_end - p_open_quote - 1;
177 
178                 // Move the cursor to the closing quote.
179                 next(diff);
180                 len += diff;
181                 assert(cur_char() == '"');
182                 continue;
183             }
184             break;
185             case '\n':
186                 next();
187             break;
188             default:
189                 continue;
190         }
191         break;
192     }
193 
194     pstring ret(p, len);
195     mp_impl->m_parsed_to_end_of_line = true;
196     return ret;
197 }
198 
skip_comment()199 void parser_base::skip_comment()
200 {
201     assert(cur_char() == '#');
202 
203     size_t n = 1;
204 
205     for (; has_char(); next(), ++n)
206     {
207         if (cur_char() == '\n')
208         {
209             next();
210             break;
211         }
212     }
213 
214     mp_impl->m_comment_length = n;
215 }
216 
reset_on_new_line()217 void parser_base::reset_on_new_line()
218 {
219     mp_impl->m_comment_length = 0;
220     mp_impl->m_parsed_to_end_of_line = false;
221 }
222 
get_scope() const223 size_t parser_base::get_scope() const
224 {
225     return (mp_impl->m_scopes.empty()) ? scope_empty : mp_impl->m_scopes.back().width;
226 }
227 
push_scope(size_t scope_width)228 void parser_base::push_scope(size_t scope_width)
229 {
230     mp_impl->m_scopes.emplace_back(scope_width);
231 }
232 
clear_scopes()233 void parser_base::clear_scopes()
234 {
235     mp_impl->m_scopes.clear();
236 }
237 
get_scope_type() const238 detail::scope_t parser_base::get_scope_type() const
239 {
240     assert(!mp_impl->m_scopes.empty());
241     return mp_impl->m_scopes.back().type;
242 }
243 
set_scope_type(detail::scope_t type)244 void parser_base::set_scope_type(detail::scope_t type)
245 {
246     assert(!mp_impl->m_scopes.empty());
247     mp_impl->m_scopes.back().type = type;
248 }
249 
pop_scope()250 size_t parser_base::pop_scope()
251 {
252     assert(!mp_impl->m_scopes.empty());
253     mp_impl->m_scopes.pop_back();
254     return get_scope();
255 }
256 
push_line_back(const char * p,size_t n)257 void parser_base::push_line_back(const char* p, size_t n)
258 {
259     mp_impl->m_line_buffer.emplace_back(p, n);
260 }
261 
pop_line_front()262 pstring parser_base::pop_line_front()
263 {
264     assert(!mp_impl->m_line_buffer.empty());
265 
266     pstring ret = mp_impl->m_line_buffer.front();
267     mp_impl->m_line_buffer.pop_front();
268     return ret;
269 }
270 
has_line_buffer() const271 bool parser_base::has_line_buffer() const
272 {
273     return !mp_impl->m_line_buffer.empty();
274 }
275 
get_line_buffer_count() const276 size_t parser_base::get_line_buffer_count() const
277 {
278     return mp_impl->m_line_buffer.size();
279 }
280 
merge_line_buffer()281 pstring parser_base::merge_line_buffer()
282 {
283     assert(!mp_impl->m_line_buffer.empty());
284 
285     char sep = mp_impl->m_in_literal_block ? '\n' : ' ';
286 
287     cell_buffer& buf = mp_impl->m_buffer;
288     buf.reset();
289 
290     auto it = mp_impl->m_line_buffer.begin();
291     buf.append(it->get(), it->size());
292     ++it;
293 
294     std::for_each(it, mp_impl->m_line_buffer.end(),
295         [&](const pstring& line)
296         {
297             buf.append(&sep, 1);
298             buf.append(line.get(), line.size());
299         }
300     );
301 
302     mp_impl->m_line_buffer.clear();
303     mp_impl->m_in_literal_block = false;
304 
305     return pstring(buf.get(), buf.size());
306 }
307 
get_doc_hash() const308 const char* parser_base::get_doc_hash() const
309 {
310     return mp_impl->m_document;
311 }
312 
set_doc_hash(const char * hash)313 void parser_base::set_doc_hash(const char* hash)
314 {
315     mp_impl->m_document = hash;
316 }
317 
318 namespace {
319 
320 mdds::sorted_string_map<detail::keyword_t>::entry keyword_entries[] = {
321     { ORCUS_ASCII("FALSE"), detail::keyword_t::boolean_false },
322     { ORCUS_ASCII("False"), detail::keyword_t::boolean_false },
323     { ORCUS_ASCII("N"),     detail::keyword_t::boolean_false },
324     { ORCUS_ASCII("NO"),    detail::keyword_t::boolean_false },
325     { ORCUS_ASCII("NULL"),  detail::keyword_t::null          },
326     { ORCUS_ASCII("No"),    detail::keyword_t::boolean_false },
327     { ORCUS_ASCII("Null"),  detail::keyword_t::null          },
328     { ORCUS_ASCII("OFF"),   detail::keyword_t::boolean_false },
329     { ORCUS_ASCII("ON"),    detail::keyword_t::boolean_true  },
330     { ORCUS_ASCII("Off"),   detail::keyword_t::boolean_false },
331     { ORCUS_ASCII("On"),    detail::keyword_t::boolean_true  },
332     { ORCUS_ASCII("TRUE"),  detail::keyword_t::boolean_true  },
333     { ORCUS_ASCII("True"),  detail::keyword_t::boolean_true  },
334     { ORCUS_ASCII("Y"),     detail::keyword_t::boolean_true  },
335     { ORCUS_ASCII("YES"),   detail::keyword_t::boolean_true  },
336     { ORCUS_ASCII("Yes"),   detail::keyword_t::boolean_true  },
337     { ORCUS_ASCII("false"), detail::keyword_t::boolean_false },
338     { ORCUS_ASCII("n"),     detail::keyword_t::boolean_false },
339     { ORCUS_ASCII("no"),    detail::keyword_t::boolean_false },
340     { ORCUS_ASCII("null"),  detail::keyword_t::null          },
341     { ORCUS_ASCII("off"),   detail::keyword_t::boolean_false },
342     { ORCUS_ASCII("on"),    detail::keyword_t::boolean_true  },
343     { ORCUS_ASCII("true"),  detail::keyword_t::boolean_true  },
344     { ORCUS_ASCII("y"),     detail::keyword_t::boolean_true  },
345     { ORCUS_ASCII("yes"),   detail::keyword_t::boolean_true  },
346     { ORCUS_ASCII("~"),     detail::keyword_t::null          },
347 };
348 
throw_quoted_string_parse_error(const char * func_name,const parse_quoted_string_state & ret,std::ptrdiff_t offset)349 void throw_quoted_string_parse_error(
350     const char* func_name, const parse_quoted_string_state& ret, std::ptrdiff_t offset)
351 {
352     std::ostringstream os;
353     os << func_name << ": failed to parse ";
354     if (ret.length == parse_quoted_string_state::error_illegal_escape_char)
355         os << "due to the presence of illegal escape character.";
356     else if (ret.length == parse_quoted_string_state::error_no_closing_quote)
357         os << "because the closing quote was not found.";
358     else
359         os << "due to unknown reason.";
360 
361     throw parse_error(os.str(), offset);
362 }
363 
364 }
365 
parse_keyword(const char * p,size_t len)366 detail::keyword_t parser_base::parse_keyword(const char* p, size_t len)
367 {
368     static mdds::sorted_string_map<detail::keyword_t> map(
369         keyword_entries,
370         ORCUS_N_ELEMENTS(keyword_entries),
371         detail::keyword_t::unknown);
372 
373     detail::keyword_t value = map.find(p, len);
374     return value;
375 }
376 
parse_key_value(const char * p,size_t len)377 parser_base::key_value parser_base::parse_key_value(const char* p, size_t len)
378 {
379     size_t scope = get_scope();
380     assert(scope != scope_empty);
381 
382     assert(*p != ' ');
383     assert(len);
384 
385     const char* p_end = p + len;
386 
387     key_value kv;
388 
389     char last = 0;
390     bool key_found = false;
391 
392     const char* p_head = p;
393 
394     for (; p != p_end; ++p)
395     {
396         if (*p == ' ')
397         {
398             if (!key_found)
399             {
400                 if (last == ':')
401                 {
402                     // Key found.
403                     kv.key = pstring(p_head, p-p_head-1).trim();
404                     key_found = true;
405                     p_head = nullptr;
406                 }
407             }
408         }
409         else
410         {
411             if (!p_head)
412                 p_head = p;
413         }
414 
415         last = *p;
416     }
417 
418     assert(p_head);
419 
420     if (key_found)
421     {
422         // Key has already been found and the value comes after the ':'.
423         kv.value = pstring(p_head, p-p_head);
424     }
425     else if (last == ':')
426     {
427         // Line only contains a key and ends with ':'.
428         kv.key = pstring(p_head, p-p_head-1).trim();
429     }
430     else
431     {
432         // Key has not been found.
433         detail::scope_t st = get_scope_type();
434         if (st == detail::scope_t::map)
435             throw yaml::parse_error("key was expected, but not found.", offset_last_char_of_line());
436     }
437 
438     return kv;
439 }
440 
parse_single_quoted_string_value(const char * & p,size_t max_length)441 pstring parser_base::parse_single_quoted_string_value(const char*& p, size_t max_length)
442 {
443     parse_quoted_string_state ret =
444         parse_single_quoted_string(p, max_length, mp_impl->m_buffer);
445 
446     if (!ret.str)
447         throw_quoted_string_parse_error("parse_single_quoted_string_value", ret, offset());
448 
449     return pstring(ret.str, ret.length);
450 }
451 
parse_double_quoted_string_value(const char * & p,size_t max_length)452 pstring parser_base::parse_double_quoted_string_value(const char*& p, size_t max_length)
453 {
454     parse_quoted_string_state ret =
455         parse_double_quoted_string(p, max_length, mp_impl->m_buffer);
456 
457     if (!ret.str)
458         throw_quoted_string_parse_error("parse_double_quoted_string_value", ret, offset());
459 
460     return pstring(ret.str, ret.length);
461 }
462 
skip_blanks(const char * & p,size_t len)463 void parser_base::skip_blanks(const char*& p, size_t len)
464 {
465     const char* p_end = p + len;
466     for (; p != p_end && *p == ' '; ++p)
467         ;
468 }
469 
start_literal_block()470 void parser_base::start_literal_block()
471 {
472     mp_impl->m_in_literal_block = true;
473 }
474 
in_literal_block() const475 bool parser_base::in_literal_block() const
476 {
477     return mp_impl->m_in_literal_block;
478 }
479 
handle_line_in_literal(size_t indent)480 void parser_base::handle_line_in_literal(size_t indent)
481 {
482     size_t cur_scope = get_scope();
483 
484     if (!has_line_buffer())
485     {
486         // Start a new multi-line string scope.
487 
488         if (indent == cur_scope)
489             throw yaml::parse_error("parse: first line of a literal block must be indented.", offset());
490 
491         push_scope(indent);
492         set_scope_type(yaml::detail::scope_t::multi_line_string);
493     }
494     else
495     {
496         // The current scope is already a multi-line scope.
497         assert(get_scope_type() == yaml::detail::scope_t::multi_line_string);
498         size_t leading_indent = indent - cur_scope;
499         prev(leading_indent);
500     }
501 
502     pstring line = parse_to_end_of_line();
503     push_line_back(line.get(), line.size());
504 }
505 
handle_line_in_multi_line_string()506 void parser_base::handle_line_in_multi_line_string()
507 {
508     if (get_scope_type() != yaml::detail::scope_t::multi_line_string)
509         set_scope_type(yaml::detail::scope_t::multi_line_string);
510 
511     pstring line = parse_to_end_of_line();
512     line = line.trim();
513     assert(!line.empty());
514     push_line_back(line.get(), line.size());
515 }
516 
517 }}
518 
519 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
520