1 #include <catch.hpp>
2 #include <internal/tokenizer.hpp>
3 #include "test_utils.hpp"
4 #include <iostream>
5
6 using namespace std;
7 using namespace hocon;
8 using namespace hocon::test_utils;
9
tokenize_as_list(string const & source)10 token_list tokenize_as_list(string const& source) {
11 token_iterator iter(fake_origin(), unique_ptr<istringstream>(new istringstream(source)), true);
12 // get all the tokens from the string and put them in a vector
13 token_list tokens;
14 while (iter.has_next()) {
15 tokens.push_back(iter.next());
16 }
17 return tokens;
18 }
19
tokenizer_test(string const & source,token_list expected)20 void tokenizer_test(string const& source, token_list expected) {
21 token_list result = tokenize_as_list(source);
22 for (size_t i = 0; i < expected.size(); i++) {
23 if (!(*expected[i] == *result[i])) {
24 // Debugging info
25 cerr << result[i]->to_string() << " but expected " << expected[i]->to_string() << " -- ";
26 cerr << "i = " + std::to_string(i);
27 }
28 REQUIRE(*expected[i] == *result[i]);
29 }
30 }
31
32 TEST_CASE("tokenize basic strings", "[tokenizer]") {
33 SECTION("tokenize empty string") {
34 string source = "";
35 token_list expected {
36 tokens::start_token(),
37 tokens::end_token()
38 };
39 tokenizer_test(source, expected);
40 }
41
42 SECTION("tokenize newlines") {
43 string source = "\n\n";
44 token_list expected {
45 tokens::start_token(),
46 make_shared<line>(fake_origin()->with_line_number(1)),
47 make_shared<line>(fake_origin()->with_line_number(2)),
48 tokens::end_token()
49 };
50 tokenizer_test(source, expected);
51 }
52 }
53
54 TEST_CASE("tokenize all types", "[tokenizer]") {
55 SECTION("tokenize all types with no spaces") {
56 string source = ",:=}{][+=\"foo\"\"\"\"bar\"\"\"true3.14false42null${a.b}${?x.y}${\"c.d\"}\n";
57 token_list expected {
58 tokens::start_token(),
59 tokens::comma_token(),
60 tokens::colon_token(),
61 tokens::equals_token(),
62 tokens::close_curly_token(),
63 tokens::open_curly_token(),
64 tokens::close_square_token(),
65 tokens::open_square_token(),
66 tokens::plus_equals_token(),
67 string_token("foo"),
68 string_token("bar"),
69 bool_token(true),
70 double_token(3.14, "3.14"),
71 bool_token(false),
72 int_token(42, "42"),
73 null_token(),
74 substitution_token(unquoted_text_token("a.b"), false),
75 substitution_token(unquoted_text_token("x.y"), true),
76 substitution_token(string_token("\"c.d\""), false),
77 line_token(1),
78 tokens::end_token()
79 };
80 tokenizer_test(source, expected);
81 }
82
83 SECTION("tokenize all types with spaces") {
84 string source = " , : = } { ] [ += \"foo\" \"\"\"bar\"\"\" 42 true 3.14 false null ${a.b} ${?x.y} ${\"c.d\"} \n ";
85 token_list expected {
86 tokens::start_token(),
87 whitespace_token(" "),
88 tokens::comma_token(),
89 whitespace_token(" "),
90 tokens::colon_token(),
91 whitespace_token(" "),
92 tokens::equals_token(),
93 whitespace_token(" "),
94 tokens::close_curly_token(),
95 whitespace_token(" "),
96 tokens::open_curly_token(),
97 whitespace_token(" "),
98 tokens::close_square_token(),
99 whitespace_token(" "),
100 tokens::open_square_token(),
101 whitespace_token(" "),
102 tokens::plus_equals_token(),
103 whitespace_token(" "),
104 string_token("foo"),
105 unquoted_text_token(" "),
106 string_token("bar"),
107 unquoted_text_token(" "),
108 int_token(42, "42"),
109 unquoted_text_token(" "),
110 bool_token(true),
111 unquoted_text_token(" "),
112 double_token(3.14, "3.14"),
113 unquoted_text_token(" "),
114 bool_token(false),
115 unquoted_text_token(" "),
116 null_token(),
117 unquoted_text_token(" "),
118 substitution_token(unquoted_text_token("a.b"), false),
119 unquoted_text_token(" "),
120 substitution_token(unquoted_text_token("x.y"), true),
121 unquoted_text_token(" "),
122 substitution_token(string_token("\"c.d\""), false),
123 whitespace_token(" "),
124 line_token(1),
125 whitespace_token(" "),
126 tokens::end_token()
127 };
128 tokenizer_test(source, expected);
129 }
130
131 SECTION("tokenize all types with multiple spaces") {
132 string source = " , : = } { ] [ += \"foo\" \"\"\"bar\"\"\" 42 true 3.14 false null ${a.b} ${?x.y} ${\"c.d\"} \n ";
133 token_list expected {
134 tokens::start_token(),
135 whitespace_token(" "),
136 tokens::comma_token(),
137 whitespace_token(" "),
138 tokens::colon_token(),
139 whitespace_token(" "),
140 tokens::equals_token(),
141 whitespace_token(" "),
142 tokens::close_curly_token(),
143 whitespace_token(" "),
144 tokens::open_curly_token(),
145 whitespace_token(" "),
146 tokens::close_square_token(),
147 whitespace_token(" "),
148 tokens::open_square_token(),
149 whitespace_token(" "),
150 tokens::plus_equals_token(),
151 whitespace_token(" "),
152 string_token("foo"),
153 unquoted_text_token(" "),
154 string_token("bar"),
155 unquoted_text_token(" "),
156 int_token(42, "42"),
157 unquoted_text_token(" "),
158 bool_token(true),
159 unquoted_text_token(" "),
160 double_token(3.14, "3.14"),
161 unquoted_text_token(" "),
162 bool_token(false),
163 unquoted_text_token(" "),
164 null_token(),
165 unquoted_text_token(" "),
166 substitution_token(unquoted_text_token("a.b"), false),
167 unquoted_text_token(" "),
168 substitution_token(unquoted_text_token("x.y"), true),
169 unquoted_text_token(" "),
170 substitution_token(string_token("\"c.d\""), false),
171 whitespace_token(" "),
172 line_token(1),
173 whitespace_token(" "),
174 tokens::end_token()
175 };
176 tokenizer_test(source, expected);
177 }
178 }
179
180 TEST_CASE("unquoted text and booleans", "[tokenizer]") {
181 SECTION("true and unquoted text") {
182 string source = "truefoo";
183 token_list expected {
184 tokens::start_token(),
185 bool_token(true),
186 unquoted_text_token("foo"),
187 tokens::end_token()
188 };
189 tokenizer_test(source, expected);
190 }
191
192 SECTION("false and unquoted text") {
193 string source = "falsefoo";
194 token_list expected {
195 tokens::start_token(),
196 bool_token(false),
197 unquoted_text_token("foo"),
198 tokens::end_token()
199 };
200 tokenizer_test(source, expected);
201 }
202
203 SECTION("null and unquoted text") {
204 string source = "nullfoo";
205 token_list expected {
206 tokens::start_token(),
207 null_token(),
208 unquoted_text_token("foo"),
209 tokens::end_token()
210 };
211 tokenizer_test(source, expected);
212 }
213
214 SECTION("unquoted text containing true") {
215 string source = "footrue";
216 token_list expected {
217 tokens::start_token(),
218 unquoted_text_token("footrue"),
219 tokens::end_token()
220 };
221 tokenizer_test(source, expected);
222 }
223
224 SECTION("unquoted text containing space true") {
225 string source = "foo true";
226 token_list expected {
227 tokens::start_token(),
228 unquoted_text_token("foo"),
229 unquoted_text_token(" "),
230 bool_token(true),
231 tokens::end_token()
232 };
233 tokenizer_test(source, expected);
234 }
235
236 SECTION("true and space and unquoted text") {
237 string source = "true foo";
238 token_list expected {
239 tokens::start_token(),
240 bool_token(true),
241 unquoted_text_token(" "),
242 unquoted_text_token("foo"),
243 tokens::end_token()
244 };
245 tokenizer_test(source, expected);
246 }
247 }
248
249 TEST_CASE("unquoted strings with special cases", "[tokenizer]") {
250 SECTION("unquoted text containing slash") {
251 string source = "a/b/c";
252 token_list expected {
253 tokens::start_token(),
254 unquoted_text_token("a/b/c"),
255 tokens::end_token()
256 };
257 tokenizer_test(source, expected);
258
259 source = "/";
260 expected = {
261 tokens::start_token(),
262 unquoted_text_token("/"),
263 tokens::end_token()
264 };
265 tokenizer_test(source, expected);
266
267 source = "/ /";
268 expected = {
269 tokens::start_token(),
270 unquoted_text_token("/"),
271 unquoted_text_token(" "),
272 unquoted_text_token("/"),
273 tokens::end_token()
274 };
275 tokenizer_test(source, expected);
276 }
277
278 SECTION("unquoted text discards external spaces") {
279 string source = " foo \n";
280 token_list expected {
281 tokens::start_token(),
282 whitespace_token(" "),
283 unquoted_text_token("foo"),
284 whitespace_token(" "),
285 line_token(1),
286 tokens::end_token()
287 };
288 tokenizer_test(source, expected);
289 }
290
291 SECTION("unquoted text keeps internal spaces") {
292 string source = " foo bar baz \n";
293 token_list expected {
294 tokens::start_token(),
295 whitespace_token(" "),
296 unquoted_text_token("foo"),
297 unquoted_text_token(" "),
298 unquoted_text_token("bar"),
299 unquoted_text_token(" "),
300 unquoted_text_token("baz"),
301 whitespace_token(" "),
302 line_token(1),
303 tokens::end_token()
304 };
305 tokenizer_test(source, expected);
306 }
307
308 SECTION("mix quoted and unquoted") {
309 string source = " foo\"bar\"baz \n";
310 token_list expected {
311 tokens::start_token(),
312 whitespace_token(" "),
313 unquoted_text_token("foo"),
314 string_token("bar"),
315 unquoted_text_token("baz"),
316 whitespace_token(" "),
317 line_token(1),
318 tokens::end_token()
319 };
320 tokenizer_test(source, expected);
321 }
322 }
323
324 TEST_CASE("escape sequence", "[tokenizer]") {
325 SECTION("unicode infinity symbol") {
326 string source = "\"\\u221E\"";
327 token_list expected {
328 tokens::start_token(),
329 string_token(u8"\u221E"),
330 tokens::end_token()
331 };
332 tokenizer_test(source, expected);
333 }
334
335 SECTION("null byte") {
336 string source = " \"\\u0000\" ";
337 token_list expected {
338 tokens::start_token(),
339 whitespace_token(" "),
340 string_token(""),
341 whitespace_token(" "),
342 tokens::end_token()
343 };
344 tokenizer_test(source, expected);
345 }
346
347 SECTION("various escape codes") {
348 string source = " \"\\\"\\\\/\\b\\f\\n\\r\\t\" ";
349 token_list expected {
350 tokens::start_token(),
351 whitespace_token(" "),
352 string_token("\"\\/\b\f\n\r\t"),
353 whitespace_token(" "),
354 tokens::end_token()
355 };
356 tokenizer_test(source, expected);
357 }
358
359 SECTION("unicode F") {
360 string source = " \"\\u0046\" ";
361 token_list expected {
362 tokens::start_token(),
363 whitespace_token(" "),
364 string_token("F"),
365 whitespace_token(" "),
366 tokens::end_token()
367 };
368 tokenizer_test(source, expected);
369 }
370
371 SECTION("two unicode F's") {
372 string source = " \"\\u0046\\u0046\" ";
373 token_list expected {
374 tokens::start_token(),
375 whitespace_token(" "),
376 string_token("FF"),
377 whitespace_token(" "),
378 tokens::end_token()
379 };
380 tokenizer_test(source, expected);
381 }
382 }
383
384 TEST_CASE("triple quoted strings") {
385 SECTION("trivial triple quoted string") {
386 string source = "\"\"\"bar\"\"\"";
387 token_list expected {
388 tokens::start_token(),
389 string_token("bar"),
390 tokens::end_token()
391 };
392 tokenizer_test(source, expected);
393 }
394
395 SECTION("trailing quotes in triple quoted string") {
396 string source = "\"\"\"\"\"\"\"\"";
397 token_list expected {
398 tokens::start_token(),
399 string_token("\"\""),
400 tokens::end_token()
401 };
402 tokenizer_test(source, expected);
403 }
404
405 SECTION("no esacpe in triple quoted strings") {
406 string source = "\"\"\"\\n\"\"\"";
407 token_list expected {
408 tokens::start_token(),
409 string_token("\\n"),
410 tokens::end_token()
411 };
412 tokenizer_test(source, expected);
413 }
414
415 SECTION("new line in triple quoted string") {
416 string source = "\"\"\"foo\nbar\"\"\"";
417 token_list expected {
418 tokens::start_token(),
419 string_token("foo\nbar"),
420 tokens::end_token()
421 };
422 tokenizer_test(source, expected);
423 }
424 }
425
426 TEST_CASE("comments", "[tokenizer]") {
427 SECTION("double slash comment") {
428 string source = "//";
429 token_list expected {
430 tokens::start_token(),
431 double_slash_comment_token(""),
432 tokens::end_token()
433 };
434 tokenizer_test(source, expected);
435 }
436
437 SECTION("hash comment") {
438 string source = "#";
439 token_list expected {
440 tokens::start_token(),
441 hash_comment_token(""),
442 tokens::end_token()
443 };
444 tokenizer_test(source, expected);
445 }
446
447 SECTION("two slashes in quoted string is string") {
448 string source = "\"//bar\"";
449 token_list expected {
450 tokens::start_token(),
451 string_token("//bar"),
452 tokens::end_token()
453 };
454 tokenizer_test(source, expected);
455 }
456
457 SECTION("hash in quoted string is string") {
458 string source = "\"#bar\"";
459 token_list expected {
460 tokens::start_token(),
461 string_token("#bar"),
462 tokens::end_token()
463 };
464 tokenizer_test(source, expected);
465 }
466
467 SECTION("slash comment after unquoted text") {
468 string source = "bar//comment";
469 token_list expected {
470 tokens::start_token(),
471 unquoted_text_token("bar"),
472 double_slash_comment_token("comment"),
473 tokens::end_token()
474 };
475 tokenizer_test(source, expected);
476 }
477
478 SECTION("hash comment after unquoted text") {
479 string source = "bar#comment";
480 token_list expected {
481 tokens::start_token(),
482 unquoted_text_token("bar"),
483 hash_comment_token("comment"),
484 tokens::end_token()
485 };
486 tokenizer_test(source, expected);
487 }
488
489 SECTION("slash comment after int") {
490 string source = "10//comment";
491 token_list expected {
492 tokens::start_token(),
493 int_token(10, "10"),
494 double_slash_comment_token("comment"),
495 tokens::end_token()
496 };
497 tokenizer_test(source, expected);
498 }
499
500 SECTION("hash comment after int") {
501 string source = "10#comment";
502 token_list expected {
503 tokens::start_token(),
504 int_token(10, "10"),
505 hash_comment_token("comment"),
506 tokens::end_token()
507 };
508 tokenizer_test(source, expected);
509 }
510
511 SECTION("slash comment with newline") {
512 string source = "10//comment\n12";
513 token_list expected {
514 tokens::start_token(),
515 int_token(10, "10"),
516 double_slash_comment_token("comment"),
517 line_token(1),
518 int_token(12, "12"),
519 tokens::end_token()
520 };
521 tokenizer_test(source, expected);
522 }
523
524 SECTION("hash comment with newline") {
525 string source = "10#comment\n12";
526 token_list expected {
527 tokens::start_token(),
528 int_token(10, "10"),
529 hash_comment_token("comment"),
530 line_token(1),
531 int_token(12, "12"),
532 tokens::end_token()
533 };
534 tokenizer_test(source, expected);
535 }
536
537 SECTION("slash comment on multiples lines with whitespace") {
538 string source = " //comment\r\n //comment2 \n//comment3 \n\n//comment4";
539 token_list expected {
540 tokens::start_token(),
541 whitespace_token(" "),
542 double_slash_comment_token("comment\r"),
543 line_token(1),
544 whitespace_token(" "),
545 double_slash_comment_token("comment2 "),
546 line_token(2),
547 double_slash_comment_token("comment3 "),
548 line_token(3),
549 line_token(4),
550 double_slash_comment_token("comment4"),
551 tokens::end_token()
552 };
553 tokenizer_test(source, expected);
554 }
555
556 SECTION("hash comment on multiples lines with whitespace") {
557 string source = " #comment\r\n #comment2 \n#comment3 \n\n#comment4";
558 token_list expected {
559 tokens::start_token(),
560 whitespace_token(" "),
561 hash_comment_token("comment\r"),
562 line_token(1),
563 whitespace_token(" "),
564 hash_comment_token("comment2 "),
565 line_token(2),
566 hash_comment_token("comment3 "),
567 line_token(3),
568 line_token(4),
569 hash_comment_token("comment4"),
570 tokens::end_token()
571 };
572 tokenizer_test(source, expected);
573 }
574 }
575
576 TEST_CASE("brackets and braces", "[tokenizer]") {
577 SECTION("open curlies") {
578 string source = "{{";
579 token_list expected{
580 tokens::start_token(),
581 tokens::open_curly_token(),
582 tokens::open_curly_token(),
583 tokens::end_token()
584 };
585 tokenizer_test(source, expected);
586 }
587
588 SECTION("close curlies") {
589 string source = "}}";
590 token_list expected{
591 tokens::start_token(),
592 tokens::close_curly_token(),
593 tokens::close_curly_token(),
594 tokens::end_token()
595 };
596 tokenizer_test(source, expected);
597 }
598
599 SECTION("open and close curlies") {
600 string source = "{}";
601 token_list expected{
602 tokens::start_token(),
603 tokens::open_curly_token(),
604 tokens::close_curly_token(),
605 tokens::end_token()
606 };
607 tokenizer_test(source, expected);
608 }
609
610 SECTION("open squares") {
611 string source = "[[";
612 token_list expected{
613 tokens::start_token(),
614 tokens::open_square_token(),
615 tokens::open_square_token(),
616 tokens::end_token()
617 };
618 tokenizer_test(source, expected);
619 }
620
621 SECTION("close curlies") {
622 string source = "]]";
623 token_list expected{
624 tokens::start_token(),
625 tokens::close_square_token(),
626 tokens::close_square_token(),
627 tokens::end_token()
628 };
629 tokenizer_test(source, expected);
630 }
631
632 SECTION("open and close curlies") {
633 string source = "[]";
634 token_list expected{
635 tokens::start_token(),
636 tokens::open_square_token(),
637 tokens::close_square_token(),
638 tokens::end_token()
639 };
640 tokenizer_test(source, expected);
641 }
642 }
643
test_for_config_error(string source)644 void test_for_config_error(string source) {
645 token_iterator iter(fake_origin(), unique_ptr<istringstream>(new istringstream(source)), true);
646 while (iter.has_next()) {
647 iter.next();
648 }
649 }
650
651 TEST_CASE("catch syntax erros", "[tokenizer]") {
652 SECTION("nothing after backslash") {
653 string source = " \"\\\" ";
654 REQUIRE_THROWS(test_for_config_error(source));
655 }
656
657 SECTION("\\q is not a valid escape sequence") {
658 string source = " \"\\q\" ";
659 REQUIRE_THROWS(test_for_config_error(source));
660 }
661
662 SECTION("unicode byte sequence missing bytes") {
663 string source = " \"\\u012\" ";
664 REQUIRE_THROWS(test_for_config_error(source));
665
666 source = " \"\\u01\" ";
667 REQUIRE_THROWS(test_for_config_error(source));
668
669 source = " \"\\u1\" ";
670 REQUIRE_THROWS(test_for_config_error(source));
671
672 source = " \"\\u\" ";
673 REQUIRE_THROWS(test_for_config_error(source));
674 }
675
676 SECTION("missing closing quotes") {
677 string source = "\"";
678 REQUIRE_THROWS(test_for_config_error(source));
679
680 source = "\"abc";
681 REQUIRE_THROWS(test_for_config_error(source));
682 }
683
684 SECTION("invalid lone characters") {
685 string source = "\"\\\"";
686 REQUIRE_THROWS(test_for_config_error(source));
687
688 source = "$";
689 REQUIRE_THROWS(test_for_config_error(source));
690
691 source = "${";
692 REQUIRE_THROWS(test_for_config_error(source));
693 }
694 }
695