1 #include <catch.hpp>
2 #include <internal/tokenizer.hpp>
3 #include "test_utils.hpp"
4 #include <iostream>
5 
6 using namespace std;
7 using namespace hocon;
8 using namespace hocon::test_utils;
9 
tokenize_as_list(string const & source)10 token_list tokenize_as_list(string const& source) {
11     token_iterator iter(fake_origin(), unique_ptr<istringstream>(new istringstream(source)), true);
12     // get all the tokens from the string and put them in a vector
13     token_list tokens;
14     while (iter.has_next()) {
15         tokens.push_back(iter.next());
16     }
17     return tokens;
18 }
19 
tokenizer_test(string const & source,token_list expected)20 void tokenizer_test(string const& source, token_list expected) {
21     token_list result = tokenize_as_list(source);
22     for (size_t i = 0; i < expected.size(); i++) {
23         if (!(*expected[i] == *result[i])) {
24             // Debugging info
25             cerr << result[i]->to_string() << " but expected " << expected[i]->to_string() << " -- ";
26             cerr << "i = " + std::to_string(i);
27         }
28         REQUIRE(*expected[i] == *result[i]);
29     }
30 }
31 
32 TEST_CASE("tokenize basic strings", "[tokenizer]") {
33     SECTION("tokenize empty string") {
34         string source = "";
35         token_list expected {
36                 tokens::start_token(),
37                 tokens::end_token()
38         };
39         tokenizer_test(source, expected);
40     }
41 
42     SECTION("tokenize newlines") {
43         string source = "\n\n";
44         token_list expected {
45                 tokens::start_token(),
46                 make_shared<line>(fake_origin()->with_line_number(1)),
47                 make_shared<line>(fake_origin()->with_line_number(2)),
48                 tokens::end_token()
49         };
50         tokenizer_test(source, expected);
51     }
52 }
53 
54 TEST_CASE("tokenize all types", "[tokenizer]") {
55     SECTION("tokenize all types with no spaces") {
56         string source = ",:=}{][+=\"foo\"\"\"\"bar\"\"\"true3.14false42null${a.b}${?x.y}${\"c.d\"}\n";
57         token_list expected {
58                 tokens::start_token(),
59                 tokens::comma_token(),
60                 tokens::colon_token(),
61                 tokens::equals_token(),
62                 tokens::close_curly_token(),
63                 tokens::open_curly_token(),
64                 tokens::close_square_token(),
65                 tokens::open_square_token(),
66                 tokens::plus_equals_token(),
67                 string_token("foo"),
68                 string_token("bar"),
69                 bool_token(true),
70                 double_token(3.14, "3.14"),
71                 bool_token(false),
72                 int_token(42, "42"),
73                 null_token(),
74                 substitution_token(unquoted_text_token("a.b"), false),
75                 substitution_token(unquoted_text_token("x.y"), true),
76                 substitution_token(string_token("\"c.d\""), false),
77                 line_token(1),
78                 tokens::end_token()
79         };
80         tokenizer_test(source, expected);
81     }
82 
83     SECTION("tokenize all types with spaces") {
84         string source = " , : = } { ] [ += \"foo\" \"\"\"bar\"\"\" 42 true 3.14 false null ${a.b} ${?x.y} ${\"c.d\"} \n ";
85         token_list expected {
86                 tokens::start_token(),
87                 whitespace_token(" "),
88                 tokens::comma_token(),
89                 whitespace_token(" "),
90                 tokens::colon_token(),
91                 whitespace_token(" "),
92                 tokens::equals_token(),
93                 whitespace_token(" "),
94                 tokens::close_curly_token(),
95                 whitespace_token(" "),
96                 tokens::open_curly_token(),
97                 whitespace_token(" "),
98                 tokens::close_square_token(),
99                 whitespace_token(" "),
100                 tokens::open_square_token(),
101                 whitespace_token(" "),
102                 tokens::plus_equals_token(),
103                 whitespace_token(" "),
104                 string_token("foo"),
105                 unquoted_text_token(" "),
106                 string_token("bar"),
107                 unquoted_text_token(" "),
108                 int_token(42, "42"),
109                 unquoted_text_token(" "),
110                 bool_token(true),
111                 unquoted_text_token(" "),
112                 double_token(3.14, "3.14"),
113                 unquoted_text_token(" "),
114                 bool_token(false),
115                 unquoted_text_token(" "),
116                 null_token(),
117                 unquoted_text_token(" "),
118                 substitution_token(unquoted_text_token("a.b"), false),
119                 unquoted_text_token(" "),
120                 substitution_token(unquoted_text_token("x.y"), true),
121                 unquoted_text_token(" "),
122                 substitution_token(string_token("\"c.d\""), false),
123                 whitespace_token(" "),
124                 line_token(1),
125                 whitespace_token(" "),
126                 tokens::end_token()
127         };
128         tokenizer_test(source, expected);
129     }
130 
131     SECTION("tokenize all types with multiple spaces") {
132         string source = "   ,   :   =   }   {   ]   [   +=   \"foo\"   \"\"\"bar\"\"\"   42   true   3.14   false   null   ${a.b}   ${?x.y}   ${\"c.d\"}  \n   ";
133         token_list expected {
134                 tokens::start_token(),
135                 whitespace_token("   "),
136                 tokens::comma_token(),
137                 whitespace_token("   "),
138                 tokens::colon_token(),
139                 whitespace_token("   "),
140                 tokens::equals_token(),
141                 whitespace_token("   "),
142                 tokens::close_curly_token(),
143                 whitespace_token("   "),
144                 tokens::open_curly_token(),
145                 whitespace_token("   "),
146                 tokens::close_square_token(),
147                 whitespace_token("   "),
148                 tokens::open_square_token(),
149                 whitespace_token("   "),
150                 tokens::plus_equals_token(),
151                 whitespace_token("   "),
152                 string_token("foo"),
153                 unquoted_text_token("   "),
154                 string_token("bar"),
155                 unquoted_text_token("   "),
156                 int_token(42, "42"),
157                 unquoted_text_token("   "),
158                 bool_token(true),
159                 unquoted_text_token("   "),
160                 double_token(3.14, "3.14"),
161                 unquoted_text_token("   "),
162                 bool_token(false),
163                 unquoted_text_token("   "),
164                 null_token(),
165                 unquoted_text_token("   "),
166                 substitution_token(unquoted_text_token("a.b"), false),
167                 unquoted_text_token("   "),
168                 substitution_token(unquoted_text_token("x.y"), true),
169                 unquoted_text_token("   "),
170                 substitution_token(string_token("\"c.d\""), false),
171                 whitespace_token("  "),
172                 line_token(1),
173                 whitespace_token("   "),
174                 tokens::end_token()
175         };
176         tokenizer_test(source, expected);
177     }
178 }
179 
180 TEST_CASE("unquoted text and booleans", "[tokenizer]") {
181     SECTION("true and unquoted text") {
182         string source = "truefoo";
183         token_list expected {
184                 tokens::start_token(),
185                 bool_token(true),
186                 unquoted_text_token("foo"),
187                 tokens::end_token()
188         };
189         tokenizer_test(source, expected);
190     }
191 
192     SECTION("false and unquoted text") {
193         string source = "falsefoo";
194         token_list expected {
195                 tokens::start_token(),
196                 bool_token(false),
197                 unquoted_text_token("foo"),
198                 tokens::end_token()
199         };
200         tokenizer_test(source, expected);
201     }
202 
203     SECTION("null and unquoted text") {
204         string source = "nullfoo";
205         token_list expected {
206                 tokens::start_token(),
207                 null_token(),
208                 unquoted_text_token("foo"),
209                 tokens::end_token()
210         };
211         tokenizer_test(source, expected);
212     }
213 
214     SECTION("unquoted text containing true") {
215         string source = "footrue";
216         token_list expected {
217                 tokens::start_token(),
218                 unquoted_text_token("footrue"),
219                 tokens::end_token()
220         };
221         tokenizer_test(source, expected);
222     }
223 
224     SECTION("unquoted text containing space true") {
225         string source = "foo true";
226         token_list expected {
227                 tokens::start_token(),
228                 unquoted_text_token("foo"),
229                 unquoted_text_token(" "),
230                 bool_token(true),
231                 tokens::end_token()
232         };
233         tokenizer_test(source, expected);
234     }
235 
236     SECTION("true and space and unquoted text") {
237         string source = "true foo";
238         token_list expected {
239                 tokens::start_token(),
240                 bool_token(true),
241                 unquoted_text_token(" "),
242                 unquoted_text_token("foo"),
243                 tokens::end_token()
244         };
245         tokenizer_test(source, expected);
246     }
247 }
248 
249 TEST_CASE("unquoted strings with special cases", "[tokenizer]") {
250     SECTION("unquoted text containing slash") {
251         string source = "a/b/c";
252         token_list expected {
253                 tokens::start_token(),
254                 unquoted_text_token("a/b/c"),
255                 tokens::end_token()
256         };
257         tokenizer_test(source, expected);
258 
259         source = "/";
260         expected = {
261                 tokens::start_token(),
262                 unquoted_text_token("/"),
263                 tokens::end_token()
264         };
265         tokenizer_test(source, expected);
266 
267         source = "/ /";
268         expected = {
269                 tokens::start_token(),
270                 unquoted_text_token("/"),
271                 unquoted_text_token(" "),
272                 unquoted_text_token("/"),
273                 tokens::end_token()
274         };
275         tokenizer_test(source, expected);
276     }
277 
278     SECTION("unquoted text discards external spaces") {
279         string source = "   foo   \n";
280         token_list expected {
281                 tokens::start_token(),
282                 whitespace_token("   "),
283                 unquoted_text_token("foo"),
284                 whitespace_token("   "),
285                 line_token(1),
286                 tokens::end_token()
287         };
288         tokenizer_test(source, expected);
289     }
290 
291     SECTION("unquoted text keeps internal spaces") {
292         string source = "    foo  bar baz   \n";
293         token_list expected {
294                 tokens::start_token(),
295                 whitespace_token("    "),
296                 unquoted_text_token("foo"),
297                 unquoted_text_token("  "),
298                 unquoted_text_token("bar"),
299                 unquoted_text_token(" "),
300                 unquoted_text_token("baz"),
301                 whitespace_token("   "),
302                 line_token(1),
303                 tokens::end_token()
304         };
305         tokenizer_test(source, expected);
306     }
307 
308     SECTION("mix quoted and unquoted") {
309         string source = "   foo\"bar\"baz   \n";
310         token_list expected {
311                 tokens::start_token(),
312                 whitespace_token("   "),
313                 unquoted_text_token("foo"),
314                 string_token("bar"),
315                 unquoted_text_token("baz"),
316                 whitespace_token("   "),
317                 line_token(1),
318                 tokens::end_token()
319         };
320         tokenizer_test(source, expected);
321     }
322 }
323 
324 TEST_CASE("escape sequence", "[tokenizer]") {
325     SECTION("unicode infinity symbol") {
326         string source = "\"\\u221E\"";
327         token_list expected {
328                 tokens::start_token(),
329                 string_token(u8"\u221E"),
330                 tokens::end_token()
331         };
332         tokenizer_test(source, expected);
333     }
334 
335     SECTION("null byte") {
336         string source = " \"\\u0000\" ";
337         token_list expected {
338                 tokens::start_token(),
339                 whitespace_token(" "),
340                 string_token(""),
341                 whitespace_token(" "),
342                 tokens::end_token()
343         };
344         tokenizer_test(source, expected);
345     }
346 
347     SECTION("various escape codes") {
348         string source = " \"\\\"\\\\/\\b\\f\\n\\r\\t\" ";
349         token_list expected {
350                 tokens::start_token(),
351                 whitespace_token(" "),
352                 string_token("\"\\/\b\f\n\r\t"),
353                 whitespace_token(" "),
354                 tokens::end_token()
355         };
356         tokenizer_test(source, expected);
357     }
358 
359     SECTION("unicode F") {
360         string source = " \"\\u0046\" ";
361         token_list expected {
362                 tokens::start_token(),
363                 whitespace_token(" "),
364                 string_token("F"),
365                 whitespace_token(" "),
366                 tokens::end_token()
367         };
368         tokenizer_test(source, expected);
369     }
370 
371     SECTION("two unicode F's") {
372         string source = " \"\\u0046\\u0046\" ";
373         token_list expected {
374                 tokens::start_token(),
375                 whitespace_token(" "),
376                 string_token("FF"),
377                 whitespace_token(" "),
378                 tokens::end_token()
379         };
380         tokenizer_test(source, expected);
381     }
382 }
383 
384 TEST_CASE("triple quoted strings") {
385     SECTION("trivial triple quoted string") {
386         string source = "\"\"\"bar\"\"\"";
387         token_list expected {
388                 tokens::start_token(),
389                 string_token("bar"),
390                 tokens::end_token()
391         };
392         tokenizer_test(source, expected);
393     }
394 
395     SECTION("trailing quotes in triple quoted string") {
396         string source = "\"\"\"\"\"\"\"\"";
397         token_list expected {
398                 tokens::start_token(),
399                 string_token("\"\""),
400                 tokens::end_token()
401         };
402         tokenizer_test(source, expected);
403     }
404 
405     SECTION("no esacpe in triple quoted strings") {
406         string source = "\"\"\"\\n\"\"\"";
407         token_list expected {
408                 tokens::start_token(),
409                 string_token("\\n"),
410                 tokens::end_token()
411         };
412         tokenizer_test(source, expected);
413     }
414 
415     SECTION("new line in triple quoted string") {
416         string source = "\"\"\"foo\nbar\"\"\"";
417         token_list expected {
418                 tokens::start_token(),
419                 string_token("foo\nbar"),
420                 tokens::end_token()
421         };
422         tokenizer_test(source, expected);
423     }
424 }
425 
426 TEST_CASE("comments", "[tokenizer]") {
427     SECTION("double slash comment") {
428         string source = "//";
429         token_list expected {
430                 tokens::start_token(),
431                 double_slash_comment_token(""),
432                 tokens::end_token()
433         };
434         tokenizer_test(source, expected);
435     }
436 
437     SECTION("hash comment") {
438         string source = "#";
439         token_list expected {
440                 tokens::start_token(),
441                 hash_comment_token(""),
442                 tokens::end_token()
443         };
444         tokenizer_test(source, expected);
445     }
446 
447     SECTION("two slashes in quoted string is string") {
448         string source = "\"//bar\"";
449         token_list expected {
450                 tokens::start_token(),
451                 string_token("//bar"),
452                 tokens::end_token()
453         };
454         tokenizer_test(source, expected);
455     }
456 
457     SECTION("hash in quoted string is string") {
458         string source = "\"#bar\"";
459         token_list expected {
460                 tokens::start_token(),
461                 string_token("#bar"),
462                 tokens::end_token()
463         };
464         tokenizer_test(source, expected);
465     }
466 
467     SECTION("slash comment after unquoted text") {
468         string source = "bar//comment";
469         token_list expected {
470                 tokens::start_token(),
471                 unquoted_text_token("bar"),
472                 double_slash_comment_token("comment"),
473                 tokens::end_token()
474         };
475         tokenizer_test(source, expected);
476     }
477 
478     SECTION("hash comment after unquoted text") {
479         string source = "bar#comment";
480         token_list expected {
481                 tokens::start_token(),
482                 unquoted_text_token("bar"),
483                 hash_comment_token("comment"),
484                 tokens::end_token()
485         };
486         tokenizer_test(source, expected);
487     }
488 
489     SECTION("slash comment after int") {
490         string source = "10//comment";
491         token_list expected {
492                 tokens::start_token(),
493                 int_token(10, "10"),
494                 double_slash_comment_token("comment"),
495                 tokens::end_token()
496         };
497         tokenizer_test(source, expected);
498     }
499 
500     SECTION("hash comment after int") {
501         string source = "10#comment";
502         token_list expected {
503                 tokens::start_token(),
504                 int_token(10, "10"),
505                 hash_comment_token("comment"),
506                 tokens::end_token()
507         };
508         tokenizer_test(source, expected);
509     }
510 
511     SECTION("slash comment with newline") {
512         string source = "10//comment\n12";
513         token_list expected {
514                 tokens::start_token(),
515                 int_token(10, "10"),
516                 double_slash_comment_token("comment"),
517                 line_token(1),
518                 int_token(12, "12"),
519                 tokens::end_token()
520         };
521         tokenizer_test(source, expected);
522     }
523 
524     SECTION("hash comment with newline") {
525         string source = "10#comment\n12";
526         token_list expected {
527                 tokens::start_token(),
528                 int_token(10, "10"),
529                 hash_comment_token("comment"),
530                 line_token(1),
531                 int_token(12, "12"),
532                 tokens::end_token()
533         };
534         tokenizer_test(source, expected);
535     }
536 
537     SECTION("slash comment on multiples lines with whitespace") {
538         string source = "   //comment\r\n   //comment2   \n//comment3   \n\n//comment4";
539         token_list expected {
540                 tokens::start_token(),
541                 whitespace_token("   "),
542                 double_slash_comment_token("comment\r"),
543                 line_token(1),
544                 whitespace_token("   "),
545                 double_slash_comment_token("comment2   "),
546                 line_token(2),
547                 double_slash_comment_token("comment3   "),
548                 line_token(3),
549                 line_token(4),
550                 double_slash_comment_token("comment4"),
551                 tokens::end_token()
552         };
553         tokenizer_test(source, expected);
554     }
555 
556     SECTION("hash comment on multiples lines with whitespace") {
557         string source = "   #comment\r\n   #comment2   \n#comment3   \n\n#comment4";
558         token_list expected {
559                 tokens::start_token(),
560                 whitespace_token("   "),
561                 hash_comment_token("comment\r"),
562                 line_token(1),
563                 whitespace_token("   "),
564                 hash_comment_token("comment2   "),
565                 line_token(2),
566                 hash_comment_token("comment3   "),
567                 line_token(3),
568                 line_token(4),
569                 hash_comment_token("comment4"),
570                 tokens::end_token()
571         };
572         tokenizer_test(source, expected);
573     }
574 }
575 
576 TEST_CASE("brackets and braces", "[tokenizer]") {
577     SECTION("open curlies") {
578         string source = "{{";
579         token_list expected{
580                 tokens::start_token(),
581                 tokens::open_curly_token(),
582                 tokens::open_curly_token(),
583                 tokens::end_token()
584         };
585         tokenizer_test(source, expected);
586     }
587 
588     SECTION("close curlies") {
589         string source = "}}";
590         token_list expected{
591                 tokens::start_token(),
592                 tokens::close_curly_token(),
593                 tokens::close_curly_token(),
594                 tokens::end_token()
595         };
596         tokenizer_test(source, expected);
597     }
598 
599     SECTION("open and close curlies") {
600         string source = "{}";
601         token_list expected{
602                 tokens::start_token(),
603                 tokens::open_curly_token(),
604                 tokens::close_curly_token(),
605                 tokens::end_token()
606         };
607         tokenizer_test(source, expected);
608     }
609 
610     SECTION("open squares") {
611         string source = "[[";
612         token_list expected{
613                 tokens::start_token(),
614                 tokens::open_square_token(),
615                 tokens::open_square_token(),
616                 tokens::end_token()
617         };
618         tokenizer_test(source, expected);
619     }
620 
621     SECTION("close curlies") {
622         string source = "]]";
623         token_list expected{
624                 tokens::start_token(),
625                 tokens::close_square_token(),
626                 tokens::close_square_token(),
627                 tokens::end_token()
628         };
629         tokenizer_test(source, expected);
630     }
631 
632     SECTION("open and close curlies") {
633         string source = "[]";
634         token_list expected{
635                 tokens::start_token(),
636                 tokens::open_square_token(),
637                 tokens::close_square_token(),
638                 tokens::end_token()
639         };
640         tokenizer_test(source, expected);
641     }
642 }
643 
test_for_config_error(string source)644 void test_for_config_error(string source) {
645     token_iterator iter(fake_origin(), unique_ptr<istringstream>(new istringstream(source)), true);
646     while (iter.has_next()) {
647         iter.next();
648     }
649 }
650 
651 TEST_CASE("catch syntax erros", "[tokenizer]") {
652     SECTION("nothing after backslash") {
653         string source = " \"\\\" ";
654         REQUIRE_THROWS(test_for_config_error(source));
655     }
656 
657     SECTION("\\q is not a valid escape sequence") {
658         string source = " \"\\q\" ";
659         REQUIRE_THROWS(test_for_config_error(source));
660     }
661 
662     SECTION("unicode byte sequence missing bytes") {
663         string source = " \"\\u012\" ";
664         REQUIRE_THROWS(test_for_config_error(source));
665 
666         source = " \"\\u01\" ";
667         REQUIRE_THROWS(test_for_config_error(source));
668 
669         source = " \"\\u1\" ";
670         REQUIRE_THROWS(test_for_config_error(source));
671 
672         source = " \"\\u\" ";
673         REQUIRE_THROWS(test_for_config_error(source));
674     }
675 
676     SECTION("missing closing quotes") {
677         string source = "\"";
678         REQUIRE_THROWS(test_for_config_error(source));
679 
680         source = "\"abc";
681         REQUIRE_THROWS(test_for_config_error(source));
682     }
683 
684     SECTION("invalid lone characters") {
685         string source = "\"\\\"";
686         REQUIRE_THROWS(test_for_config_error(source));
687 
688         source = "$";
689         REQUIRE_THROWS(test_for_config_error(source));
690 
691         source = "${";
692         REQUIRE_THROWS(test_for_config_error(source));
693     }
694 }
695