1 /*
2 __ _____ _____ _____
3 __| | __| | | | JSON for Modern C++ (test suite)
4 | | |__ | | | | | | version 3.7.3
5 |_____|_____|_____|_|___| https://github.com/nlohmann/json
6
7 Licensed under the MIT License <http://opensource.org/licenses/MIT>.
8 SPDX-License-Identifier: MIT
9 Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
10
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to deal
13 in the Software without restriction, including without limitation the rights
14 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
17
18 The above copyright notice and this permission notice shall be included in all
19 copies or substantial portions of the Software.
20
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 SOFTWARE.
28 */
29
30 #include "doctest_compatibility.h"
31
32 // for some reason including this after the json header leads to linker errors with VS 2017...
33 #include <locale>
34
35 #define private public
36 #include <nlohmann/json.hpp>
37 using nlohmann::json;
38 #undef private
39
40 #include <fstream>
41 #include <sstream>
42 #include <iostream>
43 #include <iomanip>
44
45 namespace
46 {
47 extern size_t calls;
48 size_t calls = 0;
49
50 void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
51
check_utf8dump(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)52 void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
53 {
54 std::string json_string;
55
56 CAPTURE(byte1)
57 CAPTURE(byte2)
58 CAPTURE(byte3)
59 CAPTURE(byte4)
60
61 json_string += std::string(1, static_cast<char>(byte1));
62
63 if (byte2 != -1)
64 {
65 json_string += std::string(1, static_cast<char>(byte2));
66 }
67
68 if (byte3 != -1)
69 {
70 json_string += std::string(1, static_cast<char>(byte3));
71 }
72
73 if (byte4 != -1)
74 {
75 json_string += std::string(1, static_cast<char>(byte4));
76 }
77
78 CAPTURE(json_string)
79
80 // store the string in a JSON value
81 json j = json_string;
82 json j2 = "abc" + json_string + "xyz";
83
84 // dumping with ignore/replace must not throw in any case
85 auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
86 auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
87 auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
88 auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
89 auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
90 auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
91 auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
92 auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
93
94 if (success_expected)
95 {
96 // strict mode must not throw if success is expected
97 auto s_strict = j.dump();
98 // all dumps should agree on the string
99 CHECK(s_strict == s_ignored);
100 CHECK(s_strict == s_replaced);
101 }
102 else
103 {
104 // strict mode must throw if success is not expected
105 CHECK_THROWS_AS(j.dump(), json::type_error&);
106 // ignore and replace must create different dumps
107 CHECK(s_ignored != s_replaced);
108
109 // check that replace string contains a replacement character
110 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
111 }
112
113 // check that prefix and suffix are preserved
114 CHECK(s_ignored2.substr(1, 3) == "abc");
115 CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
116 CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
117 CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
118 CHECK(s_replaced2.substr(1, 3) == "abc");
119 CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
120 CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
121 CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
122 }
123
124 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
125
126 // create and check a JSON string with up to four UTF-8 bytes
check_utf8string(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)127 void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
128 {
129 if (++calls % 100000 == 0)
130 {
131 std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl;
132 }
133
134 std::string json_string = "\"";
135
136 CAPTURE(byte1)
137 json_string += std::string(1, static_cast<char>(byte1));
138
139 if (byte2 != -1)
140 {
141 CAPTURE(byte2)
142 json_string += std::string(1, static_cast<char>(byte2));
143 }
144
145 if (byte3 != -1)
146 {
147 CAPTURE(byte3)
148 json_string += std::string(1, static_cast<char>(byte3));
149 }
150
151 if (byte4 != -1)
152 {
153 CAPTURE(byte4)
154 json_string += std::string(1, static_cast<char>(byte4));
155 }
156
157 json_string += "\"";
158
159 CAPTURE(json_string)
160
161 json _;
162 if (success_expected)
163 {
164 CHECK_NOTHROW(_ = json::parse(json_string));
165 }
166 else
167 {
168 CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&);
169 }
170 }
171 }
172
skip()173 TEST_CASE("Unicode" * doctest::skip())
174 {
175 SECTION("RFC 3629")
176 {
177 /*
178 RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
179 follows:
180
181 A UTF-8 string is a sequence of octets representing a sequence of UCS
182 characters. An octet sequence is valid UTF-8 only if it matches the
183 following syntax, which is derived from the rules for encoding UTF-8
184 and is expressed in the ABNF of [RFC2234].
185
186 UTF8-octets = *( UTF8-char )
187 UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
188 UTF8-1 = %x00-7F
189 UTF8-2 = %xC2-DF UTF8-tail
190 UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
191 %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
192 UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
193 %xF4 %x80-8F 2( UTF8-tail )
194 UTF8-tail = %x80-BF
195 */
196
197 SECTION("ill-formed first byte")
198 {
199 for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
200 {
201 check_utf8string(false, byte1);
202 check_utf8dump(false, byte1);
203 }
204
205 for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
206 {
207 check_utf8string(false, byte1);
208 check_utf8dump(false, byte1);
209 }
210 }
211
212 SECTION("UTF8-1 (x00-x7F)")
213 {
214 SECTION("well-formed")
215 {
216 for (int byte1 = 0x00; byte1 <= 0x7F; ++byte1)
217 {
218 // unescaped control characters are parse errors in JSON
219 if (0x00 <= byte1 and byte1 <= 0x1F)
220 {
221 check_utf8string(false, byte1);
222 continue;
223 }
224
225 // a single quote is a parse error in JSON
226 if (byte1 == 0x22)
227 {
228 check_utf8string(false, byte1);
229 continue;
230 }
231
232 // a single backslash is a parse error in JSON
233 if (byte1 == 0x5C)
234 {
235 check_utf8string(false, byte1);
236 continue;
237 }
238
239 // all other characters are OK
240 check_utf8string(true, byte1);
241 check_utf8dump(true, byte1);
242 }
243 }
244 }
245
246 SECTION("UTF8-2 (xC2-xDF UTF8-tail)")
247 {
248 SECTION("well-formed")
249 {
250 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
251 {
252 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
253 {
254 check_utf8string(true, byte1, byte2);
255 check_utf8dump(true, byte1, byte2);
256 }
257 }
258 }
259
260 SECTION("ill-formed: missing second byte")
261 {
262 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
263 {
264 check_utf8string(false, byte1);
265 check_utf8dump(false, byte1);
266 }
267 }
268
269 SECTION("ill-formed: wrong second byte")
270 {
271 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
272 {
273 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
274 {
275 // skip correct second byte
276 if (0x80 <= byte2 and byte2 <= 0xBF)
277 {
278 continue;
279 }
280
281 check_utf8string(false, byte1, byte2);
282 check_utf8dump(false, byte1, byte2);
283 }
284 }
285 }
286 }
287
288 SECTION("UTF8-3 (xE0 xA0-BF UTF8-tail)")
289 {
290 SECTION("well-formed")
291 {
292 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
293 {
294 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
295 {
296 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
297 {
298 check_utf8string(true, byte1, byte2, byte3);
299 check_utf8dump(true, byte1, byte2, byte3);
300 }
301 }
302 }
303 }
304
305 SECTION("ill-formed: missing second byte")
306 {
307 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
308 {
309 check_utf8string(false, byte1);
310 check_utf8dump(false, byte1);
311 }
312 }
313
314 SECTION("ill-formed: missing third byte")
315 {
316 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
317 {
318 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
319 {
320 check_utf8string(false, byte1, byte2);
321 check_utf8dump(false, byte1, byte2);
322 }
323 }
324 }
325
326 SECTION("ill-formed: wrong second byte")
327 {
328 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
329 {
330 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
331 {
332 // skip correct second byte
333 if (0xA0 <= byte2 and byte2 <= 0xBF)
334 {
335 continue;
336 }
337
338 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
339 {
340 check_utf8string(false, byte1, byte2, byte3);
341 check_utf8dump(false, byte1, byte2, byte3);
342 }
343 }
344 }
345 }
346
347 SECTION("ill-formed: wrong third byte")
348 {
349 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
350 {
351 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
352 {
353 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
354 {
355 // skip correct third byte
356 if (0x80 <= byte3 and byte3 <= 0xBF)
357 {
358 continue;
359 }
360
361 check_utf8string(false, byte1, byte2, byte3);
362 check_utf8dump(false, byte1, byte2, byte3);
363 }
364 }
365 }
366 }
367 }
368
369 SECTION("UTF8-3 (xE1-xEC UTF8-tail UTF8-tail)")
370 {
371 SECTION("well-formed")
372 {
373 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
374 {
375 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
376 {
377 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
378 {
379 check_utf8string(true, byte1, byte2, byte3);
380 check_utf8dump(true, byte1, byte2, byte3);
381 }
382 }
383 }
384 }
385
386 SECTION("ill-formed: missing second byte")
387 {
388 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
389 {
390 check_utf8string(false, byte1);
391 check_utf8dump(false, byte1);
392 }
393 }
394
395 SECTION("ill-formed: missing third byte")
396 {
397 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
398 {
399 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
400 {
401 check_utf8string(false, byte1, byte2);
402 check_utf8dump(false, byte1, byte2);
403 }
404 }
405 }
406
407 SECTION("ill-formed: wrong second byte")
408 {
409 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
410 {
411 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
412 {
413 // skip correct second byte
414 if (0x80 <= byte2 and byte2 <= 0xBF)
415 {
416 continue;
417 }
418
419 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
420 {
421 check_utf8string(false, byte1, byte2, byte3);
422 check_utf8dump(false, byte1, byte2, byte3);
423 }
424 }
425 }
426 }
427
428 SECTION("ill-formed: wrong third byte")
429 {
430 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
431 {
432 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
433 {
434 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
435 {
436 // skip correct third byte
437 if (0x80 <= byte3 and byte3 <= 0xBF)
438 {
439 continue;
440 }
441
442 check_utf8string(false, byte1, byte2, byte3);
443 check_utf8dump(false, byte1, byte2, byte3);
444 }
445 }
446 }
447 }
448 }
449
450 SECTION("UTF8-3 (xED x80-9F UTF8-tail)")
451 {
452 SECTION("well-formed")
453 {
454 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
455 {
456 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
457 {
458 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
459 {
460 check_utf8string(true, byte1, byte2, byte3);
461 check_utf8dump(true, byte1, byte2, byte3);
462 }
463 }
464 }
465 }
466
467 SECTION("ill-formed: missing second byte")
468 {
469 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
470 {
471 check_utf8string(false, byte1);
472 check_utf8dump(false, byte1);
473 }
474 }
475
476 SECTION("ill-formed: missing third byte")
477 {
478 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
479 {
480 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
481 {
482 check_utf8string(false, byte1, byte2);
483 check_utf8dump(false, byte1, byte2);
484 }
485 }
486 }
487
488 SECTION("ill-formed: wrong second byte")
489 {
490 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
491 {
492 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
493 {
494 // skip correct second byte
495 if (0x80 <= byte2 and byte2 <= 0x9F)
496 {
497 continue;
498 }
499
500 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
501 {
502 check_utf8string(false, byte1, byte2, byte3);
503 check_utf8dump(false, byte1, byte2, byte3);
504 }
505 }
506 }
507 }
508
509 SECTION("ill-formed: wrong third byte")
510 {
511 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
512 {
513 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
514 {
515 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
516 {
517 // skip correct third byte
518 if (0x80 <= byte3 and byte3 <= 0xBF)
519 {
520 continue;
521 }
522
523 check_utf8string(false, byte1, byte2, byte3);
524 check_utf8dump(false, byte1, byte2, byte3);
525 }
526 }
527 }
528 }
529 }
530
531 SECTION("UTF8-3 (xEE-xEF UTF8-tail UTF8-tail)")
532 {
533 SECTION("well-formed")
534 {
535 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
536 {
537 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
538 {
539 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
540 {
541 check_utf8string(true, byte1, byte2, byte3);
542 check_utf8dump(true, byte1, byte2, byte3);
543 }
544 }
545 }
546 }
547
548 SECTION("ill-formed: missing second byte")
549 {
550 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
551 {
552 check_utf8string(false, byte1);
553 check_utf8dump(false, byte1);
554 }
555 }
556
557 SECTION("ill-formed: missing third byte")
558 {
559 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
560 {
561 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
562 {
563 check_utf8string(false, byte1, byte2);
564 check_utf8dump(false, byte1, byte2);
565 }
566 }
567 }
568
569 SECTION("ill-formed: wrong second byte")
570 {
571 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
572 {
573 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
574 {
575 // skip correct second byte
576 if (0x80 <= byte2 and byte2 <= 0xBF)
577 {
578 continue;
579 }
580
581 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
582 {
583 check_utf8string(false, byte1, byte2, byte3);
584 check_utf8dump(false, byte1, byte2, byte3);
585 }
586 }
587 }
588 }
589
590 SECTION("ill-formed: wrong third byte")
591 {
592 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
593 {
594 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
595 {
596 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
597 {
598 // skip correct third byte
599 if (0x80 <= byte3 and byte3 <= 0xBF)
600 {
601 continue;
602 }
603
604 check_utf8string(false, byte1, byte2, byte3);
605 check_utf8dump(false, byte1, byte2, byte3);
606 }
607 }
608 }
609 }
610 }
611
612 SECTION("UTF8-4 (xF0 x90-BF UTF8-tail UTF8-tail)")
613 {
614 SECTION("well-formed")
615 {
616 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
617 {
618 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
619 {
620 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
621 {
622 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
623 {
624 check_utf8string(true, byte1, byte2, byte3, byte4);
625 check_utf8dump(true, byte1, byte2, byte3, byte4);
626 }
627 }
628 }
629 }
630 }
631
632 SECTION("ill-formed: missing second byte")
633 {
634 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
635 {
636 check_utf8string(false, byte1);
637 check_utf8dump(false, byte1);
638 }
639 }
640
641 SECTION("ill-formed: missing third byte")
642 {
643 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
644 {
645 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
646 {
647 check_utf8string(false, byte1, byte2);
648 check_utf8dump(false, byte1, byte2);
649 }
650 }
651 }
652
653 SECTION("ill-formed: missing fourth byte")
654 {
655 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
656 {
657 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
658 {
659 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
660 {
661 check_utf8string(false, byte1, byte2, byte3);
662 check_utf8dump(false, byte1, byte2, byte3);
663 }
664 }
665 }
666 }
667
668 SECTION("ill-formed: wrong second byte")
669 {
670 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
671 {
672 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
673 {
674 // skip correct second byte
675 if (0x90 <= byte2 and byte2 <= 0xBF)
676 {
677 continue;
678 }
679
680 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
681 {
682 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
683 {
684 check_utf8string(false, byte1, byte2, byte3, byte4);
685 check_utf8dump(false, byte1, byte2, byte3, byte4);
686 }
687 }
688 }
689 }
690 }
691
692 SECTION("ill-formed: wrong third byte")
693 {
694 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
695 {
696 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
697 {
698 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
699 {
700 // skip correct third byte
701 if (0x80 <= byte3 and byte3 <= 0xBF)
702 {
703 continue;
704 }
705
706 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
707 {
708 check_utf8string(false, byte1, byte2, byte3, byte4);
709 check_utf8dump(false, byte1, byte2, byte3, byte4);
710 }
711 }
712 }
713 }
714 }
715
716 SECTION("ill-formed: wrong fourth byte")
717 {
718 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
719 {
720 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
721 {
722 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
723 {
724 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
725 {
726 // skip fourth second byte
727 if (0x80 <= byte3 and byte3 <= 0xBF)
728 {
729 continue;
730 }
731
732 check_utf8string(false, byte1, byte2, byte3, byte4);
733 check_utf8dump(false, byte1, byte2, byte3, byte4);
734 }
735 }
736 }
737 }
738 }
739 }
740
741 SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
742 {
743 SECTION("well-formed")
744 {
745 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
746 {
747 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
748 {
749 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
750 {
751 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
752 {
753 check_utf8string(true, byte1, byte2, byte3, byte4);
754 check_utf8dump(true, byte1, byte2, byte3, byte4);
755 }
756 }
757 }
758 }
759 }
760
761 SECTION("ill-formed: missing second byte")
762 {
763 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
764 {
765 check_utf8string(false, byte1);
766 check_utf8dump(false, byte1);
767 }
768 }
769
770 SECTION("ill-formed: missing third byte")
771 {
772 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
773 {
774 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
775 {
776 check_utf8string(false, byte1, byte2);
777 check_utf8dump(false, byte1, byte2);
778 }
779 }
780 }
781
782 SECTION("ill-formed: missing fourth byte")
783 {
784 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
785 {
786 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
787 {
788 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
789 {
790 check_utf8string(false, byte1, byte2, byte3);
791 check_utf8dump(false, byte1, byte2, byte3);
792 }
793 }
794 }
795 }
796
797 SECTION("ill-formed: wrong second byte")
798 {
799 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
800 {
801 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
802 {
803 // skip correct second byte
804 if (0x80 <= byte2 and byte2 <= 0xBF)
805 {
806 continue;
807 }
808
809 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
810 {
811 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
812 {
813 check_utf8string(false, byte1, byte2, byte3, byte4);
814 check_utf8dump(false, byte1, byte2, byte3, byte4);
815 }
816 }
817 }
818 }
819 }
820
821 SECTION("ill-formed: wrong third byte")
822 {
823 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
824 {
825 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
826 {
827 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
828 {
829 // skip correct third byte
830 if (0x80 <= byte3 and byte3 <= 0xBF)
831 {
832 continue;
833 }
834
835 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
836 {
837 check_utf8string(false, byte1, byte2, byte3, byte4);
838 check_utf8dump(false, byte1, byte2, byte3, byte4);
839 }
840 }
841 }
842 }
843 }
844
845 SECTION("ill-formed: wrong fourth byte")
846 {
847 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
848 {
849 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
850 {
851 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
852 {
853 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
854 {
855 // skip correct fourth byte
856 if (0x80 <= byte3 and byte3 <= 0xBF)
857 {
858 continue;
859 }
860
861 check_utf8string(false, byte1, byte2, byte3, byte4);
862 check_utf8dump(false, byte1, byte2, byte3, byte4);
863 }
864 }
865 }
866 }
867 }
868 }
869
870 SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
871 {
872 SECTION("well-formed")
873 {
874 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
875 {
876 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
877 {
878 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
879 {
880 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
881 {
882 check_utf8string(true, byte1, byte2, byte3, byte4);
883 check_utf8dump(true, byte1, byte2, byte3, byte4);
884 }
885 }
886 }
887 }
888 }
889
890 SECTION("ill-formed: missing second byte")
891 {
892 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
893 {
894 check_utf8string(false, byte1);
895 check_utf8dump(false, byte1);
896 }
897 }
898
899 SECTION("ill-formed: missing third byte")
900 {
901 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
902 {
903 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
904 {
905 check_utf8string(false, byte1, byte2);
906 check_utf8dump(false, byte1, byte2);
907 }
908 }
909 }
910
911 SECTION("ill-formed: missing fourth byte")
912 {
913 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
914 {
915 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
916 {
917 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
918 {
919 check_utf8string(false, byte1, byte2, byte3);
920 check_utf8dump(false, byte1, byte2, byte3);
921 }
922 }
923 }
924 }
925
926 SECTION("ill-formed: wrong second byte")
927 {
928 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
929 {
930 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
931 {
932 // skip correct second byte
933 if (0x80 <= byte2 and byte2 <= 0x8F)
934 {
935 continue;
936 }
937
938 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
939 {
940 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
941 {
942 check_utf8string(false, byte1, byte2, byte3, byte4);
943 check_utf8dump(false, byte1, byte2, byte3, byte4);
944 }
945 }
946 }
947 }
948 }
949
950 SECTION("ill-formed: wrong third byte")
951 {
952 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
953 {
954 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
955 {
956 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
957 {
958 // skip correct third byte
959 if (0x80 <= byte3 and byte3 <= 0xBF)
960 {
961 continue;
962 }
963
964 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
965 {
966 check_utf8string(false, byte1, byte2, byte3, byte4);
967 check_utf8dump(false, byte1, byte2, byte3, byte4);
968 }
969 }
970 }
971 }
972 }
973
974 SECTION("ill-formed: wrong fourth byte")
975 {
976 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
977 {
978 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
979 {
980 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
981 {
982 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
983 {
984 // skip correct fourth byte
985 if (0x80 <= byte3 and byte3 <= 0xBF)
986 {
987 continue;
988 }
989
990 check_utf8string(false, byte1, byte2, byte3, byte4);
991 check_utf8dump(false, byte1, byte2, byte3, byte4);
992 }
993 }
994 }
995 }
996 }
997 }
998 }
999
1000 SECTION("\\uxxxx sequences")
1001 {
1002 // create an escaped string from a code point
1003 const auto codepoint_to_unicode = [](std::size_t cp)
1004 {
1005 // code points are represented as a six-character sequence: a
1006 // reverse solidus, followed by the lowercase letter u, followed
1007 // by four hexadecimal digits that encode the character's code
1008 // point
1009 std::stringstream ss;
1010 ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
1011 return ss.str();
1012 };
1013
1014 SECTION("correct sequences")
1015 {
1016 // generate all UTF-8 code points; in total, 1112064 code points are
1017 // generated: 0x1FFFFF code points - 2048 invalid values between
1018 // 0xD800 and 0xDFFF.
1019 for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
1020 {
1021 // string to store the code point as in \uxxxx format
1022 std::string json_text = "\"";
1023
1024 // decide whether to use one or two \uxxxx sequences
1025 if (cp < 0x10000u)
1026 {
1027 // The Unicode standard permanently reserves these code point
1028 // values for UTF-16 encoding of the high and low surrogates, and
1029 // they will never be assigned a character, so there should be no
1030 // reason to encode them. The official Unicode standard says that
1031 // no UTF forms, including UTF-16, can encode these code points.
1032 if (cp >= 0xD800u and cp <= 0xDFFFu)
1033 {
1034 // if we would not skip these code points, we would get a
1035 // "missing low surrogate" exception
1036 continue;
1037 }
1038
1039 // code points in the Basic Multilingual Plane can be
1040 // represented with one \uxxxx sequence
1041 json_text += codepoint_to_unicode(cp);
1042 }
1043 else
1044 {
1045 // To escape an extended character that is not in the Basic
1046 // Multilingual Plane, the character is represented as a
1047 // 12-character sequence, encoding the UTF-16 surrogate pair
1048 const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
1049 const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
1050 json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
1051 }
1052
1053 json_text += "\"";
1054 CAPTURE(json_text)
1055 json _;
1056 CHECK_NOTHROW(_ = json::parse(json_text));
1057 }
1058 }
1059
1060 SECTION("incorrect sequences")
1061 {
1062 SECTION("incorrect surrogate values")
1063 {
1064 json _;
1065
1066 CHECK_THROWS_AS(_ = json::parse("\"\\uDC00\\uDC00\""), json::parse_error&);
1067 CHECK_THROWS_WITH(_ = json::parse("\"\\uDC00\\uDC00\""),
1068 "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'");
1069
1070 CHECK_THROWS_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), json::parse_error&);
1071 CHECK_THROWS_WITH(_ = json::parse("\"\\uD7FF\\uDC00\""),
1072 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'");
1073
1074 CHECK_THROWS_AS(_ = json::parse("\"\\uD800]\""), json::parse_error&);
1075 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800]\""),
1076 "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'");
1077
1078 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\v\""), json::parse_error&);
1079 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\v\""),
1080 "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'");
1081
1082 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\u123\""), json::parse_error&);
1083 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\u123\""),
1084 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'");
1085
1086 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uDBFF\""), json::parse_error&);
1087 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uDBFF\""),
1088 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'");
1089
1090 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uE000\""), json::parse_error&);
1091 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uE000\""),
1092 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'");
1093 }
1094 }
1095
1096 #if 0
1097 SECTION("incorrect sequences")
1098 {
1099 SECTION("high surrogate without low surrogate")
1100 {
1101 // D800..DBFF are high surrogates and must be followed by low
1102 // surrogates DC00..DFFF; here, nothing follows
1103 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
1104 {
1105 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1106 CAPTURE(json_text)
1107 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1108 }
1109 }
1110
1111 SECTION("high surrogate with wrong low surrogate")
1112 {
1113 // D800..DBFF are high surrogates and must be followed by low
1114 // surrogates DC00..DFFF; here a different sequence follows
1115 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
1116 {
1117 for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
1118 {
1119 if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
1120 {
1121 continue;
1122 }
1123
1124 std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
1125 CAPTURE(json_text)
1126 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1127 }
1128 }
1129 }
1130
1131 SECTION("low surrogate without high surrogate")
1132 {
1133 // low surrogates DC00..DFFF must follow high surrogates; here,
1134 // they occur alone
1135 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
1136 {
1137 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1138 CAPTURE(json_text)
1139 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1140 }
1141 }
1142
1143 }
1144 #endif
1145 }
1146
1147 SECTION("read all unicode characters")
1148 {
1149 // read a file with all unicode characters stored as single-character
1150 // strings in a JSON array
1151 std::ifstream f("test/data/json_nlohmann_tests/all_unicode.json");
1152 json j;
1153 CHECK_NOTHROW(f >> j);
1154
1155 // the array has 1112064 + 1 elements (a terminating "null" value)
1156 // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
1157 // 0xD800 and 0xDFFF.
1158 CHECK(j.size() == 1112065);
1159
1160 SECTION("check JSON Pointers")
1161 {
1162 for (auto s : j)
1163 {
1164 // skip non-string JSON values
1165 if (not s.is_string())
1166 {
1167 continue;
1168 }
1169
1170 std::string ptr = s;
1171
1172 // tilde must be followed by 0 or 1
1173 if (ptr == "~")
1174 {
1175 ptr += "0";
1176 }
1177
1178 // JSON Pointers must begin with "/"
1179 ptr = "/" + ptr;
1180
1181 CHECK_NOTHROW(json::json_pointer("/" + ptr));
1182
1183 // check escape/unescape roundtrip
1184 auto escaped = json::json_pointer::escape(ptr);
1185 json::json_pointer::unescape(escaped);
1186 CHECK(escaped == ptr);
1187 }
1188 }
1189 }
1190
1191 SECTION("ignore byte-order-mark")
1192 {
1193 SECTION("in a stream")
1194 {
1195 // read a file with a UTF-8 BOM
1196 std::ifstream f("test/data/json_nlohmann_tests/bom.json");
1197 json j;
1198 CHECK_NOTHROW(f >> j);
1199 }
1200
1201 SECTION("with an iterator")
1202 {
1203 std::string i = "\xef\xbb\xbf{\n \"foo\": true\n}";
1204 CHECK_NOTHROW(json::parse(i.begin(), i.end()));
1205 }
1206 }
1207
1208 SECTION("error for incomplete/wrong BOM")
1209 {
1210 json _;
1211 CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
1212 CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
1213 }
1214 }
1215
1216 namespace
1217 {
1218 void roundtrip(bool success_expected, const std::string& s);
1219
roundtrip(bool success_expected,const std::string & s)1220 void roundtrip(bool success_expected, const std::string& s)
1221 {
1222 CAPTURE(s)
1223 json _;
1224
1225 // create JSON string value
1226 json j = s;
1227 // create JSON text
1228 std::string ps = std::string("\"") + s + "\"";
1229
1230 if (success_expected)
1231 {
1232 // serialization succeeds
1233 CHECK_NOTHROW(j.dump());
1234
1235 // exclude parse test for U+0000
1236 if (s[0] != '\0')
1237 {
1238 // parsing JSON text succeeds
1239 CHECK_NOTHROW(_ = json::parse(ps));
1240 }
1241
1242 // roundtrip succeeds
1243 CHECK_NOTHROW(_ = json::parse(j.dump()));
1244
1245 // after roundtrip, the same string is stored
1246 json jr = json::parse(j.dump());
1247 CHECK(jr.get<std::string>() == s);
1248 }
1249 else
1250 {
1251 // serialization fails
1252 CHECK_THROWS_AS(j.dump(), json::type_error&);
1253
1254 // parsing JSON text fails
1255 CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
1256 }
1257 }
1258 }
1259
1260 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
1261 {
1262 // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
1263 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
1264
1265 SECTION("1 Some correct UTF-8 text")
1266 {
1267 roundtrip(true, "κόσμε");
1268 }
1269
1270 SECTION("2 Boundary condition test cases")
1271 {
1272 SECTION("2.1 First possible sequence of a certain length")
1273 {
1274 // 2.1.1 1 byte (U-00000000)
1275 roundtrip(true, std::string("\0", 1));
1276 // 2.1.2 2 bytes (U-00000080)
1277 roundtrip(true, "\xc2\x80");
1278 // 2.1.3 3 bytes (U-00000800)
1279 roundtrip(true, "\xe0\xa0\x80");
1280 // 2.1.4 4 bytes (U-00010000)
1281 roundtrip(true, "\xf0\x90\x80\x80");
1282
1283 // 2.1.5 5 bytes (U-00200000)
1284 roundtrip(false, "\xF8\x88\x80\x80\x80");
1285 // 2.1.6 6 bytes (U-04000000)
1286 roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
1287 }
1288
1289 SECTION("2.2 Last possible sequence of a certain length")
1290 {
1291 // 2.2.1 1 byte (U-0000007F)
1292 roundtrip(true, "\x7f");
1293 // 2.2.2 2 bytes (U-000007FF)
1294 roundtrip(true, "\xdf\xbf");
1295 // 2.2.3 3 bytes (U-0000FFFF)
1296 roundtrip(true, "\xef\xbf\xbf");
1297
1298 // 2.2.4 4 bytes (U-001FFFFF)
1299 roundtrip(false, "\xF7\xBF\xBF\xBF");
1300 // 2.2.5 5 bytes (U-03FFFFFF)
1301 roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
1302 // 2.2.6 6 bytes (U-7FFFFFFF)
1303 roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
1304 }
1305
1306 SECTION("2.3 Other boundary conditions")
1307 {
1308 // 2.3.1 U-0000D7FF = ed 9f bf
1309 roundtrip(true, "\xed\x9f\xbf");
1310 // 2.3.2 U-0000E000 = ee 80 80
1311 roundtrip(true, "\xee\x80\x80");
1312 // 2.3.3 U-0000FFFD = ef bf bd
1313 roundtrip(true, "\xef\xbf\xbd");
1314 // 2.3.4 U-0010FFFF = f4 8f bf bf
1315 roundtrip(true, "\xf4\x8f\xbf\xbf");
1316
1317 // 2.3.5 U-00110000 = f4 90 80 80
1318 roundtrip(false, "\xf4\x90\x80\x80");
1319 }
1320 }
1321
1322 SECTION("3 Malformed sequences")
1323 {
1324 SECTION("3.1 Unexpected continuation bytes")
1325 {
1326 // Each unexpected continuation byte should be separately signalled as a
1327 // malformed sequence of its own.
1328
1329 // 3.1.1 First continuation byte 0x80
1330 roundtrip(false, "\x80");
1331 // 3.1.2 Last continuation byte 0xbf
1332 roundtrip(false, "\xbf");
1333
1334 // 3.1.3 2 continuation bytes
1335 roundtrip(false, "\x80\xbf");
1336 // 3.1.4 3 continuation bytes
1337 roundtrip(false, "\x80\xbf\x80");
1338 // 3.1.5 4 continuation bytes
1339 roundtrip(false, "\x80\xbf\x80\xbf");
1340 // 3.1.6 5 continuation bytes
1341 roundtrip(false, "\x80\xbf\x80\xbf\x80");
1342 // 3.1.7 6 continuation bytes
1343 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
1344 // 3.1.8 7 continuation bytes
1345 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
1346
1347 // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf)
1348 roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
1349 }
1350
1351 SECTION("3.2 Lonely start characters")
1352 {
1353 // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf)
1354 roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
1355 // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef)
1356 roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
1357 // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7)
1358 roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
1359 // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb)
1360 roundtrip(false, "\xf8 \xf9 \xfa \xfb");
1361 // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd)
1362 roundtrip(false, "\xfc \xfd");
1363 }
1364
1365 SECTION("3.3 Sequences with last continuation byte missing")
1366 {
1367 // All bytes of an incomplete sequence should be signalled as a single
1368 // malformed sequence, i.e., you should see only a single replacement
1369 // character in each of the next 10 tests. (Characters as in section 2)
1370
1371 // 3.3.1 2-byte sequence with last byte missing (U+0000)
1372 roundtrip(false, "\xc0");
1373 // 3.3.2 3-byte sequence with last byte missing (U+0000)
1374 roundtrip(false, "\xe0\x80");
1375 // 3.3.3 4-byte sequence with last byte missing (U+0000)
1376 roundtrip(false, "\xf0\x80\x80");
1377 // 3.3.4 5-byte sequence with last byte missing (U+0000)
1378 roundtrip(false, "\xf8\x80\x80\x80");
1379 // 3.3.5 6-byte sequence with last byte missing (U+0000)
1380 roundtrip(false, "\xfc\x80\x80\x80\x80");
1381 // 3.3.6 2-byte sequence with last byte missing (U-000007FF)
1382 roundtrip(false, "\xdf");
1383 // 3.3.7 3-byte sequence with last byte missing (U-0000FFFF)
1384 roundtrip(false, "\xef\xbf");
1385 // 3.3.8 4-byte sequence with last byte missing (U-001FFFFF)
1386 roundtrip(false, "\xf7\xbf\xbf");
1387 // 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF)
1388 roundtrip(false, "\xfb\xbf\xbf\xbf");
1389 // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
1390 roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
1391 }
1392
1393 SECTION("3.4 Concatenation of incomplete sequences")
1394 {
1395 // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
1396 // sequences being signalled:
1397 roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
1398 }
1399
1400 SECTION("3.5 Impossible bytes")
1401 {
1402 // The following two bytes cannot appear in a correct UTF-8 string
1403
1404 // 3.5.1 fe
1405 roundtrip(false, "\xfe");
1406 // 3.5.2 ff
1407 roundtrip(false, "\xff");
1408 // 3.5.3 fe fe ff ff
1409 roundtrip(false, "\xfe\xfe\xff\xff");
1410 }
1411 }
1412
1413 SECTION("4 Overlong sequences")
1414 {
1415 // The following sequences are not malformed according to the letter of
1416 // the Unicode 2.0 standard. However, they are longer then necessary and
1417 // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
1418 // decoder" should reject them just like malformed sequences for two
1419 // reasons: (1) It helps to debug applications if overlong sequences are
1420 // not treated as valid representations of characters, because this helps
1421 // to spot problems more quickly. (2) Overlong sequences provide
1422 // alternative representations of characters, that could maliciously be
1423 // used to bypass filters that check only for ASCII characters. For
1424 // instance, a 2-byte encoded line feed (LF) would not be caught by a
1425 // line counter that counts only 0x0a bytes, but it would still be
1426 // processed as a line feed by an unsafe UTF-8 decoder later in the
1427 // pipeline. From a security point of view, ASCII compatibility of UTF-8
1428 // sequences means also, that ASCII characters are *only* allowed to be
1429 // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
1430 // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
1431 // reject overlong UTF-8 sequences for which a shorter encoding exists.
1432
1433 SECTION("4.1 Examples of an overlong ASCII character")
1434 {
1435 // With a safe UTF-8 decoder, all of the following five overlong
1436 // representations of the ASCII character slash ("/") should be rejected
1437 // like a malformed UTF-8 sequence, for instance by substituting it with
1438 // a replacement character. If you see a slash below, you do not have a
1439 // safe UTF-8 decoder!
1440
1441 // 4.1.1 U+002F = c0 af
1442 roundtrip(false, "\xc0\xaf");
1443 // 4.1.2 U+002F = e0 80 af
1444 roundtrip(false, "\xe0\x80\xaf");
1445 // 4.1.3 U+002F = f0 80 80 af
1446 roundtrip(false, "\xf0\x80\x80\xaf");
1447 // 4.1.4 U+002F = f8 80 80 80 af
1448 roundtrip(false, "\xf8\x80\x80\x80\xaf");
1449 // 4.1.5 U+002F = fc 80 80 80 80 af
1450 roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
1451 }
1452
1453 SECTION("4.2 Maximum overlong sequences")
1454 {
1455 // Below you see the highest Unicode value that is still resulting in an
1456 // overlong sequence if represented with the given number of bytes. This
1457 // is a boundary test for safe UTF-8 decoders. All five characters should
1458 // be rejected like malformed UTF-8 sequences.
1459
1460 // 4.2.1 U-0000007F = c1 bf
1461 roundtrip(false, "\xc1\xbf");
1462 // 4.2.2 U-000007FF = e0 9f bf
1463 roundtrip(false, "\xe0\x9f\xbf");
1464 // 4.2.3 U-0000FFFF = f0 8f bf bf
1465 roundtrip(false, "\xf0\x8f\xbf\xbf");
1466 // 4.2.4 U-001FFFFF = f8 87 bf bf bf
1467 roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
1468 // 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf
1469 roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
1470 }
1471
1472 SECTION("4.3 Overlong representation of the NUL character")
1473 {
1474 // The following five sequences should also be rejected like malformed
1475 // UTF-8 sequences and should not be treated like the ASCII NUL
1476 // character.
1477
1478 // 4.3.1 U+0000 = c0 80
1479 roundtrip(false, "\xc0\x80");
1480 // 4.3.2 U+0000 = e0 80 80
1481 roundtrip(false, "\xe0\x80\x80");
1482 // 4.3.3 U+0000 = f0 80 80 80
1483 roundtrip(false, "\xf0\x80\x80\x80");
1484 // 4.3.4 U+0000 = f8 80 80 80 80
1485 roundtrip(false, "\xf8\x80\x80\x80\x80");
1486 // 4.3.5 U+0000 = fc 80 80 80 80 80
1487 roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
1488 }
1489 }
1490
1491 SECTION("5 Illegal code positions")
1492 {
1493 // The following UTF-8 sequences should be rejected like malformed
1494 // sequences, because they never represent valid ISO 10646 characters and
1495 // a UTF-8 decoder that accepts them might introduce security problems
1496 // comparable to overlong UTF-8 sequences.
1497
1498 SECTION("5.1 Single UTF-16 surrogates")
1499 {
1500 // 5.1.1 U+D800 = ed a0 80
1501 roundtrip(false, "\xed\xa0\x80");
1502 // 5.1.2 U+DB7F = ed ad bf
1503 roundtrip(false, "\xed\xad\xbf");
1504 // 5.1.3 U+DB80 = ed ae 80
1505 roundtrip(false, "\xed\xae\x80");
1506 // 5.1.4 U+DBFF = ed af bf
1507 roundtrip(false, "\xed\xaf\xbf");
1508 // 5.1.5 U+DC00 = ed b0 80
1509 roundtrip(false, "\xed\xb0\x80");
1510 // 5.1.6 U+DF80 = ed be 80
1511 roundtrip(false, "\xed\xbe\x80");
1512 // 5.1.7 U+DFFF = ed bf bf
1513 roundtrip(false, "\xed\xbf\xbf");
1514 }
1515
1516 SECTION("5.2 Paired UTF-16 surrogates")
1517 {
1518 // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
1519 roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
1520 // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
1521 roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
1522 // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
1523 roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
1524 // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
1525 roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
1526 // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
1527 roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
1528 // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
1529 roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
1530 // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
1531 roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
1532 // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
1533 roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
1534 }
1535
1536 SECTION("5.3 Noncharacter code positions")
1537 {
1538 // The following "noncharacters" are "reserved for internal use" by
1539 // applications, and according to older versions of the Unicode Standard
1540 // "should never be interchanged". Unicode Corrigendum #9 dropped the
1541 // latter restriction. Nevertheless, their presence in incoming UTF-8 data
1542 // can remain a potential security risk, depending on what use is made of
1543 // these codes subsequently. Examples of such internal use:
1544 //
1545 // - Some file APIs with 16-bit characters may use the integer value -1
1546 // = U+FFFF to signal an end-of-file (EOF) or error condition.
1547 //
1548 // - In some UTF-16 receivers, code point U+FFFE might trigger a
1549 // byte-swap operation (to convert between UTF-16LE and UTF-16BE).
1550 //
1551 // With such internal use of noncharacters, it may be desirable and safer
1552 // to block those code points in UTF-8 decoders, as they should never
1553 // occur legitimately in incoming UTF-8 data, and could trigger unsafe
1554 // behaviour in subsequent processing.
1555
1556 // Particularly problematic noncharacters in 16-bit applications:
1557
1558 // 5.3.1 U+FFFE = ef bf be
1559 roundtrip(true, "\xef\xbf\xbe");
1560 // 5.3.2 U+FFFF = ef bf bf
1561 roundtrip(true, "\xef\xbf\xbf");
1562
1563 // 5.3.3 U+FDD0 .. U+FDEF
1564 roundtrip(true, "\xEF\xB7\x90");
1565 roundtrip(true, "\xEF\xB7\x91");
1566 roundtrip(true, "\xEF\xB7\x92");
1567 roundtrip(true, "\xEF\xB7\x93");
1568 roundtrip(true, "\xEF\xB7\x94");
1569 roundtrip(true, "\xEF\xB7\x95");
1570 roundtrip(true, "\xEF\xB7\x96");
1571 roundtrip(true, "\xEF\xB7\x97");
1572 roundtrip(true, "\xEF\xB7\x98");
1573 roundtrip(true, "\xEF\xB7\x99");
1574 roundtrip(true, "\xEF\xB7\x9A");
1575 roundtrip(true, "\xEF\xB7\x9B");
1576 roundtrip(true, "\xEF\xB7\x9C");
1577 roundtrip(true, "\xEF\xB7\x9D");
1578 roundtrip(true, "\xEF\xB7\x9E");
1579 roundtrip(true, "\xEF\xB7\x9F");
1580 roundtrip(true, "\xEF\xB7\xA0");
1581 roundtrip(true, "\xEF\xB7\xA1");
1582 roundtrip(true, "\xEF\xB7\xA2");
1583 roundtrip(true, "\xEF\xB7\xA3");
1584 roundtrip(true, "\xEF\xB7\xA4");
1585 roundtrip(true, "\xEF\xB7\xA5");
1586 roundtrip(true, "\xEF\xB7\xA6");
1587 roundtrip(true, "\xEF\xB7\xA7");
1588 roundtrip(true, "\xEF\xB7\xA8");
1589 roundtrip(true, "\xEF\xB7\xA9");
1590 roundtrip(true, "\xEF\xB7\xAA");
1591 roundtrip(true, "\xEF\xB7\xAB");
1592 roundtrip(true, "\xEF\xB7\xAC");
1593 roundtrip(true, "\xEF\xB7\xAD");
1594 roundtrip(true, "\xEF\xB7\xAE");
1595 roundtrip(true, "\xEF\xB7\xAF");
1596
1597 // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10)
1598 roundtrip(true, "\xF0\x9F\xBF\xBF");
1599 roundtrip(true, "\xF0\xAF\xBF\xBF");
1600 roundtrip(true, "\xF0\xBF\xBF\xBF");
1601 roundtrip(true, "\xF1\x8F\xBF\xBF");
1602 roundtrip(true, "\xF1\x9F\xBF\xBF");
1603 roundtrip(true, "\xF1\xAF\xBF\xBF");
1604 roundtrip(true, "\xF1\xBF\xBF\xBF");
1605 roundtrip(true, "\xF2\x8F\xBF\xBF");
1606 roundtrip(true, "\xF2\x9F\xBF\xBF");
1607 roundtrip(true, "\xF2\xAF\xBF\xBF");
1608 }
1609 }
1610 }
1611