1 /*
2     __ _____ _____ _____
3  __|  |   __|     |   | |  JSON for Modern C++ (test suite)
4 |  |  |__   |  |  | | | |  version 3.7.3
5 |_____|_____|_____|_|___|  https://github.com/nlohmann/json
6 
7 Licensed under the MIT License <http://opensource.org/licenses/MIT>.
8 SPDX-License-Identifier: MIT
9 Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
10 
11 Permission is hereby  granted, free of charge, to any  person obtaining a copy
12 of this software and associated  documentation files (the "Software"), to deal
13 in the Software  without restriction, including without  limitation the rights
14 to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
15 copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
16 furnished to do so, subject to the following conditions:
17 
18 The above copyright notice and this permission notice shall be included in all
19 copies or substantial portions of the Software.
20 
21 THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
22 IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
23 FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
24 AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
25 LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
27 SOFTWARE.
28 */
29 
30 #include "doctest_compatibility.h"
31 
32 // for some reason including this after the json header leads to linker errors with VS 2017...
33 #include <locale>
34 
35 #define private public
36 #include <nlohmann/json.hpp>
37 using nlohmann::json;
38 #undef private
39 
40 #include <fstream>
41 #include <sstream>
42 #include <iostream>
43 #include <iomanip>
44 
45 namespace
46 {
47 extern size_t calls;
48 size_t calls = 0;
49 
50 void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
51 
check_utf8dump(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)52 void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
53 {
54     std::string json_string;
55 
56     CAPTURE(byte1)
57     CAPTURE(byte2)
58     CAPTURE(byte3)
59     CAPTURE(byte4)
60 
61     json_string += std::string(1, static_cast<char>(byte1));
62 
63     if (byte2 != -1)
64     {
65         json_string += std::string(1, static_cast<char>(byte2));
66     }
67 
68     if (byte3 != -1)
69     {
70         json_string += std::string(1, static_cast<char>(byte3));
71     }
72 
73     if (byte4 != -1)
74     {
75         json_string += std::string(1, static_cast<char>(byte4));
76     }
77 
78     CAPTURE(json_string)
79 
80     // store the string in a JSON value
81     json j = json_string;
82     json j2 = "abc" + json_string + "xyz";
83 
84     // dumping with ignore/replace must not throw in any case
85     auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
86     auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
87     auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
88     auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
89     auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
90     auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
91     auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
92     auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
93 
94     if (success_expected)
95     {
96         // strict mode must not throw if success is expected
97         auto s_strict = j.dump();
98         // all dumps should agree on the string
99         CHECK(s_strict == s_ignored);
100         CHECK(s_strict == s_replaced);
101     }
102     else
103     {
104         // strict mode must throw if success is not expected
105         CHECK_THROWS_AS(j.dump(), json::type_error&);
106         // ignore and replace must create different dumps
107         CHECK(s_ignored != s_replaced);
108 
109         // check that replace string contains a replacement character
110         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
111     }
112 
113     // check that prefix and suffix are preserved
114     CHECK(s_ignored2.substr(1, 3) == "abc");
115     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
116     CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
117     CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
118     CHECK(s_replaced2.substr(1, 3) == "abc");
119     CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
120     CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
121     CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
122 }
123 
124 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
125 
126 // create and check a JSON string with up to four UTF-8 bytes
check_utf8string(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)127 void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
128 {
129     if (++calls % 100000 == 0)
130     {
131         std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl;
132     }
133 
134     std::string json_string = "\"";
135 
136     CAPTURE(byte1)
137     json_string += std::string(1, static_cast<char>(byte1));
138 
139     if (byte2 != -1)
140     {
141         CAPTURE(byte2)
142         json_string += std::string(1, static_cast<char>(byte2));
143     }
144 
145     if (byte3 != -1)
146     {
147         CAPTURE(byte3)
148         json_string += std::string(1, static_cast<char>(byte3));
149     }
150 
151     if (byte4 != -1)
152     {
153         CAPTURE(byte4)
154         json_string += std::string(1, static_cast<char>(byte4));
155     }
156 
157     json_string += "\"";
158 
159     CAPTURE(json_string)
160 
161     json _;
162     if (success_expected)
163     {
164         CHECK_NOTHROW(_ = json::parse(json_string));
165     }
166     else
167     {
168         CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&);
169     }
170 }
171 }
172 
skip()173 TEST_CASE("Unicode" * doctest::skip())
174 {
175     SECTION("RFC 3629")
176     {
177         /*
178         RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
179         follows:
180 
181             A UTF-8 string is a sequence of octets representing a sequence of UCS
182             characters.  An octet sequence is valid UTF-8 only if it matches the
183             following syntax, which is derived from the rules for encoding UTF-8
184             and is expressed in the ABNF of [RFC2234].
185 
186             UTF8-octets = *( UTF8-char )
187             UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
188             UTF8-1      = %x00-7F
189             UTF8-2      = %xC2-DF UTF8-tail
190             UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
191                           %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
192             UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
193                           %xF4 %x80-8F 2( UTF8-tail )
194             UTF8-tail   = %x80-BF
195         */
196 
197         SECTION("ill-formed first byte")
198         {
199             for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
200             {
201                 check_utf8string(false, byte1);
202                 check_utf8dump(false, byte1);
203             }
204 
205             for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
206             {
207                 check_utf8string(false, byte1);
208                 check_utf8dump(false, byte1);
209             }
210         }
211 
212         SECTION("UTF8-1 (x00-x7F)")
213         {
214             SECTION("well-formed")
215             {
216                 for (int byte1 = 0x00; byte1 <= 0x7F; ++byte1)
217                 {
218                     // unescaped control characters are parse errors in JSON
219                     if (0x00 <= byte1 and byte1 <= 0x1F)
220                     {
221                         check_utf8string(false, byte1);
222                         continue;
223                     }
224 
225                     // a single quote is a parse error in JSON
226                     if (byte1 == 0x22)
227                     {
228                         check_utf8string(false, byte1);
229                         continue;
230                     }
231 
232                     // a single backslash is a parse error in JSON
233                     if (byte1 == 0x5C)
234                     {
235                         check_utf8string(false, byte1);
236                         continue;
237                     }
238 
239                     // all other characters are OK
240                     check_utf8string(true, byte1);
241                     check_utf8dump(true, byte1);
242                 }
243             }
244         }
245 
246         SECTION("UTF8-2 (xC2-xDF UTF8-tail)")
247         {
248             SECTION("well-formed")
249             {
250                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
251                 {
252                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
253                     {
254                         check_utf8string(true, byte1, byte2);
255                         check_utf8dump(true, byte1, byte2);
256                     }
257                 }
258             }
259 
260             SECTION("ill-formed: missing second byte")
261             {
262                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
263                 {
264                     check_utf8string(false, byte1);
265                     check_utf8dump(false, byte1);
266                 }
267             }
268 
269             SECTION("ill-formed: wrong second byte")
270             {
271                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
272                 {
273                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
274                     {
275                         // skip correct second byte
276                         if (0x80 <= byte2 and byte2 <= 0xBF)
277                         {
278                             continue;
279                         }
280 
281                         check_utf8string(false, byte1, byte2);
282                         check_utf8dump(false, byte1, byte2);
283                     }
284                 }
285             }
286         }
287 
288         SECTION("UTF8-3 (xE0 xA0-BF UTF8-tail)")
289         {
290             SECTION("well-formed")
291             {
292                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
293                 {
294                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
295                     {
296                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
297                         {
298                             check_utf8string(true, byte1, byte2, byte3);
299                             check_utf8dump(true, byte1, byte2, byte3);
300                         }
301                     }
302                 }
303             }
304 
305             SECTION("ill-formed: missing second byte")
306             {
307                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
308                 {
309                     check_utf8string(false, byte1);
310                     check_utf8dump(false, byte1);
311                 }
312             }
313 
314             SECTION("ill-formed: missing third byte")
315             {
316                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
317                 {
318                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
319                     {
320                         check_utf8string(false, byte1, byte2);
321                         check_utf8dump(false, byte1, byte2);
322                     }
323                 }
324             }
325 
326             SECTION("ill-formed: wrong second byte")
327             {
328                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
329                 {
330                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
331                     {
332                         // skip correct second byte
333                         if (0xA0 <= byte2 and byte2 <= 0xBF)
334                         {
335                             continue;
336                         }
337 
338                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
339                         {
340                             check_utf8string(false, byte1, byte2, byte3);
341                             check_utf8dump(false, byte1, byte2, byte3);
342                         }
343                     }
344                 }
345             }
346 
347             SECTION("ill-formed: wrong third byte")
348             {
349                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
350                 {
351                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
352                     {
353                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
354                         {
355                             // skip correct third byte
356                             if (0x80 <= byte3 and byte3 <= 0xBF)
357                             {
358                                 continue;
359                             }
360 
361                             check_utf8string(false, byte1, byte2, byte3);
362                             check_utf8dump(false, byte1, byte2, byte3);
363                         }
364                     }
365                 }
366             }
367         }
368 
369         SECTION("UTF8-3 (xE1-xEC UTF8-tail UTF8-tail)")
370         {
371             SECTION("well-formed")
372             {
373                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
374                 {
375                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
376                     {
377                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
378                         {
379                             check_utf8string(true, byte1, byte2, byte3);
380                             check_utf8dump(true, byte1, byte2, byte3);
381                         }
382                     }
383                 }
384             }
385 
386             SECTION("ill-formed: missing second byte")
387             {
388                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
389                 {
390                     check_utf8string(false, byte1);
391                     check_utf8dump(false, byte1);
392                 }
393             }
394 
395             SECTION("ill-formed: missing third byte")
396             {
397                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
398                 {
399                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
400                     {
401                         check_utf8string(false, byte1, byte2);
402                         check_utf8dump(false, byte1, byte2);
403                     }
404                 }
405             }
406 
407             SECTION("ill-formed: wrong second byte")
408             {
409                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
410                 {
411                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
412                     {
413                         // skip correct second byte
414                         if (0x80 <= byte2 and byte2 <= 0xBF)
415                         {
416                             continue;
417                         }
418 
419                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
420                         {
421                             check_utf8string(false, byte1, byte2, byte3);
422                             check_utf8dump(false, byte1, byte2, byte3);
423                         }
424                     }
425                 }
426             }
427 
428             SECTION("ill-formed: wrong third byte")
429             {
430                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
431                 {
432                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
433                     {
434                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
435                         {
436                             // skip correct third byte
437                             if (0x80 <= byte3 and byte3 <= 0xBF)
438                             {
439                                 continue;
440                             }
441 
442                             check_utf8string(false, byte1, byte2, byte3);
443                             check_utf8dump(false, byte1, byte2, byte3);
444                         }
445                     }
446                 }
447             }
448         }
449 
450         SECTION("UTF8-3 (xED x80-9F UTF8-tail)")
451         {
452             SECTION("well-formed")
453             {
454                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
455                 {
456                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
457                     {
458                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
459                         {
460                             check_utf8string(true, byte1, byte2, byte3);
461                             check_utf8dump(true, byte1, byte2, byte3);
462                         }
463                     }
464                 }
465             }
466 
467             SECTION("ill-formed: missing second byte")
468             {
469                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
470                 {
471                     check_utf8string(false, byte1);
472                     check_utf8dump(false, byte1);
473                 }
474             }
475 
476             SECTION("ill-formed: missing third byte")
477             {
478                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
479                 {
480                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
481                     {
482                         check_utf8string(false, byte1, byte2);
483                         check_utf8dump(false, byte1, byte2);
484                     }
485                 }
486             }
487 
488             SECTION("ill-formed: wrong second byte")
489             {
490                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
491                 {
492                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
493                     {
494                         // skip correct second byte
495                         if (0x80 <= byte2 and byte2 <= 0x9F)
496                         {
497                             continue;
498                         }
499 
500                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
501                         {
502                             check_utf8string(false, byte1, byte2, byte3);
503                             check_utf8dump(false, byte1, byte2, byte3);
504                         }
505                     }
506                 }
507             }
508 
509             SECTION("ill-formed: wrong third byte")
510             {
511                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
512                 {
513                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
514                     {
515                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
516                         {
517                             // skip correct third byte
518                             if (0x80 <= byte3 and byte3 <= 0xBF)
519                             {
520                                 continue;
521                             }
522 
523                             check_utf8string(false, byte1, byte2, byte3);
524                             check_utf8dump(false, byte1, byte2, byte3);
525                         }
526                     }
527                 }
528             }
529         }
530 
531         SECTION("UTF8-3 (xEE-xEF UTF8-tail UTF8-tail)")
532         {
533             SECTION("well-formed")
534             {
535                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
536                 {
537                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
538                     {
539                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
540                         {
541                             check_utf8string(true, byte1, byte2, byte3);
542                             check_utf8dump(true, byte1, byte2, byte3);
543                         }
544                     }
545                 }
546             }
547 
548             SECTION("ill-formed: missing second byte")
549             {
550                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
551                 {
552                     check_utf8string(false, byte1);
553                     check_utf8dump(false, byte1);
554                 }
555             }
556 
557             SECTION("ill-formed: missing third byte")
558             {
559                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
560                 {
561                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
562                     {
563                         check_utf8string(false, byte1, byte2);
564                         check_utf8dump(false, byte1, byte2);
565                     }
566                 }
567             }
568 
569             SECTION("ill-formed: wrong second byte")
570             {
571                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
572                 {
573                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
574                     {
575                         // skip correct second byte
576                         if (0x80 <= byte2 and byte2 <= 0xBF)
577                         {
578                             continue;
579                         }
580 
581                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
582                         {
583                             check_utf8string(false, byte1, byte2, byte3);
584                             check_utf8dump(false, byte1, byte2, byte3);
585                         }
586                     }
587                 }
588             }
589 
590             SECTION("ill-formed: wrong third byte")
591             {
592                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
593                 {
594                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
595                     {
596                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
597                         {
598                             // skip correct third byte
599                             if (0x80 <= byte3 and byte3 <= 0xBF)
600                             {
601                                 continue;
602                             }
603 
604                             check_utf8string(false, byte1, byte2, byte3);
605                             check_utf8dump(false, byte1, byte2, byte3);
606                         }
607                     }
608                 }
609             }
610         }
611 
612         SECTION("UTF8-4 (xF0 x90-BF UTF8-tail UTF8-tail)")
613         {
614             SECTION("well-formed")
615             {
616                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
617                 {
618                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
619                     {
620                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
621                         {
622                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
623                             {
624                                 check_utf8string(true, byte1, byte2, byte3, byte4);
625                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
626                             }
627                         }
628                     }
629                 }
630             }
631 
632             SECTION("ill-formed: missing second byte")
633             {
634                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
635                 {
636                     check_utf8string(false, byte1);
637                     check_utf8dump(false, byte1);
638                 }
639             }
640 
641             SECTION("ill-formed: missing third byte")
642             {
643                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
644                 {
645                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
646                     {
647                         check_utf8string(false, byte1, byte2);
648                         check_utf8dump(false, byte1, byte2);
649                     }
650                 }
651             }
652 
653             SECTION("ill-formed: missing fourth byte")
654             {
655                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
656                 {
657                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
658                     {
659                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
660                         {
661                             check_utf8string(false, byte1, byte2, byte3);
662                             check_utf8dump(false, byte1, byte2, byte3);
663                         }
664                     }
665                 }
666             }
667 
668             SECTION("ill-formed: wrong second byte")
669             {
670                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
671                 {
672                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
673                     {
674                         // skip correct second byte
675                         if (0x90 <= byte2 and byte2 <= 0xBF)
676                         {
677                             continue;
678                         }
679 
680                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
681                         {
682                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
683                             {
684                                 check_utf8string(false, byte1, byte2, byte3, byte4);
685                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
686                             }
687                         }
688                     }
689                 }
690             }
691 
692             SECTION("ill-formed: wrong third byte")
693             {
694                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
695                 {
696                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
697                     {
698                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
699                         {
700                             // skip correct third byte
701                             if (0x80 <= byte3 and byte3 <= 0xBF)
702                             {
703                                 continue;
704                             }
705 
706                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
707                             {
708                                 check_utf8string(false, byte1, byte2, byte3, byte4);
709                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
710                             }
711                         }
712                     }
713                 }
714             }
715 
716             SECTION("ill-formed: wrong fourth byte")
717             {
718                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
719                 {
720                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
721                     {
722                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
723                         {
724                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
725                             {
726                                 // skip fourth second byte
727                                 if (0x80 <= byte3 and byte3 <= 0xBF)
728                                 {
729                                     continue;
730                                 }
731 
732                                 check_utf8string(false, byte1, byte2, byte3, byte4);
733                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
734                             }
735                         }
736                     }
737                 }
738             }
739         }
740 
741         SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
742         {
743             SECTION("well-formed")
744             {
745                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
746                 {
747                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
748                     {
749                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
750                         {
751                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
752                             {
753                                 check_utf8string(true, byte1, byte2, byte3, byte4);
754                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
755                             }
756                         }
757                     }
758                 }
759             }
760 
761             SECTION("ill-formed: missing second byte")
762             {
763                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
764                 {
765                     check_utf8string(false, byte1);
766                     check_utf8dump(false, byte1);
767                 }
768             }
769 
770             SECTION("ill-formed: missing third byte")
771             {
772                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
773                 {
774                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
775                     {
776                         check_utf8string(false, byte1, byte2);
777                         check_utf8dump(false, byte1, byte2);
778                     }
779                 }
780             }
781 
782             SECTION("ill-formed: missing fourth byte")
783             {
784                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
785                 {
786                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
787                     {
788                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
789                         {
790                             check_utf8string(false, byte1, byte2, byte3);
791                             check_utf8dump(false, byte1, byte2, byte3);
792                         }
793                     }
794                 }
795             }
796 
797             SECTION("ill-formed: wrong second byte")
798             {
799                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
800                 {
801                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
802                     {
803                         // skip correct second byte
804                         if (0x80 <= byte2 and byte2 <= 0xBF)
805                         {
806                             continue;
807                         }
808 
809                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
810                         {
811                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
812                             {
813                                 check_utf8string(false, byte1, byte2, byte3, byte4);
814                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
815                             }
816                         }
817                     }
818                 }
819             }
820 
821             SECTION("ill-formed: wrong third byte")
822             {
823                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
824                 {
825                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
826                     {
827                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
828                         {
829                             // skip correct third byte
830                             if (0x80 <= byte3 and byte3 <= 0xBF)
831                             {
832                                 continue;
833                             }
834 
835                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
836                             {
837                                 check_utf8string(false, byte1, byte2, byte3, byte4);
838                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
839                             }
840                         }
841                     }
842                 }
843             }
844 
845             SECTION("ill-formed: wrong fourth byte")
846             {
847                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
848                 {
849                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
850                     {
851                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
852                         {
853                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
854                             {
855                                 // skip correct fourth byte
856                                 if (0x80 <= byte3 and byte3 <= 0xBF)
857                                 {
858                                     continue;
859                                 }
860 
861                                 check_utf8string(false, byte1, byte2, byte3, byte4);
862                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
863                             }
864                         }
865                     }
866                 }
867             }
868         }
869 
870         SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
871         {
872             SECTION("well-formed")
873             {
874                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
875                 {
876                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
877                     {
878                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
879                         {
880                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
881                             {
882                                 check_utf8string(true, byte1, byte2, byte3, byte4);
883                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
884                             }
885                         }
886                     }
887                 }
888             }
889 
890             SECTION("ill-formed: missing second byte")
891             {
892                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
893                 {
894                     check_utf8string(false, byte1);
895                     check_utf8dump(false, byte1);
896                 }
897             }
898 
899             SECTION("ill-formed: missing third byte")
900             {
901                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
902                 {
903                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
904                     {
905                         check_utf8string(false, byte1, byte2);
906                         check_utf8dump(false, byte1, byte2);
907                     }
908                 }
909             }
910 
911             SECTION("ill-formed: missing fourth byte")
912             {
913                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
914                 {
915                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
916                     {
917                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
918                         {
919                             check_utf8string(false, byte1, byte2, byte3);
920                             check_utf8dump(false, byte1, byte2, byte3);
921                         }
922                     }
923                 }
924             }
925 
926             SECTION("ill-formed: wrong second byte")
927             {
928                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
929                 {
930                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
931                     {
932                         // skip correct second byte
933                         if (0x80 <= byte2 and byte2 <= 0x8F)
934                         {
935                             continue;
936                         }
937 
938                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
939                         {
940                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
941                             {
942                                 check_utf8string(false, byte1, byte2, byte3, byte4);
943                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
944                             }
945                         }
946                     }
947                 }
948             }
949 
950             SECTION("ill-formed: wrong third byte")
951             {
952                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
953                 {
954                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
955                     {
956                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
957                         {
958                             // skip correct third byte
959                             if (0x80 <= byte3 and byte3 <= 0xBF)
960                             {
961                                 continue;
962                             }
963 
964                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
965                             {
966                                 check_utf8string(false, byte1, byte2, byte3, byte4);
967                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
968                             }
969                         }
970                     }
971                 }
972             }
973 
974             SECTION("ill-formed: wrong fourth byte")
975             {
976                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
977                 {
978                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
979                     {
980                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
981                         {
982                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
983                             {
984                                 // skip correct fourth byte
985                                 if (0x80 <= byte3 and byte3 <= 0xBF)
986                                 {
987                                     continue;
988                                 }
989 
990                                 check_utf8string(false, byte1, byte2, byte3, byte4);
991                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
992                             }
993                         }
994                     }
995                 }
996             }
997         }
998     }
999 
1000     SECTION("\\uxxxx sequences")
1001     {
1002         // create an escaped string from a code point
1003         const auto codepoint_to_unicode = [](std::size_t cp)
1004         {
1005             // code points are represented as a six-character sequence: a
1006             // reverse solidus, followed by the lowercase letter u, followed
1007             // by four hexadecimal digits that encode the character's code
1008             // point
1009             std::stringstream ss;
1010             ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
1011             return ss.str();
1012         };
1013 
1014         SECTION("correct sequences")
1015         {
1016             // generate all UTF-8 code points; in total, 1112064 code points are
1017             // generated: 0x1FFFFF code points - 2048 invalid values between
1018             // 0xD800 and 0xDFFF.
1019             for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
1020             {
1021                 // string to store the code point as in \uxxxx format
1022                 std::string json_text = "\"";
1023 
1024                 // decide whether to use one or two \uxxxx sequences
1025                 if (cp < 0x10000u)
1026                 {
1027                     // The Unicode standard permanently reserves these code point
1028                     // values for UTF-16 encoding of the high and low surrogates, and
1029                     // they will never be assigned a character, so there should be no
1030                     // reason to encode them. The official Unicode standard says that
1031                     // no UTF forms, including UTF-16, can encode these code points.
1032                     if (cp >= 0xD800u and cp <= 0xDFFFu)
1033                     {
1034                         // if we would not skip these code points, we would get a
1035                         // "missing low surrogate" exception
1036                         continue;
1037                     }
1038 
1039                     // code points in the Basic Multilingual Plane can be
1040                     // represented with one \uxxxx sequence
1041                     json_text += codepoint_to_unicode(cp);
1042                 }
1043                 else
1044                 {
1045                     // To escape an extended character that is not in the Basic
1046                     // Multilingual Plane, the character is represented as a
1047                     // 12-character sequence, encoding the UTF-16 surrogate pair
1048                     const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
1049                     const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
1050                     json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
1051                 }
1052 
1053                 json_text += "\"";
1054                 CAPTURE(json_text)
1055                 json _;
1056                 CHECK_NOTHROW(_ = json::parse(json_text));
1057             }
1058         }
1059 
1060         SECTION("incorrect sequences")
1061         {
1062             SECTION("incorrect surrogate values")
1063             {
1064                 json _;
1065 
1066                 CHECK_THROWS_AS(_ = json::parse("\"\\uDC00\\uDC00\""), json::parse_error&);
1067                 CHECK_THROWS_WITH(_ = json::parse("\"\\uDC00\\uDC00\""),
1068                                   "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'");
1069 
1070                 CHECK_THROWS_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), json::parse_error&);
1071                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD7FF\\uDC00\""),
1072                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'");
1073 
1074                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800]\""), json::parse_error&);
1075                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800]\""),
1076                                   "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'");
1077 
1078                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\v\""), json::parse_error&);
1079                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\v\""),
1080                                   "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'");
1081 
1082                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\u123\""), json::parse_error&);
1083                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\u123\""),
1084                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'");
1085 
1086                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uDBFF\""), json::parse_error&);
1087                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uDBFF\""),
1088                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'");
1089 
1090                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uE000\""), json::parse_error&);
1091                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uE000\""),
1092                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'");
1093             }
1094         }
1095 
1096 #if 0
1097         SECTION("incorrect sequences")
1098         {
1099             SECTION("high surrogate without low surrogate")
1100             {
1101                 // D800..DBFF are high surrogates and must be followed by low
1102                 // surrogates DC00..DFFF; here, nothing follows
1103                 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
1104                 {
1105                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1106                     CAPTURE(json_text)
1107                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1108                 }
1109             }
1110 
1111             SECTION("high surrogate with wrong low surrogate")
1112             {
1113                 // D800..DBFF are high surrogates and must be followed by low
1114                 // surrogates DC00..DFFF; here a different sequence follows
1115                 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
1116                 {
1117                     for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
1118                     {
1119                         if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
1120                         {
1121                             continue;
1122                         }
1123 
1124                         std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
1125                         CAPTURE(json_text)
1126                         CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1127                     }
1128                 }
1129             }
1130 
1131             SECTION("low surrogate without high surrogate")
1132             {
1133                 // low surrogates DC00..DFFF must follow high surrogates; here,
1134                 // they occur alone
1135                 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
1136                 {
1137                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1138                     CAPTURE(json_text)
1139                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1140                 }
1141             }
1142 
1143         }
1144 #endif
1145     }
1146 
1147     SECTION("read all unicode characters")
1148     {
1149         // read a file with all unicode characters stored as single-character
1150         // strings in a JSON array
1151         std::ifstream f("test/data/json_nlohmann_tests/all_unicode.json");
1152         json j;
1153         CHECK_NOTHROW(f >> j);
1154 
1155         // the array has 1112064 + 1 elements (a terminating "null" value)
1156         // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
1157         // 0xD800 and 0xDFFF.
1158         CHECK(j.size() == 1112065);
1159 
1160         SECTION("check JSON Pointers")
1161         {
1162             for (auto s : j)
1163             {
1164                 // skip non-string JSON values
1165                 if (not s.is_string())
1166                 {
1167                     continue;
1168                 }
1169 
1170                 std::string ptr = s;
1171 
1172                 // tilde must be followed by 0 or 1
1173                 if (ptr == "~")
1174                 {
1175                     ptr += "0";
1176                 }
1177 
1178                 // JSON Pointers must begin with "/"
1179                 ptr = "/" + ptr;
1180 
1181                 CHECK_NOTHROW(json::json_pointer("/" + ptr));
1182 
1183                 // check escape/unescape roundtrip
1184                 auto escaped = json::json_pointer::escape(ptr);
1185                 json::json_pointer::unescape(escaped);
1186                 CHECK(escaped == ptr);
1187             }
1188         }
1189     }
1190 
1191     SECTION("ignore byte-order-mark")
1192     {
1193         SECTION("in a stream")
1194         {
1195             // read a file with a UTF-8 BOM
1196             std::ifstream f("test/data/json_nlohmann_tests/bom.json");
1197             json j;
1198             CHECK_NOTHROW(f >> j);
1199         }
1200 
1201         SECTION("with an iterator")
1202         {
1203             std::string i = "\xef\xbb\xbf{\n   \"foo\": true\n}";
1204             CHECK_NOTHROW(json::parse(i.begin(), i.end()));
1205         }
1206     }
1207 
1208     SECTION("error for incomplete/wrong BOM")
1209     {
1210         json _;
1211         CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
1212         CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
1213     }
1214 }
1215 
1216 namespace
1217 {
1218 void roundtrip(bool success_expected, const std::string& s);
1219 
roundtrip(bool success_expected,const std::string & s)1220 void roundtrip(bool success_expected, const std::string& s)
1221 {
1222     CAPTURE(s)
1223     json _;
1224 
1225     // create JSON string value
1226     json j = s;
1227     // create JSON text
1228     std::string ps = std::string("\"") + s + "\"";
1229 
1230     if (success_expected)
1231     {
1232         // serialization succeeds
1233         CHECK_NOTHROW(j.dump());
1234 
1235         // exclude parse test for U+0000
1236         if (s[0] != '\0')
1237         {
1238             // parsing JSON text succeeds
1239             CHECK_NOTHROW(_ = json::parse(ps));
1240         }
1241 
1242         // roundtrip succeeds
1243         CHECK_NOTHROW(_ = json::parse(j.dump()));
1244 
1245         // after roundtrip, the same string is stored
1246         json jr = json::parse(j.dump());
1247         CHECK(jr.get<std::string>() == s);
1248     }
1249     else
1250     {
1251         // serialization fails
1252         CHECK_THROWS_AS(j.dump(), json::type_error&);
1253 
1254         // parsing JSON text fails
1255         CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
1256     }
1257 }
1258 }
1259 
1260 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
1261 {
1262     // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
1263     // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
1264 
1265     SECTION("1  Some correct UTF-8 text")
1266     {
1267         roundtrip(true, "κόσμε");
1268     }
1269 
1270     SECTION("2  Boundary condition test cases")
1271     {
1272         SECTION("2.1  First possible sequence of a certain length")
1273         {
1274             // 2.1.1  1 byte  (U-00000000)
1275             roundtrip(true, std::string("\0", 1));
1276             // 2.1.2  2 bytes (U-00000080)
1277             roundtrip(true, "\xc2\x80");
1278             // 2.1.3  3 bytes (U-00000800)
1279             roundtrip(true, "\xe0\xa0\x80");
1280             // 2.1.4  4 bytes (U-00010000)
1281             roundtrip(true, "\xf0\x90\x80\x80");
1282 
1283             // 2.1.5  5 bytes (U-00200000)
1284             roundtrip(false, "\xF8\x88\x80\x80\x80");
1285             // 2.1.6  6 bytes (U-04000000)
1286             roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
1287         }
1288 
1289         SECTION("2.2  Last possible sequence of a certain length")
1290         {
1291             // 2.2.1  1 byte  (U-0000007F)
1292             roundtrip(true, "\x7f");
1293             // 2.2.2  2 bytes (U-000007FF)
1294             roundtrip(true, "\xdf\xbf");
1295             // 2.2.3  3 bytes (U-0000FFFF)
1296             roundtrip(true, "\xef\xbf\xbf");
1297 
1298             // 2.2.4  4 bytes (U-001FFFFF)
1299             roundtrip(false, "\xF7\xBF\xBF\xBF");
1300             // 2.2.5  5 bytes (U-03FFFFFF)
1301             roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
1302             // 2.2.6  6 bytes (U-7FFFFFFF)
1303             roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
1304         }
1305 
1306         SECTION("2.3  Other boundary conditions")
1307         {
1308             // 2.3.1  U-0000D7FF = ed 9f bf
1309             roundtrip(true, "\xed\x9f\xbf");
1310             // 2.3.2  U-0000E000 = ee 80 80
1311             roundtrip(true, "\xee\x80\x80");
1312             // 2.3.3  U-0000FFFD = ef bf bd
1313             roundtrip(true, "\xef\xbf\xbd");
1314             // 2.3.4  U-0010FFFF = f4 8f bf bf
1315             roundtrip(true, "\xf4\x8f\xbf\xbf");
1316 
1317             // 2.3.5  U-00110000 = f4 90 80 80
1318             roundtrip(false, "\xf4\x90\x80\x80");
1319         }
1320     }
1321 
1322     SECTION("3  Malformed sequences")
1323     {
1324         SECTION("3.1  Unexpected continuation bytes")
1325         {
1326             // Each unexpected continuation byte should be separately signalled as a
1327             // malformed sequence of its own.
1328 
1329             // 3.1.1  First continuation byte 0x80
1330             roundtrip(false, "\x80");
1331             // 3.1.2  Last  continuation byte 0xbf
1332             roundtrip(false, "\xbf");
1333 
1334             // 3.1.3  2 continuation bytes
1335             roundtrip(false, "\x80\xbf");
1336             // 3.1.4  3 continuation bytes
1337             roundtrip(false, "\x80\xbf\x80");
1338             // 3.1.5  4 continuation bytes
1339             roundtrip(false, "\x80\xbf\x80\xbf");
1340             // 3.1.6  5 continuation bytes
1341             roundtrip(false, "\x80\xbf\x80\xbf\x80");
1342             // 3.1.7  6 continuation bytes
1343             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
1344             // 3.1.8  7 continuation bytes
1345             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
1346 
1347             // 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf)
1348             roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
1349         }
1350 
1351         SECTION("3.2  Lonely start characters")
1352         {
1353             // 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf)
1354             roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
1355             // 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef)
1356             roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
1357             // 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7)
1358             roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
1359             // 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb)
1360             roundtrip(false, "\xf8 \xf9 \xfa \xfb");
1361             // 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd)
1362             roundtrip(false, "\xfc \xfd");
1363         }
1364 
1365         SECTION("3.3  Sequences with last continuation byte missing")
1366         {
1367             // All bytes of an incomplete sequence should be signalled as a single
1368             // malformed sequence, i.e., you should see only a single replacement
1369             // character in each of the next 10 tests. (Characters as in section 2)
1370 
1371             // 3.3.1  2-byte sequence with last byte missing (U+0000)
1372             roundtrip(false, "\xc0");
1373             // 3.3.2  3-byte sequence with last byte missing (U+0000)
1374             roundtrip(false, "\xe0\x80");
1375             // 3.3.3  4-byte sequence with last byte missing (U+0000)
1376             roundtrip(false, "\xf0\x80\x80");
1377             // 3.3.4  5-byte sequence with last byte missing (U+0000)
1378             roundtrip(false, "\xf8\x80\x80\x80");
1379             // 3.3.5  6-byte sequence with last byte missing (U+0000)
1380             roundtrip(false, "\xfc\x80\x80\x80\x80");
1381             // 3.3.6  2-byte sequence with last byte missing (U-000007FF)
1382             roundtrip(false, "\xdf");
1383             // 3.3.7  3-byte sequence with last byte missing (U-0000FFFF)
1384             roundtrip(false, "\xef\xbf");
1385             // 3.3.8  4-byte sequence with last byte missing (U-001FFFFF)
1386             roundtrip(false, "\xf7\xbf\xbf");
1387             // 3.3.9  5-byte sequence with last byte missing (U-03FFFFFF)
1388             roundtrip(false, "\xfb\xbf\xbf\xbf");
1389             // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
1390             roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
1391         }
1392 
1393         SECTION("3.4  Concatenation of incomplete sequences")
1394         {
1395             // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
1396             // sequences being signalled:
1397             roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
1398         }
1399 
1400         SECTION("3.5  Impossible bytes")
1401         {
1402             // The following two bytes cannot appear in a correct UTF-8 string
1403 
1404             // 3.5.1  fe
1405             roundtrip(false, "\xfe");
1406             // 3.5.2  ff
1407             roundtrip(false, "\xff");
1408             // 3.5.3  fe fe ff ff
1409             roundtrip(false, "\xfe\xfe\xff\xff");
1410         }
1411     }
1412 
1413     SECTION("4  Overlong sequences")
1414     {
1415         // The following sequences are not malformed according to the letter of
1416         // the Unicode 2.0 standard. However, they are longer then necessary and
1417         // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
1418         // decoder" should reject them just like malformed sequences for two
1419         // reasons: (1) It helps to debug applications if overlong sequences are
1420         // not treated as valid representations of characters, because this helps
1421         // to spot problems more quickly. (2) Overlong sequences provide
1422         // alternative representations of characters, that could maliciously be
1423         // used to bypass filters that check only for ASCII characters. For
1424         // instance, a 2-byte encoded line feed (LF) would not be caught by a
1425         // line counter that counts only 0x0a bytes, but it would still be
1426         // processed as a line feed by an unsafe UTF-8 decoder later in the
1427         // pipeline. From a security point of view, ASCII compatibility of UTF-8
1428         // sequences means also, that ASCII characters are *only* allowed to be
1429         // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
1430         // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
1431         // reject overlong UTF-8 sequences for which a shorter encoding exists.
1432 
1433         SECTION("4.1  Examples of an overlong ASCII character")
1434         {
1435             // With a safe UTF-8 decoder, all of the following five overlong
1436             // representations of the ASCII character slash ("/") should be rejected
1437             // like a malformed UTF-8 sequence, for instance by substituting it with
1438             // a replacement character. If you see a slash below, you do not have a
1439             // safe UTF-8 decoder!
1440 
1441             // 4.1.1 U+002F = c0 af
1442             roundtrip(false, "\xc0\xaf");
1443             // 4.1.2 U+002F = e0 80 af
1444             roundtrip(false, "\xe0\x80\xaf");
1445             // 4.1.3 U+002F = f0 80 80 af
1446             roundtrip(false, "\xf0\x80\x80\xaf");
1447             // 4.1.4 U+002F = f8 80 80 80 af
1448             roundtrip(false, "\xf8\x80\x80\x80\xaf");
1449             // 4.1.5 U+002F = fc 80 80 80 80 af
1450             roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
1451         }
1452 
1453         SECTION("4.2  Maximum overlong sequences")
1454         {
1455             // Below you see the highest Unicode value that is still resulting in an
1456             // overlong sequence if represented with the given number of bytes. This
1457             // is a boundary test for safe UTF-8 decoders. All five characters should
1458             // be rejected like malformed UTF-8 sequences.
1459 
1460             // 4.2.1  U-0000007F = c1 bf
1461             roundtrip(false, "\xc1\xbf");
1462             // 4.2.2  U-000007FF = e0 9f bf
1463             roundtrip(false, "\xe0\x9f\xbf");
1464             // 4.2.3  U-0000FFFF = f0 8f bf bf
1465             roundtrip(false, "\xf0\x8f\xbf\xbf");
1466             // 4.2.4  U-001FFFFF = f8 87 bf bf bf
1467             roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
1468             // 4.2.5  U-03FFFFFF = fc 83 bf bf bf bf
1469             roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
1470         }
1471 
1472         SECTION("4.3  Overlong representation of the NUL character")
1473         {
1474             // The following five sequences should also be rejected like malformed
1475             // UTF-8 sequences and should not be treated like the ASCII NUL
1476             // character.
1477 
1478             // 4.3.1  U+0000 = c0 80
1479             roundtrip(false, "\xc0\x80");
1480             // 4.3.2  U+0000 = e0 80 80
1481             roundtrip(false, "\xe0\x80\x80");
1482             // 4.3.3  U+0000 = f0 80 80 80
1483             roundtrip(false, "\xf0\x80\x80\x80");
1484             // 4.3.4  U+0000 = f8 80 80 80 80
1485             roundtrip(false, "\xf8\x80\x80\x80\x80");
1486             // 4.3.5  U+0000 = fc 80 80 80 80 80
1487             roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
1488         }
1489     }
1490 
1491     SECTION("5  Illegal code positions")
1492     {
1493         // The following UTF-8 sequences should be rejected like malformed
1494         // sequences, because they never represent valid ISO 10646 characters and
1495         // a UTF-8 decoder that accepts them might introduce security problems
1496         // comparable to overlong UTF-8 sequences.
1497 
1498         SECTION("5.1 Single UTF-16 surrogates")
1499         {
1500             // 5.1.1  U+D800 = ed a0 80
1501             roundtrip(false, "\xed\xa0\x80");
1502             // 5.1.2  U+DB7F = ed ad bf
1503             roundtrip(false, "\xed\xad\xbf");
1504             // 5.1.3  U+DB80 = ed ae 80
1505             roundtrip(false, "\xed\xae\x80");
1506             // 5.1.4  U+DBFF = ed af bf
1507             roundtrip(false, "\xed\xaf\xbf");
1508             // 5.1.5  U+DC00 = ed b0 80
1509             roundtrip(false, "\xed\xb0\x80");
1510             // 5.1.6  U+DF80 = ed be 80
1511             roundtrip(false, "\xed\xbe\x80");
1512             // 5.1.7  U+DFFF = ed bf bf
1513             roundtrip(false, "\xed\xbf\xbf");
1514         }
1515 
1516         SECTION("5.2 Paired UTF-16 surrogates")
1517         {
1518             // 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80
1519             roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
1520             // 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf
1521             roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
1522             // 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80
1523             roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
1524             // 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf
1525             roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
1526             // 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80
1527             roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
1528             // 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf
1529             roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
1530             // 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80
1531             roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
1532             // 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf
1533             roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
1534         }
1535 
1536         SECTION("5.3 Noncharacter code positions")
1537         {
1538             // The following "noncharacters" are "reserved for internal use" by
1539             // applications, and according to older versions of the Unicode Standard
1540             // "should never be interchanged". Unicode Corrigendum #9 dropped the
1541             // latter restriction. Nevertheless, their presence in incoming UTF-8 data
1542             // can remain a potential security risk, depending on what use is made of
1543             // these codes subsequently. Examples of such internal use:
1544             //
1545             //  - Some file APIs with 16-bit characters may use the integer value -1
1546             //    = U+FFFF to signal an end-of-file (EOF) or error condition.
1547             //
1548             //  - In some UTF-16 receivers, code point U+FFFE might trigger a
1549             //    byte-swap operation (to convert between UTF-16LE and UTF-16BE).
1550             //
1551             // With such internal use of noncharacters, it may be desirable and safer
1552             // to block those code points in UTF-8 decoders, as they should never
1553             // occur legitimately in incoming UTF-8 data, and could trigger unsafe
1554             // behaviour in subsequent processing.
1555 
1556             // Particularly problematic noncharacters in 16-bit applications:
1557 
1558             // 5.3.1  U+FFFE = ef bf be
1559             roundtrip(true, "\xef\xbf\xbe");
1560             // 5.3.2  U+FFFF = ef bf bf
1561             roundtrip(true, "\xef\xbf\xbf");
1562 
1563             // 5.3.3  U+FDD0 .. U+FDEF
1564             roundtrip(true, "\xEF\xB7\x90");
1565             roundtrip(true, "\xEF\xB7\x91");
1566             roundtrip(true, "\xEF\xB7\x92");
1567             roundtrip(true, "\xEF\xB7\x93");
1568             roundtrip(true, "\xEF\xB7\x94");
1569             roundtrip(true, "\xEF\xB7\x95");
1570             roundtrip(true, "\xEF\xB7\x96");
1571             roundtrip(true, "\xEF\xB7\x97");
1572             roundtrip(true, "\xEF\xB7\x98");
1573             roundtrip(true, "\xEF\xB7\x99");
1574             roundtrip(true, "\xEF\xB7\x9A");
1575             roundtrip(true, "\xEF\xB7\x9B");
1576             roundtrip(true, "\xEF\xB7\x9C");
1577             roundtrip(true, "\xEF\xB7\x9D");
1578             roundtrip(true, "\xEF\xB7\x9E");
1579             roundtrip(true, "\xEF\xB7\x9F");
1580             roundtrip(true, "\xEF\xB7\xA0");
1581             roundtrip(true, "\xEF\xB7\xA1");
1582             roundtrip(true, "\xEF\xB7\xA2");
1583             roundtrip(true, "\xEF\xB7\xA3");
1584             roundtrip(true, "\xEF\xB7\xA4");
1585             roundtrip(true, "\xEF\xB7\xA5");
1586             roundtrip(true, "\xEF\xB7\xA6");
1587             roundtrip(true, "\xEF\xB7\xA7");
1588             roundtrip(true, "\xEF\xB7\xA8");
1589             roundtrip(true, "\xEF\xB7\xA9");
1590             roundtrip(true, "\xEF\xB7\xAA");
1591             roundtrip(true, "\xEF\xB7\xAB");
1592             roundtrip(true, "\xEF\xB7\xAC");
1593             roundtrip(true, "\xEF\xB7\xAD");
1594             roundtrip(true, "\xEF\xB7\xAE");
1595             roundtrip(true, "\xEF\xB7\xAF");
1596 
1597             // 5.3.4  U+nFFFE U+nFFFF (for n = 1..10)
1598             roundtrip(true, "\xF0\x9F\xBF\xBF");
1599             roundtrip(true, "\xF0\xAF\xBF\xBF");
1600             roundtrip(true, "\xF0\xBF\xBF\xBF");
1601             roundtrip(true, "\xF1\x8F\xBF\xBF");
1602             roundtrip(true, "\xF1\x9F\xBF\xBF");
1603             roundtrip(true, "\xF1\xAF\xBF\xBF");
1604             roundtrip(true, "\xF1\xBF\xBF\xBF");
1605             roundtrip(true, "\xF2\x8F\xBF\xBF");
1606             roundtrip(true, "\xF2\x9F\xBF\xBF");
1607             roundtrip(true, "\xF2\xAF\xBF\xBF");
1608         }
1609     }
1610 }
1611