1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* util/support/t_utf8.c - test UTF-8 boundary conditions */
3 /*
4 * Copyright (C) 2015 by the Massachusetts Institute of Technology.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
30 * OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #include <stdio.h>
34 #include <string.h>
35
36 #include "k5-platform.h"
37 #include "k5-utf8.h"
38
39 /*
40 * Convenience macro to allow testing of old encodings.
41 *
42 * "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point
43 * was U+7FFFFFFF instead of U+10FFFF.
44 */
45 #ifdef OLDENCODINGS
46 #define L(x) (x)
47 #else
48 #define L(x) 0
49 #endif
50
51 /*
52 * len is 0 for invalid encoding prefixes (krb5int_utf8_charlen2() partially
53 * enforces the validity of the first two bytes, based on masking the second
54 * byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the
55 * range between U+110000 and U+13FFFF).
56 *
57 * ucs is 0 for invalid encodings (including ones with valid prefixes according
58 * to krb5int_utf8_charlen2(); krb5int_utf8_to_ucs4() will still fail on them
59 * because it checks more things.) Code points above U+10FFFF are excluded by
60 * the actual test code and remain in the table for possibly testing the old
61 * implementation that didn't exclude them.
62 *
63 * Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the
64 * surrogate pair range.
65 */
66 struct testcase {
67 const char *p;
68 krb5_ucs4 ucs;
69 int len;
70 } testcases[] = {
71 { "\x7f", 0x0000007f, 1 }, /* Lowest 1-byte encoding */
72 { "\xc0\x80", 0x00000000, 0 }, /* Invalid 2-byte encoding */
73 { "\xc2\x80", 0x00000080, 2 }, /* Lowest valid 2-byte encoding */
74 { "\xdf\xbf", 0x000007ff, 2 }, /* Highest valid 2-byte encoding*/
75 { "\xdf\xff", 0x00000000, 2 }, /* Invalid 2-byte encoding*/
76 { "\xe0\x80\x80", 0x00000000, 0 }, /* Invalid 3-byte encoding */
77 { "\xe0\xa0\x80", 0x00000800, 3 }, /* Lowest valid 3-byte encoding */
78 { "\xef\xbf\xbf", 0x0000ffff, 3 }, /* Highest valid 3-byte encoding */
79 { "\xef\xff\xff", 0x00000000, 3 }, /* Invalid 3-byte encoding */
80 { "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */
81 { "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */
82 { "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */
83 /* Next higher 4-byte encoding (old) */
84 { "\xf4\x90\x80\x80", 0x00110000, 4 },
85 /* Highest 4-byte encoding starting with 0xf4 (old) */
86 { "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 },
87 /* Next higher 4-byte prefix byte (old) */
88 { "\xf5\x80\x80\x80", 0x00140000, L(4) },
89 /* Highest valid 4-byte encoding (old) */
90 { "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) },
91 /* Invalid 4-byte encoding */
92 { "\xf7\xff\xff\xff", 0x00000000, L(4) },
93 /* Invalid 5-byte encoding */
94 { "\xf8\x80\x80\x80\x80", 0x00000000, 0 },
95 /* Lowest valid 5-byte encoding (old) */
96 { "\xf8\x88\x80\x80\x80", 0x00200000, L(5) },
97 /* Highest valid 5-byte encoding (old) */
98 { "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) },
99 /* Invalid 5-byte encoding */
100 { "\xfb\xff\xff\xff\xff", 0x00000000, L(5) },
101 /* Invalid 6-byte encoding */
102 { "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 },
103 /* Lowest valid 6-byte encoding (old) */
104 { "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) },
105 /* Highest valid 6-byte encoding (old) */
106 { "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) },
107 /* Invalid 6-byte encoding */
108 { "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) },
109 };
110
111 static void
printhex(const char * p)112 printhex(const char *p)
113 {
114 for (; *p != '\0'; p++) {
115 printf("%02x ", (unsigned char)*p);
116 }
117 }
118
119 static void
printtest(struct testcase * t)120 printtest(struct testcase *t)
121 {
122 printhex(t->p);
123 printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len);
124 }
125
126 static int
test_decode(struct testcase * t,int high4)127 test_decode(struct testcase *t, int high4)
128 {
129 int len, status = 0;
130 krb5_ucs4 u = 0;
131
132 len = krb5int_utf8_charlen2(t->p);
133 if (len != t->len) {
134 printf("expected len=%d, got len=%d\n", t->len, len);
135 status = 1;
136 }
137 if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) {
138 printf("unexpected success in utf8_to_ucs4\n");
139 status = 1;
140 }
141 if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) {
142 printf("unexpected failure in utf8_to_ucs4\n");
143 status = 1;
144 }
145 if (t->ucs != u && !high4) {
146 printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs,
147 (unsigned long)u);
148 status = 1;
149 }
150 return status;
151 }
152
153 static int
test_encode(struct testcase * t,int high4)154 test_encode(struct testcase *t, int high4)
155 {
156 size_t size;
157 char buf[7];
158
159 memset(buf, 0, sizeof(buf));
160 size = krb5int_ucs4_to_utf8(t->ucs, buf);
161 if (high4 && size != 0) {
162 printf("unexpected success beyond U+10FFFF\n");
163 return 1;
164 }
165 if (!high4 && size == 0) {
166 printf("unexpected zero size on encode\n");
167 return 1;
168 }
169 if (size != 0 && strcmp(t->p, buf) != 0) {
170 printf("expected ");
171 printhex(t->p);
172 printf("got ");
173 printhex(buf);
174 printf("\n");
175 return 1;
176 }
177 return 0;
178 }
179
180 int
main(int argc,char ** argv)181 main(int argc, char **argv)
182 {
183 size_t ncases = sizeof(testcases) / sizeof(testcases[0]);
184 size_t i;
185 struct testcase *t;
186 int status = 0, verbose = 0;
187 /* Is this a "high" 4-byte encoding above U+10FFFF? */
188 int high4;
189
190 if (argc == 2 && strcmp(argv[1], "-v") == 0)
191 verbose = 1;
192 for (i = 0; i < ncases; i++) {
193 t = &testcases[i];
194 if (verbose)
195 printtest(t);
196 #ifndef OLDENCODINGS
197 high4 = t->ucs > 0x10ffff;
198 #else
199 high4 = 0;
200 #endif
201 if (test_decode(t, high4) != 0)
202 status = 1;
203 if (t->ucs == 0)
204 continue;
205 if (test_encode(t, high4) != 0)
206 status = 1;
207 }
208 return status;
209 }
210