1 /*
2  * encoding.c - UTF-8 and UTF-16LE codecs and utility functions
3  *
4  * Copyright (C) 2012-2016 Eric Biggers
5  *
6  * This file is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU Lesser General Public License as published by the Free
8  * Software Foundation; either version 3 of the License, or (at your option) any
9  * later version.
10  *
11  * This file is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
14  * details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this file; if not, see http://www.gnu.org/licenses/.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #  include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <string.h>
26 
27 #include "wimlib/encoding.h"
28 #include "wimlib/endianness.h"
29 #include "wimlib/error.h"
30 #include "wimlib/unaligned.h"
31 #include "wimlib/util.h"
32 
33 /*
34  * Allow unpaired surrogates, such as might exist in Windows-style filenames ---
35  * which are normally valid UTF-16LE, but are actually treated as opaque
36  * sequences of 16-bit WCHARs by Windows.  When decoding "UTF-16LE", unpaired
37  * surrogates will be decoded as their surrogate codepoints; and when encoding
38  * to and from "UTF-8", the encoding will actually be WTF-8 ("Wobbly
39  * Transformation Format - 8-bit"), a superset of UTF-8 which permits the
40  * surrogate codepoints.
41  *
42  * In combination with also allowing the "non-character" codepoints U+FFFE and
43  * U+FFFF, the result is that every Windows-style filename can be translated to
44  * a UNIX-style filename.
45  *
46  * Unfortunately, the converse is not true: not every UNIX filename can be
47  * translated to a Windows filename.  Only UNIX filenames that are valid "WTF-8"
48  * can be translated.  I considered ways to define a bijective mapping, but
49  * there did not seem to be a straightforward way.  The "UTF-8b" scheme, for
50  * example, would map each invalid byte 'b' to a surrogate "escape code" 'U+DC00
51  * + b'.  The problem with this was that surrogate escape codes can be combined
52  * to create a valid UTF-8 sequence, thus breaking the bijection by mapping
53  * multiple Windows filenames to a single UNIX filename.
54  */
55 #define ALLOW_UNPAIRED_SURROGATES	1
56 
57 #define INVALID_CODEPOINT	0xFFFFFFFF
58 #define VALIDATE(expr)		if (validate && unlikely(!(expr))) goto invalid
59 #define IS_SURROGATE(c)		((c) >= 0xD800 && (c) < 0xE000)
60 #define IS_HIGH_SURROGATE(c)	((c) >= 0xD800 && (c) < 0xDC00)
61 #define IS_LOW_SURROGATE(c)	((c) >= 0xDC00 && (c) < 0xE000)
62 #define IS_UTF8_TAIL(c)		(((c) & 0xC0) == 0x80)
63 
64 /*
65  * Decode the next Unicode codepoint from the string at @in, which has
66  * @remaining >= 1 bytes remaining.  Return the number of bytes consumed and
67  * write the decoded codepoint to *c_ret.
68  *
69  * If the input might not be a valid string in the source encoding, then
70  * @validate must be specified as %true, and then on invalid input the function
71  * consumes at least one byte and sets *c_ret to INVALID_CODEPOINT.  If the
72  * input is guaranteed to be valid, then @validate may be specified as %false.
73  */
74 typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining,
75 					bool validate, u32 *c_ret);
76 
77 /* Encode the Unicode codepoint @c and return the number of bytes used. */
78 typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out);
79 
80 static forceinline unsigned
utf8_decode_codepoint(const u8 * in,size_t remaining,bool validate,u32 * c_ret)81 utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret)
82 {
83 	if (likely(in[0] < 0x80)) { /* U+0...U+7F */
84 		*c_ret = in[0];
85 		return 1;
86 	}
87 
88 	if (in[0] < 0xE0) { /* U+80...U+7FF */
89 		VALIDATE(in[0] >= 0xC2 && remaining >= 2 &&
90 			 IS_UTF8_TAIL(in[1]));
91 		*c_ret = ((u32)(in[0] & 0x1F) << 6) |
92 			 ((u32)(in[1] & 0x3F) << 0);
93 		return 2;
94 	}
95 
96 	if (in[0] < 0xF0) { /* U+800...U+FFFF, possibly excluding surrogates */
97 		VALIDATE(remaining >= 3 &&
98 			 IS_UTF8_TAIL(in[1]) &&
99 			 IS_UTF8_TAIL(in[2]));
100 		*c_ret = ((u32)(in[0] & 0x0F) << 12) |
101 			 ((u32)(in[1] & 0x3F) << 6) |
102 			 ((u32)(in[2] & 0x3F) << 0);
103 		VALIDATE(*c_ret >= 0x800);
104 	#if !ALLOW_UNPAIRED_SURROGATES
105 		VALIDATE(!IS_SURROGATE(*c_ret));
106 	#endif
107 		return 3;
108 	}
109 
110 	/* U+10000...U+10FFFF */
111 	VALIDATE(in[0] < 0xF8 && remaining >= 4 &&
112 		 IS_UTF8_TAIL(in[1]) &&
113 		 IS_UTF8_TAIL(in[2]) &&
114 		 IS_UTF8_TAIL(in[3]));
115 	*c_ret = ((u32)(in[0] & 0x07) << 18) |
116 		 ((u32)(in[1] & 0x3F) << 12) |
117 		 ((u32)(in[2] & 0x3F) << 6) |
118 		 ((u32)(in[3] & 0x3F) << 0);
119 	VALIDATE(*c_ret >= 0x10000 && *c_ret <= 0x10FFFF);
120 	return 4;
121 
122 invalid:
123 	*c_ret = INVALID_CODEPOINT;
124 	return 1;
125 }
126 
127 static forceinline unsigned
utf8_encode_codepoint(u32 c,u8 * out)128 utf8_encode_codepoint(u32 c, u8 *out)
129 {
130 	if (likely(c < 0x80)) {
131 		out[0] = c;
132 		return 1;
133 	}
134 
135 	if (c < 0x800) {
136 		out[0] = 0xC0 | (c >> 6);
137 		out[1] = 0x80 | (c & 0x3F);
138 		return 2;
139 	}
140 
141 	if (c < 0x10000) {
142 		out[0] = 0xE0 | (c >> 12);
143 		out[1] = 0x80 | ((c >> 6) & 0x3F);
144 		out[2] = 0x80 | (c & 0x3F);
145 		return 3;
146 	}
147 
148 	out[0] = 0xF0 | (c >> 18);
149 	out[1] = 0x80 | ((c >> 12) & 0x3F);
150 	out[2] = 0x80 | ((c >> 6) & 0x3F);
151 	out[3] = 0x80 | (c & 0x3F);
152 	return 4;
153 }
154 
155 static forceinline unsigned
utf16le_decode_codepoint(const u8 * in,size_t remaining,bool validate,u32 * c_ret)156 utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate,
157 			 u32 *c_ret)
158 {
159 	u32 h, l;
160 
161 	VALIDATE(remaining >= 2);
162 	h = get_unaligned_le16(in);
163 	if (unlikely(IS_SURROGATE(h))) {
164 		/* Surrogate pairs are U+10000...U+10FFFF.
165 		 * Unpaired surrogates are U+D800...U+DFFF. */
166 	#if ALLOW_UNPAIRED_SURROGATES
167 		if (unlikely(!IS_HIGH_SURROGATE(h) || remaining < 4))
168 			goto unpaired;
169 		l = get_unaligned_le16(in + 2);
170 		if (unlikely(!IS_LOW_SURROGATE(l)))
171 			goto unpaired;
172 	#else
173 		VALIDATE(IS_HIGH_SURROGATE(h) && remaining >= 4);
174 		l = get_unaligned_le16(in + 2);
175 		VALIDATE(IS_LOW_SURROGATE(l));
176 	#endif
177 		*c_ret = 0x10000 + ((h - 0xD800) << 10) + (l - 0xDC00);
178 		return 4;
179 	}
180 #if ALLOW_UNPAIRED_SURROGATES
181 unpaired:
182 #endif
183 	*c_ret = h;
184 	return 2;
185 
186 invalid:
187 	*c_ret = INVALID_CODEPOINT;
188 	return min(remaining, 2);
189 }
190 
191 static forceinline unsigned
utf16le_encode_codepoint(u32 c,u8 * out)192 utf16le_encode_codepoint(u32 c, u8 *out)
193 {
194 	if (likely(c < 0x10000)) {
195 		put_unaligned_le16(c, out);
196 		return 2;
197 	}
198 	c -= 0x10000;
199 	put_unaligned_le16(0xD800 + (c >> 10), out);
200 	put_unaligned_le16(0xDC00 + (c & 0x3FF), out + 2);
201 	return 4;
202 }
203 
204 /*
205  * Convert the string @in of size @in_nbytes from the encoding given by the
206  * @decode_codepoint function to the encoding given by the @encode_codepoint
207  * function.  @in does not need to be null-terminated, but a null terminator
208  * will be added to the output string.
209  *
210  * On success, write the allocated output string to @out_ret (must not be NULL)
211  * and its size excluding the null terminator to @out_nbytes_ret (may be NULL).
212  *
213  * If the input string is malformed, return @ilseq_err with errno set to EILSEQ.
214  * If out of memory, return WIMLIB_ERR_NOMEM with errno set to ENOMEM.
215  */
216 static forceinline int
convert_string(const u8 * const in,const size_t in_nbytes,u8 ** out_ret,size_t * out_nbytes_ret,int ilseq_err,decode_codepoint_fn decode_codepoint,encode_codepoint_fn encode_codepoint)217 convert_string(const u8 * const in, const size_t in_nbytes,
218 	       u8 **out_ret, size_t *out_nbytes_ret,
219 	       int ilseq_err,
220 	       decode_codepoint_fn decode_codepoint,
221 	       encode_codepoint_fn encode_codepoint)
222 {
223 	const u8 * const in_end = in + in_nbytes;
224 	const u8 *p_in;
225 	u8 *p_out;
226 	size_t out_nbytes = 0;
227 	u8 *out;
228 	u8 tmp[8]; /* assuming no codepoint requires > 8 bytes to encode */
229 	u32 c;
230 
231 	/* Validate the input string and compute the output size. */
232 	for (p_in = in; p_in != in_end; ) {
233 		p_in += (*decode_codepoint)(p_in, in_end - p_in, true, &c);
234 		if (unlikely(c == INVALID_CODEPOINT)) {
235 			errno = EILSEQ;
236 			return ilseq_err;
237 		}
238 		out_nbytes += (*encode_codepoint)(c, tmp);
239 	}
240 
241 	/* Allocate the output string, including space for a null terminator. */
242 	out = MALLOC(out_nbytes + (*encode_codepoint)(0, tmp));
243 	if (unlikely(!out))
244 		return WIMLIB_ERR_NOMEM;
245 
246 	/* Do the conversion. */
247 	for (p_in = in, p_out = out; p_in != in_end; ) {
248 		p_in += (*decode_codepoint)(p_in, in_end - p_in, false, &c);
249 		p_out += (*encode_codepoint)(c, p_out);
250 	}
251 
252 	/* Add a null terminator. */
253 	(*encode_codepoint)(0, p_out);
254 
255 	/* Return the output string and its size (by reference). */
256 	*out_ret = out;
257 	if (out_nbytes_ret)
258 		*out_nbytes_ret = out_nbytes;
259 	return 0;
260 }
261 
262 int
utf8_to_utf16le(const char * in,size_t in_nbytes,utf16lechar ** out_ret,size_t * out_nbytes_ret)263 utf8_to_utf16le(const char *in, size_t in_nbytes,
264 		utf16lechar **out_ret, size_t *out_nbytes_ret)
265 {
266 	return convert_string((const u8 *)in, in_nbytes,
267 			      (u8 **)out_ret, out_nbytes_ret,
268 			      WIMLIB_ERR_INVALID_UTF8_STRING,
269 			      utf8_decode_codepoint, utf16le_encode_codepoint);
270 }
271 
272 int
utf16le_to_utf8(const utf16lechar * in,size_t in_nbytes,char ** out_ret,size_t * out_nbytes_ret)273 utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
274 		char **out_ret, size_t *out_nbytes_ret)
275 {
276 	return convert_string((const u8 *)in, in_nbytes,
277 			      (u8 **)out_ret, out_nbytes_ret,
278 			      WIMLIB_ERR_INVALID_UTF16_STRING,
279 			      utf16le_decode_codepoint, utf8_encode_codepoint);
280 }
281 
282 /*
283  * A table that maps from UCS-2 characters to their upper case equivalents.
284  * Index and array values are both CPU endian.
285  * Note: this is only an *approximation* of real UTF-16 case folding.
286  */
287 u16 upcase[65536];
288 
289 void
init_upcase(void)290 init_upcase(void)
291 {
292 	/* This is the table used in NTFS volumes formatted by Windows 10.
293 	 * It was compressed by tools/compress_upcase_table.c.  */
294 	static const u16 upcase_compressed[] = {
295 		0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
296 		0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
297 		0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
298 		0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
299 		0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
300 		0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
301 		0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
302 		0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
303 		0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
304 		0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
305 		0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
306 		0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
307 		0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
308 		0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
309 		0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
310 		0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
311 		0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
312 		0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
313 		0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
314 		0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
315 		0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
316 		0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
317 		0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
318 		0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
319 		0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
320 		0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
321 		0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
322 		0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
323 		0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
324 		0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
325 		0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
326 		0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
327 		0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
328 		0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
329 		0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
330 		0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
331 		0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
332 		0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
333 		0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
334 		0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
335 		0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
336 		0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
337 		0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
338 		0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
339 		0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
340 		0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
341 		0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
342 		0x0040, 0x0000,
343 	};
344 
345 	/* Simple LZ decoder  */
346 	const u16 *in_next = upcase_compressed;
347 	for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
348 		u16 length = *in_next++;
349 		u16 src_pos = *in_next++;
350 		if (length == 0) {
351 			/* Literal */
352 			upcase[i++] = src_pos;
353 		} else {
354 			/* Match */
355 			do {
356 				upcase[i++] = upcase[src_pos++];
357 			} while (--length);
358 		}
359 	}
360 
361 	/* Delta filter  */
362 	for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
363 		upcase[i] += i;
364 }
365 
366 /*
367  * Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
368  * case-insensitively (%ignore_case == true).
369  *
370  * This is implemented using the default upper-case table used by NTFS.  It does
371  * not handle all possible cases allowed by UTF-16LE.  For example, different
372  * normalizations of the same sequence of "characters" are not considered equal.
373  * It hopefully does the right thing most of the time though.
374  */
375 int
cmp_utf16le_strings(const utf16lechar * s1,size_t n1,const utf16lechar * s2,size_t n2,bool ignore_case)376 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
377 		    const utf16lechar *s2, size_t n2,
378 		    bool ignore_case)
379 {
380 	size_t n = min(n1, n2);
381 
382 	if (ignore_case) {
383 		for (size_t i = 0; i < n; i++) {
384 			u16 c1 = upcase[le16_to_cpu(s1[i])];
385 			u16 c2 = upcase[le16_to_cpu(s2[i])];
386 			if (c1 != c2)
387 				return (c1 < c2) ? -1 : 1;
388 		}
389 	} else {
390 		for (size_t i = 0; i < n; i++) {
391 			u16 c1 = le16_to_cpu(s1[i]);
392 			u16 c2 = le16_to_cpu(s2[i]);
393 			if (c1 != c2)
394 				return (c1 < c2) ? -1 : 1;
395 		}
396 	}
397 	if (n1 == n2)
398 		return 0;
399 	return (n1 < n2) ? -1 : 1;
400 }
401 
402 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated.  */
403 int
cmp_utf16le_strings_z(const utf16lechar * s1,const utf16lechar * s2,bool ignore_case)404 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
405 		      bool ignore_case)
406 {
407 	if (ignore_case) {
408 		for (;;) {
409 			u16 c1 = upcase[le16_to_cpu(*s1)];
410 			u16 c2 = upcase[le16_to_cpu(*s2)];
411 			if (c1 != c2)
412 				return (c1 < c2) ? -1 : 1;
413 			if (c1 == 0)
414 				return 0;
415 			s1++, s2++;
416 		}
417 	} else {
418 		while (*s1 && *s1 == *s2)
419 			s1++, s2++;
420 		if (*s1 == *s2)
421 			return 0;
422 		return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
423 	}
424 }
425 
426 /* Duplicate a UTF-16 string.  The input string might not be null terminated and
427  * might be misaligned, but the returned string is guaranteed to be null
428  * terminated and properly aligned.  */
429 utf16lechar *
utf16le_dupz(const void * s,size_t size)430 utf16le_dupz(const void *s, size_t size)
431 {
432 	utf16lechar *dup = MALLOC(size + sizeof(utf16lechar));
433 	if (dup) {
434 		memcpy(dup, s, size);
435 		dup[size / sizeof(utf16lechar)] = 0;
436 	}
437 	return dup;
438 }
439 
440 /* Duplicate a null-terminated UTF-16 string.  */
441 utf16lechar *
utf16le_dup(const utf16lechar * s)442 utf16le_dup(const utf16lechar *s)
443 {
444 	return memdup(s, utf16le_len_bytes(s) + sizeof(utf16lechar));
445 }
446 
447 /* Return the length, in bytes, of a null terminated UTF-16 string, excluding
448  * the null terminator.  */
449 size_t
utf16le_len_bytes(const utf16lechar * s)450 utf16le_len_bytes(const utf16lechar *s)
451 {
452 	const utf16lechar *p = s;
453 	while (*p)
454 		p++;
455 	return (p - s) * sizeof(utf16lechar);
456 }
457 
458 /* Return the length, in UTF-16 coding units, of a null terminated UTF-16
459  * string, excluding the null terminator.  */
460 size_t
utf16le_len_chars(const utf16lechar * s)461 utf16le_len_chars(const utf16lechar *s)
462 {
463 	return utf16le_len_bytes(s) / sizeof(utf16lechar);
464 }
465