1 /*-
2  * Copyright (c) 2003-2007 Tim Kientzle
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "archive_platform.h"
27 __FBSDID("$FreeBSD: src/lib/libarchive/archive_string.c,v 1.17 2008/12/06 05:56:43 kientzle Exp $");
28 
29 /*
30  * Basic resizable string support, to simplify manipulating arbitrary-sized
31  * strings while minimizing heap activity.
32  */
33 
34 #ifdef HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #ifdef HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #ifdef HAVE_WCHAR_H
41 #include <wchar.h>
42 #endif
43 #if defined(_WIN32) && !defined(__CYGWIN__)
44 #include <windows.h>
45 #endif
46 
47 #include "archive_private.h"
48 #include "archive_string.h"
49 
50 struct archive_string *
__archive_string_append(struct archive_string * as,const char * p,size_t s)51 __archive_string_append(struct archive_string *as, const char *p, size_t s)
52 {
53 	if (__archive_string_ensure(as, as->length + s + 1) == NULL)
54 		__archive_errx(1, "Out of memory");
55 	memcpy(as->s + as->length, p, s);
56 	as->s[as->length + s] = 0;
57 	as->length += s;
58 	return (as);
59 }
60 
61 void
__archive_string_copy(struct archive_string * dest,struct archive_string * src)62 __archive_string_copy(struct archive_string *dest, struct archive_string *src)
63 {
64 	if (src->length == 0)
65 		dest->length = 0;
66 	else {
67 		if (__archive_string_ensure(dest, src->length + 1) == NULL)
68 			__archive_errx(1, "Out of memory");
69 		memcpy(dest->s, src->s, src->length);
70 		dest->length = src->length;
71 		dest->s[dest->length] = 0;
72 	}
73 }
74 
75 void
__archive_string_concat(struct archive_string * dest,struct archive_string * src)76 __archive_string_concat(struct archive_string *dest, struct archive_string *src)
77 {
78 	if (src->length > 0) {
79 		if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL)
80 			__archive_errx(1, "Out of memory");
81 		memcpy(dest->s + dest->length, src->s, src->length);
82 		dest->length += src->length;
83 		dest->s[dest->length] = 0;
84 	}
85 }
86 
87 void
__archive_string_free(struct archive_string * as)88 __archive_string_free(struct archive_string *as)
89 {
90 	as->length = 0;
91 	as->buffer_length = 0;
92 	if (as->s != NULL) {
93 		free(as->s);
94 		as->s = NULL;
95 	}
96 }
97 
98 /* Returns NULL on any allocation failure. */
99 struct archive_string *
__archive_string_ensure(struct archive_string * as,size_t s)100 __archive_string_ensure(struct archive_string *as, size_t s)
101 {
102 	char *p;
103 	size_t new_length;
104 
105 	/* If buffer is already big enough, don't reallocate. */
106 	if (as->s && (s <= as->buffer_length))
107 		return (as);
108 
109 	/*
110 	 * Growing the buffer at least exponentially ensures that
111 	 * append operations are always linear in the number of
112 	 * characters appended.  Using a smaller growth rate for
113 	 * larger buffers reduces memory waste somewhat at the cost of
114 	 * a larger constant factor.
115 	 */
116 	if (as->buffer_length < 32)
117 		/* Start with a minimum 32-character buffer. */
118 		new_length = 32;
119 	else if (as->buffer_length < 8192)
120 		/* Buffers under 8k are doubled for speed. */
121 		new_length = as->buffer_length + as->buffer_length;
122 	else {
123 		/* Buffers 8k and over grow by at least 25% each time. */
124 		new_length = as->buffer_length + as->buffer_length / 4;
125 		/* Be safe: If size wraps, fail. */
126 		if (new_length < as->buffer_length) {
127 			/* On failure, wipe the string and return NULL. */
128 			__archive_string_free(as);
129 			return (NULL);
130 		}
131 	}
132 	/*
133 	 * The computation above is a lower limit to how much we'll
134 	 * grow the buffer.  In any case, we have to grow it enough to
135 	 * hold the request.
136 	 */
137 	if (new_length < s)
138 		new_length = s;
139 	/* Now we can reallocate the buffer. */
140 	p = (char *)realloc(as->s, new_length);
141 	if (p == NULL) {
142 		/* On failure, wipe the string and return NULL. */
143 		__archive_string_free(as);
144 		return (NULL);
145 	}
146 
147 	as->s = p;
148 	as->buffer_length = new_length;
149 	return (as);
150 }
151 
152 struct archive_string *
__archive_strncat(struct archive_string * as,const void * _p,size_t n)153 __archive_strncat(struct archive_string *as, const void *_p, size_t n)
154 {
155 	size_t s;
156 	const char *p, *pp;
157 
158 	p = (const char *)_p;
159 
160 	/* Like strlen(p), except won't examine positions beyond p[n]. */
161 	s = 0;
162 	pp = p;
163 	while (*pp && s < n) {
164 		pp++;
165 		s++;
166 	}
167 	return (__archive_string_append(as, p, s));
168 }
169 
170 struct archive_string *
__archive_strappend_char(struct archive_string * as,char c)171 __archive_strappend_char(struct archive_string *as, char c)
172 {
173 	return (__archive_string_append(as, &c, 1));
174 }
175 
176 /*
177  * Translates a wide character string into UTF-8 and appends
178  * to the archive_string.  Note: returns NULL if conversion fails,
179  * but still leaves a best-effort conversion in the argument as.
180  */
181 struct archive_string *
__archive_strappend_w_utf8(struct archive_string * as,const wchar_t * w)182 __archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w)
183 {
184 	char *p;
185 	unsigned wc;
186 	char buff[256];
187 	struct archive_string *return_val = as;
188 
189 	/*
190 	 * Convert one wide char at a time into 'buff', whenever that
191 	 * fills, append it to the string.
192 	 */
193 	p = buff;
194 	while (*w != L'\0') {
195 		/* Flush the buffer when we have <=16 bytes free. */
196 		/* (No encoding has a single character >16 bytes.) */
197 		if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) {
198 			*p = '\0';
199 			archive_strcat(as, buff);
200 			p = buff;
201 		}
202 		wc = *w++;
203 		/* If this is a surrogate pair, assemble the full code point.*/
204 		/* Note: wc must not be wchar_t here, because the full code
205 		 * point can be more than 16 bits! */
206 		if (wc >= 0xD800 && wc <= 0xDBff
207 		    && *w >= 0xDC00 && *w <= 0xDFFF) {
208 			wc -= 0xD800;
209 			wc *= 0x400;
210 			wc += (*w - 0xDC00);
211 			wc += 0x10000;
212 			++w;
213 		}
214 		/* Translate code point to UTF8 */
215 		if (wc <= 0x7f) {
216 			*p++ = (char)wc;
217 		} else if (wc <= 0x7ff) {
218 			*p++ = 0xc0 | ((wc >> 6) & 0x1f);
219 			*p++ = 0x80 | (wc & 0x3f);
220 		} else if (wc <= 0xffff) {
221 			*p++ = 0xe0 | ((wc >> 12) & 0x0f);
222 			*p++ = 0x80 | ((wc >> 6) & 0x3f);
223 			*p++ = 0x80 | (wc & 0x3f);
224 		} else if (wc <= 0x1fffff) {
225 			*p++ = 0xf0 | ((wc >> 18) & 0x07);
226 			*p++ = 0x80 | ((wc >> 12) & 0x3f);
227 			*p++ = 0x80 | ((wc >> 6) & 0x3f);
228 			*p++ = 0x80 | (wc & 0x3f);
229 		} else {
230 			/* Unicode has no codes larger than 0x1fffff. */
231 			/* TODO: use \uXXXX escape here instead of ? */
232 			*p++ = '?';
233 			return_val = NULL;
234 		}
235 	}
236 	*p = '\0';
237 	archive_strcat(as, buff);
238 	return (return_val);
239 }
240 
241 static int
utf8_to_unicode(int * pwc,const char * s,size_t n)242 utf8_to_unicode(int *pwc, const char *s, size_t n)
243 {
244         int ch;
245 
246         /*
247 	 * Decode 1-4 bytes depending on the value of the first byte.
248 	 */
249         ch = (unsigned char)*s;
250 	if (ch == 0) {
251 		return (0); /* Standard:  return 0 for end-of-string. */
252 	}
253 	if ((ch & 0x80) == 0) {
254                 *pwc = ch & 0x7f;
255 		return (1);
256         }
257 	if ((ch & 0xe0) == 0xc0) {
258 		if (n < 2)
259 			return (-1);
260 		if ((s[1] & 0xc0) != 0x80) return (-1);
261                 *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
262 		return (2);
263         }
264 	if ((ch & 0xf0) == 0xe0) {
265 		if (n < 3)
266 			return (-1);
267 		if ((s[1] & 0xc0) != 0x80) return (-1);
268 		if ((s[2] & 0xc0) != 0x80) return (-1);
269                 *pwc = ((ch & 0x0f) << 12)
270 		    | ((s[1] & 0x3f) << 6)
271 		    | (s[2] & 0x3f);
272 		return (3);
273         }
274 	if ((ch & 0xf8) == 0xf0) {
275 		if (n < 4)
276 			return (-1);
277 		if ((s[1] & 0xc0) != 0x80) return (-1);
278 		if ((s[2] & 0xc0) != 0x80) return (-1);
279 		if ((s[3] & 0xc0) != 0x80) return (-1);
280                 *pwc = ((ch & 0x07) << 18)
281 		    | ((s[1] & 0x3f) << 12)
282 		    | ((s[2] & 0x3f) << 6)
283 		    | (s[3] & 0x3f);
284 		return (4);
285         }
286 	/* Invalid first byte. */
287 	return (-1);
288 }
289 
290 /*
291  * Return a wide-character Unicode string by converting this archive_string
292  * from UTF-8.  We assume that systems with 16-bit wchar_t always use
293  * UTF16 and systems with 32-bit wchar_t can accept UCS4.
294  */
295 wchar_t *
__archive_string_utf8_w(struct archive_string * as)296 __archive_string_utf8_w(struct archive_string *as)
297 {
298 	wchar_t *ws, *dest;
299 	int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */
300 	const char *src, *end;
301 	int n;
302 
303 	ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t));
304 	if (ws == NULL)
305 		__archive_errx(1, "Out of memory");
306 	dest = ws;
307 	src = as->s;
308 	end = as->s + as->buffer_length;
309 	while (*src != '\0') {
310 		n = utf8_to_unicode(&wc, src, end - src);
311 		if (n == 0)
312 			break;
313 		if (n < 0) {
314 			free(ws);
315 			return (NULL);
316 		}
317 		src += n;
318 		if (wc >= 0xDC00 && wc <= 0xDBFF) {
319 			/* This is a leading surrogate; some idiot
320 			 * has translated UTF16 to UTF8 without combining
321 			 * surrogates; rebuild the full code point before
322 			 * continuing. */
323 			n = utf8_to_unicode(&wc2, src, end - src);
324 			if (n < 0) {
325 				free(ws);
326 				return (NULL);
327 			}
328 			if (n == 0) /* Ignore the leading surrogate */
329 				break;
330 			if (wc2 < 0xDC00 || wc2 > 0xDFFF) {
331 				/* If the second character isn't a
332 				 * trailing surrogate, then someone
333 				 * has really screwed up and this is
334 				 * invalid. */
335 				free(ws);
336 				return (NULL);
337 			} else {
338 				src += n;
339 				wc -= 0xD800;
340 				wc *= 0x400;
341 				wc += wc2 - 0xDC00;
342 				wc += 0x10000;
343 			}
344 		}
345 		if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) {
346 			/* We have a code point that won't fit into a
347 			 * wchar_t; convert it to a surrogate pair. */
348 			wc -= 0x10000;
349 			*dest++ = ((wc >> 10) & 0x3ff) + 0xD800;
350 			*dest++ = (wc & 0x3ff) + 0xDC00;
351 		} else
352 			*dest++ = wc;
353 	}
354 	*dest++ = L'\0';
355 	return (ws);
356 }
357 
358 #if defined(_WIN32) && !defined(__CYGWIN__)
359 
360 /*
361  * Translates a wide character string into current locale character set
362  * and appends to the archive_string.  Note: returns NULL if conversion
363  * fails.
364  *
365  * Win32 builds use WideCharToMultiByte from the Windows API.
366  * (Maybe Cygwin should too?  WideCharToMultiByte will know a
367  * lot more about local character encodings than the wcrtomb()
368  * wrapper is going to know.)
369  */
370 struct archive_string *
__archive_strappend_w_mbs(struct archive_string * as,const wchar_t * w)371 __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
372 {
373 	char *p;
374 	int l, wl;
375 	BOOL useDefaultChar = FALSE;
376 
377 	wl = (int)wcslen(w);
378 	l = wl * 4 + 4;
379 	p = malloc(l);
380 	if (p == NULL)
381 		__archive_errx(1, "Out of memory");
382 	/* To check a useDefaultChar is to simulate error handling of
383 	 * the my_wcstombs() which is running on non Windows system with
384 	 * wctomb().
385 	 * And to set NULL for last argument is necessary when a codepage
386 	 * is not CP_ACP(current locale).
387 	 */
388 	l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar);
389 	if (l == 0) {
390 		free(p);
391 		return (NULL);
392 	}
393 	__archive_string_append(as, p, l);
394 	free(p);
395 	return (as);
396 }
397 
398 #else
399 
400 /*
401  * Translates a wide character string into current locale character set
402  * and appends to the archive_string.  Note: returns NULL if conversion
403  * fails.
404  *
405  * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
406  * one character at a time.  If a non-Windows platform doesn't have
407  * either of these, fall back to the built-in UTF8 conversion.
408  */
409 struct archive_string *
__archive_strappend_w_mbs(struct archive_string * as,const wchar_t * w)410 __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
411 {
412 #if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB)
413 	/* If there's no built-in locale support, fall back to UTF8 always. */
414 	return __archive_strappend_w_utf8(as, w);
415 #else
416 	/* We cannot use the standard wcstombs() here because it
417 	 * cannot tell us how big the output buffer should be.  So
418 	 * I've built a loop around wcrtomb() or wctomb() that
419 	 * converts a character at a time and resizes the string as
420 	 * needed.  We prefer wcrtomb() when it's available because
421 	 * it's thread-safe. */
422 	int n;
423 	char *p;
424 	char buff[256];
425 #if HAVE_WCRTOMB
426 	mbstate_t shift_state;
427 
428 	memset(&shift_state, 0, sizeof(shift_state));
429 #else
430 	/* Clear the shift state before starting. */
431 	wctomb(NULL, L'\0');
432 #endif
433 
434 	/*
435 	 * Convert one wide char at a time into 'buff', whenever that
436 	 * fills, append it to the string.
437 	 */
438 	p = buff;
439 	while (*w != L'\0') {
440 		/* Flush the buffer when we have <=16 bytes free. */
441 		/* (No encoding has a single character >16 bytes.) */
442 		if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) {
443 			*p = '\0';
444 			archive_strcat(as, buff);
445 			p = buff;
446 		}
447 #if HAVE_WCRTOMB
448 		n = wcrtomb(p, *w++, &shift_state);
449 #else
450 		n = wctomb(p, *w++);
451 #endif
452 		if (n == -1)
453 			return (NULL);
454 		p += n;
455 	}
456 	*p = '\0';
457 	archive_strcat(as, buff);
458 	return (as);
459 #endif
460 }
461 
462 #endif /* _WIN32 && ! __CYGWIN__ */
463