xref: /openbsd/usr.bin/tmux/utf8.c (revision 7e151e3f)
1 /* $OpenBSD: utf8.c,v 1.65 2024/05/24 12:41:24 nicm Exp $ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <vis.h>
26 
27 #include "tmux.h"
28 
29 static const wchar_t utf8_force_wide[] = {
30 	0x0261D,
31 	0x026F9,
32 	0x0270A,
33 	0x0270B,
34 	0x0270C,
35 	0x0270D,
36 	0x1F1E6,
37 	0x1F1E7,
38 	0x1F1E8,
39 	0x1F1E9,
40 	0x1F1EA,
41 	0x1F1EB,
42 	0x1F1EC,
43 	0x1F1ED,
44 	0x1F1EE,
45 	0x1F1EF,
46 	0x1F1F0,
47 	0x1F1F1,
48 	0x1F1F2,
49 	0x1F1F3,
50 	0x1F1F4,
51 	0x1F1F5,
52 	0x1F1F6,
53 	0x1F1F7,
54 	0x1F1F8,
55 	0x1F1F9,
56 	0x1F1FA,
57 	0x1F1FB,
58 	0x1F1FC,
59 	0x1F1FD,
60 	0x1F1FE,
61 	0x1F1FF,
62 	0x1F385,
63 	0x1F3C2,
64 	0x1F3C3,
65 	0x1F3C4,
66 	0x1F3C7,
67 	0x1F3CA,
68 	0x1F3CB,
69 	0x1F3CC,
70 	0x1F3FB,
71 	0x1F3FC,
72 	0x1F3FD,
73 	0x1F3FE,
74 	0x1F3FF,
75 	0x1F442,
76 	0x1F443,
77 	0x1F446,
78 	0x1F447,
79 	0x1F448,
80 	0x1F449,
81 	0x1F44A,
82 	0x1F44B,
83 	0x1F44C,
84 	0x1F44D,
85 	0x1F44E,
86 	0x1F44F,
87 	0x1F450,
88 	0x1F466,
89 	0x1F467,
90 	0x1F468,
91 	0x1F469,
92 	0x1F46B,
93 	0x1F46C,
94 	0x1F46D,
95 	0x1F46E,
96 	0x1F470,
97 	0x1F471,
98 	0x1F472,
99 	0x1F473,
100 	0x1F474,
101 	0x1F475,
102 	0x1F476,
103 	0x1F477,
104 	0x1F478,
105 	0x1F47C,
106 	0x1F481,
107 	0x1F482,
108 	0x1F483,
109 	0x1F485,
110 	0x1F486,
111 	0x1F487,
112 	0x1F48F,
113 	0x1F491,
114 	0x1F4AA,
115 	0x1F574,
116 	0x1F575,
117 	0x1F57A,
118 	0x1F590,
119 	0x1F595,
120 	0x1F596,
121 	0x1F645,
122 	0x1F646,
123 	0x1F647,
124 	0x1F64B,
125 	0x1F64C,
126 	0x1F64D,
127 	0x1F64E,
128 	0x1F64F,
129 	0x1F6A3,
130 	0x1F6B4,
131 	0x1F6B5,
132 	0x1F6B6,
133 	0x1F6C0,
134 	0x1F6CC,
135 	0x1F90C,
136 	0x1F90F,
137 	0x1F918,
138 	0x1F919,
139 	0x1F91A,
140 	0x1F91B,
141 	0x1F91C,
142 	0x1F91D,
143 	0x1F91E,
144 	0x1F91F,
145 	0x1F926,
146 	0x1F930,
147 	0x1F931,
148 	0x1F932,
149 	0x1F933,
150 	0x1F934,
151 	0x1F935,
152 	0x1F936,
153 	0x1F937,
154 	0x1F938,
155 	0x1F939,
156 	0x1F93D,
157 	0x1F93E,
158 	0x1F977,
159 	0x1F9B5,
160 	0x1F9B6,
161 	0x1F9B8,
162 	0x1F9B9,
163 	0x1F9BB,
164 	0x1F9CD,
165 	0x1F9CE,
166 	0x1F9CF,
167 	0x1F9D1,
168 	0x1F9D2,
169 	0x1F9D3,
170 	0x1F9D4,
171 	0x1F9D5,
172 	0x1F9D6,
173 	0x1F9D7,
174 	0x1F9D8,
175 	0x1F9D9,
176 	0x1F9DA,
177 	0x1F9DB,
178 	0x1F9DC,
179 	0x1F9DD,
180 	0x1FAC3,
181 	0x1FAC4,
182 	0x1FAC5,
183 	0x1FAF0,
184 	0x1FAF1,
185 	0x1FAF2,
186 	0x1FAF3,
187 	0x1FAF4,
188 	0x1FAF5,
189 	0x1FAF6,
190 	0x1FAF7,
191 	0x1FAF8
192 };
193 
194 struct utf8_item {
195 	RB_ENTRY(utf8_item)	index_entry;
196 	u_int			index;
197 
198 	RB_ENTRY(utf8_item)	data_entry;
199 	char			data[UTF8_SIZE];
200 	u_char			size;
201 };
202 
203 static int
utf8_data_cmp(struct utf8_item * ui1,struct utf8_item * ui2)204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
205 {
206 	if (ui1->size < ui2->size)
207 		return (-1);
208 	if (ui1->size > ui2->size)
209 		return (1);
210 	return (memcmp(ui1->data, ui2->data, ui1->size));
211 }
212 RB_HEAD(utf8_data_tree, utf8_item);
213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
215 
216 static int
utf8_index_cmp(struct utf8_item * ui1,struct utf8_item * ui2)217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
218 {
219 	if (ui1->index < ui2->index)
220 		return (-1);
221 	if (ui1->index > ui2->index)
222 		return (1);
223 	return (0);
224 }
225 RB_HEAD(utf8_index_tree, utf8_item);
226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
228 
229 static u_int utf8_next_index;
230 
231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
233 
234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
236 
237 /* Get a UTF-8 item from data. */
238 static struct utf8_item *
utf8_item_by_data(const u_char * data,size_t size)239 utf8_item_by_data(const u_char *data, size_t size)
240 {
241 	struct utf8_item	ui;
242 
243 	memcpy(ui.data, data, size);
244 	ui.size = size;
245 
246 	return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
247 }
248 
249 /* Get a UTF-8 item from data. */
250 static struct utf8_item *
utf8_item_by_index(u_int index)251 utf8_item_by_index(u_int index)
252 {
253 	struct utf8_item	ui;
254 
255 	ui.index = index;
256 
257 	return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
258 }
259 
260 /* Add a UTF-8 item. */
261 static int
utf8_put_item(const u_char * data,size_t size,u_int * index)262 utf8_put_item(const u_char *data, size_t size, u_int *index)
263 {
264 	struct utf8_item	*ui;
265 
266 	ui = utf8_item_by_data(data, size);
267 	if (ui != NULL) {
268 		*index = ui->index;
269 		log_debug("%s: found %.*s = %u", __func__, (int)size, data,
270 		    *index);
271 		return (0);
272 	}
273 
274 	if (utf8_next_index == 0xffffff + 1)
275 		return (-1);
276 
277 	ui = xcalloc(1, sizeof *ui);
278 	ui->index = utf8_next_index++;
279 	RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
280 
281 	memcpy(ui->data, data, size);
282 	ui->size = size;
283 	RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
284 
285 	*index = ui->index;
286 	log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
287 	return (0);
288 }
289 
290 static int
utf8_table_cmp(const void * vp1,const void * vp2)291 utf8_table_cmp(const void *vp1, const void *vp2)
292 {
293 	const wchar_t	*wc1 = vp1, *wc2 = vp2;
294 
295 	if (*wc1 < *wc2)
296 		return (-1);
297 	if (*wc1 > *wc2)
298 		return (1);
299 	return (0);
300 }
301 
302 /* Check if character in table. */
303 int
utf8_in_table(wchar_t find,const wchar_t * table,u_int count)304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
305 {
306 	wchar_t	*found;
307 
308 	found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
309 	return (found != NULL);
310 }
311 
312 /* Get UTF-8 character from data. */
313 enum utf8_state
utf8_from_data(const struct utf8_data * ud,utf8_char * uc)314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
315 {
316 	u_int	index;
317 
318 	if (ud->width > 2)
319 		fatalx("invalid UTF-8 width: %u", ud->width);
320 
321 	if (ud->size > UTF8_SIZE)
322 		goto fail;
323 	if (ud->size <= 3) {
324 		index = (((utf8_char)ud->data[2] << 16)|
325 			  ((utf8_char)ud->data[1] << 8)|
326 			  ((utf8_char)ud->data[0]));
327 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
328 		goto fail;
329 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
330 	log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
331 	    (int)ud->size, ud->data, *uc);
332 	return (UTF8_DONE);
333 
334 fail:
335 	if (ud->width == 0)
336 		*uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
337 	else if (ud->width == 1)
338 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
339 	else
340 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
341 	return (UTF8_ERROR);
342 }
343 
344 /* Get UTF-8 data from character. */
345 void
utf8_to_data(utf8_char uc,struct utf8_data * ud)346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
347 {
348 	struct utf8_item	*ui;
349 	u_int			 index;
350 
351 	memset(ud, 0, sizeof *ud);
352 	ud->size = ud->have = UTF8_GET_SIZE(uc);
353 	ud->width = UTF8_GET_WIDTH(uc);
354 
355 	if (ud->size <= 3) {
356 		ud->data[2] = (uc >> 16);
357 		ud->data[1] = ((uc >> 8) & 0xff);
358 		ud->data[0] = (uc & 0xff);
359 	} else {
360 		index = (uc & 0xffffff);
361 		if ((ui = utf8_item_by_index(index)) == NULL)
362 			memset(ud->data, ' ', ud->size);
363 		else
364 			memcpy(ud->data, ui->data, ud->size);
365 	}
366 
367 	log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
368 	    (int)ud->size, ud->data);
369 }
370 
371 /* Get UTF-8 character from a single ASCII character. */
372 u_int
utf8_build_one(u_char ch)373 utf8_build_one(u_char ch)
374 {
375 	return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
376 }
377 
378 /* Set a single character. */
379 void
utf8_set(struct utf8_data * ud,u_char ch)380 utf8_set(struct utf8_data *ud, u_char ch)
381 {
382 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
383 
384 	memcpy(ud, &empty, sizeof *ud);
385 	*ud->data = ch;
386 }
387 
388 /* Copy UTF-8 character. */
389 void
utf8_copy(struct utf8_data * to,const struct utf8_data * from)390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
391 {
392 	u_int	i;
393 
394 	memcpy(to, from, sizeof *to);
395 
396 	for (i = to->size; i < sizeof to->data; i++)
397 		to->data[i] = '\0';
398 }
399 
400 /* Get width of Unicode character. */
401 static enum utf8_state
utf8_width(struct utf8_data * ud,int * width)402 utf8_width(struct utf8_data *ud, int *width)
403 {
404 	wchar_t	wc;
405 
406 	if (utf8_towc(ud, &wc) != UTF8_DONE)
407 		return (UTF8_ERROR);
408 	if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
409 		*width = 2;
410 		return (UTF8_DONE);
411 	}
412 
413 	*width = wcwidth(wc);
414 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
415 	if (*width < 0) {
416 		/*
417 		 * C1 control characters are nonprintable, so they are always
418 		 * zero width.
419 		 */
420 		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
421 	}
422 	if (*width >= 0 && *width <= 0xff)
423 		return (UTF8_DONE);
424 	return (UTF8_ERROR);
425 }
426 
427 /* Convert UTF-8 character to wide character. */
428 enum utf8_state
utf8_towc(const struct utf8_data * ud,wchar_t * wc)429 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
430 {
431 	switch (mbtowc(wc, ud->data, ud->size)) {
432 	case -1:
433 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
434 		    errno);
435 		mbtowc(NULL, NULL, MB_CUR_MAX);
436 		return (UTF8_ERROR);
437 	case 0:
438 		return (UTF8_ERROR);
439 	}
440 	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
441 	return (UTF8_DONE);
442 }
443 
444 /*
445  * Open UTF-8 sequence.
446  *
447  * 11000010-11011111 C2-DF start of 2-byte sequence
448  * 11100000-11101111 E0-EF start of 3-byte sequence
449  * 11110000-11110100 F0-F4 start of 4-byte sequence
450  */
451 enum utf8_state
utf8_open(struct utf8_data * ud,u_char ch)452 utf8_open(struct utf8_data *ud, u_char ch)
453 {
454 	memset(ud, 0, sizeof *ud);
455 	if (ch >= 0xc2 && ch <= 0xdf)
456 		ud->size = 2;
457 	else if (ch >= 0xe0 && ch <= 0xef)
458 		ud->size = 3;
459 	else if (ch >= 0xf0 && ch <= 0xf4)
460 		ud->size = 4;
461 	else
462 		return (UTF8_ERROR);
463 	utf8_append(ud, ch);
464 	return (UTF8_MORE);
465 }
466 
467 /* Append character to UTF-8, closing if finished. */
468 enum utf8_state
utf8_append(struct utf8_data * ud,u_char ch)469 utf8_append(struct utf8_data *ud, u_char ch)
470 {
471 	int	width;
472 
473 	if (ud->have >= ud->size)
474 		fatalx("UTF-8 character overflow");
475 	if (ud->size > sizeof ud->data)
476 		fatalx("UTF-8 character size too large");
477 
478 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
479 		ud->width = 0xff;
480 
481 	ud->data[ud->have++] = ch;
482 	if (ud->have != ud->size)
483 		return (UTF8_MORE);
484 
485 	if (ud->width == 0xff)
486 		return (UTF8_ERROR);
487 	if (utf8_width(ud, &width) != UTF8_DONE)
488 		return (UTF8_ERROR);
489 	ud->width = width;
490 
491 	return (UTF8_DONE);
492 }
493 
494 /*
495  * Encode len characters from src into dst, which is guaranteed to have four
496  * bytes available for each character from src (for \abc or UTF-8) plus space
497  * for \0.
498  */
499 int
utf8_strvis(char * dst,const char * src,size_t len,int flag)500 utf8_strvis(char *dst, const char *src, size_t len, int flag)
501 {
502 	struct utf8_data	 ud;
503 	const char		*start = dst, *end = src + len;
504 	enum utf8_state		 more;
505 	size_t			 i;
506 
507 	while (src < end) {
508 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
509 			while (++src < end && more == UTF8_MORE)
510 				more = utf8_append(&ud, *src);
511 			if (more == UTF8_DONE) {
512 				/* UTF-8 character finished. */
513 				for (i = 0; i < ud.size; i++)
514 					*dst++ = ud.data[i];
515 				continue;
516 			}
517 			/* Not a complete, valid UTF-8 character. */
518 			src -= ud.have;
519 		}
520 		if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
521 			if (isalpha((u_char)src[1]) ||
522 			    src[1] == '_' ||
523 			    src[1] == '{')
524 				*dst++ = '\\';
525 			*dst++ = '$';
526 		} else if (src < end - 1)
527 			dst = vis(dst, src[0], flag, src[1]);
528 		else if (src < end)
529 			dst = vis(dst, src[0], flag, '\0');
530 		src++;
531 	}
532 	*dst = '\0';
533 	return (dst - start);
534 }
535 
536 /* Same as utf8_strvis but allocate the buffer. */
537 int
utf8_stravis(char ** dst,const char * src,int flag)538 utf8_stravis(char **dst, const char *src, int flag)
539 {
540 	char	*buf;
541 	int	 len;
542 
543 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
544 	len = utf8_strvis(buf, src, strlen(src), flag);
545 
546 	*dst = xrealloc(buf, len + 1);
547 	return (len);
548 }
549 
550 /* Same as utf8_strvis but allocate the buffer. */
551 int
utf8_stravisx(char ** dst,const char * src,size_t srclen,int flag)552 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
553 {
554 	char	*buf;
555 	int	 len;
556 
557 	buf = xreallocarray(NULL, 4, srclen + 1);
558 	len = utf8_strvis(buf, src, srclen, flag);
559 
560 	*dst = xrealloc(buf, len + 1);
561 	return (len);
562 }
563 
564 /* Does this string contain anything that isn't valid UTF-8? */
565 int
utf8_isvalid(const char * s)566 utf8_isvalid(const char *s)
567 {
568 	struct utf8_data ud;
569 	const char	*end;
570 	enum utf8_state	 more;
571 
572 	end = s + strlen(s);
573 	while (s < end) {
574 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
575 			while (++s < end && more == UTF8_MORE)
576 				more = utf8_append(&ud, *s);
577 			if (more == UTF8_DONE)
578 				continue;
579 			return (0);
580 		}
581 		if (*s < 0x20 || *s > 0x7e)
582 			return (0);
583 		s++;
584 	}
585 	return (1);
586 }
587 
588 /*
589  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
590  * the returned string. Anything not valid printable ASCII or UTF-8 is
591  * stripped.
592  */
593 char *
utf8_sanitize(const char * src)594 utf8_sanitize(const char *src)
595 {
596 	char		*dst = NULL;
597 	size_t		 n = 0;
598 	enum utf8_state	 more;
599 	struct utf8_data ud;
600 	u_int		 i;
601 
602 	while (*src != '\0') {
603 		dst = xreallocarray(dst, n + 1, sizeof *dst);
604 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
605 			while (*++src != '\0' && more == UTF8_MORE)
606 				more = utf8_append(&ud, *src);
607 			if (more == UTF8_DONE) {
608 				dst = xreallocarray(dst, n + ud.width,
609 				    sizeof *dst);
610 				for (i = 0; i < ud.width; i++)
611 					dst[n++] = '_';
612 				continue;
613 			}
614 			src -= ud.have;
615 		}
616 		if (*src > 0x1f && *src < 0x7f)
617 			dst[n++] = *src;
618 		else
619 			dst[n++] = '_';
620 		src++;
621 	}
622 	dst = xreallocarray(dst, n + 1, sizeof *dst);
623 	dst[n] = '\0';
624 	return (dst);
625 }
626 
627 /* Get UTF-8 buffer length. */
628 size_t
utf8_strlen(const struct utf8_data * s)629 utf8_strlen(const struct utf8_data *s)
630 {
631 	size_t	i;
632 
633 	for (i = 0; s[i].size != 0; i++)
634 		/* nothing */;
635 	return (i);
636 }
637 
638 /* Get UTF-8 string width. */
639 u_int
utf8_strwidth(const struct utf8_data * s,ssize_t n)640 utf8_strwidth(const struct utf8_data *s, ssize_t n)
641 {
642 	ssize_t	i;
643 	u_int	width = 0;
644 
645 	for (i = 0; s[i].size != 0; i++) {
646 		if (n != -1 && n == i)
647 			break;
648 		width += s[i].width;
649 	}
650 	return (width);
651 }
652 
653 /*
654  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
655  * Caller frees.
656  */
657 struct utf8_data *
utf8_fromcstr(const char * src)658 utf8_fromcstr(const char *src)
659 {
660 	struct utf8_data	*dst = NULL;
661 	size_t			 n = 0;
662 	enum utf8_state		 more;
663 
664 	while (*src != '\0') {
665 		dst = xreallocarray(dst, n + 1, sizeof *dst);
666 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
667 			while (*++src != '\0' && more == UTF8_MORE)
668 				more = utf8_append(&dst[n], *src);
669 			if (more == UTF8_DONE) {
670 				n++;
671 				continue;
672 			}
673 			src -= dst[n].have;
674 		}
675 		utf8_set(&dst[n], *src);
676 		n++;
677 		src++;
678 	}
679 	dst = xreallocarray(dst, n + 1, sizeof *dst);
680 	dst[n].size = 0;
681 	return (dst);
682 }
683 
684 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
685 char *
utf8_tocstr(struct utf8_data * src)686 utf8_tocstr(struct utf8_data *src)
687 {
688 	char	*dst = NULL;
689 	size_t	 n = 0;
690 
691 	for(; src->size != 0; src++) {
692 		dst = xreallocarray(dst, n + src->size, 1);
693 		memcpy(dst + n, src->data, src->size);
694 		n += src->size;
695 	}
696 	dst = xreallocarray(dst, n + 1, 1);
697 	dst[n] = '\0';
698 	return (dst);
699 }
700 
701 /* Get width of UTF-8 string. */
702 u_int
utf8_cstrwidth(const char * s)703 utf8_cstrwidth(const char *s)
704 {
705 	struct utf8_data	tmp;
706 	u_int			width;
707 	enum utf8_state		more;
708 
709 	width = 0;
710 	while (*s != '\0') {
711 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
712 			while (*++s != '\0' && more == UTF8_MORE)
713 				more = utf8_append(&tmp, *s);
714 			if (more == UTF8_DONE) {
715 				width += tmp.width;
716 				continue;
717 			}
718 			s -= tmp.have;
719 		}
720 		if (*s > 0x1f && *s != 0x7f)
721 			width++;
722 		s++;
723 	}
724 	return (width);
725 }
726 
727 /* Pad UTF-8 string to width on the left. Caller frees. */
728 char *
utf8_padcstr(const char * s,u_int width)729 utf8_padcstr(const char *s, u_int width)
730 {
731 	size_t	 slen;
732 	char	*out;
733 	u_int	 n, i;
734 
735 	n = utf8_cstrwidth(s);
736 	if (n >= width)
737 		return (xstrdup(s));
738 
739 	slen = strlen(s);
740 	out = xmalloc(slen + 1 + (width - n));
741 	memcpy(out, s, slen);
742 	for (i = n; i < width; i++)
743 		out[slen++] = ' ';
744 	out[slen] = '\0';
745 	return (out);
746 }
747 
748 /* Pad UTF-8 string to width on the right. Caller frees. */
749 char *
utf8_rpadcstr(const char * s,u_int width)750 utf8_rpadcstr(const char *s, u_int width)
751 {
752 	size_t	 slen;
753 	char	*out;
754 	u_int	 n, i;
755 
756 	n = utf8_cstrwidth(s);
757 	if (n >= width)
758 		return (xstrdup(s));
759 
760 	slen = strlen(s);
761 	out = xmalloc(slen + 1 + (width - n));
762 	for (i = 0; i < width - n; i++)
763 		out[i] = ' ';
764 	memcpy(out + i, s, slen);
765 	out[i + slen] = '\0';
766 	return (out);
767 }
768 
769 int
utf8_cstrhas(const char * s,const struct utf8_data * ud)770 utf8_cstrhas(const char *s, const struct utf8_data *ud)
771 {
772 	struct utf8_data	*copy, *loop;
773 	int			 found = 0;
774 
775 	copy = utf8_fromcstr(s);
776 	for (loop = copy; loop->size != 0; loop++) {
777 		if (loop->size != ud->size)
778 			continue;
779 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
780 			found = 1;
781 			break;
782 		}
783 	}
784 	free(copy);
785 
786 	return (found);
787 }
788