1 /*
2  * libtsm - Unicode Handling
3  *
4  * Copyright (c) 2011-2013 David Herrmann <dh.herrmann@gmail.com>
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files
8  * (the "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  * The tsm-utf8-state-machine is based on the wayland-compositor demos:
28  *
29  * Copyright © 2008 Kristian Høgsberg
30  *
31  * Permission to use, copy, modify, distribute, and sell this software and
32  * its documentation for any purpose is hereby granted without fee, provided
33  * that the above copyright notice appear in all copies and that both that
34  * copyright notice and this permission notice appear in supporting
35  * documentation, and that the name of the copyright holders not be used in
36  * advertising or publicity pertaining to distribution of the software
37  * without specific, written prior permission.  The copyright holders make
38  * no representations about the suitability of this software for any
39  * purpose.  It is provided "as is" without express or implied warranty.
40  *
41  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
42  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
43  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
44  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
45  * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
46  * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
47  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
48  */
49 
50 /*
51  * Unicode Helpers
52  * This implements several helpers for Unicode/UTF8/UCS4 input and output. See
53  * below for comments on each helper.
54  */
55 
56 #include <errno.h>
57 #include <inttypes.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <stdint.h>
61 #include <stdbool.h>
62 #include "wcwidth.h"
63 #include "arcan_shmif.h"
64 #include "arcan_tui.h"
65 #include "arcan_tuisym.h"
66 #include "libtsm.h"
67 #include "libtsm_int.h"
68 #include "shl_array.h"
69 #include "shl_htable.h"
70 
71 /*
72  * Unicode Symbol Handling
73  * The main goal of the tsm_symbol_* functions is to provide a datatype which
74  * can contain the representation of any printable character. This includes all
75  * basic Unicode characters but also combined characters.
76  * To avoid all the memory management we still represent a character as a single
77  * integer value (tsm_symbol_t) but internally we allocate a string which is
78  * represented by this value.
79  *
80  * A tsm_symbol_t is an integer which represents a single character point.
81  * For most Unicode characters this is simply the UCS4 representation. In fact,
82  * every UCS4 characters is a valid tsm_symbol_t object.
83  * However, Unicode standard allows combining marks. Therefore, some characters
84  * consists of more than one Unicode character.
85  * A global symbol-table provides all those combined characters as single
86  * integers. You simply create a valid base character and append your combining
87  * marks and the table will return a new valid tsm_symbol_t. It is no longer
88  * a valid UCS4 value, though. But no memory management is needed as all
89  * tsm_symbol_t objects are simple integers.
90  *
91  * The symbol table contains two-way
92  * references. The Hash Table contains all the symbols with the symbol ucs4
93  * string as key and the symbol ID as value.
94  * The index array contains the symbol ID as key and a pointer to the ucs4
95  * string as value. But the hash table owns the ucs4 string.
96  * This allows fast implementations of *_get() and *_append() without long
97  * search intervals.
98  *
99  * When creating a new symbol, we simply return the UCS4 value as new symbol. We
100  * do not add it to our symbol table as it is only one character. However, if a
101  * character is appended to an existing symbol, we create a new ucs4 string and
102  * push the new symbol into the symbol table.
103  */
104 
105 const tsm_symbol_t tsm_symbol_default = 0;
106 
107 struct tsm_symbol_table {
108 	unsigned long ref;
109 	uint32_t next_id;
110 	struct shl_array *index;
111 	struct shl_htable symbols;
112 };
113 
hash_ucs4(const void * key,void * priv)114 static size_t hash_ucs4(const void *key, void *priv)
115 {
116 	size_t i, val = 5381;
117 	const uint32_t *ucs4 = key;
118 
119 	for (i = 0; ucs4[i] <= TSM_UCS4_MAX; ++i)
120 		val = val * 33 + ucs4[i];
121 
122 	return val;
123 }
124 
cmp_ucs4(const void * a,const void * b)125 static bool cmp_ucs4(const void *a, const void *b)
126 {
127 	size_t i;
128 	const uint32_t *v1, *v2;
129 
130 	v1 = a;
131 	v2 = b;
132 
133 	for (i = 0; ; ++i) {
134 		if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
135 			return true;
136 		if (v1[i] != v2[i])
137 			return false;
138 	}
139 }
140 
free_ucs4(void * elem,void * priv)141 static void free_ucs4(void *elem, void *priv)
142 {
143 	uint32_t *v = elem;
144 
145 	/* key is prefix with actual value so pass correct pointer */
146 	free(--v);
147 }
148 
tsm_symbol_table_new(struct tsm_symbol_table ** out)149 int tsm_symbol_table_new(struct tsm_symbol_table **out)
150 {
151 	struct tsm_symbol_table *tbl;
152 	int ret;
153 	static const uint32_t *val = NULL; /* we need a valid lvalue */
154 
155 	if (!out)
156 		return -EINVAL;
157 
158 	tbl = malloc(sizeof(*tbl));
159 	if (!tbl)
160 		return -ENOMEM;
161 	memset(tbl, 0, sizeof(*tbl));
162 	tbl->ref = 1;
163 	tbl->next_id = TSM_UCS4_MAX + 2;
164 	shl_htable_init(&tbl->symbols, cmp_ucs4, hash_ucs4, NULL);
165 
166 	ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4);
167 	if (ret)
168 		goto err_free;
169 
170 	/* first entry is not used so add dummy */
171 	shl_array_push(tbl->index, &val);
172 
173 	*out = tbl;
174 	return 0;
175 
176 err_free:
177 	free(tbl);
178 	return ret;
179 }
180 
tsm_symbol_table_ref(struct tsm_symbol_table * tbl)181 void tsm_symbol_table_ref(struct tsm_symbol_table *tbl)
182 {
183 	if (!tbl || !tbl->ref)
184 		return;
185 
186 	++tbl->ref;
187 }
188 
tsm_symbol_table_unref(struct tsm_symbol_table * tbl)189 void tsm_symbol_table_unref(struct tsm_symbol_table *tbl)
190 {
191 	if (!tbl || !tbl->ref || --tbl->ref)
192 		return;
193 
194 	shl_htable_clear(&tbl->symbols, free_ucs4, NULL);
195 	shl_array_free(tbl->index);
196 	free(tbl);
197 }
198 
tsm_symbol_make(uint32_t ucs4)199 tsm_symbol_t tsm_symbol_make(uint32_t ucs4)
200 {
201 	if (ucs4 > TSM_UCS4_MAX)
202 		return 0;
203 	else
204 		return ucs4;
205 }
206 
207 /*
208  * This decomposes a symbol into a ucs4 string and a size value. If \sym is a
209  * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
210  * Therefore, the returned value may get destroyed if your \sym argument gets
211  * destroyed.
212  * If \sym is a composed ucs4 string, then the returned value points into the
213  * hash table of the symbol table and lives as long as the symbol table does.
214  *
215  * This always returns a valid value. If an error happens, the default character
216  * is returned. If \size is NULL, then the size value is omitted.
217  */
tsm_symbol_get(struct tsm_symbol_table * tbl,tsm_symbol_t * sym,size_t * size)218 const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl,
219 			       tsm_symbol_t *sym, size_t *size)
220 {
221 	uint32_t *ucs4, idx;
222 
223 	if (*sym <= TSM_UCS4_MAX) {
224 		if (size)
225 			*size = 1;
226 		return sym;
227 	}
228 
229 	if (!tbl)
230 		return sym;
231 
232 	idx = *sym - (TSM_UCS4_MAX + 1);
233 	if (idx >= shl_array_get_length(tbl->index))
234 		ucs4 = NULL;
235 	else
236 		ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx);
237 
238 	if (!ucs4) {
239 		if (size)
240 			*size = 1;
241 		return &tsm_symbol_default;
242 	}
243 
244 	if (size) {
245 		*size = 0;
246 		while (ucs4[*size] <= TSM_UCS4_MAX)
247 			++*size;
248 	}
249 
250 	return ucs4;
251 }
252 
tsm_symbol_append(struct tsm_symbol_table * tbl,tsm_symbol_t sym,uint32_t ucs4)253 tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl,
254 			       tsm_symbol_t sym, uint32_t ucs4)
255 {
256 	uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval;
257 	const uint32_t *ptr;
258 	size_t s;
259 	bool res;
260 	int ret;
261 
262 	if (!tbl)
263 		return sym;
264 
265 	if (ucs4 > TSM_UCS4_MAX)
266 		return sym;
267 
268 	ptr = tsm_symbol_get(tbl, &sym, &s);
269 	if (s >= TSM_UCS4_MAXLEN)
270 		return sym;
271 
272 	memcpy(buf, ptr, s * sizeof(uint32_t));
273 	buf[s++] = ucs4;
274 	buf[s++] = TSM_UCS4_MAX + 1;
275 
276 	res = shl_htable_lookup(&tbl->symbols, buf, hash_ucs4(buf, NULL),
277 				(void**)&nval);
278 	if (res) {
279 		/* key is prefixed with actual value */
280 		return *--nval;
281 	}
282 
283 	/* We save the key in nval and prefix it with the new ID. Note that
284 	 * the prefix is hidden, we actually store "++nval" in the htable. */
285 	nval = malloc(sizeof(uint32_t) * (s + 1));
286 	if (!nval)
287 		return sym;
288 
289 	++nval;
290 	memcpy(nval, buf, s * sizeof(uint32_t));
291 
292 	nsym = tbl->next_id + 1;
293 	/* Out of IDs; we actually have 2 Billion IDs so this seems
294 	 * very unlikely but lets be safe here */
295 	if (nsym <= tbl->next_id++)
296 		goto err_id;
297 
298 	/* store ID hidden before the key */
299 	*(nval - 1) = nsym;
300 
301 	ret = shl_htable_insert(&tbl->symbols, nval, hash_ucs4(nval, NULL));
302 	if (ret)
303 		goto err_id;
304 
305 	ret = shl_array_push(tbl->index, &nval);
306 	if (ret)
307 		goto err_symbol;
308 
309 	return nsym;
310 
311 err_symbol:
312 	shl_htable_remove(&tbl->symbols, nval, hash_ucs4(nval, NULL), NULL);
313 err_id:
314 	--tbl->next_id;
315 	free(nval);
316 	return sym;
317 }
318 
tsm_symbol_get_width(struct tsm_symbol_table * tbl,tsm_symbol_t sym)319 unsigned int tsm_symbol_get_width(struct tsm_symbol_table *tbl,
320 				  tsm_symbol_t sym)
321 {
322 	const uint32_t *ch;
323 	size_t len;
324 
325 	if (!tbl)
326 		return 0;
327 
328 	ch = tsm_symbol_get(tbl, &sym, &len);
329 	if (len == 0)
330 		return 0;
331 
332 	return tsm_ucs4_get_width(*ch);
333 }
334 
335 /*
336  * Convert UCS4 character to UTF-8. This creates one of:
337  *   0xxxxxxx
338  *   110xxxxx 10xxxxxx
339  *   1110xxxx 10xxxxxx 10xxxxxx
340  *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
341  * This is based on the same function from "terminology" from the Enlightenment
342  * project. See COPYING for more information.
343  *
344  * @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and
345  * indicates how long the written UTF8 string is.
346  *
347  * Please note @g is a real UCS4 code and not a tsm_symbol_t object!
348  *
349  * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for
350  * UTF16 compatibility. It is an error to encode them. Same applies to numbers
351  * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with
352  * 0xFFFF or 0xFFFE.
353  */
354 
355 SHL_EXPORT
tsm_ucs4_get_width(uint32_t ucs4)356 unsigned int tsm_ucs4_get_width(uint32_t ucs4)
357 {
358 	int ret;
359 
360 	ret = mk_wcwidth(ucs4);
361 	if (ret <= 0)
362 		return 0;
363 
364 	return ret;
365 }
366 
367 SHL_EXPORT
tsm_ucs4_to_utf8(uint32_t g,char * txt)368 size_t tsm_ucs4_to_utf8(uint32_t g, char *txt)
369 {
370 	if (g >= 0xd800 && g <= 0xdfff)
371 		return 0;
372 	if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe)
373 		return 0;
374 	if (g >= 0xfdd0 && g <= 0xfdef)
375 		return 0;
376 
377 	if (g < (1 << 7)) {
378 		txt[0] = g & 0x7f;
379 		return 1;
380 	} else if (g < (1 << (5 + 6))) {
381 		txt[0] = 0xc0 | ((g >> 6) & 0x1f);
382 		txt[1] = 0x80 | ((g     ) & 0x3f);
383 		return 2;
384 	} else if (g < (1 << (4 + 6 + 6))) {
385 		txt[0] = 0xe0 | ((g >> 12) & 0x0f);
386 		txt[1] = 0x80 | ((g >>  6) & 0x3f);
387 		txt[2] = 0x80 | ((g      ) & 0x3f);
388 		return 3;
389 	} else if (g < (1 << (3 + 6 + 6 + 6))) {
390 		txt[0] = 0xf0 | ((g >> 18) & 0x07);
391 		txt[1] = 0x80 | ((g >> 12) & 0x3f);
392 		txt[2] = 0x80 | ((g >>  6) & 0x3f);
393 		txt[3] = 0x80 | ((g      ) & 0x3f);
394 		return 4;
395 	} else {
396 		return 0;
397 	}
398 }
399 
400 SHL_EXPORT
tsm_ucs4_to_utf8_alloc(const uint32_t * ucs4,size_t len,size_t * len_out)401 char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out)
402 {
403 	char *val;
404 	size_t i, pos;
405 
406 	val = malloc(4 * len);
407 	if (!val)
408 		return NULL;
409 
410 	pos = 0;
411 	for (i = 0; i < len; ++i)
412 		pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]);
413 
414 	if (!pos) {
415 		free(val);
416 		return NULL;
417 	}
418 
419 	if (len_out)
420 		*len_out = pos;
421 	return val;
422 }
423 
424 /*
425  * UTF8 State Machine
426  * This state machine parses UTF8 and converts it into a stream of Unicode
427  * characters (UCS4 values). A state-machine is represented by a
428  * "struct tsm_utf8_mach" object. It has no global state and all functions are
429  * re-entrant if called with different state-machine objects.
430  *
431  * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its
432  * initial state. Returns 0 on success.
433  *
434  * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally
435  * allocated memory.
436  *
437  * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This
438  * is the same state the machine is in after it got created.
439  *
440  * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the
441  * state-machine. This function returns the new state of the state-machine after
442  * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT,
443  * then there is a pending UCS4 character that you should retrieve via
444  * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was
445  * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and
446  * some error recovery was tried or a replacement character was choosen. All
447  * other states mean that the machine needs more input to parse the stream.
448  *
449  * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on
450  * the state machine so you can call it multiple times.
451  *
452  * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This
453  * can be used to ignore the last read input or to simply reset the machine.
454  * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to
455  * get a full UTF8 sequence.
456  * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the
457  * user will read a replacement character. If further errors occur, we go to
458  * state TSM_UTF8_START to avoid printing multiple replacement characters for a
459  * single misinterpreted UTF8 sequence. However, under some circumstances it may
460  * happen that we stay in TSM_UTF8_REJECT and a next replacement character is
461  * returned.
462  * It is difficult to decide how to interpret wrong input but this machine seems
463  * to be quite good at deciding what to do. Generally, we prefer discarding or
464  * replacing input instead of trying to decipher ASCII values from the invalid
465  * data. This guarantees that we do not send wrong values to the terminal
466  * emulator. Some might argue that an ASCII fallback would be better. However,
467  * this means that we might send very weird escape-sequences to the VTE layer.
468  * Especially with C1 codes applications can really break many terminal features
469  * so we avoid any non-ASCII+non-UTF8 input to prevent this.
470  */
471 
472 struct tsm_utf8_mach {
473 	int state;
474 	uint32_t ch;
475 };
476 
tsm_utf8_mach_new(struct tsm_utf8_mach ** out)477 int tsm_utf8_mach_new(struct tsm_utf8_mach **out)
478 {
479 	struct tsm_utf8_mach *mach;
480 
481 	if (!out)
482 		return -EINVAL;
483 
484 	mach = malloc(sizeof(*mach));
485 	if (!mach)
486 		return -ENOMEM;
487 
488 	memset(mach, 0, sizeof(*mach));
489 	mach->state = TSM_UTF8_START;
490 
491 	*out = mach;
492 	return 0;
493 }
494 
tsm_utf8_mach_free(struct tsm_utf8_mach * mach)495 void tsm_utf8_mach_free(struct tsm_utf8_mach *mach)
496 {
497 	if (!mach)
498 		return;
499 
500 	free(mach);
501 }
502 
tsm_utf8_mach_feed(struct tsm_utf8_mach * mach,char ci)503 int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci)
504 {
505 	uint32_t c;
506 
507 	if (!mach)
508 		return TSM_UTF8_START;
509 
510 	c = ci;
511 
512 	switch (mach->state) {
513 	case TSM_UTF8_START:
514 	case TSM_UTF8_ACCEPT:
515 	case TSM_UTF8_REJECT:
516 		if (c == 0xC0 || c == 0xC1) {
517 			/* overlong encoding for ASCII, reject */
518 			mach->state = TSM_UTF8_REJECT;
519 		} else if ((c & 0x80) == 0) {
520 			/* single byte, accept */
521 			mach->ch = c;
522 			mach->state = TSM_UTF8_ACCEPT;
523 		} else if ((c & 0xC0) == 0x80) {
524 			/* parser out of sync, ignore byte */
525 			mach->state = TSM_UTF8_START;
526 		} else if ((c & 0xE0) == 0xC0) {
527 			/* start of two byte sequence */
528 			mach->ch = (c & 0x1F) << 6;
529 			mach->state = TSM_UTF8_EXPECT1;
530 		} else if ((c & 0xF0) == 0xE0) {
531 			/* start of three byte sequence */
532 			mach->ch = (c & 0x0F) << 12;
533 			mach->state = TSM_UTF8_EXPECT2;
534 		} else if ((c & 0xF8) == 0xF0) {
535 			/* start of four byte sequence */
536 			mach->ch = (c & 0x07) << 18;
537 			mach->state = TSM_UTF8_EXPECT3;
538 		} else {
539 			/* overlong encoding, reject */
540 			mach->state = TSM_UTF8_REJECT;
541 		}
542 		break;
543 	case TSM_UTF8_EXPECT3:
544 		mach->ch |= (c & 0x3F) << 12;
545 		if ((c & 0xC0) == 0x80)
546 			mach->state = TSM_UTF8_EXPECT2;
547 		else
548 			mach->state = TSM_UTF8_REJECT;
549 		break;
550 	case TSM_UTF8_EXPECT2:
551 		mach->ch |= (c & 0x3F) << 6;
552 		if ((c & 0xC0) == 0x80)
553 			mach->state = TSM_UTF8_EXPECT1;
554 		else
555 			mach->state = TSM_UTF8_REJECT;
556 		break;
557 	case TSM_UTF8_EXPECT1:
558 		mach->ch |= c & 0x3F;
559 		if ((c & 0xC0) == 0x80)
560 			mach->state = TSM_UTF8_ACCEPT;
561 		else
562 			mach->state = TSM_UTF8_REJECT;
563 		break;
564 	default:
565 		mach->state = TSM_UTF8_REJECT;
566 		break;
567 	}
568 
569 	return mach->state;
570 }
571 
tsm_utf8_mach_get(struct tsm_utf8_mach * mach)572 uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach)
573 {
574 	if (!mach || mach->state != TSM_UTF8_ACCEPT)
575 		return TSM_UCS4_REPLACEMENT;
576 
577 	return mach->ch;
578 }
579 
tsm_utf8_mach_reset(struct tsm_utf8_mach * mach)580 void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach)
581 {
582 	if (!mach)
583 		return;
584 
585 	mach->state = TSM_UTF8_START;
586 }
587