1 /*
2 * libtsm - Unicode Handling
3 *
4 * Copyright (c) 2011-2013 David Herrmann <dh.herrmann@gmail.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files
8 * (the "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 /*
27 * The tsm-utf8-state-machine is based on the wayland-compositor demos:
28 *
29 * Copyright © 2008 Kristian Høgsberg
30 *
31 * Permission to use, copy, modify, distribute, and sell this software and
32 * its documentation for any purpose is hereby granted without fee, provided
33 * that the above copyright notice appear in all copies and that both that
34 * copyright notice and this permission notice appear in supporting
35 * documentation, and that the name of the copyright holders not be used in
36 * advertising or publicity pertaining to distribution of the software
37 * without specific, written prior permission. The copyright holders make
38 * no representations about the suitability of this software for any
39 * purpose. It is provided "as is" without express or implied warranty.
40 *
41 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
42 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
43 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
44 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
45 * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
46 * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
47 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
48 */
49
50 /*
51 * Unicode Helpers
52 * This implements several helpers for Unicode/UTF8/UCS4 input and output. See
53 * below for comments on each helper.
54 */
55
56 #include <errno.h>
57 #include <inttypes.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <stdint.h>
61 #include <stdbool.h>
62 #include "wcwidth.h"
63 #include "arcan_shmif.h"
64 #include "arcan_tui.h"
65 #include "arcan_tuisym.h"
66 #include "libtsm.h"
67 #include "libtsm_int.h"
68 #include "shl_array.h"
69 #include "shl_htable.h"
70
71 /*
72 * Unicode Symbol Handling
73 * The main goal of the tsm_symbol_* functions is to provide a datatype which
74 * can contain the representation of any printable character. This includes all
75 * basic Unicode characters but also combined characters.
76 * To avoid all the memory management we still represent a character as a single
77 * integer value (tsm_symbol_t) but internally we allocate a string which is
78 * represented by this value.
79 *
80 * A tsm_symbol_t is an integer which represents a single character point.
81 * For most Unicode characters this is simply the UCS4 representation. In fact,
82 * every UCS4 characters is a valid tsm_symbol_t object.
83 * However, Unicode standard allows combining marks. Therefore, some characters
84 * consists of more than one Unicode character.
85 * A global symbol-table provides all those combined characters as single
86 * integers. You simply create a valid base character and append your combining
87 * marks and the table will return a new valid tsm_symbol_t. It is no longer
88 * a valid UCS4 value, though. But no memory management is needed as all
89 * tsm_symbol_t objects are simple integers.
90 *
91 * The symbol table contains two-way
92 * references. The Hash Table contains all the symbols with the symbol ucs4
93 * string as key and the symbol ID as value.
94 * The index array contains the symbol ID as key and a pointer to the ucs4
95 * string as value. But the hash table owns the ucs4 string.
96 * This allows fast implementations of *_get() and *_append() without long
97 * search intervals.
98 *
99 * When creating a new symbol, we simply return the UCS4 value as new symbol. We
100 * do not add it to our symbol table as it is only one character. However, if a
101 * character is appended to an existing symbol, we create a new ucs4 string and
102 * push the new symbol into the symbol table.
103 */
104
105 const tsm_symbol_t tsm_symbol_default = 0;
106
107 struct tsm_symbol_table {
108 unsigned long ref;
109 uint32_t next_id;
110 struct shl_array *index;
111 struct shl_htable symbols;
112 };
113
hash_ucs4(const void * key,void * priv)114 static size_t hash_ucs4(const void *key, void *priv)
115 {
116 size_t i, val = 5381;
117 const uint32_t *ucs4 = key;
118
119 for (i = 0; ucs4[i] <= TSM_UCS4_MAX; ++i)
120 val = val * 33 + ucs4[i];
121
122 return val;
123 }
124
cmp_ucs4(const void * a,const void * b)125 static bool cmp_ucs4(const void *a, const void *b)
126 {
127 size_t i;
128 const uint32_t *v1, *v2;
129
130 v1 = a;
131 v2 = b;
132
133 for (i = 0; ; ++i) {
134 if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
135 return true;
136 if (v1[i] != v2[i])
137 return false;
138 }
139 }
140
free_ucs4(void * elem,void * priv)141 static void free_ucs4(void *elem, void *priv)
142 {
143 uint32_t *v = elem;
144
145 /* key is prefix with actual value so pass correct pointer */
146 free(--v);
147 }
148
tsm_symbol_table_new(struct tsm_symbol_table ** out)149 int tsm_symbol_table_new(struct tsm_symbol_table **out)
150 {
151 struct tsm_symbol_table *tbl;
152 int ret;
153 static const uint32_t *val = NULL; /* we need a valid lvalue */
154
155 if (!out)
156 return -EINVAL;
157
158 tbl = malloc(sizeof(*tbl));
159 if (!tbl)
160 return -ENOMEM;
161 memset(tbl, 0, sizeof(*tbl));
162 tbl->ref = 1;
163 tbl->next_id = TSM_UCS4_MAX + 2;
164 shl_htable_init(&tbl->symbols, cmp_ucs4, hash_ucs4, NULL);
165
166 ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4);
167 if (ret)
168 goto err_free;
169
170 /* first entry is not used so add dummy */
171 shl_array_push(tbl->index, &val);
172
173 *out = tbl;
174 return 0;
175
176 err_free:
177 free(tbl);
178 return ret;
179 }
180
tsm_symbol_table_ref(struct tsm_symbol_table * tbl)181 void tsm_symbol_table_ref(struct tsm_symbol_table *tbl)
182 {
183 if (!tbl || !tbl->ref)
184 return;
185
186 ++tbl->ref;
187 }
188
tsm_symbol_table_unref(struct tsm_symbol_table * tbl)189 void tsm_symbol_table_unref(struct tsm_symbol_table *tbl)
190 {
191 if (!tbl || !tbl->ref || --tbl->ref)
192 return;
193
194 shl_htable_clear(&tbl->symbols, free_ucs4, NULL);
195 shl_array_free(tbl->index);
196 free(tbl);
197 }
198
tsm_symbol_make(uint32_t ucs4)199 tsm_symbol_t tsm_symbol_make(uint32_t ucs4)
200 {
201 if (ucs4 > TSM_UCS4_MAX)
202 return 0;
203 else
204 return ucs4;
205 }
206
207 /*
208 * This decomposes a symbol into a ucs4 string and a size value. If \sym is a
209 * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
210 * Therefore, the returned value may get destroyed if your \sym argument gets
211 * destroyed.
212 * If \sym is a composed ucs4 string, then the returned value points into the
213 * hash table of the symbol table and lives as long as the symbol table does.
214 *
215 * This always returns a valid value. If an error happens, the default character
216 * is returned. If \size is NULL, then the size value is omitted.
217 */
tsm_symbol_get(struct tsm_symbol_table * tbl,tsm_symbol_t * sym,size_t * size)218 const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl,
219 tsm_symbol_t *sym, size_t *size)
220 {
221 uint32_t *ucs4, idx;
222
223 if (*sym <= TSM_UCS4_MAX) {
224 if (size)
225 *size = 1;
226 return sym;
227 }
228
229 if (!tbl)
230 return sym;
231
232 idx = *sym - (TSM_UCS4_MAX + 1);
233 if (idx >= shl_array_get_length(tbl->index))
234 ucs4 = NULL;
235 else
236 ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx);
237
238 if (!ucs4) {
239 if (size)
240 *size = 1;
241 return &tsm_symbol_default;
242 }
243
244 if (size) {
245 *size = 0;
246 while (ucs4[*size] <= TSM_UCS4_MAX)
247 ++*size;
248 }
249
250 return ucs4;
251 }
252
tsm_symbol_append(struct tsm_symbol_table * tbl,tsm_symbol_t sym,uint32_t ucs4)253 tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl,
254 tsm_symbol_t sym, uint32_t ucs4)
255 {
256 uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval;
257 const uint32_t *ptr;
258 size_t s;
259 bool res;
260 int ret;
261
262 if (!tbl)
263 return sym;
264
265 if (ucs4 > TSM_UCS4_MAX)
266 return sym;
267
268 ptr = tsm_symbol_get(tbl, &sym, &s);
269 if (s >= TSM_UCS4_MAXLEN)
270 return sym;
271
272 memcpy(buf, ptr, s * sizeof(uint32_t));
273 buf[s++] = ucs4;
274 buf[s++] = TSM_UCS4_MAX + 1;
275
276 res = shl_htable_lookup(&tbl->symbols, buf, hash_ucs4(buf, NULL),
277 (void**)&nval);
278 if (res) {
279 /* key is prefixed with actual value */
280 return *--nval;
281 }
282
283 /* We save the key in nval and prefix it with the new ID. Note that
284 * the prefix is hidden, we actually store "++nval" in the htable. */
285 nval = malloc(sizeof(uint32_t) * (s + 1));
286 if (!nval)
287 return sym;
288
289 ++nval;
290 memcpy(nval, buf, s * sizeof(uint32_t));
291
292 nsym = tbl->next_id + 1;
293 /* Out of IDs; we actually have 2 Billion IDs so this seems
294 * very unlikely but lets be safe here */
295 if (nsym <= tbl->next_id++)
296 goto err_id;
297
298 /* store ID hidden before the key */
299 *(nval - 1) = nsym;
300
301 ret = shl_htable_insert(&tbl->symbols, nval, hash_ucs4(nval, NULL));
302 if (ret)
303 goto err_id;
304
305 ret = shl_array_push(tbl->index, &nval);
306 if (ret)
307 goto err_symbol;
308
309 return nsym;
310
311 err_symbol:
312 shl_htable_remove(&tbl->symbols, nval, hash_ucs4(nval, NULL), NULL);
313 err_id:
314 --tbl->next_id;
315 free(nval);
316 return sym;
317 }
318
tsm_symbol_get_width(struct tsm_symbol_table * tbl,tsm_symbol_t sym)319 unsigned int tsm_symbol_get_width(struct tsm_symbol_table *tbl,
320 tsm_symbol_t sym)
321 {
322 const uint32_t *ch;
323 size_t len;
324
325 if (!tbl)
326 return 0;
327
328 ch = tsm_symbol_get(tbl, &sym, &len);
329 if (len == 0)
330 return 0;
331
332 return tsm_ucs4_get_width(*ch);
333 }
334
335 /*
336 * Convert UCS4 character to UTF-8. This creates one of:
337 * 0xxxxxxx
338 * 110xxxxx 10xxxxxx
339 * 1110xxxx 10xxxxxx 10xxxxxx
340 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
341 * This is based on the same function from "terminology" from the Enlightenment
342 * project. See COPYING for more information.
343 *
344 * @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and
345 * indicates how long the written UTF8 string is.
346 *
347 * Please note @g is a real UCS4 code and not a tsm_symbol_t object!
348 *
349 * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for
350 * UTF16 compatibility. It is an error to encode them. Same applies to numbers
351 * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with
352 * 0xFFFF or 0xFFFE.
353 */
354
355 SHL_EXPORT
tsm_ucs4_get_width(uint32_t ucs4)356 unsigned int tsm_ucs4_get_width(uint32_t ucs4)
357 {
358 int ret;
359
360 ret = mk_wcwidth(ucs4);
361 if (ret <= 0)
362 return 0;
363
364 return ret;
365 }
366
367 SHL_EXPORT
tsm_ucs4_to_utf8(uint32_t g,char * txt)368 size_t tsm_ucs4_to_utf8(uint32_t g, char *txt)
369 {
370 if (g >= 0xd800 && g <= 0xdfff)
371 return 0;
372 if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe)
373 return 0;
374 if (g >= 0xfdd0 && g <= 0xfdef)
375 return 0;
376
377 if (g < (1 << 7)) {
378 txt[0] = g & 0x7f;
379 return 1;
380 } else if (g < (1 << (5 + 6))) {
381 txt[0] = 0xc0 | ((g >> 6) & 0x1f);
382 txt[1] = 0x80 | ((g ) & 0x3f);
383 return 2;
384 } else if (g < (1 << (4 + 6 + 6))) {
385 txt[0] = 0xe0 | ((g >> 12) & 0x0f);
386 txt[1] = 0x80 | ((g >> 6) & 0x3f);
387 txt[2] = 0x80 | ((g ) & 0x3f);
388 return 3;
389 } else if (g < (1 << (3 + 6 + 6 + 6))) {
390 txt[0] = 0xf0 | ((g >> 18) & 0x07);
391 txt[1] = 0x80 | ((g >> 12) & 0x3f);
392 txt[2] = 0x80 | ((g >> 6) & 0x3f);
393 txt[3] = 0x80 | ((g ) & 0x3f);
394 return 4;
395 } else {
396 return 0;
397 }
398 }
399
400 SHL_EXPORT
tsm_ucs4_to_utf8_alloc(const uint32_t * ucs4,size_t len,size_t * len_out)401 char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out)
402 {
403 char *val;
404 size_t i, pos;
405
406 val = malloc(4 * len);
407 if (!val)
408 return NULL;
409
410 pos = 0;
411 for (i = 0; i < len; ++i)
412 pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]);
413
414 if (!pos) {
415 free(val);
416 return NULL;
417 }
418
419 if (len_out)
420 *len_out = pos;
421 return val;
422 }
423
424 /*
425 * UTF8 State Machine
426 * This state machine parses UTF8 and converts it into a stream of Unicode
427 * characters (UCS4 values). A state-machine is represented by a
428 * "struct tsm_utf8_mach" object. It has no global state and all functions are
429 * re-entrant if called with different state-machine objects.
430 *
431 * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its
432 * initial state. Returns 0 on success.
433 *
434 * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally
435 * allocated memory.
436 *
437 * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This
438 * is the same state the machine is in after it got created.
439 *
440 * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the
441 * state-machine. This function returns the new state of the state-machine after
442 * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT,
443 * then there is a pending UCS4 character that you should retrieve via
444 * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was
445 * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and
446 * some error recovery was tried or a replacement character was choosen. All
447 * other states mean that the machine needs more input to parse the stream.
448 *
449 * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on
450 * the state machine so you can call it multiple times.
451 *
452 * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This
453 * can be used to ignore the last read input or to simply reset the machine.
454 * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to
455 * get a full UTF8 sequence.
456 * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the
457 * user will read a replacement character. If further errors occur, we go to
458 * state TSM_UTF8_START to avoid printing multiple replacement characters for a
459 * single misinterpreted UTF8 sequence. However, under some circumstances it may
460 * happen that we stay in TSM_UTF8_REJECT and a next replacement character is
461 * returned.
462 * It is difficult to decide how to interpret wrong input but this machine seems
463 * to be quite good at deciding what to do. Generally, we prefer discarding or
464 * replacing input instead of trying to decipher ASCII values from the invalid
465 * data. This guarantees that we do not send wrong values to the terminal
466 * emulator. Some might argue that an ASCII fallback would be better. However,
467 * this means that we might send very weird escape-sequences to the VTE layer.
468 * Especially with C1 codes applications can really break many terminal features
469 * so we avoid any non-ASCII+non-UTF8 input to prevent this.
470 */
471
472 struct tsm_utf8_mach {
473 int state;
474 uint32_t ch;
475 };
476
tsm_utf8_mach_new(struct tsm_utf8_mach ** out)477 int tsm_utf8_mach_new(struct tsm_utf8_mach **out)
478 {
479 struct tsm_utf8_mach *mach;
480
481 if (!out)
482 return -EINVAL;
483
484 mach = malloc(sizeof(*mach));
485 if (!mach)
486 return -ENOMEM;
487
488 memset(mach, 0, sizeof(*mach));
489 mach->state = TSM_UTF8_START;
490
491 *out = mach;
492 return 0;
493 }
494
tsm_utf8_mach_free(struct tsm_utf8_mach * mach)495 void tsm_utf8_mach_free(struct tsm_utf8_mach *mach)
496 {
497 if (!mach)
498 return;
499
500 free(mach);
501 }
502
tsm_utf8_mach_feed(struct tsm_utf8_mach * mach,char ci)503 int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci)
504 {
505 uint32_t c;
506
507 if (!mach)
508 return TSM_UTF8_START;
509
510 c = ci;
511
512 switch (mach->state) {
513 case TSM_UTF8_START:
514 case TSM_UTF8_ACCEPT:
515 case TSM_UTF8_REJECT:
516 if (c == 0xC0 || c == 0xC1) {
517 /* overlong encoding for ASCII, reject */
518 mach->state = TSM_UTF8_REJECT;
519 } else if ((c & 0x80) == 0) {
520 /* single byte, accept */
521 mach->ch = c;
522 mach->state = TSM_UTF8_ACCEPT;
523 } else if ((c & 0xC0) == 0x80) {
524 /* parser out of sync, ignore byte */
525 mach->state = TSM_UTF8_START;
526 } else if ((c & 0xE0) == 0xC0) {
527 /* start of two byte sequence */
528 mach->ch = (c & 0x1F) << 6;
529 mach->state = TSM_UTF8_EXPECT1;
530 } else if ((c & 0xF0) == 0xE0) {
531 /* start of three byte sequence */
532 mach->ch = (c & 0x0F) << 12;
533 mach->state = TSM_UTF8_EXPECT2;
534 } else if ((c & 0xF8) == 0xF0) {
535 /* start of four byte sequence */
536 mach->ch = (c & 0x07) << 18;
537 mach->state = TSM_UTF8_EXPECT3;
538 } else {
539 /* overlong encoding, reject */
540 mach->state = TSM_UTF8_REJECT;
541 }
542 break;
543 case TSM_UTF8_EXPECT3:
544 mach->ch |= (c & 0x3F) << 12;
545 if ((c & 0xC0) == 0x80)
546 mach->state = TSM_UTF8_EXPECT2;
547 else
548 mach->state = TSM_UTF8_REJECT;
549 break;
550 case TSM_UTF8_EXPECT2:
551 mach->ch |= (c & 0x3F) << 6;
552 if ((c & 0xC0) == 0x80)
553 mach->state = TSM_UTF8_EXPECT1;
554 else
555 mach->state = TSM_UTF8_REJECT;
556 break;
557 case TSM_UTF8_EXPECT1:
558 mach->ch |= c & 0x3F;
559 if ((c & 0xC0) == 0x80)
560 mach->state = TSM_UTF8_ACCEPT;
561 else
562 mach->state = TSM_UTF8_REJECT;
563 break;
564 default:
565 mach->state = TSM_UTF8_REJECT;
566 break;
567 }
568
569 return mach->state;
570 }
571
tsm_utf8_mach_get(struct tsm_utf8_mach * mach)572 uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach)
573 {
574 if (!mach || mach->state != TSM_UTF8_ACCEPT)
575 return TSM_UCS4_REPLACEMENT;
576
577 return mach->ch;
578 }
579
tsm_utf8_mach_reset(struct tsm_utf8_mach * mach)580 void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach)
581 {
582 if (!mach)
583 return;
584
585 mach->state = TSM_UTF8_START;
586 }
587