1 /* $XTermId: ptydata.c,v 1.150 2020/10/12 18:46:28 tom Exp $ */
2
3 /*
4 * Copyright 1999-2019,2020 by Thomas E. Dickey
5 *
6 * All Rights Reserved
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Except as contained in this notice, the name(s) of the above copyright
28 * holders shall not be used in advertising or otherwise to promote the
29 * sale, use or other dealings in this Software without prior written
30 * authorization.
31 */
32
33 #include <data.h>
34
35 #if OPT_WIDE_CHARS
36 #include <menu.h>
37 #include <wcwidth.h>
38 #endif
39
40 #ifdef TEST_DRIVER
41 #undef TRACE
42 #define TRACE(p) if (1) printf p
43 #undef TRACE2
44 #define TRACE2(p) if (0) printf p
45 #define visibleChars(buf, len) "buffer"
46 #endif
47
48 /*
49 * Check for both EAGAIN and EWOULDBLOCK, because some supposedly POSIX
50 * systems are broken and return EWOULDBLOCK when they should return EAGAIN.
51 * Note that this macro may evaluate its argument more than once.
52 */
53 #if defined(EAGAIN) && defined(EWOULDBLOCK)
54 #define E_TEST(err) ((err) == EAGAIN || (err) == EWOULDBLOCK)
55 #else
56 #ifdef EAGAIN
57 #define E_TEST(err) ((err) == EAGAIN)
58 #else
59 #define E_TEST(err) ((err) == EWOULDBLOCK)
60 #endif
61 #endif
62
63 #if OPT_WIDE_CHARS
64 /*
65 * Convert the 8-bit codes in data->buffer[] into Unicode in data->utf_data.
66 * The number of bytes converted will be nonzero iff there is data.
67 */
68 Bool
decodeUtf8(TScreen * screen,PtyData * data)69 decodeUtf8(TScreen *screen, PtyData *data)
70 {
71 int i;
72 int length = (int) (data->last - data->next);
73 int utf_count = 0;
74 unsigned utf_char = 0;
75
76 data->utf_size = 0;
77 for (i = 0; i < length; i++) {
78 unsigned c = data->next[i];
79
80 /* Combine UTF-8 into Unicode */
81 if (c < 0x80) {
82 /* We received an ASCII character */
83 if (utf_count > 0) {
84 data->utf_data = UCS_REPL; /* prev. sequence incomplete */
85 data->utf_size = i;
86 } else {
87 data->utf_data = (IChar) c;
88 data->utf_size = 1;
89 }
90 break;
91 } else if (screen->vt100_graphics
92 && (c < 0x100)
93 && (utf_count == 0)
94 && screen->gsets[(int) screen->curgr] != nrc_ASCII) {
95 data->utf_data = (IChar) c;
96 data->utf_size = 1;
97 break;
98 } else if (c < 0xc0) {
99 /* We received a continuation byte */
100 if (utf_count < 1) {
101 /*
102 * We received a continuation byte before receiving a sequence
103 * state. Or an attempt to use a C1 control string. Either
104 * way, it is mapped to the replacement character, unless
105 * allowed by optional feature.
106 */
107 data->utf_data = (IChar) (screen->c1_printable ? c : UCS_REPL);
108 data->utf_size = (i + 1);
109 break;
110 } else if (screen->utf8_weblike
111 && (utf_count == 3
112 && utf_char == 0x04
113 && c >= 0x90)) {
114 /* The encoding would form a code point beyond U+10FFFF. */
115 data->utf_size = i;
116 data->utf_data = UCS_REPL;
117 break;
118 } else if (screen->utf8_weblike
119 && (utf_count == 2
120 && utf_char == 0x0d
121 && c >= 0xa0)) {
122 /* The encoding would form a surrogate code point. */
123 data->utf_size = i;
124 data->utf_data = UCS_REPL;
125 break;
126 } else {
127 /* Check for overlong UTF-8 sequences for which a shorter
128 * encoding would exist and replace them with UCS_REPL.
129 * An overlong UTF-8 sequence can have any of the following
130 * forms:
131 * 1100000x 10xxxxxx
132 * 11100000 100xxxxx 10xxxxxx
133 * 11110000 1000xxxx 10xxxxxx 10xxxxxx
134 * 11111000 10000xxx 10xxxxxx 10xxxxxx 10xxxxxx
135 * 11111100 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
136 */
137 if (!utf_char && !((c & 0x7f) >> (7 - utf_count))) {
138 if (screen->utf8_weblike) {
139 /* overlong sequence continued */
140 data->utf_data = UCS_REPL;
141 data->utf_size = i;
142 break;
143 } else {
144 utf_char = UCS_REPL;
145 }
146 }
147 utf_char <<= 6;
148 utf_char |= (c & 0x3f);
149 if ((utf_char >= 0xd800 &&
150 utf_char <= 0xdfff) ||
151 (utf_char == 0xfffe) ||
152 (utf_char == HIDDEN_CHAR)) {
153 utf_char = UCS_REPL;
154 }
155 utf_count--;
156 if (utf_count == 0) {
157 #if !OPT_WIDER_ICHAR
158 /* characters outside UCS-2 become UCS_REPL */
159 if (utf_char > NARROW_ICHAR) {
160 TRACE(("using replacement for %#x\n", utf_char));
161 utf_char = UCS_REPL;
162 }
163 #endif
164 data->utf_data = (IChar) utf_char;
165 data->utf_size = (i + 1);
166 break;
167 }
168 }
169 } else {
170 /* We received a sequence start byte */
171 if (utf_count > 0) {
172 /* previous sequence is incomplete */
173 data->utf_data = UCS_REPL;
174 data->utf_size = i;
175 break;
176 }
177 if (screen->utf8_weblike) {
178 if (c < 0xe0) {
179 if (!(c & 0x1e)) {
180 /* overlong sequence start */
181 data->utf_data = UCS_REPL;
182 data->utf_size = (i + 1);
183 break;
184 }
185 utf_count = 1;
186 utf_char = (c & 0x1f);
187 } else if (c < 0xf0) {
188 utf_count = 2;
189 utf_char = (c & 0x0f);
190 } else if (c < 0xf5) {
191 utf_count = 3;
192 utf_char = (c & 0x07);
193 } else {
194 data->utf_data = UCS_REPL;
195 data->utf_size = (i + 1);
196 break;
197 }
198 } else {
199 if (c < 0xe0) {
200 utf_count = 1;
201 utf_char = (c & 0x1f);
202 if (!(c & 0x1e)) {
203 /* overlong sequence */
204 utf_char = UCS_REPL;
205 }
206 } else if (c < 0xf0) {
207 utf_count = 2;
208 utf_char = (c & 0x0f);
209 } else if (c < 0xf8) {
210 utf_count = 3;
211 utf_char = (c & 0x07);
212 } else if (c < 0xfc) {
213 utf_count = 4;
214 utf_char = (c & 0x03);
215 } else if (c < 0xfe) {
216 utf_count = 5;
217 utf_char = (c & 0x01);
218 } else {
219 data->utf_data = UCS_REPL;
220 data->utf_size = (i + 1);
221 break;
222 }
223 }
224 }
225 }
226 #if OPT_TRACE > 1
227 TRACE(("UTF-8 char %04X [%d..%d]\n",
228 data->utf_data,
229 (int) (data->next - data->buffer),
230 (int) (data->next - data->buffer + data->utf_size - 1)));
231 #endif
232
233 return (data->utf_size != 0);
234 }
235 #endif
236
237 int
readPtyData(XtermWidget xw,PtySelect * select_mask,PtyData * data)238 readPtyData(XtermWidget xw, PtySelect * select_mask, PtyData *data)
239 {
240 TScreen *screen = TScreenOf(xw);
241 int size = 0;
242
243 #ifdef VMS
244 if (*select_mask & pty_mask) {
245 trimPtyData(xw, data);
246 if (read_queue.flink != 0) {
247 size = tt_read(data->next);
248 if (size == 0) {
249 Panic("input: read returned zero\n", 0);
250 }
251 } else {
252 sys$hiber();
253 }
254 }
255 #else /* !VMS */
256 if (FD_ISSET(screen->respond, select_mask)) {
257 int save_err;
258 trimPtyData(xw, data);
259
260 size = (int) read(screen->respond, (char *) data->last, (size_t) FRG_SIZE);
261 save_err = errno;
262 #if (defined(i386) && defined(SVR4) && defined(sun)) || defined(__CYGWIN__)
263 /*
264 * Yes, I know this is a majorly f*ugly hack, however it seems to
265 * be necessary for Solaris x86. DWH 11/15/94
266 * Dunno why though..
267 * (and now CYGWIN, alanh@xfree86.org 08/15/01
268 */
269 if (size <= 0) {
270 if (save_err == EIO || save_err == 0)
271 NormalExit();
272 else if (!E_TEST(save_err))
273 Panic("input: read returned unexpected error (%d)\n", save_err);
274 size = 0;
275 }
276 #else /* !f*ugly */
277 if (size < 0) {
278 if (save_err == EIO)
279 NormalExit();
280 else if (!E_TEST(save_err))
281 Panic("input: read returned unexpected error (%d)\n", save_err);
282 size = 0;
283 } else if (size == 0) {
284 #if defined(__FreeBSD__)
285 NormalExit();
286 #else
287 Panic("input: read returned zero\n", 0);
288 #endif
289 }
290 #endif /* f*ugly */
291 }
292 #endif /* VMS */
293
294 if (size) {
295 #if OPT_TRACE
296 int i;
297
298 TRACE(("read %d bytes from pty\n", size));
299 for (i = 0; i < size; i++) {
300 if (!(i % 16))
301 TRACE(("%s", i ? "\n " : "READ"));
302 TRACE((" %02X", data->last[i]));
303 }
304 TRACE(("\n"));
305 #endif
306 data->last += size;
307 #ifdef ALLOWLOGGING
308 TScreenOf(term)->logstart = VTbuffer->next;
309 #endif
310 }
311
312 return (size);
313 }
314
315 /*
316 * Return the next value from the input buffer. Note that morePtyData() is
317 * always called before this function, so we can do the UTF-8 input conversion
318 * in that function and simply return the result here.
319 */
320 #if OPT_WIDE_CHARS
321 IChar
nextPtyData(TScreen * screen,PtyData * data)322 nextPtyData(TScreen *screen, PtyData *data)
323 {
324 IChar result;
325 if (screen->utf8_inparse) {
326 skipPtyData(data, result);
327 } else {
328 result = *((data)->next++);
329 if (!screen->output_eight_bits) {
330 result = (IChar) (result & 0x7f);
331 }
332 }
333 TRACE2(("nextPtyData returns %#x\n", result));
334 return result;
335 }
336 #endif
337
338 #if OPT_WIDE_CHARS
339 /*
340 * Called when UTF-8 mode has been turned on/off.
341 */
342 void
switchPtyData(TScreen * screen,int flag)343 switchPtyData(TScreen *screen, int flag)
344 {
345 if (screen->utf8_mode != flag) {
346 screen->utf8_mode = flag;
347 screen->utf8_inparse = (Boolean) (flag != 0);
348 mk_wcwidth_init(screen->utf8_mode);
349
350 TRACE(("turning UTF-8 mode %s\n", BtoS(flag)));
351 update_font_utf8_mode();
352 }
353 }
354 #endif
355
356 /*
357 * Allocate a buffer.
358 */
359 void
initPtyData(PtyData ** result)360 initPtyData(PtyData **result)
361 {
362 PtyData *data;
363
364 TRACE2(("initPtyData given minBufSize %d, maxBufSize %d\n",
365 FRG_SIZE, BUF_SIZE));
366
367 if (FRG_SIZE < 64)
368 FRG_SIZE = 64;
369 if (BUF_SIZE < FRG_SIZE)
370 BUF_SIZE = FRG_SIZE;
371 if (BUF_SIZE % FRG_SIZE)
372 BUF_SIZE = BUF_SIZE + FRG_SIZE - (BUF_SIZE % FRG_SIZE);
373
374 TRACE2(("initPtyData using minBufSize %d, maxBufSize %d\n",
375 FRG_SIZE, BUF_SIZE));
376
377 data = TypeXtMallocX(PtyData, (BUF_SIZE + FRG_SIZE));
378
379 memset(data, 0, sizeof(*data));
380 data->next = data->buffer;
381 data->last = data->buffer;
382 *result = data;
383 }
384
385 /*
386 * Initialize a buffer for the caller, using its data in 'next'.
387 */
388 #if OPT_WIDE_CHARS
389 PtyData *
fakePtyData(PtyData * result,Char * next,Char * last)390 fakePtyData(PtyData *result, Char *next, Char *last)
391 {
392 PtyData *data = result;
393
394 memset(data, 0, sizeof(*data));
395 data->next = next;
396 data->last = last;
397
398 return data;
399 }
400 #endif
401
402 /*
403 * Remove used data by shifting the buffer down, to make room for more data,
404 * e.g., a continuation-read.
405 */
406 void
trimPtyData(XtermWidget xw,PtyData * data)407 trimPtyData(XtermWidget xw, PtyData *data)
408 {
409 (void) xw;
410 FlushLog(xw);
411
412 if (data->next != data->buffer) {
413 int i;
414 int n = (int) (data->last - data->next);
415
416 TRACE(("shifting buffer down by %d\n", n));
417 for (i = 0; i < n; ++i) {
418 data->buffer[i] = data->next[i];
419 }
420 data->next = data->buffer;
421 data->last = data->next + n;
422 }
423
424 }
425
426 /*
427 * Insert new data into the input buffer so the next calls to morePtyData()
428 * and nextPtyData() will return that.
429 */
430 void
fillPtyData(XtermWidget xw,PtyData * data,const char * value,int length)431 fillPtyData(XtermWidget xw, PtyData *data, const char *value, int length)
432 {
433 int size;
434 int n;
435
436 /* remove the used portion of the buffer */
437 trimPtyData(xw, data);
438
439 VTbuffer->last += length;
440 size = (int) (VTbuffer->last - VTbuffer->next);
441
442 /* shift the unused portion up to make room */
443 for (n = size; n >= length; --n)
444 VTbuffer->next[n] = VTbuffer->next[n - length];
445
446 /* insert the new bytes to interpret */
447 for (n = 0; n < length; n++)
448 VTbuffer->next[n] = CharOf(value[n]);
449 }
450
451 #if OPT_WIDE_CHARS
452 /*
453 * Convert an ISO-8859-1 code 'c' to UTF-8, storing the result in the target
454 * 'lp', and returning a pointer past the converted character.
455 */
456 Char *
convertToUTF8(Char * lp,unsigned c)457 convertToUTF8(Char *lp, unsigned c)
458 {
459 #define CH(n) (Char)((c) >> ((n) * 8))
460 if (c < 0x80) {
461 /* 0******* */
462 *lp++ = (Char) CH(0);
463 } else if (c < 0x800) {
464 /* 110***** 10****** */
465 *lp++ = (Char) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2));
466 *lp++ = (Char) (0x80 | (CH(0) & 0x3f));
467 } else if (c < 0x00010000) {
468 /* 1110**** 10****** 10****** */
469 *lp++ = (Char) (0xe0 | ((int) (CH(1) & 0xf0) >> 4));
470 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
471 *lp++ = (Char) (0x80 | (CH(0) & 0x3f));
472 } else if (c < 0x00200000) {
473 *lp++ = (Char) (0xf0 | ((int) (CH(2) & 0x1f) >> 2));
474 *lp++ = (Char) (0x80 |
475 ((int) (CH(1) & 0xf0) >> 4) |
476 ((int) (CH(2) & 0x03) << 4));
477 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
478 *lp++ = (Char) (0x80 | (CH(0) & 0x3f));
479 } else if (c < 0x04000000) {
480 *lp++ = (Char) (0xf8 | (CH(3) & 0x03));
481 *lp++ = (Char) (0x80 | (CH(2) >> 2));
482 *lp++ = (Char) (0x80 |
483 ((int) (CH(1) & 0xf0) >> 4) |
484 ((int) (CH(2) & 0x03) << 4));
485 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
486 *lp++ = (Char) (0x80 | (CH(0) & 0x3f));
487 } else {
488 *lp++ = (Char) (0xfc | ((int) (CH(3) & 0x40) >> 6));
489 *lp++ = (Char) (0x80 | (CH(3) & 0x3f));
490 *lp++ = (Char) (0x80 | (CH(2) >> 2));
491 *lp++ = (Char) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4));
492 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
493 *lp++ = (Char) (0x80 | (CH(0) & 0x3f));
494 }
495 return lp;
496 #undef CH
497 }
498
499 /*
500 * Convert a UTF-8 multibyte character to an Unicode value, returning a pointer
501 * past the converted UTF-8 input. The first 256 values align with ISO-8859-1,
502 * making it possible to use this to convert to Latin-1.
503 *
504 * If the conversion fails, return null.
505 */
506 Char *
convertFromUTF8(Char * lp,unsigned * cp)507 convertFromUTF8(Char *lp, unsigned *cp)
508 {
509 int want;
510
511 /*
512 * Find the number of bytes we will need from the source.
513 */
514 if ((*lp & 0x80) == 0) {
515 want = 1;
516 } else if ((*lp & 0xe0) == 0xc0) {
517 want = 2;
518 } else if ((*lp & 0xf0) == 0xe0) {
519 want = 3;
520 } else if ((*lp & 0xf8) == 0xf0) {
521 want = 4;
522 } else if ((*lp & 0xfc) == 0xf8) {
523 want = 5;
524 } else if ((*lp & 0xfe) == 0xfc) {
525 want = 6;
526 } else {
527 want = 0;
528 }
529
530 if (want) {
531 int have = 1;
532
533 while (lp[have] != '\0') {
534 if ((lp[have] & 0xc0) != 0x80)
535 break;
536 ++have;
537 }
538 if (want == have) {
539 unsigned mask = 0;
540 int j;
541 int shift = 0;
542
543 *cp = 0;
544 switch (want) {
545 case 1:
546 mask = (*lp);
547 break;
548 case 2:
549 mask = (*lp & 0x1f);
550 break;
551 case 3:
552 mask = (*lp & 0x0f);
553 break;
554 case 4:
555 mask = (*lp & 0x07);
556 break;
557 case 5:
558 mask = (*lp & 0x03);
559 break;
560 case 6:
561 mask = (*lp & 0x01);
562 break;
563 default:
564 mask = 0;
565 break;
566 }
567
568 for (j = 1; j < want; j++) {
569 *cp |= (unsigned) ((lp[want - j] & 0x3f) << shift);
570 shift += 6;
571 }
572 *cp |= mask << shift;
573 lp += want;
574 } else {
575 *cp = BAD_ASCII;
576 lp = NULL;
577 }
578 } else {
579 *cp = BAD_ASCII;
580 lp = NULL;
581 }
582 return lp;
583 }
584
585 /*
586 * Returns true if the entire string is valid UTF-8.
587 */
588 Boolean
isValidUTF8(Char * lp)589 isValidUTF8(Char *lp)
590 {
591 Boolean result = True;
592 while (*lp) {
593 unsigned ch;
594 Char *next = convertFromUTF8(lp, &ch);
595 if (next == NULL || ch == 0) {
596 result = False;
597 break;
598 }
599 lp = next;
600 }
601 return result;
602 }
603
604 /*
605 * Write data back to the PTY
606 */
607 void
writePtyData(int f,IChar * d,unsigned len)608 writePtyData(int f, IChar *d, unsigned len)
609 {
610 unsigned n = (len << 1);
611
612 if (VTbuffer->write_len <= len) {
613 VTbuffer->write_len = n;
614 VTbuffer->write_buf = (Char *) XtRealloc((char *)
615 VTbuffer->write_buf, VTbuffer->write_len);
616 }
617
618 for (n = 0; n < len; n++)
619 VTbuffer->write_buf[n] = (Char) d[n];
620
621 TRACE(("writePtyData %u:%s\n", n,
622 visibleChars(VTbuffer->write_buf, n)));
623 v_write(f, VTbuffer->write_buf, n);
624 }
625 #endif /* OPT_WIDE_CHARS */
626
627 #ifdef NO_LEAKS
628 void
noleaks_ptydata(void)629 noleaks_ptydata(void)
630 {
631 if (VTbuffer != 0) {
632 #if OPT_WIDE_CHARS
633 free(VTbuffer->write_buf);
634 #endif
635 FreeAndNull(VTbuffer);
636 }
637 }
638 #endif
639
640 #ifdef TEST_DRIVER
641
642 #include "data.c"
643
644 void
NormalExit(void)645 NormalExit(void)
646 {
647 fprintf(stderr, "NormalExit!\n");
648 exit(EXIT_SUCCESS);
649 }
650
651 void
Panic(const char * s,int a)652 Panic(const char *s, int a)
653 {
654 (void) s;
655 (void) a;
656 fprintf(stderr, "Panic!\n");
657 exit(EXIT_FAILURE);
658 }
659
660 #if OPT_WIDE_CHARS
661
662 #ifdef ALLOWLOGGING
663 void
FlushLog(XtermWidget xw)664 FlushLog(XtermWidget xw)
665 {
666 (void) xw;
667 }
668 #endif
669
670 void
v_write(int f,const Char * data,unsigned len)671 v_write(int f, const Char *data, unsigned len)
672 {
673 (void) f;
674 (void) data;
675 (void) len;
676 }
677
678 void
mk_wcwidth_init(int mode)679 mk_wcwidth_init(int mode)
680 {
681 (void) mode;
682 }
683
684 void
update_font_utf8_mode(void)685 update_font_utf8_mode(void)
686 {
687 }
688
689 static int message_level = 0;
690 static int opt_all = 0;
691 static int opt_illegal = 0;
692 static int opt_convert = 0;
693 static int opt_reverse = 0;
694 static long total_test = 0;
695 static long total_errs = 0;
696
697 static void
usage(void)698 usage(void)
699 {
700 static const char *msg[] =
701 {
702 "Usage: test_ptydata [options] [c1[-c1b] [c2-[c2b] [...]]]",
703 "",
704 "Options:",
705 " -a exercise all legal encode/decode to/from UTF-8",
706 " -c call convertFromUTF8 rather than decodeUTF8",
707 " -i ignore illegal UTF-8 when testing -r option",
708 " -q quieter",
709 " -r reverse/decode from UTF-8 byte-string to/from Unicode",
710 " -v more verbose"
711 };
712 size_t n;
713 for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
714 fprintf(stderr, "%s\n", msg[n]);
715 }
716 exit(EXIT_FAILURE);
717 }
718
719 /*
720 * http://www.unicode.org/versions/corrigendum1.html, table 3.1B
721 */
722 #define OkRange(n,lo,hi) \
723 if (value[n] < lo || value[n] > hi) { \
724 result = False; \
725 break; \
726 }
727 static Bool
is_legal_utf8(const Char * value)728 is_legal_utf8(const Char *value)
729 {
730 Bool result = True;
731 Char ch;
732 while ((ch = *value) != '\0') {
733 if (ch <= 0x7f) {
734 ++value;
735 } else if (ch >= 0xc2 && ch <= 0xdf) {
736 OkRange(1, 0x80, 0xbf);
737 value += 2;
738 } else if (ch == 0xe0) {
739 OkRange(1, 0xa0, 0xbf);
740 OkRange(2, 0x80, 0xbf);
741 value += 3;
742 } else if (ch >= 0xe1 && ch <= 0xef) {
743 OkRange(1, 0x80, 0xbf);
744 OkRange(2, 0x80, 0xbf);
745 value += 3;
746 } else if (ch == 0xf0) {
747 OkRange(1, 0x90, 0xbf);
748 OkRange(2, 0x80, 0xbf);
749 OkRange(3, 0x80, 0xbf);
750 value += 4;
751 } else if (ch >= 0xf1 && ch <= 0xf3) {
752 OkRange(1, 0x80, 0xbf);
753 OkRange(2, 0x80, 0xbf);
754 OkRange(3, 0x80, 0xbf);
755 value += 4;
756 } else if (ch == 0xf4) {
757 OkRange(1, 0x80, 0x8f);
758 OkRange(2, 0x80, 0xbf);
759 OkRange(3, 0x80, 0xbf);
760 value += 4;
761 } else {
762 result = False;
763 break;
764 }
765 }
766 return result;
767 }
768
769 static void
test_utf8_convert(void)770 test_utf8_convert(void)
771 {
772 unsigned c_in, c_out;
773 Char buffer[10];
774 Char *result;
775 unsigned limit = 0x110000;
776 unsigned success = 0;
777 unsigned bucket[256];
778
779 memset(bucket, 0, sizeof(bucket));
780 for (c_in = 0; c_in < limit; ++c_in) {
781 memset(buffer, 0, sizeof(buffer));
782 if ((result = convertToUTF8(buffer, c_in)) == 0) {
783 TRACE(("conversion of U+%04X to UTF-8 failed\n", c_in));
784 } else {
785 if ((result = convertFromUTF8(buffer, &c_out)) == 0) {
786 TRACE(("conversion of U+%04X from UTF-8 failed\n", c_in));
787 } else if (c_in != c_out) {
788 TRACE(("conversion of U+%04X to/from UTF-8 gave U+%04X\n",
789 c_in, c_out));
790 } else {
791 while (result-- != buffer) {
792 bucket[*result]++;
793 }
794 ++success;
795 }
796 }
797 }
798 TRACE(("%u/%u successful\n", success, limit));
799 for (c_in = 0; c_in < 256; ++c_in) {
800 if ((c_in % 8) == 0) {
801 TRACE((" %02X:", c_in));
802 }
803 TRACE((" %8X", bucket[c_in]));
804 if (((c_in + 1) % 8) == 0) {
805 TRACE(("\n"));
806 }
807 }
808 }
809
810 static int
decode_one(const char * source,char ** target)811 decode_one(const char *source, char **target)
812 {
813 int result = -1;
814 long check;
815 int radix = 0;
816 if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
817 source += 2;
818 radix = 16;
819 } else if (source[0] == '0' && source[1] == 'b') {
820 source += 2;
821 radix = 2;
822 }
823 check = strtol(source, target, radix);
824 if (*target != NULL && *target != source)
825 result = (int) check;
826 return result;
827 }
828
829 static int
decode_range(const char * source,int * lo,int * hi)830 decode_range(const char *source, int *lo, int *hi)
831 {
832 int result = 0;
833 char *after1;
834 char *after2;
835 if ((*lo = decode_one(source, &after1)) >= 0) {
836 after1 += strspn(after1, ":-.\t ");
837 if ((*hi = decode_one(after1, &after2)) < 0) {
838 *hi = *lo;
839 }
840 result = 1;
841 }
842 return result;
843 }
844
845 #define MAX_BYTES 6
846
847 static void
do_range(const char * source)848 do_range(const char *source)
849 {
850 int lo, hi;
851
852 TScreen screen;
853 memset(&screen, 0, sizeof(screen));
854
855 if (decode_range(source, &lo, &hi)) {
856 while (lo <= hi) {
857 unsigned c_in = (unsigned) lo++;
858 PtyData *data;
859 Char *next;
860 Char buffer[MAX_BYTES + 1];
861
862 if (opt_reverse) {
863 Bool skip = False;
864 Bool first = True;
865 int j, k;
866 for (j = 0; j < MAX_BYTES; ++j) {
867 unsigned long bits = ((unsigned long) c_in >> (8 * j));
868 if ((buffer[j] = (Char) bits) == 0) {
869 skip = (bits != 0);
870 break;
871 }
872 }
873 if (skip)
874 continue;
875 initPtyData(&data);
876 for (k = 0; k <= j; ++k) {
877 data->buffer[k] = buffer[j - k - 1];
878 }
879 if (opt_illegal && !is_legal_utf8(data->buffer)) {
880 free(data);
881 continue;
882 }
883 if (message_level > 1) {
884 printf("TEST ");
885 for (k = 0; k < j; ++k) {
886 printf("%02X", data->buffer[k]);
887 }
888 }
889 data->next = data->buffer;
890 data->last = data->buffer + j;
891 while (decodeUtf8(&screen, data)) {
892 total_test++;
893 if (data->utf_data == UCS_REPL)
894 total_errs++;
895 data->next += data->utf_size;
896 if (message_level > 1) {
897 printf("%s%04X", first ? " ->" : ", ", data->utf_data);
898 }
899 first = False;
900 }
901 if (!first)
902 total_test--;
903 if (message_level > 1) {
904 printf("\n");
905 fflush(stdout);
906 }
907 free(data);
908 } else if (opt_convert) {
909 unsigned c_out;
910 Char *result;
911
912 memset(buffer, 0, sizeof(buffer));
913 if ((result = next = convertToUTF8(buffer, c_in)) == 0) {
914 fprintf(stderr,
915 "conversion of U+%04X to UTF-8 failed\n", c_in);
916 } else if ((result = convertFromUTF8(buffer, &c_out)) == 0) {
917 fprintf(stderr,
918 "conversion of U+%04X from UTF-8 failed\n", c_in);
919 total_errs++;
920 } else if (c_in != c_out) {
921 fprintf(stderr,
922 "conversion of U+%04X to/from UTF-8 gave U+%04X\n",
923 c_in, c_out);
924 } else if (message_level > 1) {
925 *next = '\0';
926 printf("TEST %04X (%d:%s) ->%04X\n", c_in,
927 (int) (next - buffer),
928 buffer,
929 c_out);
930 fflush(stdout);
931 }
932 } else {
933 initPtyData(&data);
934 next = convertToUTF8(data->buffer, c_in);
935 *next = 0;
936 data->next = data->buffer;
937 data->last = next;
938 decodeUtf8(&screen, data);
939 if (message_level > 1) {
940 printf("TEST %04X (%d:%s) ->%04X\n", c_in,
941 (int) (next - data->buffer),
942 data->buffer,
943 data->utf_data);
944 fflush(stdout);
945 }
946 if (c_in != data->utf_data) {
947 fprintf(stderr, "Mismatch: %04X vs %04X\n", c_in, data->utf_data);
948 total_errs++;
949 }
950 free(data);
951 }
952 total_test++;
953 }
954 }
955 }
956
957 int
main(int argc,char ** argv)958 main(int argc, char **argv)
959 {
960 int ch;
961
962 setlocale(LC_ALL, "");
963 while ((ch = getopt(argc, argv, "aciqrv")) != -1) {
964 switch (ch) {
965 case 'a':
966 opt_all = 1;
967 break;
968 case 'c':
969 opt_convert = 1;
970 break;
971 case 'i':
972 opt_illegal = 1;
973 break;
974 case 'q':
975 message_level--;
976 break;
977 case 'r':
978 opt_reverse = 1;
979 break;
980 case 'v':
981 message_level++;
982 break;
983 default:
984 usage();
985 }
986 }
987 if (opt_all) {
988 test_utf8_convert();
989 } else {
990 if (optind >= argc)
991 usage();
992 while (optind < argc) {
993 do_range(argv[optind++]);
994 }
995 if (total_test) {
996 printf("%ld/%ld mismatches (%.0f%%)\n",
997 total_errs,
998 total_test,
999 (100.0 * (double) total_errs) / (double) total_test);
1000 }
1001 }
1002 return EXIT_SUCCESS;
1003 }
1004 #else
1005 int
main(int argc,char ** argv)1006 main(int argc, char **argv)
1007 {
1008 (void) argc;
1009 (void) argv;
1010 printf("Nothing to be done here...\n");
1011 return EXIT_SUCCESS;
1012 }
1013 #endif /* OPT_WIDE_CHARS */
1014 #endif
1015