1 /* 2 * Copyright (C) 1984-2014 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ 9 10 11 /* 12 * Functions to define the character set 13 * and do things specific to the character set. 14 */ 15 16 #include "less.h" 17 #if HAVE_LOCALE 18 #include <locale.h> 19 #include <ctype.h> 20 #include <langinfo.h> 21 #endif 22 23 #include "charset.h" 24 25 public int utf_mode = 0; 26 27 /* 28 * Predefined character sets, 29 * selected by the LESSCHARSET environment variable. 30 */ 31 struct charset { 32 char *name; 33 int *p_flag; 34 char *desc; 35 } charsets[] = { 36 { "ascii", NULL, "8bcccbcc18b95.b" }, 37 { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" }, 38 { "iso8859", NULL, "8bcccbcc18b95.33b." }, 39 { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." }, 40 { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" }, 41 { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" }, 42 { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" }, 43 { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" }, 44 { "koi8-r", NULL, "8bcccbcc18b95.b." }, 45 { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." }, 46 { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." }, 47 { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." }, 48 { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." }, 49 { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 50 { "dos", NULL, "8bcccbcc12bc5b95.b." }, 51 { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." }, 52 { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." }, 53 { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." }, 54 { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 55 { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 56 { NULL, NULL, NULL } 57 }; 58 59 /* 60 * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others. 61 */ 62 struct cs_alias { 63 char *name; 64 char *oname; 65 } cs_aliases[] = { 66 { "UTF-8", "utf-8" }, 67 { "ANSI_X3.4-1968", "ascii" }, 68 { "US-ASCII", "ascii" }, 69 { "latin1", "iso8859" }, 70 { "ISO-8859-1", "iso8859" }, 71 { "latin9", "iso8859" }, 72 { "ISO-8859-15", "iso8859" }, 73 { "latin2", "iso8859" }, 74 { "ISO-8859-2", "iso8859" }, 75 { "ISO-8859-3", "latin3" }, 76 { "latin4", "iso8859" }, 77 { "ISO-8859-4", "iso8859" }, 78 { "cyrillic", "iso8859" }, 79 { "ISO-8859-5", "iso8859" }, 80 { "ISO-8859-6", "arabic" }, 81 { "ISO-8859-7", "greek" }, 82 { "IBM9005", "greek2005" }, 83 { "ISO-8859-8", "hebrew" }, 84 { "latin5", "iso8859" }, 85 { "ISO-8859-9", "iso8859" }, 86 { "latin6", "iso8859" }, 87 { "ISO-8859-10", "iso8859" }, 88 { "latin7", "iso8859" }, 89 { "ISO-8859-13", "iso8859" }, 90 { "latin8", "iso8859" }, 91 { "ISO-8859-14", "iso8859" }, 92 { "latin10", "iso8859" }, 93 { "ISO-8859-16", "iso8859" }, 94 { "IBM437", "dos" }, 95 { "EBCDIC-US", "ebcdic" }, 96 { "IBM1047", "IBM-1047" }, 97 { "KOI8-R", "koi8-r" }, 98 { "KOI8-U", "koi8-r" }, 99 { "GEORGIAN-PS", "georgianps" }, 100 { "TCVN5712-1", "tcvn" }, 101 { "NEXTSTEP", "next" }, 102 { "windows", "windows-1252" }, /* backward compatibility */ 103 { "CP1251", "windows-1251" }, 104 { "CP1252", "windows-1252" }, 105 { "CP1255", "windows-1255" }, 106 { NULL, NULL } 107 }; 108 109 #define IS_BINARY_CHAR 01 110 #define IS_CONTROL_CHAR 02 111 112 static char chardef[256]; 113 static char *binfmt = NULL; 114 static char *utfbinfmt = NULL; 115 public int binattr = AT_STANDOUT; 116 117 118 /* 119 * Define a charset, given a description string. 120 * The string consists of 256 letters, 121 * one for each character in the charset. 122 * If the string is shorter than 256 letters, missing letters 123 * are taken to be identical to the last one. 124 * A decimal number followed by a letter is taken to be a 125 * repetition of the letter. 126 * 127 * Each letter is one of: 128 * . normal character 129 * b binary character 130 * c control character 131 */ 132 static void 133 ichardef(s) 134 char *s; 135 { 136 register char *cp; 137 register int n; 138 register char v; 139 140 n = 0; 141 v = 0; 142 cp = chardef; 143 while (*s != '\0') 144 { 145 switch (*s++) 146 { 147 case '.': 148 v = 0; 149 break; 150 case 'c': 151 v = IS_CONTROL_CHAR; 152 break; 153 case 'b': 154 v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 155 break; 156 157 case '0': case '1': case '2': case '3': case '4': 158 case '5': case '6': case '7': case '8': case '9': 159 n = (10 * n) + (s[-1] - '0'); 160 continue; 161 162 default: 163 error("invalid chardef", NULL_PARG); 164 quit(QUIT_ERROR); 165 /*NOTREACHED*/ 166 } 167 168 do 169 { 170 if (cp >= chardef + sizeof(chardef)) 171 { 172 error("chardef longer than 256", NULL_PARG); 173 quit(QUIT_ERROR); 174 /*NOTREACHED*/ 175 } 176 *cp++ = v; 177 } while (--n > 0); 178 n = 0; 179 } 180 181 while (cp < chardef + sizeof(chardef)) 182 *cp++ = v; 183 } 184 185 /* 186 * Define a charset, given a charset name. 187 * The valid charset names are listed in the "charsets" array. 188 */ 189 static int 190 icharset(name, no_error) 191 register char *name; 192 int no_error; 193 { 194 register struct charset *p; 195 register struct cs_alias *a; 196 197 if (name == NULL || *name == '\0') 198 return (0); 199 200 /* First see if the name is an alias. */ 201 for (a = cs_aliases; a->name != NULL; a++) 202 { 203 if (strcmp(name, a->name) == 0) 204 { 205 name = a->oname; 206 break; 207 } 208 } 209 210 for (p = charsets; p->name != NULL; p++) 211 { 212 if (strcmp(name, p->name) == 0) 213 { 214 ichardef(p->desc); 215 if (p->p_flag != NULL) 216 *(p->p_flag) = 1; 217 return (1); 218 } 219 } 220 221 if (!no_error) { 222 error("invalid charset name", NULL_PARG); 223 quit(QUIT_ERROR); 224 } 225 return (0); 226 } 227 228 #if HAVE_LOCALE 229 /* 230 * Define a charset, given a locale name. 231 */ 232 static void 233 ilocale() 234 { 235 register int c; 236 237 for (c = 0; c < (int) sizeof(chardef); c++) 238 { 239 if (isprint(c)) 240 chardef[c] = 0; 241 else if (iscntrl(c)) 242 chardef[c] = IS_CONTROL_CHAR; 243 else 244 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 245 } 246 } 247 #endif 248 249 /* 250 * Define the printing format for control (or binary utf) chars. 251 */ 252 static void 253 setbinfmt(s, fmtvarptr, default_fmt) 254 char *s; 255 char **fmtvarptr; 256 char *default_fmt; 257 { 258 if (s && utf_mode) 259 { 260 /* It would be too hard to account for width otherwise. */ 261 char *t = s; 262 while (*t) 263 { 264 if (*t < ' ' || *t > '~') 265 { 266 s = default_fmt; 267 goto attr; 268 } 269 t++; 270 } 271 } 272 273 /* %n is evil */ 274 if (s == NULL || *s == '\0' || 275 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 276 (*s != '*' && strchr(s, 'n'))) 277 s = default_fmt; 278 279 /* 280 * Select the attributes if it starts with "*". 281 */ 282 attr: 283 if (*s == '*') 284 { 285 switch (s[1]) 286 { 287 case 'd': binattr = AT_BOLD; break; 288 case 'k': binattr = AT_BLINK; break; 289 case 's': binattr = AT_STANDOUT; break; 290 case 'u': binattr = AT_UNDERLINE; break; 291 default: binattr = AT_NORMAL; break; 292 } 293 s += 2; 294 } 295 *fmtvarptr = s; 296 } 297 298 /* 299 * 300 */ 301 static void 302 set_charset() 303 { 304 char *s; 305 306 /* 307 * See if environment variable LESSCHARSET is defined. 308 */ 309 s = lgetenv("LESSCHARSET"); 310 if (icharset(s, 0)) 311 return; 312 313 /* 314 * LESSCHARSET is not defined: try LESSCHARDEF. 315 */ 316 s = lgetenv("LESSCHARDEF"); 317 if (s != NULL && *s != '\0') 318 { 319 ichardef(s); 320 return; 321 } 322 323 #if HAVE_LOCALE 324 #ifdef CODESET 325 /* 326 * Try using the codeset name as the charset name. 327 */ 328 s = nl_langinfo(CODESET); 329 if (icharset(s, 1)) 330 return; 331 #endif 332 #endif 333 334 #if HAVE_STRSTR 335 /* 336 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 337 */ 338 if ((s = lgetenv("LC_ALL")) != NULL || 339 (s = lgetenv("LC_CTYPE")) != NULL || 340 (s = lgetenv("LANG")) != NULL) 341 { 342 if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL 343 || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL) 344 if (icharset("utf-8", 1)) 345 return; 346 } 347 #endif 348 349 #if HAVE_LOCALE 350 /* 351 * Get character definitions from locale functions, 352 * rather than from predefined charset entry. 353 */ 354 ilocale(); 355 #if MSDOS_COMPILER 356 /* 357 * Default to "dos". 358 */ 359 (void) icharset("dos", 1); 360 #else 361 /* 362 * Default to "latin1". 363 */ 364 (void) icharset("latin1", 1); 365 #endif 366 #endif 367 } 368 369 /* 370 * Initialize charset data structures. 371 */ 372 public void 373 init_charset() 374 { 375 char *s; 376 377 #if HAVE_LOCALE 378 setlocale(LC_ALL, ""); 379 #endif 380 381 set_charset(); 382 383 s = lgetenv("LESSBINFMT"); 384 setbinfmt(s, &binfmt, "*s<%02X>"); 385 386 s = lgetenv("LESSUTFBINFMT"); 387 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); 388 } 389 390 /* 391 * Is a given character a "binary" character? 392 */ 393 public int 394 binary_char(c) 395 LWCHAR c; 396 { 397 if (utf_mode) 398 return (is_ubin_char(c)); 399 c &= 0377; 400 return (chardef[c] & IS_BINARY_CHAR); 401 } 402 403 /* 404 * Is a given character a "control" character? 405 */ 406 public int 407 control_char(c) 408 LWCHAR c; 409 { 410 c &= 0377; 411 return (chardef[c] & IS_CONTROL_CHAR); 412 } 413 414 /* 415 * Return the printable form of a character. 416 * For example, in the "ascii" charset '\3' is printed as "^C". 417 */ 418 public char * 419 prchar(c) 420 LWCHAR c; 421 { 422 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 423 static char buf[32]; 424 425 c &= 0377; 426 if ((c < 128 || !utf_mode) && !control_char(c)) 427 SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 428 else if (c == ESC) 429 strcpy(buf, "ESC"); 430 #if IS_EBCDIC_HOST 431 else if (!binary_char(c) && c < 64) 432 SNPRINTF1(buf, sizeof(buf), "^%c", 433 /* 434 * This array roughly inverts CONTROL() #defined in less.h, 435 * and should be kept in sync with CONTROL() and IBM-1047. 436 */ 437 "@ABC.I.?...KLMNO" 438 "PQRS.JH.XY.." 439 "\\]^_" 440 "......W[.....EFG" 441 "..V....D....TU.Z"[c]); 442 #else 443 else if (c < 128 && !control_char(c ^ 0100)) 444 SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100)); 445 #endif 446 else 447 SNPRINTF1(buf, sizeof(buf), binfmt, c); 448 return (buf); 449 } 450 451 /* 452 * Return the printable form of a UTF-8 character. 453 */ 454 public char * 455 prutfchar(ch) 456 LWCHAR ch; 457 { 458 static char buf[32]; 459 460 if (ch == ESC) 461 strcpy(buf, "ESC"); 462 else if (ch < 128 && control_char(ch)) 463 { 464 if (!control_char(ch ^ 0100)) 465 SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100); 466 else 467 SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch); 468 } else if (is_ubin_char(ch)) 469 { 470 SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch); 471 } else 472 { 473 char *p = buf; 474 if (ch >= 0x80000000) 475 ch = 0xFFFD; /* REPLACEMENT CHARACTER */ 476 put_wchar(&p, ch); 477 *p = '\0'; 478 } 479 return (buf); 480 } 481 482 /* 483 * Get the length of a UTF-8 character in bytes. 484 */ 485 public int 486 utf_len(ch) 487 char ch; 488 { 489 if ((ch & 0x80) == 0) 490 return 1; 491 if ((ch & 0xE0) == 0xC0) 492 return 2; 493 if ((ch & 0xF0) == 0xE0) 494 return 3; 495 if ((ch & 0xF8) == 0xF0) 496 return 4; 497 if ((ch & 0xFC) == 0xF8) 498 return 5; 499 if ((ch & 0xFE) == 0xFC) 500 return 6; 501 /* Invalid UTF-8 encoding. */ 502 return 1; 503 } 504 505 /* 506 * Does the parameter point to the lead byte of a well-formed UTF-8 character? 507 */ 508 public int 509 is_utf8_well_formed(s) 510 unsigned char *s; 511 { 512 int i; 513 int len; 514 515 if (IS_UTF8_INVALID(s[0])) 516 return (0); 517 518 len = utf_len((char) s[0]); 519 if (len == 1) 520 return (1); 521 if (len == 2) 522 { 523 if (s[0] < 0xC2) 524 return (0); 525 } else 526 { 527 unsigned char mask; 528 mask = (~((1 << (8-len)) - 1)) & 0xFF; 529 if (s[0] == mask && (s[1] & mask) == 0x80) 530 return (0); 531 } 532 533 for (i = 1; i < len; i++) 534 if (!IS_UTF8_TRAIL(s[i])) 535 return (0); 536 return (1); 537 } 538 539 /* 540 * Return number of invalid UTF-8 sequences found in a buffer. 541 */ 542 public int 543 utf_bin_count(data, len) 544 unsigned char *data; 545 int len; 546 { 547 int bin_count = 0; 548 while (len > 0) 549 { 550 if (is_utf8_well_formed(data)) 551 { 552 int clen = utf_len(*data); 553 data += clen; 554 len -= clen; 555 } else 556 { 557 /* Skip to next lead byte. */ 558 bin_count++; 559 do { 560 ++data; 561 --len; 562 } while (len > 0 && !IS_UTF8_LEAD(*data)); 563 } 564 } 565 return (bin_count); 566 } 567 568 /* 569 * Get the value of a UTF-8 character. 570 */ 571 public LWCHAR 572 get_wchar(p) 573 char *p; 574 { 575 switch (utf_len(p[0])) 576 { 577 case 1: 578 default: 579 /* 0xxxxxxx */ 580 return (LWCHAR) 581 (p[0] & 0xFF); 582 case 2: 583 /* 110xxxxx 10xxxxxx */ 584 return (LWCHAR) ( 585 ((p[0] & 0x1F) << 6) | 586 (p[1] & 0x3F)); 587 case 3: 588 /* 1110xxxx 10xxxxxx 10xxxxxx */ 589 return (LWCHAR) ( 590 ((p[0] & 0x0F) << 12) | 591 ((p[1] & 0x3F) << 6) | 592 (p[2] & 0x3F)); 593 case 4: 594 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 595 return (LWCHAR) ( 596 ((p[0] & 0x07) << 18) | 597 ((p[1] & 0x3F) << 12) | 598 ((p[2] & 0x3F) << 6) | 599 (p[3] & 0x3F)); 600 case 5: 601 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 602 return (LWCHAR) ( 603 ((p[0] & 0x03) << 24) | 604 ((p[1] & 0x3F) << 18) | 605 ((p[2] & 0x3F) << 12) | 606 ((p[3] & 0x3F) << 6) | 607 (p[4] & 0x3F)); 608 case 6: 609 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 610 return (LWCHAR) ( 611 ((p[0] & 0x01) << 30) | 612 ((p[1] & 0x3F) << 24) | 613 ((p[2] & 0x3F) << 18) | 614 ((p[3] & 0x3F) << 12) | 615 ((p[4] & 0x3F) << 6) | 616 (p[5] & 0x3F)); 617 } 618 } 619 620 /* 621 * Store a character into a UTF-8 string. 622 */ 623 public void 624 put_wchar(pp, ch) 625 char **pp; 626 LWCHAR ch; 627 { 628 if (!utf_mode || ch < 0x80) 629 { 630 /* 0xxxxxxx */ 631 *(*pp)++ = (char) ch; 632 } else if (ch < 0x800) 633 { 634 /* 110xxxxx 10xxxxxx */ 635 *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F)); 636 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 637 } else if (ch < 0x10000) 638 { 639 /* 1110xxxx 10xxxxxx 10xxxxxx */ 640 *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F)); 641 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 642 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 643 } else if (ch < 0x200000) 644 { 645 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 646 *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07)); 647 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 648 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 649 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 650 } else if (ch < 0x4000000) 651 { 652 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 653 *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03)); 654 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 655 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 656 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 657 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 658 } else 659 { 660 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 661 *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01)); 662 *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F)); 663 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 664 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 665 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 666 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 667 } 668 } 669 670 /* 671 * Step forward or backward one character in a string. 672 */ 673 public LWCHAR 674 step_char(pp, dir, limit) 675 char **pp; 676 signed int dir; 677 char *limit; 678 { 679 LWCHAR ch; 680 int len; 681 char *p = *pp; 682 683 if (!utf_mode) 684 { 685 /* It's easy if chars are one byte. */ 686 if (dir > 0) 687 ch = (LWCHAR) ((p < limit) ? *p++ : 0); 688 else 689 ch = (LWCHAR) ((p > limit) ? *--p : 0); 690 } else if (dir > 0) 691 { 692 len = utf_len(*p); 693 if (p + len > limit) 694 { 695 ch = 0; 696 p = limit; 697 } else 698 { 699 ch = get_wchar(p); 700 p += len; 701 } 702 } else 703 { 704 while (p > limit && IS_UTF8_TRAIL(p[-1])) 705 p--; 706 if (p > limit) 707 ch = get_wchar(--p); 708 else 709 ch = 0; 710 } 711 *pp = p; 712 return ch; 713 } 714 715 /* 716 * Unicode characters data 717 * Actual data is in the generated *.uni files. 718 */ 719 720 #define DECLARE_RANGE_TABLE_START(name) \ 721 static struct wchar_range name##_array[] = { 722 #define DECLARE_RANGE_TABLE_END(name) \ 723 }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) }; 724 725 DECLARE_RANGE_TABLE_START(compose) 726 #include "compose.uni" 727 DECLARE_RANGE_TABLE_END(compose) 728 729 DECLARE_RANGE_TABLE_START(ubin) 730 #include "ubin.uni" 731 DECLARE_RANGE_TABLE_END(ubin) 732 733 DECLARE_RANGE_TABLE_START(wide) 734 #include "wide.uni" 735 DECLARE_RANGE_TABLE_END(wide) 736 737 /* comb_table is special pairs, not ranges. */ 738 static struct wchar_range comb_table[] = { 739 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 740 }; 741 742 743 static int 744 is_in_table(ch, table) 745 LWCHAR ch; 746 struct wchar_range_table *table; 747 { 748 int hi; 749 int lo; 750 751 /* Binary search in the table. */ 752 if (ch < table->table[0].first) 753 return 0; 754 lo = 0; 755 hi = table->count - 1; 756 while (lo <= hi) 757 { 758 int mid = (lo + hi) / 2; 759 if (ch > table->table[mid].last) 760 lo = mid + 1; 761 else if (ch < table->table[mid].first) 762 hi = mid - 1; 763 else 764 return 1; 765 } 766 return 0; 767 } 768 769 /* 770 * Is a character a UTF-8 composing character? 771 * If a composing character follows any char, the two combine into one glyph. 772 */ 773 public int 774 is_composing_char(ch) 775 LWCHAR ch; 776 { 777 return is_in_table(ch, &compose_table); 778 } 779 780 /* 781 * Should this UTF-8 character be treated as binary? 782 */ 783 public int 784 is_ubin_char(ch) 785 LWCHAR ch; 786 { 787 return is_in_table(ch, &ubin_table); 788 } 789 790 /* 791 * Is this a double width UTF-8 character? 792 */ 793 public int 794 is_wide_char(ch) 795 LWCHAR ch; 796 { 797 return is_in_table(ch, &wide_table); 798 } 799 800 /* 801 * Is a character a UTF-8 combining character? 802 * A combining char acts like an ordinary char, but if it follows 803 * a specific char (not any char), the two combine into one glyph. 804 */ 805 public int 806 is_combining_char(ch1, ch2) 807 LWCHAR ch1; 808 LWCHAR ch2; 809 { 810 /* The table is small; use linear search. */ 811 int i; 812 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 813 { 814 if (ch1 == comb_table[i].first && 815 ch2 == comb_table[i].last) 816 return 1; 817 } 818 return 0; 819 } 820 821