1 /* $XTermId: charclass.c,v 1.44 2021/02/02 00:19:32 tom Exp $ */
2 
3 /*
4  * Copyright 2002-2020,2021 by Thomas E. Dickey
5  *
6  *                         All Rights Reserved
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the
10  * "Software"), to deal in the Software without restriction, including
11  * without limitation the rights to use, copy, modify, merge, publish,
12  * distribute, sublicense, and/or sell copies of the Software, and to
13  * permit persons to whom the Software is furnished to do so, subject to
14  * the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included
17  * in all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  * Except as contained in this notice, the name(s) of the above copyright
28  * holders shall not be used in advertising or otherwise to promote the
29  * sale, use or other dealings in this Software without prior written
30  * authorization.
31  *
32  *----------------------------------------------------------------------------
33  * Compact and efficient reimplementation of the
34  * xterm character class mechanism for large character sets
35  *
36  * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
37  *
38  * xterm allows users to select entire words with a double-click on the left
39  * mouse button.  Opinions might differ on what type of characters are part of
40  * separate words, therefore xterm allows users to configure a class code for
41  * each 8-bit character.  Words are maximum length sequences of neighboring
42  * characters with identical class code.  Extending this mechanism to Unicode
43  * naively would create an at least 2^16 entries (128 kB) long class code
44  * table.
45  *
46  * Instead, we transform the character class table into a list of intervals,
47  * that will be accessed via a linear search.  Changes made to the table by the
48  * user will be appended.  A special class code IDENT (default) marks
49  * characters who have their code number as the class code.
50  *
51  * We could alternatively use a sorted table of non-overlapping intervals that
52  * can be accessed via binary search, but merging in new intervals is
53  * significantly more hassle and not worth the effort here.
54  */
55 
56 #include <xterm.h>
57 #include <charclass.h>
58 
59 #if OPT_WIDE_CHARS
60 
61 #ifdef TEST_DRIVER
62 
63 #include <ctype.h>
64 #include <wchar.h>
65 #include <wctype.h>
66 
67 #if OPT_TRACE
68 #define Trace if (opt_v) printf
69 #endif
70 
71 #undef OPT_REPORT_CCLASS
72 #define OPT_REPORT_CCLASS 1
73 #endif /* TEST_DRIVER */
74 
75 static struct classentry {
76     int cclass;
77     int first;
78     int last;
79 } *classtab;
80 
81 typedef enum {
82     IDENT = -1,
83     OTHER = 0,
84     CNTRL = 1,
85     ALNUM = 48,
86     BLANK = 32,
87     U_CJK = 0x4e00,
88     U_SUP = 0x2070,
89     U_SUB = 0x2080,
90     U_HIR = 0x3040,
91     U_KAT = 0x30a0,
92     U_HAN = 0xac00
93 } Classes;
94 
95 #ifdef TEST_DRIVER
96 static int opt_all;
97 static int opt_check;
98 static int opt_quiet;
99 static int opt_v;
100 #endif
101 
102 void
init_classtab(void)103 init_classtab(void)
104 {
105     const int size = 50;
106 
107     TRACE(("init_classtab " TRACE_L "\n"));
108 
109     classtab = TypeMallocN(struct classentry, (unsigned) size);
110     if (!classtab)
111 	abort();
112     classtab[0].cclass = size;
113     classtab[0].first = 1;
114     classtab[0].last = 0;
115 
116     /* old xterm default classes */
117     SetCharacterClassRange(0, 0, BLANK);
118     SetCharacterClassRange(1, 31, CNTRL);
119     SetCharacterClassRange('\t', '\t', BLANK);
120     SetCharacterClassRange('0', '9', ALNUM);
121     SetCharacterClassRange('A', 'Z', ALNUM);
122     SetCharacterClassRange('_', '_', ALNUM);
123     SetCharacterClassRange('a', 'z', ALNUM);
124     SetCharacterClassRange(127, 159, CNTRL);
125     SetCharacterClassRange(160, 191, IDENT);
126     SetCharacterClassRange(192, 255, ALNUM);
127     SetCharacterClassRange(215, 215, IDENT);
128     SetCharacterClassRange(247, 247, IDENT);
129 
130     /* added Unicode classes */
131     SetCharacterClassRange(0x0100, 0xffdf, ALNUM);	/* mostly characters */
132     SetCharacterClassRange(0x037e, 0x037e, IDENT);	/* Greek question mark */
133     SetCharacterClassRange(0x0387, 0x0387, IDENT);	/* Greek ano teleia */
134     SetCharacterClassRange(0x055a, 0x055f, IDENT);	/* Armenian punctuation */
135     SetCharacterClassRange(0x0589, 0x0589, IDENT);	/* Armenian full stop */
136     SetCharacterClassRange(0x0700, 0x070d, IDENT);	/* Syriac punctuation */
137     SetCharacterClassRange(0x104a, 0x104f, IDENT);	/* Myanmar punctuation */
138     SetCharacterClassRange(0x10fb, 0x10fb, IDENT);	/* Georgian punctuation */
139     SetCharacterClassRange(0x1361, 0x1368, IDENT);	/* Ethiopic punctuation */
140     SetCharacterClassRange(0x166d, 0x166e, IDENT);	/* Canadian Syl. punctuation */
141     SetCharacterClassRange(0x17d4, 0x17dc, IDENT);	/* Khmer punctuation */
142     SetCharacterClassRange(0x1800, 0x180a, IDENT);	/* Mongolian punctuation */
143     SetCharacterClassRange(0x2000, 0x200a, BLANK);	/* spaces */
144     SetCharacterClassRange(0x200b, 0x27ff, IDENT);	/* punctuation and symbols */
145     SetCharacterClassRange(0x2070, 0x207f, U_SUP);	/* superscript */
146     SetCharacterClassRange(0x2080, 0x208f, U_SUB);	/* subscript */
147     SetCharacterClassRange(0x3000, 0x3000, BLANK);	/* ideographic space */
148     SetCharacterClassRange(0x3001, 0x3020, IDENT);	/* ideographic punctuation */
149     SetCharacterClassRange(0x3040, 0x309f, U_HIR);	/* Hiragana */
150     SetCharacterClassRange(0x30a0, 0x30ff, U_KAT);	/* Katakana */
151     SetCharacterClassRange(0x3300, 0x9fff, U_CJK);	/* CJK Ideographs */
152     SetCharacterClassRange(0xac00, 0xd7a3, U_HAN);	/* Hangul Syllables */
153     SetCharacterClassRange(0xf900, 0xfaff, U_CJK);	/* CJK Ideographs */
154     SetCharacterClassRange(0xfe30, 0xfe6b, IDENT);	/* punctuation forms */
155     SetCharacterClassRange(0xff00, 0xff0f, IDENT);	/* half/fullwidth ASCII */
156     SetCharacterClassRange(0xff1a, 0xff20, IDENT);	/* half/fullwidth ASCII */
157     SetCharacterClassRange(0xff3b, 0xff40, IDENT);	/* half/fullwidth ASCII */
158     SetCharacterClassRange(0xff5b, 0xff64, IDENT);	/* half/fullwidth ASCII */
159 
160     TRACE((TRACE_R " init_classtab\n"));
161     return;
162 }
163 
164 int
CharacterClass(int c)165 CharacterClass(int c)
166 {
167     int i, cclass = IDENT;
168 
169     for (i = classtab[0].first; i <= classtab[0].last; i++)
170 	if (classtab[i].first <= c && classtab[i].last >= c)
171 	    cclass = classtab[i].cclass;
172 
173     if (cclass < 0)
174 	cclass = c;
175 
176     return cclass;
177 }
178 
179 #if OPT_REPORT_CCLASS
180 #define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
181 static const char *
class_name(Classes code)182 class_name(Classes code)
183 {
184     static char buffer[80];
185     const char *result = "?";
186     switch (code) {
187     case ALNUM:
188 	result = "ALNUM";
189 	break;
190     case BLANK:
191 	result = "BLANK";
192 	break;
193     case CNTRL:
194 	result = "CNTRL";
195 	break;
196     case OTHER:
197 	result = "OTHER";
198 	break;
199     case IDENT:
200 	result = "IDENT";
201 	break;
202     case U_SUP:
203 	result = "superscript";
204 	break;
205     case U_SUB:
206 	result = "subscript";
207 	break;
208     case U_CJK:
209 	result = "CJK Ideographs";
210 	break;
211     case U_HIR:
212 	result = "Hiragana";
213 	break;
214     case U_KAT:
215 	result = "Katakana";
216 	break;
217     case U_HAN:
218 	result = "Hangul Syllables";
219 	break;
220     default:
221 	sprintf(buffer, charFormat(code), code);
222 	result = buffer;
223 	break;
224     }
225     return result;
226 }
227 
228 /*
229  * Special convention for classtab[0]:
230  * - classtab[0].cclass is the allocated number of entries in classtab
231  * - classtab[0].first = 1 (first used entry in classtab)
232  * - classtab[0].last is the last used entry in classtab
233  */
234 
235 int
SetCharacterClassRange(int low,int high,int value)236 SetCharacterClassRange(int low, int high, int value)
237 {
238     TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n",
239 	   low, high, class_name(value)));
240 
241     if (high < low)
242 	return -1;		/* nothing to do */
243 
244     /* make sure we have at least one free entry left at table end */
245     if (classtab[0].last > classtab[0].cclass - 2) {
246 	classtab[0].cclass += 5 + classtab[0].cclass / 4;
247 	classtab = TypeRealloc(struct classentry,
248 			         (unsigned) classtab[0].cclass, classtab);
249 	if (!classtab)
250 	    abort();
251     }
252 
253     /* simply append new interval to end of interval array */
254     classtab[0].last++;
255     classtab[classtab[0].last].first = low;
256     classtab[classtab[0].last].last = high;
257     classtab[classtab[0].last].cclass = value;
258 
259     return 0;
260 }
261 
262 void
report_wide_char_class(void)263 report_wide_char_class(void)
264 {
265     static const Classes known_classes[] =
266     {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
267     int i;
268 
269     printf("\n");
270     printf("Unicode charClass data uses the last match\n");
271     printf("from these overlapping intervals of character codes:\n");
272     for (i = classtab[0].first; i <= classtab[0].last; i++) {
273 	printf("\tU+%04X .. U+%04X %s\n",
274 	       classtab[i].first,
275 	       classtab[i].last,
276 	       class_name((Classes) classtab[i].cclass));
277     }
278     printf("\n");
279     printf("These class-names are used internally (the first character code in a class):\n");
280     for (i = 0; i < (int) XtNumber(known_classes); ++i) {
281 	printf("\t");
282 	printf(charFormat(known_classes[i]), known_classes[i]);
283 	printf(" = %s\n", class_name(known_classes[i]));
284     }
285 }
286 #endif /* OPT_REPORT_CCLASS */
287 
288 #ifdef NO_LEAKS
289 void
noleaks_CharacterClass(void)290 noleaks_CharacterClass(void)
291 {
292     FreeAndNull(classtab);
293 }
294 #endif
295 #endif /* OPT_WIDE_CHARS */
296 
297 #ifdef TEST_DRIVER
298 #if OPT_WIDE_CHARS
299 static void
usage(void)300 usage(void)
301 {
302     static const char *msg[] =
303     {
304 	"Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]",
305 	"",
306 	"Options:",
307 	" -a  show all data",
308 	" -s  show only summary",
309 	" -v  verbose"
310     };
311     size_t n;
312     for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
313 	fprintf(stderr, "%s\n", msg[n]);
314     }
315     exit(EXIT_FAILURE);
316 }
317 
318 static int
expected_class(int wch)319 expected_class(int wch)
320 {
321     int result = wch;
322     wint_t ch = (wint_t) wch;
323     if (ch == '\0' || ch == '\t') {
324 	result = BLANK;
325     } else if (iswcntrl(ch)) {
326 	result = CNTRL;
327     } else if (iswspace(ch)) {
328 	result = BLANK;
329     } else if (ch < 127) {
330 	if (isalnum(ch) || ch == '_') {
331 	    result = ALNUM;
332 	}
333     } else if (ch == 170 || ch == 181 || ch == 186) {
334 	;
335     } else if (iswalnum(ch)) {
336 	result = ALNUM;
337     }
338     return result;
339 }
340 
341 static int
show_cclass_range(int lo,int hi)342 show_cclass_range(int lo, int hi)
343 {
344     int cclass = CharacterClass(lo);
345     int ident = (cclass == lo);
346     int more = 0;
347     if (ident) {
348 	int ch;
349 	for (ch = lo + 1; ch <= hi; ch++) {
350 	    if (CharacterClass(ch) != ch) {
351 		ident = 0;
352 		break;
353 	    }
354 	}
355 	if (ident && (hi < 255)) {
356 	    ch = hi + 1;
357 	    if (CharacterClass(ch) == ch) {
358 		if (ch >= 255 || CharacterClass(ch + 1) != ch) {
359 		    more = 1;
360 		}
361 	    }
362 	}
363     }
364     if (!more) {
365 	if (lo == hi) {
366 	    printf("\t%d", lo);
367 	} else {
368 	    printf("\t%d-%d", lo, hi);
369 	}
370 	if (!ident)
371 	    printf(":%d", cclass);
372 	if (hi < 255)
373 	    printf(", \\");
374 	printf("\n");
375     }
376     return !more;
377 }
378 
379 static void
report_resource(int first,int last)380 report_resource(int first, int last)
381 {
382     int class_p;
383     int ch;
384     int dh;
385 
386     class_p = CharacterClass(dh = first);
387     for (ch = first; ch < last; ++ch) {
388 	int class_c = CharacterClass(ch);
389 	if (class_c != class_p) {
390 	    if (show_cclass_range(dh, ch - 1)) {
391 		dh = ch;
392 		class_p = class_c;
393 	    }
394 	}
395     }
396     if (dh < last - 1) {
397 	show_cclass_range(dh, last - 1);
398     }
399 }
400 
401 static int
decode_one(const char * source,char ** target)402 decode_one(const char *source, char **target)
403 {
404     int result = -1;
405     long check;
406     int radix = 0;
407     if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
408 	source += 2;
409 	radix = 16;
410     }
411     check = strtol(source, target, radix);
412     if (*target != NULL && *target != source)
413 	result = (int) check;
414     return result;
415 }
416 
417 static int
decode_range(const char * source,int * lo,int * hi)418 decode_range(const char *source, int *lo, int *hi)
419 {
420     int result = 0;
421     char *after1;
422     char *after2;
423     if ((*lo = decode_one(source, &after1)) >= 0) {
424 	after1 += strspn(after1, ":-.\t ");
425 	if ((*hi = decode_one(after1, &after2)) < 0) {
426 	    *hi = *lo;
427 	}
428 	result = 1;
429     }
430     return result;
431 }
432 
433 static void
do_range(const char * source)434 do_range(const char *source)
435 {
436     int lo, hi;
437     if (decode_range(source, &lo, &hi)) {
438 	if (opt_all) {
439 	    while (lo <= hi) {
440 		int other_rc = CharacterClass(lo);
441 		if (!opt_quiet)
442 		    printf("U+%04X\t%s\n", lo, class_name(other_rc));
443 		++lo;
444 	    }
445 	} else if (opt_check) {
446 	    while (lo <= hi) {
447 		int expect = expected_class(lo);
448 		int actual = CharacterClass(lo);
449 		if (actual != expect)
450 		    printf("U+%04X\t%s ->%s\n", lo,
451 			   class_name(expect),
452 			   class_name(actual));
453 		++lo;
454 	    }
455 	} else {
456 	    printf("\"charClass\" resource for [%d..%d]:\n", lo, hi);
457 	    report_resource(lo, hi + 1);
458 	}
459     }
460 }
461 #endif /* OPT_WIDE_CHARS */
462 
463 /*
464  * TODO: add option to show do_range in hex
465  */
466 int
main(int argc,char ** argv ENVP_ARG)467 main(int argc, char **argv ENVP_ARG)
468 {
469 #if OPT_WIDE_CHARS
470     int ch;
471 #endif
472 
473     (void) argc;
474     (void) argv;
475 
476 #if OPT_WIDE_CHARS
477     setlocale(LC_ALL, "");
478     while ((ch = getopt(argc, argv, "acsv")) != -1) {
479 	switch (ch) {
480 	case 'a':
481 	    opt_all = 1;
482 	    break;
483 	case 'c':
484 	    opt_check = 1;
485 	    break;
486 	case 's':
487 	    opt_quiet = 1;
488 	    break;
489 	case 'v':
490 	    opt_v = 1;
491 	    break;
492 	default:
493 	    usage();
494 	}
495     }
496     init_classtab();
497 
498     if (optind >= argc) {
499 	do_range("0-255");
500     } else {
501 	while (optind < argc) {
502 	    do_range(argv[optind++]);
503 	}
504     }
505     report_wide_char_class();
506 #else
507     printf("wide-character support is not configured\n");
508 #endif /* OPT_WIDE_CHARS */
509     return 0;
510 }
511 #endif /* TEST_DRIVER */
512