1 /*======================================================================*\
2 |* Editor mined *|
3 |* Character properties *|
4 \*======================================================================*/
5
6 #include "mined.h"
7 #include "charprop.h"
8
9
10 /* #include "charname.t" ? */
11 #if defined (__TURBOC__) || defined (VAX) || defined (__DJGPP__)
12 #define NOCHARNAMES
13 #endif
14
15 /* #include "decompos.t" ? */
16 #if defined (__TURBOC__) || defined (VAX)
17 #define NODECOMPOSE
18 #endif
19
20
21 /*======================================================================*\
22 |* Character properties *|
23 \*======================================================================*/
24
25 #include "scriptdf.t"
26
27 static struct scriptentry scripttable [] = {
28 /* this is checked also for word moves and identifier searches... */
29 #include "scripts.t"
30 };
31
32
33 #define compact_charname_t
34
35 #ifdef compact_charname_t
36 struct charnameentry {
37 char * charname_e;
38 };
39 #else
40 struct charnameentry {
41 unsigned long u;
42 char * charname;
43 };
44 #endif
45
46 static struct charnameentry charnametable [] = {
47 #ifdef NOCHARNAMES
48 # ifdef compact_charname_t
49 {"\x00\x00\x00"},
50 # else
51 {0, ""},
52 # endif
53 #else
54 #include "charname.t"
55 #endif
56 };
57
58 static struct charseqentry {
59 char * name;
60 unsigned long u [4];
61 } charseqtable [] = {
62 #ifdef NOCHARNAMES
63 {"", 0},
64 #else
65 #include "charseqs.t"
66 #endif
67 };
68
69
70 typedef enum {
71 decomp_canonical,
72 decomp_circle,
73 decomp_compat,
74 decomp_final,
75 decomp_font,
76 decomp_fraction,
77 decomp_initial,
78 decomp_isolated,
79 decomp_medial,
80 decomp_narrow,
81 decomp_noBreak,
82 decomp_small,
83 decomp_square,
84 decomp_sub,
85 decomp_super,
86 decomp_vertical,
87 decomp_wide
88 } decomposetype;
89 struct decomposeentry {
90 unsigned long u;
91 decomposetype decomposition_type;
92 unsigned long decomposition_mapping [18];
93 };
94 static char * decomposition_type [] = {
95 /* decomp_canonical */ "",
96 /* decomp_circle */ " <encircled>",
97 /* decomp_compat */ " <compatibility>",
98 /* decomp_final */ " <final form>",
99 /* decomp_font */ " <font variant>",
100 /* decomp_fraction */ " <fraction>",
101 /* decomp_initial */ " <initial form>",
102 /* decomp_isolated */ " <isolated form>",
103 /* decomp_medial */ " <medial form>",
104 /* decomp_narrow */ " <narrow/hankaku>",
105 /* decomp_noBreak */ " <no-break>",
106 /* decomp_small */ " <small variant>",
107 /* decomp_square */ " <squared font variant>",
108 /* decomp_sub */ " <subscript>",
109 /* decomp_super */ " <superscript>",
110 /* decomp_vertical */ " <vertical layout form>",
111 /* decomp_wide */ " <wide/zenkaku>",
112 };
113 static struct decomposeentry decomposetable [] = {
114 #ifdef NODECOMPOSE
115 {0, 0, 0},
116 #else
117 #include "decompos.t"
118 #endif
119 };
120
121
122 static char decomposition_str [maxMSGlen];
123
124 /*
125 Lookup character decomposition of Unicode character.
126 */
127 static
128 unsigned long *
decomposition_lookup(ucs,typepoi)129 decomposition_lookup (ucs, typepoi)
130 unsigned long ucs;
131 decomposetype * typepoi;
132 {
133 int min = 0;
134 int max = sizeof (decomposetable) / sizeof (struct decomposeentry) - 1;
135 int mid;
136
137 /* binary search in table */
138 while (max >= min) {
139 mid = (min + max) / 2;
140 if (decomposetable [mid].u < ucs) {
141 min = mid + 1;
142 } else if (decomposetable [mid].u > ucs) {
143 max = mid - 1;
144 } else {
145 decomposetype t = decomposetable [mid].decomposition_type;
146 if (t >= arrlen (decomposition_type)
147 #ifndef __clang__
148 /* clang would whine about 'tautological-compare' here
149 but that is not true for all C compilers (e.g. Sun Studio),
150 so let's be defensive */
151 || t < 0
152 #endif
153 ) {
154 return 0;
155 } else {
156 * typepoi = t;
157 return decomposetable [mid].decomposition_mapping;
158 }
159 }
160 }
161
162 return 0;
163 }
164
165 /*
166 Determine character decomposition of Unicode character.
167 Return as string.
168 */
169 char *
decomposition_string(ucs)170 decomposition_string (ucs)
171 unsigned long ucs;
172 {
173 decomposetype type;
174 unsigned long * decomp = decomposition_lookup (ucs, & type);
175 if (decomp == 0) {
176 return 0;
177 } else {
178 int i = 0;
179
180 strcpy (decomposition_str, decomposition_type [type]);
181
182 while (i < arrlen (decomposetable [0].decomposition_mapping) && decomp [i]) {
183 char su [9];
184 build_string (su, " U+%04lX", decomp [i]);
185 strcat (decomposition_str, su);
186 i ++;
187 }
188
189 return decomposition_str;
190 }
191 }
192
193 /*
194 Determine character decomposition base character of Unicode character.
195 */
196 unsigned long
decomposition_base(ucs)197 decomposition_base (ucs)
198 unsigned long ucs;
199 {
200 decomposetype type;
201 unsigned long * decomp = decomposition_lookup (ucs, & type);
202 if (decomp == 0) {
203 if (ucs == (unsigned char) '�') {
204 return 's';
205 } else {
206 return 0;
207 }
208 } else {
209 if (decomp [0] == 0x28 && decomp [1]) {
210 /* PARENTHESIZED form */
211 return decomp [1];
212 } else {
213 return decomp [0];
214 }
215 }
216 }
217
218
219 /*
220 Determine Unicode named sequence.
221 */
222 char *
charseqname(u0,follow,lenpoi,seqpoi)223 charseqname (u0, follow, lenpoi, seqpoi)
224 unsigned long u0;
225 char * follow;
226 int * lenpoi;
227 unsigned long * * seqpoi;
228 {
229 int i;
230 unsigned long u1 = CHAR_UNKNOWN;
231 unsigned long u2 = CHAR_UNKNOWN;
232 unsigned long u3 = CHAR_UNKNOWN;
233 for (i = 0; i < arrlen (charseqtable); i ++) {
234 if (u0 == charseqtable [i].u [0]) {
235 * seqpoi = charseqtable [i].u;
236 if (u1 == CHAR_UNKNOWN && * follow && * follow != '\n') {
237 u1 = unicodevalue (follow);
238 advance_char (& follow);
239 if (* follow && * follow != '\n') {
240 u2 = unicodevalue (follow);
241 advance_char (& follow);
242 }
243 if (* follow && * follow != '\n') {
244 u3 = unicodevalue (follow);
245 advance_char (& follow);
246 }
247 }
248 if (u1 == charseqtable [i].u [1]) {
249 if (! charseqtable [i].u [2]) {
250 * lenpoi = 2;
251 return charseqtable [i].name;
252 } else if (u2 == charseqtable [i].u [2]) {
253 if (! charseqtable [i].u [3]) {
254 * lenpoi = 3;
255 return charseqtable [i].name;
256 } else if (u3 == charseqtable [i].u [3]) {
257 * lenpoi = 4;
258 return charseqtable [i].name;
259 }
260 }
261 }
262 }
263 }
264 return NIL_PTR;
265 }
266
267 /*
268 Determine character name of Unicode character.
269 */
270 char *
charname(ucs)271 charname (ucs)
272 unsigned long ucs;
273 {
274 int min = 0;
275 int max = sizeof (charnametable) / sizeof (struct charnameentry) - 1;
276 int mid;
277
278 /* binary search in table */
279 while (max >= min) {
280 unsigned long midu;
281 #ifdef compact_charname_t
282 unsigned char * mide;
283 mid = (min + max) / 2;
284 mide = (unsigned char *) charnametable [mid].charname_e;
285 midu = (mide [0] << 16) + (mide [1] << 8) + mide [2];
286 #else
287 mid = (min + max) / 2;
288 midu = charnametable [mid].u;
289 #endif
290 if (midu < ucs) {
291 min = mid + 1;
292 } else if (midu > ucs) {
293 max = mid - 1;
294 } else {
295 #ifdef compact_charname_t
296 return (char *) mide + 3;
297 #else
298 return charnametable [mid].charname;
299 #endif
300 }
301 }
302
303 return 0;
304 }
305
306 /*
307 Determine script info of Unicode character according to script range table.
308 */
309 struct scriptentry *
scriptinfo(ucs)310 scriptinfo (ucs)
311 unsigned long ucs;
312 {
313 int min = 0;
314 int max = sizeof (scripttable) / sizeof (struct scriptentry) - 1;
315 int mid;
316
317 /* binary search in table */
318 while (max >= min) {
319 mid = (min + max) / 2;
320 if (scripttable [mid].last < ucs) {
321 min = mid + 1;
322 } else if (scripttable [mid].first > ucs) {
323 max = mid - 1;
324 } else if (scripttable [mid].first <= ucs && scripttable [mid].last >= ucs) {
325 return & scripttable [mid];
326 }
327 }
328
329 return 0;
330 }
331
332 char *
script(ucs)333 script (ucs)
334 unsigned long ucs;
335 {
336 struct scriptentry * se = scriptinfo (ucs);
337 if (se) {
338 return category_names [se->scriptname];
339 } else {
340 return "";
341 }
342 }
343
344 char *
category(ucs)345 category (ucs)
346 unsigned long ucs;
347 {
348 struct scriptentry * se = scriptinfo (ucs);
349 if (se) {
350 return category_names [se->categoryname];
351 } else {
352 return "";
353 }
354 }
355
356
357 int
is_right_to_left(ucs)358 is_right_to_left (ucs)
359 unsigned long ucs;
360 {
361 if (ucs < 0x0590)
362 return 0;
363
364 return
365 (ucs >= 0x0590 && ucs <= 0x05FF) /* Hebrew */
366 || (ucs >= 0xFB1D && ucs <= 0xFB4F) /* Hebrew presentation forms */
367 || (ucs >= 0x0600 && ucs <= 0x07FF) /* Arabic, Syriac, Thaana, NKo */
368 || (ucs >= 0xFB50 && ucs <= 0xFDFF) /* Arabic presentation forms A */
369 || (ucs >= 0xFE70 && ucs <= 0xFEFF) /* Arabic presentation forms B */
370 || (ucs >= 0x0800 && ucs <= 0x08FF) /* Samaritan, Mandaic, ..., Arabic Ext-A */
371 || (ucs == 0x200F) /* right-to-left mark */
372 #ifdef RLmarks
373 || (ucs == 0x202B) /* right-to-left embedding */
374 || (ucs == 0x202E) /* right-to-left override */
375 #endif
376 || (ucs >= 0x10800 && ucs <= 0x10FFF)
377 || (ucs >= 0x1E800 && ucs <= 0x1EFFF) /* ..., Arabic Mathematical, ... */
378 ;
379 }
380
381
382 struct hanentry *
lookup_handescr(unichar)383 lookup_handescr (unichar)
384 unsigned long unichar;
385 {
386 int min = 0;
387 int max = hantable_len - 1;
388 int mid;
389
390 /* binary search in table */
391 while (max >= min) {
392 mid = (min + max) / 2;
393 if (hantable [mid].unicode < unichar) {
394 min = mid + 1;
395 } else if (hantable [mid].unicode > unichar) {
396 max = mid - 1;
397 } else {
398 /* construct struct hanentry from raw_hanentry */
399 static struct hanentry han;
400 char * text = hantable [mid].text;
401
402 han.unicode = hantable [mid].unicode;
403 han.Mandarin = text;
404 text += strlen (text) + 1;
405 han.Cantonese = text;
406 text += strlen (text) + 1;
407 han.Japanese = text;
408 text += strlen (text) + 1;
409 han.Sino_Japanese = text;
410 text += strlen (text) + 1;
411 han.Hangul = text;
412 text += strlen (text) + 1;
413 han.Korean = text;
414 text += strlen (text) + 1;
415 han.Vietnamese = text;
416 text += strlen (text) + 1;
417 han.HanyuPinlu = text;
418 text += strlen (text) + 1;
419 han.HanyuPinyin = text;
420 text += strlen (text) + 1;
421 han.XHCHanyuPinyin = text;
422 text += strlen (text) + 1;
423 han.Tang = text;
424 text += strlen (text) + 1;
425 han.Definition = text;
426
427 return & han;
428 }
429 }
430 return 0;
431 }
432
433
434 FLAG
is_bullet_or_dash(unich)435 is_bullet_or_dash (unich)
436 unsigned long unich;
437 {
438 char * chname = charname (unich);
439 if (unich == 0xB7) { /* MIDDLE DOT */
440 return True;
441 }
442 if (unich == 0x2015) { /* HORIZONTAL BAR / QUOTATION DASH */
443 return True;
444 }
445 if (chname != NIL_PTR) {
446 char * bull = strstr (chname, "BULLET");
447 if (bull != NIL_PTR && strlen (bull) == 6) {
448 return True;
449 }
450 bull = strstr (chname, "DASH");
451 if (bull != NIL_PTR && strlen (bull) == 4) {
452 return True;
453 }
454 }
455 return False;
456 }
457
458
459 /*======================================================================*\
460 |* Some Unicode character properties *|
461 \*======================================================================*/
462
463 struct interval {
464 unsigned long first;
465 unsigned long last;
466 };
467
468 static
469 int
lookup(ucs,table,len)470 lookup (ucs, table, len)
471 unsigned long ucs;
472 struct interval * table;
473 int len;
474 {
475 int min = 0;
476 int mid;
477 int max = len - 1;
478
479 if (ucs < table [0].first || ucs > table [max].last) {
480 return 0;
481 }
482 while (max >= min) {
483 mid = (min + max) / 2;
484 if (ucs > table [mid].last) {
485 min = mid + 1;
486 } else if (ucs < table [mid].first) {
487 max = mid - 1;
488 } else {
489 return 1;
490 }
491 }
492
493 return 0;
494 }
495
496
497 FLAG
isLetter(unichar)498 isLetter (unichar)
499 unsigned long unichar;
500 {
501 char * cat = category (unichar);
502 return streq (cat, "Letter");
503 }
504
505 /* struct interval list_wide [] */
506 #include "wide.t"
507
508 FLAG
is_wideunichar(ucs)509 is_wideunichar (ucs)
510 unsigned long ucs;
511 {
512 return lookup (ucs, list_wide, arrlen (list_wide));
513 }
514
515
516 struct caseconv_entry caseconv_table [] = {
517 #include "casetabl.t"
518 };
519
520 #define caseconv_table_size (sizeof (caseconv_table) / sizeof (* caseconv_table))
521
522
523 struct caseconv_special_entry caseconv_special [] = {
524 #include "casespec.t"
525 };
526
527 #define caseconv_special_size (sizeof (caseconv_special) / sizeof (* caseconv_special))
528
529
530 /* struct interval list_Soft_Dotted [] */
531 #include "softdot.t"
532
533
534 static struct {
535 unsigned long first;
536 unsigned long last;
537 char category;
538 short combining_class;
539 } combining_classes [] = {
540 #include "combin.t"
541 };
542
543 /**
544 Look up combining class;
545 return: if not found, -1
546 if category is "Spacing Combining" (Mc): -1 - combining class
547 else combining class
548 */
549 static
550 int
combining_class(ucs)551 combining_class (ucs)
552 unsigned long ucs;
553 {
554 int min = 0;
555 int mid;
556 int max = arrlen (combining_classes) - 1;
557
558 if (ucs < combining_classes [0].first) {
559 return -1;
560 }
561 while (max >= min) {
562 mid = (min + max) / 2;
563 if (ucs > combining_classes [mid].last) {
564 min = mid + 1;
565 } else if (ucs < combining_classes [mid].first) {
566 max = mid - 1;
567 } else {
568 if (combining_classes [mid].category == 'c') {
569 return -2 - combining_classes [mid].combining_class;
570 } else {
571 return combining_classes [mid].combining_class;
572 }
573 }
574 }
575 return -1;
576 }
577
578 FLAG
iscombining_unichar(ucs)579 iscombining_unichar (ucs)
580 unsigned long ucs;
581 {
582 #ifdef spacingcombining_isnt_combining
583 return combining_class (ucs) >= 0;
584 #else
585 return combining_class (ucs) != -1;
586 #endif
587 }
588
589 FLAG
isspacingcombining_unichar(ucs)590 isspacingcombining_unichar (ucs)
591 unsigned long ucs;
592 {
593 return combining_class (ucs) <= -2;
594 }
595
596 int
soft_dotted(ucs)597 soft_dotted (ucs)
598 unsigned long ucs;
599 {
600 return lookup (ucs, list_Soft_Dotted, arrlen (list_Soft_Dotted));
601 }
602
603
604 int
lookup_caseconv(basechar)605 lookup_caseconv (basechar)
606 unsigned long basechar;
607 {
608 int low = 0;
609 int high = caseconv_table_size - 1;
610 int i;
611
612 while (low <= high) {
613 i = (low + high) / 2;
614 if (caseconv_table [i].base == basechar) {
615 return i;
616 } else if (caseconv_table [i].base >= basechar) {
617 high = i - 1;
618 } else {
619 low = i + 1;
620 }
621 }
622 /* notify "not found" */
623 return -1;
624 }
625
626 /**
627 case_convert converts a Unicode character to
628 +2: title case
629 +1: upper case
630 -1: lower case
631 */
632 unsigned long
case_convert(unichar,dir)633 case_convert (unichar, dir)
634 unsigned long unichar;
635 int dir;
636 {
637 int tabix = lookup_caseconv (unichar);
638
639 if (tabix >= 0) {
640 if (dir == 2 && caseconv_table [tabix].title != 0) {
641 return caseconv_table [tabix].title;
642 } else if (dir > 0 && caseconv_table [tabix].toupper != 0) {
643 return unichar + caseconv_table [tabix].toupper;
644 } else if (dir < 0 && caseconv_table [tabix].tolower != 0) {
645 return unichar + caseconv_table [tabix].tolower;
646 }
647 }
648
649 return unichar;
650 }
651
652 int
lookup_caseconv_special(basechar,langcond)653 lookup_caseconv_special (basechar, langcond)
654 unsigned long basechar;
655 short langcond;
656 {
657 int i;
658 #ifdef caseconvsearch_uncond
659 int low = 0;
660 int high = caseconv_special_size - 1;
661
662 /* plain binary search is not applicable as keys are ambiguous */
663 while (low <= high) {
664 i = (low + high) / 2;
665 if (caseconv_special [i].base == basechar) {
666 return i;
667 } else if (caseconv_special [i].base >= basechar) {
668 high = i - 1;
669 } else {
670 low = i + 1;
671 }
672 }
673 #else
674 for (i = 0; i < caseconv_special_size; i ++) {
675 if (caseconv_special [i].base == basechar) {
676 short langcondi = caseconv_special [i].condition & U_conds_lang;
677 if (langcondi == 0 || (langcondi & langcond)) {
678 return i;
679 }
680 }
681 }
682 #endif
683 /* notify "not found" */
684 return -1;
685 }
686
687 int
iscombining_notabove(unichar)688 iscombining_notabove (unichar)
689 unsigned long unichar;
690 {
691 int cc = combining_class (unichar);
692 return cc > 0 && cc != 230;
693 }
694
695 int
iscombining_above(unichar)696 iscombining_above (unichar)
697 unsigned long unichar;
698 {
699 return combining_class (unichar) == 230;
700 }
701
702
703 /*======================================================================*\
704 |* End *|
705 \*======================================================================*/
706