1 /*======================================================================*\
2 |*		Editor mined						*|
3 |*		Character properties					*|
4 \*======================================================================*/
5 
6 #include "mined.h"
7 #include "charprop.h"
8 
9 
10 /* #include "charname.t" ? */
11 #if defined (__TURBOC__) || defined (VAX) || defined (__DJGPP__)
12 #define NOCHARNAMES
13 #endif
14 
15 /* #include "decompos.t" ? */
16 #if defined (__TURBOC__) || defined (VAX)
17 #define NODECOMPOSE
18 #endif
19 
20 
21 /*======================================================================*\
22 |*			Character properties				*|
23 \*======================================================================*/
24 
25 #include "scriptdf.t"
26 
27 static struct scriptentry scripttable [] = {
28 /* this is checked also for word moves and identifier searches... */
29 #include "scripts.t"
30 };
31 
32 
33 #define compact_charname_t
34 
35 #ifdef compact_charname_t
36 struct charnameentry {
37 	char * charname_e;
38 };
39 #else
40 struct charnameentry {
41 	unsigned long u;
42 	char * charname;
43 };
44 #endif
45 
46 static struct charnameentry charnametable [] = {
47 #ifdef NOCHARNAMES
48 # ifdef compact_charname_t
49 	{"\x00\x00\x00"},
50 # else
51 	{0, ""},
52 # endif
53 #else
54 #include "charname.t"
55 #endif
56 };
57 
58 static struct charseqentry {
59 	char * name;
60 	unsigned long u [4];
61 } charseqtable [] = {
62 #ifdef NOCHARNAMES
63 	{"", 0},
64 #else
65 #include "charseqs.t"
66 #endif
67 };
68 
69 
70 typedef enum {
71 	decomp_canonical,
72 	decomp_circle,
73 	decomp_compat,
74 	decomp_final,
75 	decomp_font,
76 	decomp_fraction,
77 	decomp_initial,
78 	decomp_isolated,
79 	decomp_medial,
80 	decomp_narrow,
81 	decomp_noBreak,
82 	decomp_small,
83 	decomp_square,
84 	decomp_sub,
85 	decomp_super,
86 	decomp_vertical,
87 	decomp_wide
88 } decomposetype;
89 struct decomposeentry {
90 	unsigned long u;
91 	decomposetype decomposition_type;
92 	unsigned long decomposition_mapping [18];
93 };
94 static char * decomposition_type [] = {
95 	/* decomp_canonical */	"",
96 	/* decomp_circle */	" <encircled>",
97 	/* decomp_compat */	" <compatibility>",
98 	/* decomp_final */	" <final form>",
99 	/* decomp_font */	" <font variant>",
100 	/* decomp_fraction */	" <fraction>",
101 	/* decomp_initial */	" <initial form>",
102 	/* decomp_isolated */	" <isolated form>",
103 	/* decomp_medial */	" <medial form>",
104 	/* decomp_narrow */	" <narrow/hankaku>",
105 	/* decomp_noBreak */	" <no-break>",
106 	/* decomp_small */	" <small variant>",
107 	/* decomp_square */	" <squared font variant>",
108 	/* decomp_sub */	" <subscript>",
109 	/* decomp_super */	" <superscript>",
110 	/* decomp_vertical */	" <vertical layout form>",
111 	/* decomp_wide */	" <wide/zenkaku>",
112 };
113 static struct decomposeentry decomposetable [] = {
114 #ifdef NODECOMPOSE
115 	{0, 0, 0},
116 #else
117 #include "decompos.t"
118 #endif
119 };
120 
121 
122 static char decomposition_str [maxMSGlen];
123 
124 /*
125    Lookup character decomposition of Unicode character.
126  */
127 static
128 unsigned long *
decomposition_lookup(ucs,typepoi)129 decomposition_lookup (ucs, typepoi)
130   unsigned long ucs;
131   decomposetype * typepoi;
132 {
133   int min = 0;
134   int max = sizeof (decomposetable) / sizeof (struct decomposeentry) - 1;
135   int mid;
136 
137   /* binary search in table */
138   while (max >= min) {
139     mid = (min + max) / 2;
140     if (decomposetable [mid].u < ucs) {
141       min = mid + 1;
142     } else if (decomposetable [mid].u > ucs) {
143       max = mid - 1;
144     } else {
145 	decomposetype t = decomposetable [mid].decomposition_type;
146 	if (t >= arrlen (decomposition_type)
147 #ifndef __clang__
148 	/* clang would whine about 'tautological-compare' here
149 	   but that is not true for all C compilers (e.g. Sun Studio),
150 	   so let's be defensive */
151 	   || t < 0
152 #endif
153 	   ) {
154 		return 0;
155 	} else {
156 		* typepoi = t;
157 		return decomposetable [mid].decomposition_mapping;
158 	}
159     }
160   }
161 
162   return 0;
163 }
164 
165 /*
166    Determine character decomposition of Unicode character.
167    Return as string.
168  */
169 char *
decomposition_string(ucs)170 decomposition_string (ucs)
171   unsigned long ucs;
172 {
173   decomposetype type;
174   unsigned long * decomp = decomposition_lookup (ucs, & type);
175   if (decomp == 0) {
176 	return 0;
177   } else {
178 	int i = 0;
179 
180 	strcpy (decomposition_str, decomposition_type [type]);
181 
182 	while (i < arrlen (decomposetable [0].decomposition_mapping) && decomp [i]) {
183 		char su [9];
184 		build_string (su, " U+%04lX", decomp [i]);
185 		strcat (decomposition_str, su);
186 		i ++;
187 	}
188 
189 	return decomposition_str;
190   }
191 }
192 
193 /*
194    Determine character decomposition base character of Unicode character.
195  */
196 unsigned long
decomposition_base(ucs)197 decomposition_base (ucs)
198   unsigned long ucs;
199 {
200   decomposetype type;
201   unsigned long * decomp = decomposition_lookup (ucs, & type);
202   if (decomp == 0) {
203 	if (ucs == (unsigned char) '�') {
204 		return 's';
205 	} else {
206 		return 0;
207 	}
208   } else {
209 	if (decomp [0] == 0x28 && decomp [1]) {
210 		/* PARENTHESIZED form */
211 		return decomp [1];
212 	} else {
213 		return decomp [0];
214 	}
215   }
216 }
217 
218 
219 /*
220    Determine Unicode named sequence.
221  */
222 char *
charseqname(u0,follow,lenpoi,seqpoi)223 charseqname (u0, follow, lenpoi, seqpoi)
224   unsigned long u0;
225   char * follow;
226   int * lenpoi;
227   unsigned long * * seqpoi;
228 {
229   int i;
230   unsigned long u1 = CHAR_UNKNOWN;
231   unsigned long u2 = CHAR_UNKNOWN;
232   unsigned long u3 = CHAR_UNKNOWN;
233   for (i = 0; i < arrlen (charseqtable); i ++) {
234 	if (u0 == charseqtable [i].u [0]) {
235 		* seqpoi = charseqtable [i].u;
236 		if (u1 == CHAR_UNKNOWN && * follow && * follow != '\n') {
237 			u1 = unicodevalue (follow);
238 			advance_char (& follow);
239 			if (* follow && * follow != '\n') {
240 				u2 = unicodevalue (follow);
241 				advance_char (& follow);
242 			}
243 			if (* follow && * follow != '\n') {
244 				u3 = unicodevalue (follow);
245 				advance_char (& follow);
246 			}
247 		}
248 		if (u1 == charseqtable [i].u [1]) {
249 			if (! charseqtable [i].u [2]) {
250 				* lenpoi = 2;
251 				return charseqtable [i].name;
252 			} else if (u2 == charseqtable [i].u [2]) {
253 				if (! charseqtable [i].u [3]) {
254 					* lenpoi = 3;
255 					return charseqtable [i].name;
256 				} else if (u3 == charseqtable [i].u [3]) {
257 					* lenpoi = 4;
258 					return charseqtable [i].name;
259 				}
260 			}
261 		}
262 	}
263   }
264   return NIL_PTR;
265 }
266 
267 /*
268    Determine character name of Unicode character.
269  */
270 char *
charname(ucs)271 charname (ucs)
272   unsigned long ucs;
273 {
274   int min = 0;
275   int max = sizeof (charnametable) / sizeof (struct charnameentry) - 1;
276   int mid;
277 
278   /* binary search in table */
279   while (max >= min) {
280     unsigned long midu;
281 #ifdef compact_charname_t
282     unsigned char * mide;
283     mid = (min + max) / 2;
284     mide = (unsigned char *) charnametable [mid].charname_e;
285     midu = (mide [0] << 16) + (mide [1] << 8) + mide [2];
286 #else
287     mid = (min + max) / 2;
288     midu = charnametable [mid].u;
289 #endif
290     if (midu < ucs) {
291       min = mid + 1;
292     } else if (midu > ucs) {
293       max = mid - 1;
294     } else {
295 #ifdef compact_charname_t
296       return (char *) mide + 3;
297 #else
298       return charnametable [mid].charname;
299 #endif
300     }
301   }
302 
303   return 0;
304 }
305 
306 /*
307    Determine script info of Unicode character according to script range table.
308  */
309 struct scriptentry *
scriptinfo(ucs)310 scriptinfo (ucs)
311   unsigned long ucs;
312 {
313   int min = 0;
314   int max = sizeof (scripttable) / sizeof (struct scriptentry) - 1;
315   int mid;
316 
317   /* binary search in table */
318   while (max >= min) {
319     mid = (min + max) / 2;
320     if (scripttable [mid].last < ucs) {
321       min = mid + 1;
322     } else if (scripttable [mid].first > ucs) {
323       max = mid - 1;
324     } else if (scripttable [mid].first <= ucs && scripttable [mid].last >= ucs) {
325       return & scripttable [mid];
326     }
327   }
328 
329   return 0;
330 }
331 
332 char *
script(ucs)333 script (ucs)
334   unsigned long ucs;
335 {
336   struct scriptentry * se = scriptinfo (ucs);
337   if (se) {
338 	return category_names [se->scriptname];
339   } else {
340 	return "";
341   }
342 }
343 
344 char *
category(ucs)345 category (ucs)
346   unsigned long ucs;
347 {
348   struct scriptentry * se = scriptinfo (ucs);
349   if (se) {
350 	return category_names [se->categoryname];
351   } else {
352 	return "";
353   }
354 }
355 
356 
357 int
is_right_to_left(ucs)358 is_right_to_left (ucs)
359   unsigned long ucs;
360 {
361   if (ucs < 0x0590)
362       return 0;
363 
364   return
365      (ucs >= 0x0590 && ucs <= 0x05FF)	/* Hebrew */
366   || (ucs >= 0xFB1D && ucs <= 0xFB4F)	/* Hebrew presentation forms */
367   || (ucs >= 0x0600 && ucs <= 0x07FF)	/* Arabic, Syriac, Thaana, NKo */
368   || (ucs >= 0xFB50 && ucs <= 0xFDFF)	/* Arabic presentation forms A */
369   || (ucs >= 0xFE70 && ucs <= 0xFEFF)	/* Arabic presentation forms B */
370   || (ucs >= 0x0800 && ucs <= 0x08FF)	/* Samaritan, Mandaic, ..., Arabic Ext-A */
371   || (ucs == 0x200F)			/* right-to-left mark */
372 #ifdef RLmarks
373   || (ucs == 0x202B)			/* right-to-left embedding */
374   || (ucs == 0x202E)			/* right-to-left override */
375 #endif
376   || (ucs >= 0x10800 && ucs <= 0x10FFF)
377   || (ucs >= 0x1E800 && ucs <= 0x1EFFF)	/* ..., Arabic Mathematical, ... */
378   ;
379 }
380 
381 
382 struct hanentry *
lookup_handescr(unichar)383 lookup_handescr (unichar)
384   unsigned long unichar;
385 {
386   int min = 0;
387   int max = hantable_len - 1;
388   int mid;
389 
390   /* binary search in table */
391   while (max >= min) {
392 	mid = (min + max) / 2;
393 	if (hantable [mid].unicode < unichar) {
394 		min = mid + 1;
395 	} else if (hantable [mid].unicode > unichar) {
396 		max = mid - 1;
397 	} else {
398 		/* construct struct hanentry from raw_hanentry */
399 		static struct hanentry han;
400 		char * text = hantable [mid].text;
401 
402 		han.unicode = hantable [mid].unicode;
403 		han.Mandarin = text;
404 		text += strlen (text) + 1;
405 		han.Cantonese = text;
406 		text += strlen (text) + 1;
407 		han.Japanese = text;
408 		text += strlen (text) + 1;
409 		han.Sino_Japanese = text;
410 		text += strlen (text) + 1;
411 		han.Hangul = text;
412 		text += strlen (text) + 1;
413 		han.Korean = text;
414 		text += strlen (text) + 1;
415 		han.Vietnamese = text;
416 		text += strlen (text) + 1;
417 		han.HanyuPinlu = text;
418 		text += strlen (text) + 1;
419 		han.HanyuPinyin = text;
420 		text += strlen (text) + 1;
421 		han.XHCHanyuPinyin = text;
422 		text += strlen (text) + 1;
423 		han.Tang = text;
424 		text += strlen (text) + 1;
425 		han.Definition = text;
426 
427 		return & han;
428 	}
429   }
430   return 0;
431 }
432 
433 
434 FLAG
is_bullet_or_dash(unich)435 is_bullet_or_dash (unich)
436   unsigned long unich;
437 {
438   char * chname = charname (unich);
439   if (unich == 0xB7) {	/* MIDDLE DOT */
440 	return True;
441   }
442   if (unich == 0x2015) {	/* HORIZONTAL BAR / QUOTATION DASH */
443 	return True;
444   }
445   if (chname != NIL_PTR) {
446 	char * bull = strstr (chname, "BULLET");
447 	if (bull != NIL_PTR && strlen (bull) == 6) {
448 		return True;
449 	}
450 	bull = strstr (chname, "DASH");
451 	if (bull != NIL_PTR && strlen (bull) == 4) {
452 		return True;
453 	}
454   }
455   return False;
456 }
457 
458 
459 /*======================================================================*\
460 |*			Some Unicode character properties		*|
461 \*======================================================================*/
462 
463 struct interval {
464     unsigned long first;
465     unsigned long last;
466 };
467 
468 static
469 int
lookup(ucs,table,len)470 lookup (ucs, table, len)
471   unsigned long ucs;
472   struct interval * table;
473   int len;
474 {
475   int min = 0;
476   int mid;
477   int max = len - 1;
478 
479   if (ucs < table [0].first || ucs > table [max].last) {
480 	return 0;
481   }
482   while (max >= min) {
483 	mid = (min + max) / 2;
484 	if (ucs > table [mid].last) {
485 		min = mid + 1;
486 	} else if (ucs < table [mid].first) {
487 		max = mid - 1;
488 	} else {
489 		return 1;
490 	}
491   }
492 
493   return 0;
494 }
495 
496 
497 FLAG
isLetter(unichar)498 isLetter (unichar)
499   unsigned long unichar;
500 {
501 	char * cat = category (unichar);
502 	return streq (cat, "Letter");
503 }
504 
505 /* struct interval list_wide [] */
506 #include "wide.t"
507 
508 FLAG
is_wideunichar(ucs)509 is_wideunichar (ucs)
510   unsigned long ucs;
511 {
512   return lookup (ucs, list_wide, arrlen (list_wide));
513 }
514 
515 
516 struct caseconv_entry caseconv_table [] = {
517 #include "casetabl.t"
518 };
519 
520 #define caseconv_table_size	(sizeof (caseconv_table) / sizeof (* caseconv_table))
521 
522 
523 struct caseconv_special_entry caseconv_special [] = {
524 #include "casespec.t"
525 };
526 
527 #define caseconv_special_size	(sizeof (caseconv_special) / sizeof (* caseconv_special))
528 
529 
530 /* struct interval list_Soft_Dotted [] */
531 #include "softdot.t"
532 
533 
534 static struct {
535     unsigned long first;
536     unsigned long last;
537     char category;
538     short combining_class;
539 } combining_classes [] = {
540 #include "combin.t"
541 };
542 
543 /**
544    Look up combining class;
545    return: if not found, -1
546            if category is "Spacing Combining" (Mc): -1 - combining class
547            else combining class
548  */
549 static
550 int
combining_class(ucs)551 combining_class (ucs)
552   unsigned long ucs;
553 {
554   int min = 0;
555   int mid;
556   int max = arrlen (combining_classes) - 1;
557 
558   if (ucs < combining_classes [0].first) {
559 	return -1;
560   }
561   while (max >= min) {
562 	mid = (min + max) / 2;
563 	if (ucs > combining_classes [mid].last) {
564 		min = mid + 1;
565 	} else if (ucs < combining_classes [mid].first) {
566 		max = mid - 1;
567 	} else {
568 		if (combining_classes [mid].category == 'c') {
569 			return -2 - combining_classes [mid].combining_class;
570 		} else {
571 			return combining_classes [mid].combining_class;
572 		}
573 	}
574   }
575   return -1;
576 }
577 
578 FLAG
iscombining_unichar(ucs)579 iscombining_unichar (ucs)
580   unsigned long ucs;
581 {
582 #ifdef spacingcombining_isnt_combining
583   return combining_class (ucs) >= 0;
584 #else
585   return combining_class (ucs) != -1;
586 #endif
587 }
588 
589 FLAG
isspacingcombining_unichar(ucs)590 isspacingcombining_unichar (ucs)
591   unsigned long ucs;
592 {
593   return combining_class (ucs) <= -2;
594 }
595 
596 int
soft_dotted(ucs)597 soft_dotted (ucs)
598   unsigned long ucs;
599 {
600   return lookup (ucs, list_Soft_Dotted, arrlen (list_Soft_Dotted));
601 }
602 
603 
604 int
lookup_caseconv(basechar)605 lookup_caseconv (basechar)
606   unsigned long basechar;
607 {
608 	int low = 0;
609 	int high = caseconv_table_size - 1;
610 	int i;
611 
612 	while (low <= high) {
613 		i = (low + high) / 2;
614 		if (caseconv_table [i].base == basechar) {
615 			return i;
616 		} else if (caseconv_table [i].base >= basechar) {
617 			high = i - 1;
618 		} else {
619 			low = i + 1;
620 		}
621 	}
622 	/* notify "not found" */
623 	return -1;
624 }
625 
626 /**
627    case_convert converts a Unicode character to
628    +2: title case
629    +1: upper case
630    -1: lower case
631  */
632 unsigned long
case_convert(unichar,dir)633 case_convert (unichar, dir)
634   unsigned long unichar;
635   int dir;
636 {
637   int tabix = lookup_caseconv (unichar);
638 
639   if (tabix >= 0) {
640 	if (dir == 2 && caseconv_table [tabix].title != 0) {
641 		return caseconv_table [tabix].title;
642 	} else if (dir > 0 && caseconv_table [tabix].toupper != 0) {
643 		return unichar + caseconv_table [tabix].toupper;
644 	} else if (dir < 0 && caseconv_table [tabix].tolower != 0) {
645 		return unichar + caseconv_table [tabix].tolower;
646 	}
647   }
648 
649   return unichar;
650 }
651 
652 int
lookup_caseconv_special(basechar,langcond)653 lookup_caseconv_special (basechar, langcond)
654   unsigned long basechar;
655   short langcond;
656 {
657   int i;
658 #ifdef caseconvsearch_uncond
659   int low = 0;
660   int high = caseconv_special_size - 1;
661 
662 	/* plain binary search is not applicable as keys are ambiguous */
663 	while (low <= high) {
664 		i = (low + high) / 2;
665 		if (caseconv_special [i].base == basechar) {
666 			return i;
667 		} else if (caseconv_special [i].base >= basechar) {
668 			high = i - 1;
669 		} else {
670 			low = i + 1;
671 		}
672 	}
673 #else
674 	for (i = 0; i < caseconv_special_size; i ++) {
675 		if (caseconv_special [i].base == basechar) {
676 			short langcondi = caseconv_special [i].condition & U_conds_lang;
677 			if (langcondi == 0 || (langcondi & langcond)) {
678 				return i;
679 			}
680 		}
681 	}
682 #endif
683 	/* notify "not found" */
684 	return -1;
685 }
686 
687 int
iscombining_notabove(unichar)688 iscombining_notabove (unichar)
689   unsigned long unichar;
690 {
691   int cc = combining_class (unichar);
692   return cc > 0 && cc != 230;
693 }
694 
695 int
iscombining_above(unichar)696 iscombining_above (unichar)
697   unsigned long unichar;
698 {
699   return combining_class (unichar) == 230;
700 }
701 
702 
703 /*======================================================================*\
704 |*				End					*|
705 \*======================================================================*/
706