1 /*======================================================================*\
2 |* Editor mined *|
3 |* CJK character set <-> Unicode mapping tables *|
4 \*======================================================================*/
5
6 #include "mined.h"
7 #include "charcode.h"
8 #include "charprop.h"
9 #include "termprop.h"
10
11
12 /*======================================================================*\
13 |* Character values *|
14 \*======================================================================*/
15
16 unsigned char code_SPACE = ' ';
17 unsigned char code_TAB = '\t';
18 unsigned long code_LF = '\n';
19 unsigned long code_NL = CHAR_INVALID;
20
21
22 /*======================================================================*\
23 |* Character properties *|
24 \*======================================================================*/
25
26 FLAG
no_char(c)27 no_char (c)
28 unsigned long c;
29 {
30 return c == CHAR_UNKNOWN || c == CHAR_INVALID;
31 }
32
33 FLAG
no_unichar(u)34 no_unichar (u)
35 unsigned long u;
36 {
37 return u == CHAR_UNKNOWN || u == CHAR_INVALID;
38 }
39
40 /**
41 Check if character is a control character in current encoding.
42 (Should be more generic...)
43 */
44 int
iscontrol(c)45 iscontrol (c)
46 unsigned long c;
47 {
48 if (mapped_text) {
49 unsigned long u = lookup_encodedchar (c);
50 return u == '\177' || (! no_unichar (u) && u < ' ');
51 } else if (utf8_text) {
52 if (unassigned_single_width) {
53 if (rxvt_version > 0) {
54 /* handle weird mapping of non-Unicode ranges */
55 if (c < 0x80000000) {
56 c &= 0x1FFFFF;
57 }
58 }
59 }
60 return c == '\177' || c < ' ';
61 } else if (cjk_text) {
62 return c == '\177' || c < ' ';
63 } else {
64 return c == '\177' || (c & '\177') < ' ';
65 }
66 }
67
68 /**
69 Check if character is any of the following white space characters:
70 U+0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
71 U+0020;SPACE;Zs;0;WS;;;;;N;;;;;
72 U+00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
73 U+2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
74 U+2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
75 U+2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
76 U+2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
77 U+2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
78 U+2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
79 U+2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
80 U+2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
81 U+200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
82 U+200B;ZERO WIDTH SPACE;Cf;0;BN;;;;;N;;;;;
83 U+202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
84 U+205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
85 U+3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
86 */
87 int
iswhitespace(c)88 iswhitespace (c)
89 unsigned long c;
90 {
91 return c == ' ' || c == '\t' || c == 0xA0
92 || (c >= 0x2002 && c <= 0x200B) || c == 0x3000
93 || c == 0x202F || c == 0x205F
94 || c == 0xFEFF;
95 }
96
97 struct interval {
98 unsigned long first;
99 unsigned long last;
100 };
101 /*
102 static struct interval list_Quotation_Mark [] =
103 static struct interval list_Dash [] =
104 ...
105 */
106 #include "typoprop.t"
107
108 static
109 int
lookup(ucs,table,length)110 lookup (ucs, table, length)
111 unsigned long ucs;
112 struct interval * table;
113 int length;
114 {
115 int min = 0;
116 int mid;
117 int max = length - 1;
118
119 if (ucs < table [0].first || ucs > table [max].last) {
120 return 0;
121 }
122 while (max >= min) {
123 mid = (min + max) / 2;
124 if (ucs > table [mid].last) {
125 min = mid + 1;
126 } else if (ucs < table [mid].first) {
127 max = mid - 1;
128 } else {
129 return 1;
130 }
131 }
132
133 return 0;
134 }
135
136 /**
137 Check if character is a quotation mark
138 */
139 int
isquotationmark(unichar)140 isquotationmark (unichar)
141 unsigned long unichar;
142 {
143 return lookup (unichar, list_Quotation_Mark, arrlen (list_Quotation_Mark));
144 }
145
146 /**
147 Check if character is a dash
148 */
149 int
isdash(unichar)150 isdash (unichar)
151 unsigned long unichar;
152 {
153 return lookup (unichar, list_Dash, arrlen (list_Dash));
154 }
155
156 /**
157 Check if character is an opening parenthesis
158 */
159 int
isopeningparenthesis(unichar)160 isopeningparenthesis (unichar)
161 unsigned long unichar;
162 {
163 return lookup (unichar, list_Ps, arrlen (list_Ps));
164 }
165
166 /**
167 Return display indication for a control character.
168 */
169 character
controlchar(c)170 controlchar (c)
171 character c;
172 {
173 if (c == '\177') {
174 return '?';
175 } else {
176 return c + '@';
177 }
178 }
179
180
181 /**
182 Return the isolated form of an ALEF character.
183 */
184 unsigned long
isolated_alef(unichar)185 isolated_alef (unichar)
186 unsigned long unichar;
187 {
188 if (unichar == 0x0622) {
189 /* ALEF WITH MADDA ABOVE */
190 return 0xFE81;
191 } else if (unichar == 0x0623) {
192 /* ALEF WITH HAMZA ABOVE */
193 return 0xFE83;
194 } else if (unichar == 0x0625) {
195 /* ALEF WITH HAMZA BELOW */
196 return 0xFE87;
197 } else if (unichar == 0x0627) {
198 /* ALEF */
199 return 0xFE8D;
200 } else {
201 return 0xFE8D;
202 }
203 }
204
205 /**
206 Return the ligature with LAM of an ALEF character.
207 Use the ISOLATED FORM.
208 */
209 unsigned long
ligature_lam_alef(unichar)210 ligature_lam_alef (unichar)
211 unsigned long unichar;
212 {
213 if (unichar == 0x0622) {
214 /* ALEF WITH MADDA ABOVE */
215 return 0xFEF5;
216 } else if (unichar == 0x0623) {
217 /* ALEF WITH HAMZA ABOVE */
218 return 0xFEF7;
219 } else if (unichar == 0x0625) {
220 /* ALEF WITH HAMZA BELOW */
221 return 0xFEF9;
222 } else if (unichar == 0x0627) {
223 /* ALEF */
224 return 0xFEFB;
225 } else {
226 return 0xFEFB;
227 }
228 }
229
230
231 /**
232 Return max value in current encoding.
233 */
234 unsigned long
max_char_value()235 max_char_value ()
236 {
237 if (cjk_text) switch (text_encoding_tag) {
238 case 'G': return 0xFF39FF39;
239 case 'C': return 0x8EFFFFFF;
240 case 'J': return 0x8FFFFF;
241 case 'X': return 0x8FFFFF;
242 default: return 0xFFFF;
243 } else if (utf8_text) {
244 return 0x7FFFFFFF;
245 } else {
246 return 0xFF;
247 }
248 }
249
250
251 /**
252 Convert CJK character in current text encoding to byte sequence.
253 Terminate with NUL byte.
254 Return byte count.
255 */
256 int
cjkencode(cjkchar,buf)257 cjkencode (cjkchar, buf)
258 unsigned long cjkchar;
259 character * buf;
260 {
261 return cjkencode_char (False, cjkchar, buf);
262 }
263
264 static
265 int
multi_char(term,c)266 multi_char (term, c)
267 FLAG term;
268 character c;
269 {
270 if (term) {
271 return (character) c >= 0x80
272 && (! cjk_term
273 || (term_encoding_tag != 'S' && term_encoding_tag != 'x')
274 || (character) c < 0xA1
275 || (character) c > 0xDF);
276 } else {
277 return multichar (c);
278 }
279 }
280
281 /**
282 Convert CJK character in terminal or text encoding to byte sequence.
283 */
284 int
cjkencode_char(term,cjkchar,buf)285 cjkencode_char (term, cjkchar, buf)
286 FLAG term;
287 unsigned long cjkchar;
288 character * buf;
289 {
290 int len = 0;
291 int i;
292 char encoding_tag = term ? term_encoding_tag : text_encoding_tag;
293
294 if (cjkchar >= 0x1000000) {
295 i = (cjkchar >> 16) & 0xFF;
296 if (encoding_tag == 'G' && cjkchar >= 0x80000000
297 && i >= '0' && i <= '9') {
298 len = 4;
299 } else if (encoding_tag == 'C' && (cjkchar >> 24) == 0x8E) {
300 len = 4;
301 }
302 } else if (cjkchar >= 0x10000) {
303 if ((encoding_tag == 'J' || encoding_tag == 'X')
304 && (cjkchar >> 16) == 0x8F) {
305 len = 3;
306 }
307 } else if (cjkchar >= 0x8000 && (cjkchar & 0xFF) > 0 &&
308 multi_char (term, (character) (cjkchar >> 8))) {
309 len = 2;
310 } else if (cjkchar < 0x100 && ! multi_char (term, cjkchar)) {
311 len = 1;
312 }
313
314 for (i = len - 1; i >= 0; i --) {
315 buf [i] = cjkchar & 0xFF;
316 cjkchar = cjkchar >> 8;
317 if (buf [i] == '\0') {
318 len = 0;
319 }
320 }
321 buf [len] = '\0';
322
323 return len;
324 }
325
326 /**
327 Convert Unicode character to UTF-8.
328 Terminate with NUL byte.
329 Return byte count.
330 */
331 int
utfencode(unichar,buf)332 utfencode (unichar, buf)
333 unsigned long unichar;
334 character * buf;
335 {
336 int len;
337
338 if (unichar < 0x80) {
339 len = 1;
340 * buf ++ = unichar;
341 } else if (unichar < 0x800) {
342 len = 2;
343 * buf ++ = 0xC0 | (unichar >> 6);
344 * buf ++ = 0x80 | (unichar & 0x3F);
345 } else if (unichar < 0x10000) {
346 len = 3;
347 * buf ++ = 0xE0 | (unichar >> 12);
348 * buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
349 * buf ++ = 0x80 | (unichar & 0x3F);
350 } else if (unichar < 0x200000) {
351 len = 4;
352 * buf ++ = 0xF0 | (unichar >> 18);
353 * buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
354 * buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
355 * buf ++ = 0x80 | (unichar & 0x3F);
356 } else if (unichar < 0x4000000) {
357 len = 5;
358 * buf ++ = 0xF8 | (unichar >> 24);
359 * buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
360 * buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
361 * buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
362 * buf ++ = 0x80 | (unichar & 0x3F);
363 } else if (unichar < 0x80000000) {
364 len = 6;
365 * buf ++ = 0xFC | (unichar >> 30);
366 * buf ++ = 0x80 | ((unichar >> 24) & 0x3F);
367 * buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
368 * buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
369 * buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
370 * buf ++ = 0x80 | (unichar & 0x3F);
371 } else {
372 len = 0;
373 }
374 * buf = '\0';
375 return len;
376 }
377
378 /**
379 Convert character to byte sequence.
380 Terminate with NUL byte.
381 Return buffer pointer.
382 */
383 char *
encode_char(c)384 encode_char (c)
385 unsigned long c;
386 {
387 static char buf [7];
388 if (utf8_text) {
389 (void) utfencode (c, buf);
390 } else if (cjk_text) {
391 (void) cjkencode (c, buf);
392 } else {
393 buf [0] = c;
394 buf [1] = '\0';
395 }
396 return buf;
397 }
398
399
400 /**
401 Check if a character code is a valid CJK encoding pattern
402 (not necessarily in the assigned ranges)
403 of the currently active text encoding.
404 */
405 FLAG
valid_cjk(cjkchar,cjkbytes)406 valid_cjk (cjkchar, cjkbytes)
407 unsigned long cjkchar;
408 character * cjkbytes;
409 {
410 return valid_cjkchar (False, cjkchar, cjkbytes);
411 }
412
413 /**
414 Check if a character code is a valid CJK encoding pattern
415 (not necessarily in the assigned ranges)
416 of the currently active terminal or text encoding.
417 */
418 FLAG
valid_cjkchar(term,cjkchar,cjkbytes)419 valid_cjkchar (term, cjkchar, cjkbytes)
420 FLAG term;
421 unsigned long cjkchar;
422 character * cjkbytes;
423 {
424 character cjkbuf [5];
425 char encoding_tag = term ? term_encoding_tag : text_encoding_tag;
426
427 if (cjkchar < 0x80) {
428 return True;
429 }
430
431 if (! cjkbytes) {
432 cjkbytes = cjkbuf;
433 (void) cjkencode_char (term, cjkchar, cjkbytes);
434 }
435
436 switch (encoding_tag) {
437 /*
438 GB GBK 81-FE 40-7E, 80-FE
439 GB18030 81-FE 30-39 81-FE 30-39
440 Big5 Big5-HKSCS 87-FE 40-7E, A1-FE
441 CNS EUC-TW A1-FE A1-FE
442 8E A1-A7 A1-FE A1-FE
443 */
444 case 'G': if (cjkchar > 0xFFFF) {
445 return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
446 && cjkbytes [1] >= '0' && cjkbytes [1] <= '9'
447 && cjkbytes [2] >= 0x81 && cjkbytes [2] <= 0xFE
448 && cjkbytes [3] >= '0' && cjkbytes [3] <= '9';
449 } else {
450 return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
451 && cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFE
452 && cjkbytes [1] != 0x7F;
453 }
454 case 'B': return cjkbytes [0] >= 0x87 && cjkbytes [0] <= 0xFE
455 && ((cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0x7E)
456 ||
457 (cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE)
458 )
459 && cjkbytes [2] == 0;
460 case 'C': return (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
461 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
462 && cjkbytes [2] == 0)
463 ||
464 (cjkbytes [0] == 0x8E
465 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xAF
466 && cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
467 && cjkbytes [3] >= 0xA1 && cjkbytes [3] <= 0xFE);
468 /*
469 EUC-JP 8E A1-DF
470 A1-A8 A1-FE
471 B0-F4 A1-FE
472 8F A2,A6,A7,A9-AB,B0-ED A1-FE
473 8F A1-FE A1-FE
474 EUC-JIS X 0213 8E A1-DF
475 A1-FE A1-FE
476 8F A1,A3-A5,A8,AC-AF,EE-FE A1-FE
477 */
478 case 'X':
479 case 'J': return (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
480 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
481 && cjkbytes [2] == 0
482 )
483 ||
484 (cjkbytes [0] == 0x8E
485 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xDF
486 && cjkbytes [2] == 0
487 )
488 ||
489 (cjkbytes [0] == 0x8F
490 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
491 && cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
492 && cjkbytes [3] == 0
493 );
494 /*
495 Shift-JIS A1-DF
496 81-84, 87-9F 40-7E, 80-FC
497 E0-EA, ED-EE, FA-FC 40-7E, 80-FC
498 Shift-JIS X 0213 A1-DF
499 81-9F 40-7E, 80-FC
500 E0-FC 40-7E, 80-FC
501 */
502 case 'x':
503 case 'S': return (cjkchar >= 0xA1 && cjkchar <= 0xDF)
504 || (cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFC
505 && cjkbytes [1] != 0x7F && cjkbytes [2] == 0
506 && (
507 (cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0x9F)
508 || (cjkbytes [0] >= 0xE0 && cjkbytes [0] <= 0xFC)
509 )
510 );
511 /*
512 UHC UHC 81-FE 41-5A, 61-7A, 81-FE
513 Johab 84-DE 31-7E, 81-FE
514 E0-F9 31-7E, 81-FE
515 */
516 case 'K': return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
517 && ((cjkbytes [1] >= 0x41 && cjkbytes [1] <= 0x5A)
518 ||
519 (cjkbytes [1] >= 0x61 && cjkbytes [1] <= 0x7A)
520 ||
521 (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
522 )
523 && cjkbytes [2] == 0;
524 case 'H': return ((cjkbytes [0] >= 0x84 && cjkbytes [0] <= 0xDE)
525 ||
526 (cjkbytes [0] >= 0xE0 && cjkbytes [0] <= 0xF9)
527 )
528 &&
529 ((cjkbytes [1] >= 0x31 && cjkbytes [1] <= 0x7E)
530 ||
531 (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
532 )
533 && cjkbytes [2] == 0;
534 default: return False;
535 }
536 }
537
538
539 /*======================================================================*\
540 Conversion tables mapping various CJK encodings to Unicode
541 \*======================================================================*/
542
543 #include "charmaps.h"
544
545 struct charmap_table_entry {
546 struct encoding_table_entry * table;
547 unsigned int * table_len;
548 char * charmap;
549 char * tag2;
550 char tag1;
551 };
552
553 static struct charmap_table_entry charmaps_table [] = {
554 # ifdef __TURBOC__
555 {cp437_table, & cp437_table_len, "CP437", "PC", 'p'},
556 {cp850_table, & cp850_table_len, "CP850", "PL", 'P'},
557 # else
558 # include "charmaps.t"
559 # endif
560 };
561
562
563 /*======================================================================*\
564 |* Configuration string matching *|
565 \*======================================================================*/
566
567 /**
568 matchprefix determines whether its first parameter contains its
569 second parameter matching approximately as an initial prefix,
570 at word boundaries!
571 The match ignores separating '-', '_', and space characters,
572 and does not match case.
573 The algorithm assumes that letters are ASCII as this is used for
574 configuration strings only.
575 */
576 static
577 int
matchprefix(s,m)578 matchprefix (s, m)
579 char * s;
580 char * m;
581 {
582 do {
583 char cs, cm;
584 while (* m == '-' || * m == '_' || * m == ' ') {
585 m ++;
586 }
587 while (* s == '-' || * s == '_' || * s == ' ') {
588 s ++;
589 }
590 if (! * m) {
591 return True;
592 #ifdef koi8_ru_fix
593 /* approx. prefix match found; check word boundary */
594 if ( (* s >= 'a' && * s <= 'z')
595 || (* s >= 'A' && * s <= 'Z')
596 || (* s >= '0' && * s <= '9')
597 ) {
598 /* continue */
599 } else {
600 return True;
601 }
602 #endif
603 }
604 if (! * s) {
605 return False;
606 }
607 cs = * s;
608 if (cs >= 'a' && cs <= 'z') {
609 cs = cs - 'a' + 'A';
610 }
611 cm = * m;
612 if (cm >= 'a' && cm <= 'z') {
613 cm = cm - 'a' + 'A';
614 }
615 if (cm != cs) {
616 return False;
617 }
618 s ++;
619 m ++;
620 } while (True);
621 }
622
623 /**
624 matchpart determines whether its first parameter contains its
625 non-empty second parameter matching approximately as an initial
626 prefix or as a prefix of any part after a '/' or '>' separator,
627 at word boundaries!
628 The match ignores separating '-', '_', and space characters,
629 and does not match case.
630 The algorithm assumes that letters are ASCII as this is used for
631 configuration strings only.
632 */
633 static
634 int
matchpart(s,m)635 matchpart (s, m)
636 char * s;
637 char * m;
638 {
639 char * p;
640 if (! * m) {
641 return False;
642 }
643 if (matchprefix (s, m)) {
644 return True;
645 } else {
646 p = strpbrk (s, ">/");
647 if (p) {
648 p ++;
649 return matchpart (p, m);
650 } else {
651 return False;
652 }
653 }
654 }
655
656
657 /*======================================================================*\
658 |* Mapping tables and functions *|
659 \*======================================================================*/
660
661 /**
662 Terminal character mapping table and its length
663 */
664 static struct encoding_table_entry * terminal_table = (struct encoding_table_entry *) 0;
665 static unsigned int terminal_table_len = 0;
666
667 /**
668 Current CJK/Unicode mapping table and its length
669 */
670 static struct encoding_table_entry * text_table = (struct encoding_table_entry *) 0;
671 static unsigned int text_table_len = 0;
672
673
674 /**
675 Are mapped text and terminal encodings different?
676 */
677 FLAG
remapping_chars()678 remapping_chars ()
679 {
680 return text_table != terminal_table;
681 }
682
683
684 /**
685 List of 2nd characters of 2 Unicode character mappings (mostly accents)
686 for certain 2-character CJK mappings (JIS or HKSCS);
687 must be consistent with range and order of according #defines in charcode.h
688 */
689 static unsigned int uni2_accents [] =
690 {0x309A, 0x0300, 0x0301, 0x02E5, 0x02E9, 0x0304, 0x030C};
691
692 /**
693 Current encoding indications
694 */
695 char text_encoding_tag = '-';
696 char * text_encoding_flag = "??"; /* for display in flags menu area */
697 char term_encoding_tag = '-';
698 static char * current_text_encoding = "";
699 static char * term_encoding = "";
700
701 /**
702 Return charmap name of current text encoding.
703 */
704 char *
get_text_encoding()705 get_text_encoding ()
706 {
707 if (utf8_text) {
708 if (utf16_file) {
709 if (utf16_little_endian) {
710 return "UTF-16LE";
711 } else {
712 return "UTF-16BE";
713 }
714 } else {
715 return "UTF-8";
716 }
717 } else if (! cjk_text && ! mapped_text) {
718 if (ebcdic_file) {
719 return "CP1047";
720 } else {
721 return "ISO 8859-1";
722 }
723 } else {
724 return current_text_encoding;
725 }
726 }
727
728 /**
729 Return charmap name of terminal encoding.
730 */
731 char *
get_term_encoding()732 get_term_encoding ()
733 {
734 if (utf8_screen) {
735 return "UTF-8";
736 } else if (! cjk_term && ! mapped_term) {
737 return "ISO 8859-1";
738 } else {
739 return term_encoding;
740 }
741 }
742
743
744 static FLAG combined_text;
745
746 /**
747 Return True if active encoding has combining characters.
748 */
749 FLAG
encoding_has_combining()750 encoding_has_combining ()
751 {
752 return utf8_text
753 || (mapped_text && combined_text)
754 || (cjk_text && combined_text);
755 }
756
757 /**
758 Determine if active encoding has combining characters.
759 */
760 static
761 FLAG
mapping_has_combining(term)762 mapping_has_combining (term)
763 FLAG term;
764 {
765 unsigned long i;
766 for (i = 0; i < 0x100; i ++) {
767 unsigned long unichar;
768 if (term) {
769 unichar = lookup_mappedtermchar (i);
770 } else {
771 unichar = lookup_encodedchar (i);
772 }
773 if (term ? term_iscombining (unichar) : iscombining_unichar (unichar)) {
774 return True;
775 }
776 }
777 return False;
778 }
779
780 #ifdef split_map_entries
781 /*
782 Decode CJK character value from split table entry.
783 */
784 static
785 unsigned long
decode_cjk(entrypoi,map_table)786 decode_cjk (entrypoi, map_table)
787 struct encoding_table_entry * entrypoi;
788 struct encoding_table_entry * map_table;
789 {
790 #ifdef use_CJKcharmaps
791 if (map_table == gb_table) {
792 if ((unsigned int) entrypoi->cjk_ext == 0xFF) {
793 return entrypoi->cjk_base;
794 } else {
795 return ((entrypoi->cjk_base & 0x00FF) << 24)
796 | (entrypoi->cjk_base & 0xFF00)
797 | 0x00300030
798 | ((((unsigned int) entrypoi->cjk_ext) & 0xF0) << 12)
799 | (((unsigned int) entrypoi->cjk_ext) & 0x0F);
800 }
801 }
802 #endif
803
804 if ((unsigned int) entrypoi->cjk_ext >= 0x90) {
805 return 0x8E000000 | (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
806 } else {
807 return (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
808 }
809 }
810 #endif
811
812 static
813 void
setup_mapping(term,map_table,map_table_len,tag1,tag2)814 setup_mapping (term, map_table, map_table_len, tag1, tag2)
815 FLAG term;
816 struct encoding_table_entry * map_table;
817 unsigned int map_table_len;
818 char tag1;
819 char * tag2;
820 {
821 FLAG multi_byte = False;
822 unsigned int j;
823
824 if (term) {
825 terminal_table = map_table;
826 terminal_table_len = map_table_len;
827 term_encoding_tag = tag1;
828 } else {
829 text_table = map_table;
830 text_table_len = map_table_len;
831 text_encoding_tag = tag1;
832 text_encoding_flag = tag2;
833 }
834
835 /* check if it is a multi-byte mapping table */
836 for (j = 0; j < map_table_len; j ++) {
837 unsigned long cjki;
838 #ifdef split_map_entries
839 cjki = decode_cjk (& map_table [j], map_table);
840 #else
841 cjki = map_table [j].cjk;
842 #endif
843 if (cjki > 0xFF) {
844 multi_byte = True;
845 break;
846 }
847 }
848
849 if (term) {
850 if (multi_byte) {
851 cjk_term = True;
852 mapped_term = False;
853 /* combining_screen is auto-detected */
854 } else {
855 mapped_term = True;
856 cjk_term = False;
857 /* combining_screen is auto-detected */
858 }
859 } else {
860 if (multi_byte) {
861 cjk_text = True;
862 mapped_text = False;
863 combined_text = text_encoding_tag == 'G'
864 || text_encoding_tag == 'X'
865 || text_encoding_tag == 'x';
866 } else {
867 mapped_text = True;
868 cjk_text = False;
869 combined_text = mapping_has_combining (term);
870 }
871 }
872 }
873
874 /**
875 Set either text or terminal character mapping table.
876 Return True on success, False if tag unknown.
877 */
878 static
879 FLAG
set_char_encoding(term,charmap,tag)880 set_char_encoding (term, charmap, tag)
881 FLAG term;
882 char * charmap;
883 char tag;
884 {
885 if (term) {
886 ascii_screen = False;
887 }
888 if (charmap && ! term
889 && (streq (":16", charmap) || matchpart ("UTF-16BE", charmap))) {
890 utf8_text = True;
891 utf16_file = True;
892 utf16_little_endian = False;
893 cjk_text = False;
894 mapped_text = False;
895 current_text_encoding = "UTF-16BE";
896 text_encoding_flag = "16";
897 return True;
898 } else if (charmap && ! term
899 && (streq (":61", charmap) || matchpart ("UTF-16LE", charmap))) {
900 utf8_text = True;
901 utf16_file = True;
902 utf16_little_endian = True;
903 cjk_text = False;
904 mapped_text = False;
905 current_text_encoding = "UTF-16LE";
906 text_encoding_flag = "61";
907 return True;
908 } else if (charmap && ! term && streq (":??", charmap)) {
909 text_table_len = 0;
910 text_encoding_tag = ' ';
911 text_encoding_flag = "??";
912 utf8_text = False;
913 utf16_file = False;
914 cjk_text = True;
915 mapped_text = False;
916 current_text_encoding = "[CJK]";
917 return True;
918 } else if (charmap ? strisprefix ("UTF-8", charmap) : tag == 'U') {
919 if (term) {
920 utf8_screen = True;
921 utf8_input = True;
922 cjk_term = False;
923 mapped_term = False;
924 term_encoding = "UTF-8";
925 term_encoding_tag = 'U';
926 } else {
927 utf8_text = True;
928 utf16_file = False;
929 cjk_text = False;
930 mapped_text = False;
931 current_text_encoding = "UTF-8";
932 text_encoding_flag = "U8";
933 }
934 return True;
935 } else if (charmap ? matchpart ("ISO 8859-1", charmap) : tag == 'L') {
936 if (term) {
937 utf8_screen = False;
938 utf8_input = False;
939 cjk_term = False;
940 mapped_term = False;
941 term_encoding = "ISO 8859-1";
942 term_encoding_tag = 'L';
943 } else {
944 utf8_text = False;
945 utf16_file = False;
946 cjk_text = False;
947 mapped_text = False;
948 current_text_encoding = "ISO 8859-1";
949 text_encoding_flag = "L1";
950 }
951 return True;
952 } else {
953 int i;
954 for (i = 0; i < arrlen (charmaps_table); i ++) {
955 if (charmap ? (charmap [0] == ':' ?
956 streq (& charmap [1], charmaps_table [i].tag2)
957 : matchpart (charmaps_table [i].charmap, charmap)
958 )
959 : charmaps_table [i].tag1 == tag) {
960 if (term) {
961 if (streq (charmaps_table [i].charmap, "CP1047")) {
962 /* not supporting EBCDIC terminal */
963 break;
964 }
965 utf8_screen = False;
966 utf8_input = False;
967 term_encoding = charmaps_table [i].charmap;
968 if (streq (term_encoding, "ASCII")) {
969 ascii_screen = True;
970 }
971 } else {
972 utf8_text = False;
973 utf16_file = False;
974 current_text_encoding = charmaps_table [i].charmap;
975 }
976 setup_mapping (term,
977 charmaps_table [i].table,
978 * charmaps_table [i].table_len,
979 charmaps_table [i].tag1,
980 charmaps_table [i].tag2);
981 return True;
982 }
983 }
984 }
985 return False;
986 }
987
988 static struct {
989 char * alias;
990 char * codepage;
991 } cpaliases [] = {
992 {"CP819", "ISO-8859-1"},
993 {"CP912", "ISO-8859-2"},
994 {"CP913", "ISO-8859-3"},
995 {"CP914", "ISO-8859-4"},
996 {"CP915", "ISO-8859-5"},
997 {"CP1089", "ISO-8859-6"},
998 {"CP813", "ISO-8859-7"},
999 {"CP916", "ISO-8859-8"},
1000 {"CP920", "ISO-8859-9"},
1001 {"CP919", "ISO-8859-10"},
1002 {"CP923", "ISO-8859-15"},
1003 {"CP28591", "ISO-8859-1"},
1004 {"CP28592", "ISO-8859-2"},
1005 {"CP28593", "ISO-8859-3"},
1006 {"CP28594", "ISO-8859-4"},
1007 {"CP28595", "ISO-8859-5"},
1008 {"CP28596", "ISO-8859-6"},
1009 {"CP28597", "ISO-8859-7"},
1010 {"CP28598", "ISO-8859-8"}, /* indicates visual ordering ... */
1011 {"CP28599", "ISO-8859-9"},
1012 {"CP28603", "ISO-8859-13"},
1013 {"CP28605", "ISO-8859-15"},
1014 {"CP38598", "ISO-8859-8"}, /* indicates logical ordering ... */
1015 {"CP20000", "CNS"},
1016 {"CP20127", "ASCII"},
1017 {"CP20866", "KOI8-R"},
1018 {"CP20936", "GB2312"},
1019 {"CP21866", "KOI8-U"},
1020 {"CP51949", "EUC-KR"},
1021 {"CP54936", "GB18030"},
1022 {"CP65001", "UTF-8"},
1023 {"CP932", "Shift-JIS"},
1024 {"CP936", "GBK"},
1025 {"CP949", "EUC-KR"},
1026 {"CP950", "Big5"},
1027 {"CP20932", "EUC-JP"},
1028 };
1029
1030 /**
1031 Set terminal character code mapping table according to encoding tag.
1032 Return True on success, False if tag unknown.
1033 */
1034 FLAG
set_term_encoding(charmap,tag)1035 set_term_encoding (charmap, tag)
1036 char * charmap;
1037 char tag;
1038 {
1039 /* handle generic codepage notation (cygwin 1.7 / DOS support) */
1040 if (charmap && strisprefix ("CP", charmap)) {
1041 if (set_char_encoding (True, charmap, tag)) {
1042 return True;
1043 } else {
1044 int i;
1045 /* check DOS/Windows ISO-8859 codepage aliases */
1046 for (i = 0; i < arrlen (cpaliases); i ++) {
1047 if (streq (charmap, cpaliases [i].alias)) {
1048 if (set_char_encoding (True, cpaliases [i].codepage, tag)) {
1049 return True;
1050 }
1051 }
1052 }
1053 (void) set_char_encoding (True, "ASCII", ' ');
1054 return False;
1055 }
1056 }
1057
1058 return set_char_encoding (True, charmap, tag);
1059 }
1060
1061 #define dont_debug_set_text_encoding
1062
1063 /**
1064 Set character mapping table and text encoding variables according
1065 to encoding tag.
1066 Return True on success, False if tag unknown.
1067 */
1068 FLAG
set_text_encoding(charmap,tag,debug_tag)1069 set_text_encoding (charmap, tag, debug_tag)
1070 char * charmap;
1071 char tag;
1072 char * debug_tag;
1073 {
1074 FLAG ret = set_char_encoding (False, charmap, tag);
1075
1076 #ifdef debug_set_text_encoding
1077 printf ("set_text_encoding [%s] %s [%c] -> %d: <%s>\n", debug_tag, charmap, tag, ret, get_text_encoding ());
1078 #endif
1079
1080 /* EBCDIC kludge */
1081 code_SPACE = encodedchar (' ');
1082 code_TAB = encodedchar ('\t');
1083 code_LF = encodedchar ('\n');
1084 code_NL = encodedchar (0x85);
1085 if (code_SPACE == ' ') {
1086 ebcdic_text = False;
1087 ebcdic_file = False;
1088 } else {
1089 ebcdic_text = True;
1090 /* or rather transform than map: */
1091 ebcdic_text = False;
1092 ebcdic_file = True;
1093 mapped_text = False;
1094 }
1095
1096 return ret;
1097 }
1098
1099 /*
1100 Look up a Unicode value in a character set mapping table.
1101 @return CJK value, or CHAR_INVALID if not found
1102 */
1103 static
1104 unsigned long
unmap_char(unichar,map_table,map_table_len,term)1105 unmap_char (unichar, map_table, map_table_len, term)
1106 unsigned long unichar;
1107 struct encoding_table_entry * map_table;
1108 unsigned int map_table_len;
1109 FLAG term;
1110 {
1111 #ifdef split_map_entries
1112 unsigned char unichar_high = unichar >> 16;
1113 unsigned short unichar_low = unichar & 0xFFFF;
1114 #endif
1115 int i;
1116
1117 /* workaround for handling ambiguous mappings:
1118 for terminal output, prefer longer code point:
1119 term ? scan table downwards : scan table upwards
1120 */
1121 term = False; /* don't use this quirk-around */
1122
1123 for (i = 0; i < map_table_len; i ++) {
1124 struct encoding_table_entry * map_table_poi =
1125 & map_table [term ? map_table_len - 1 - i : i];
1126 #ifdef split_map_entries
1127 if (
1128 unichar_low == map_table_poi->unicode_low
1129 && unichar_high == map_table_poi->unicode_high
1130 ) {
1131 return decode_cjk (map_table_poi, map_table);
1132 }
1133 #else
1134 if (
1135 unichar == map_table_poi->unicode
1136 ) {
1137 return map_table_poi->cjk;
1138 }
1139 #endif
1140 }
1141 return CHAR_INVALID;
1142 }
1143
1144 /*
1145 Map a character in a character set mapping table.
1146 @return Unicode value, or CHAR_INVALID if not found
1147 */
1148 static
1149 unsigned long
map_char(cjk,map_table,map_table_len)1150 map_char (cjk, map_table, map_table_len)
1151 unsigned long cjk;
1152 struct encoding_table_entry * map_table;
1153 unsigned int map_table_len;
1154 {
1155 int low = 0;
1156 int high = map_table_len - 1;
1157 int i;
1158
1159 unsigned long cjki;
1160
1161 while (low <= high) {
1162 i = (low + high) / 2;
1163 #ifdef split_map_entries
1164 cjki = decode_cjk (& map_table [i], map_table);
1165 #else
1166 cjki = map_table [i].cjk;
1167 #endif
1168 if (cjki == cjk) {
1169 #ifdef split_map_entries
1170 if (map_table [i].unicode_high & 0x80) {
1171 return 0x80000000 | (uni2_accents [map_table [i].unicode_high & 0x7F] << 16) | (map_table [i].unicode_low);
1172 } else {
1173 return (((unsigned long) map_table [i].unicode_high) << 16) | (map_table [i].unicode_low);
1174 }
1175 #else
1176 if (map_table [i].unicode & 0x800000) {
1177 return 0x80000000 | (uni2_accents [(map_table [i].unicode >> 16) & 0x7F] << 16) | (map_table [i].unicode & 0xFFFF);
1178 } else {
1179 return map_table [i].unicode;
1180 }
1181 #endif
1182 } else if (cjki >= cjk) {
1183 high = i - 1;
1184 } else {
1185 low = i + 1;
1186 }
1187 }
1188 return CHAR_INVALID;
1189 }
1190
1191
1192 /*======================================================================*\
1193 |* Conversion functions *|
1194 \*======================================================================*/
1195
1196 /**
1197 GB18030 algorithmic mapping part
1198 */
1199 static
1200 unsigned long
gb_to_unicode(gb)1201 gb_to_unicode (gb)
1202 unsigned long gb;
1203 {
1204 unsigned int byte2 = (gb >> 16) & 0xFF;
1205 unsigned int byte3 = (gb >> 8) & 0xFF;
1206 unsigned int byte4 = gb & 0xFF;
1207
1208 if (byte2 < '0' || byte2 > '9' || byte3 < 0x81 || byte4 < '0' || byte4 > '9') {
1209 return CHAR_INVALID;
1210 }
1211
1212 return (((((gb >> 24) & 0xFF) - 0x90) * 10
1213 + (byte2 - 0x30)) * 126L
1214 + (byte3 - 0x81)) * 10L
1215 + (byte4 - 0x30)
1216 + 0x10000;
1217 }
1218
1219 static
1220 unsigned long
unicode_to_gb(uc)1221 unicode_to_gb (uc)
1222 unsigned long uc;
1223 {
1224 unsigned int a, b, c, d;
1225
1226 if (uc >= 0x200000) {
1227 return CHAR_INVALID;
1228 }
1229
1230 uc -= 0x10000;
1231 d = 0x30 + uc % 10;
1232 uc /= 10;
1233 c = 0x81 + uc % 126;
1234 uc /= 126;
1235 b = 0x30 + uc % 10;
1236 uc /= 10;
1237 a = 0x90 + uc;
1238
1239 return (a << 24) | (b << 16) | (c << 8) | d;
1240 }
1241
1242
1243 /*
1244 mapped_char () converts a Unicode value into an encoded character,
1245 using the table given as parameter.
1246 */
1247 static
1248 unsigned long
mapped_char(unichar,map_table,map_table_len,term)1249 mapped_char (unichar, map_table, map_table_len, term)
1250 unsigned long unichar;
1251 struct encoding_table_entry * map_table;
1252 unsigned int map_table_len;
1253 FLAG term;
1254 {
1255 unsigned long cjkchar;
1256
1257 #ifdef use_CJKcharmaps
1258 if (map_table == gb_table && unichar >= 0x10000) {
1259 return unicode_to_gb (unichar);
1260 }
1261 #endif
1262
1263 cjkchar = unmap_char (unichar, map_table, map_table_len, term);
1264 if (cjkchar != CHAR_INVALID) {
1265 return cjkchar;
1266 }
1267
1268 if (unichar < 0x20) {
1269 /* transparently return control range (for commands) */
1270 return unichar;
1271 } else if (unichar < 0x80) {
1272 /* transparently map ASCII range unless mapped already */
1273 cjkchar = unichar;
1274 unichar = map_char (cjkchar, map_table, map_table_len);
1275 if (! no_unichar (unichar) && unichar != cjkchar) {
1276 return CHAR_INVALID;
1277 } else {
1278 return cjkchar;
1279 }
1280 } else {
1281 /* notify "not found" */
1282 return CHAR_INVALID;
1283 }
1284 }
1285
1286 /*
1287 mappedtermchar () converts a Unicode value into an encoded character,
1288 using the terminal encoding (terminal_table).
1289 */
1290 unsigned long
mappedtermchar(unichar)1291 mappedtermchar (unichar)
1292 unsigned long unichar;
1293 {
1294 return mapped_char (unichar, terminal_table, terminal_table_len, True);
1295 }
1296
1297 /*
1298 encodedchar () converts a Unicode value into an encoded character,
1299 using the current text encoding (text_table).
1300 */
1301 unsigned long
encodedchar(unichar)1302 encodedchar (unichar)
1303 unsigned long unichar;
1304 {
1305 if (cjk_text || mapped_text) {
1306 return mapped_char (unichar, text_table, text_table_len, False);
1307 } else if (utf8_text || unichar < 0x100) {
1308 return unichar;
1309 } else {
1310 return CHAR_INVALID;
1311 }
1312 }
1313
1314 /*
1315 encodedchar2 () converts two Unicode values into one JIS character,
1316 using the current text encoding (text_table).
1317 */
1318 unsigned long
encodedchar2(uc1,uc2)1319 encodedchar2 (uc1, uc2)
1320 unsigned long uc1;
1321 unsigned long uc2;
1322 {
1323 int i;
1324 for (i = 0; i < arrlen (uni2_accents); i ++) {
1325 if (uni2_accents [i] == uc2) {
1326 unsigned long unichar = uc1 | ((0x80 + i) << uni2tag_shift);
1327 return mapped_char (unichar, text_table, text_table_len, False);
1328 }
1329 }
1330 return CHAR_INVALID;
1331 }
1332
1333
1334 /*
1335 lookup_mapped_char () converts an encoded character to Unicode,
1336 using the table given as parameter.
1337 */
1338 static
1339 unsigned long
lookup_mapped_char(cjk,map_table,map_table_len)1340 lookup_mapped_char (cjk, map_table, map_table_len)
1341 unsigned long cjk;
1342 struct encoding_table_entry * map_table;
1343 unsigned int map_table_len;
1344 {
1345 unsigned long unichar;
1346
1347 #ifdef use_CJKcharmaps
1348 if (map_table == gb_table && cjk >= 0x90000000) {
1349 return gb_to_unicode (cjk);
1350 }
1351 #endif
1352
1353 unichar = map_char (cjk, map_table, map_table_len);
1354 if (! no_unichar (unichar)) {
1355 return unichar;
1356 } else if (cjk < 0x80) {
1357 /* transparently map ASCII range unless mapped already */
1358 return cjk;
1359 } else {
1360 /* notify "not found" */
1361 return CHAR_INVALID;
1362 }
1363 }
1364
1365 /*
1366 lookup_mappedtermchar () converts an encoded character to Unicode,
1367 using the terminal encoding (terminal_table).
1368 */
1369 unsigned long
lookup_mappedtermchar(cjk)1370 lookup_mappedtermchar (cjk)
1371 unsigned long cjk;
1372 {
1373 return lookup_mapped_char (cjk, terminal_table, terminal_table_len);
1374 }
1375
1376 /*
1377 lookup_encodedchar () converts an encoded character to Unicode,
1378 using the current text encoding (text_table).
1379 */
1380 unsigned long
lookup_encodedchar(cjk)1381 lookup_encodedchar (cjk)
1382 unsigned long cjk;
1383 {
1384 if (cjk_text || mapped_text) {
1385 return lookup_mapped_char (cjk, text_table, text_table_len);
1386 } else if (utf8_text || cjk < 0x100) {
1387 return cjk;
1388 } else {
1389 return CHAR_INVALID;
1390 }
1391 }
1392
1393
1394 /*======================================================================*\
1395 |* End *|
1396 \*======================================================================*/
1397