1 /*======================================================================*\
2 |*		Editor mined						*|
3 |*		CJK character set <-> Unicode mapping tables		*|
4 \*======================================================================*/
5 
6 #include "mined.h"
7 #include "charcode.h"
8 #include "charprop.h"
9 #include "termprop.h"
10 
11 
12 /*======================================================================*\
13 |*			Character values				*|
14 \*======================================================================*/
15 
16 unsigned char code_SPACE = ' ';
17 unsigned char code_TAB = '\t';
18 unsigned long code_LF = '\n';
19 unsigned long code_NL = CHAR_INVALID;
20 
21 
22 /*======================================================================*\
23 |*			Character properties				*|
24 \*======================================================================*/
25 
26 FLAG
no_char(c)27 no_char (c)
28   unsigned long c;
29 {
30   return c == CHAR_UNKNOWN || c == CHAR_INVALID;
31 }
32 
33 FLAG
no_unichar(u)34 no_unichar (u)
35   unsigned long u;
36 {
37   return u == CHAR_UNKNOWN || u == CHAR_INVALID;
38 }
39 
40 /**
41    Check if character is a control character in current encoding.
42    (Should be more generic...)
43  */
44 int
iscontrol(c)45 iscontrol (c)
46   unsigned long c;
47 {
48   if (mapped_text) {
49 	unsigned long u = lookup_encodedchar (c);
50 	return u == '\177' || (! no_unichar (u) && u < ' ');
51   } else if (utf8_text) {
52 	if (unassigned_single_width) {
53 		if (rxvt_version > 0) {
54 			/* handle weird mapping of non-Unicode ranges */
55 			if (c < 0x80000000) {
56 				c &= 0x1FFFFF;
57 			}
58 		}
59 	}
60 	return c == '\177' || c < ' ';
61   } else if (cjk_text) {
62 	return c == '\177' || c < ' ';
63   } else {
64 	return c == '\177' || (c & '\177') < ' ';
65   }
66 }
67 
68 /**
69    Check if character is any of the following white space characters:
70 	U+0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
71 	U+0020;SPACE;Zs;0;WS;;;;;N;;;;;
72 	U+00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
73 	U+2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
74 	U+2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
75 	U+2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
76 	U+2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
77 	U+2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
78 	U+2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
79 	U+2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
80 	U+2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
81 	U+200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
82 	U+200B;ZERO WIDTH SPACE;Cf;0;BN;;;;;N;;;;;
83 	U+202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
84 	U+205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
85 	U+3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
86  */
87 int
iswhitespace(c)88 iswhitespace (c)
89   unsigned long c;
90 {
91   return c == ' ' || c == '\t' || c == 0xA0
92 	|| (c >= 0x2002 && c <= 0x200B) || c == 0x3000
93 	|| c == 0x202F || c == 0x205F
94 	|| c == 0xFEFF;
95 }
96 
97 struct interval {
98     unsigned long first;
99     unsigned long last;
100 };
101 /*
102 static struct interval list_Quotation_Mark [] =
103 static struct interval list_Dash [] =
104 ...
105 */
106 #include "typoprop.t"
107 
108 static
109 int
lookup(ucs,table,length)110 lookup (ucs, table, length)
111   unsigned long ucs;
112   struct interval * table;
113   int length;
114 {
115   int min = 0;
116   int mid;
117   int max = length - 1;
118 
119   if (ucs < table [0].first || ucs > table [max].last) {
120 	return 0;
121   }
122   while (max >= min) {
123 	mid = (min + max) / 2;
124 	if (ucs > table [mid].last) {
125 		min = mid + 1;
126 	} else if (ucs < table [mid].first) {
127 		max = mid - 1;
128 	} else {
129 		return 1;
130 	}
131   }
132 
133   return 0;
134 }
135 
136 /**
137    Check if character is a quotation mark
138  */
139 int
isquotationmark(unichar)140 isquotationmark (unichar)
141   unsigned long unichar;
142 {
143   return lookup (unichar, list_Quotation_Mark, arrlen (list_Quotation_Mark));
144 }
145 
146 /**
147    Check if character is a dash
148  */
149 int
isdash(unichar)150 isdash (unichar)
151   unsigned long unichar;
152 {
153   return lookup (unichar, list_Dash, arrlen (list_Dash));
154 }
155 
156 /**
157    Check if character is an opening parenthesis
158  */
159 int
isopeningparenthesis(unichar)160 isopeningparenthesis (unichar)
161   unsigned long unichar;
162 {
163   return lookup (unichar, list_Ps, arrlen (list_Ps));
164 }
165 
166 /**
167    Return display indication for a control character.
168  */
169 character
controlchar(c)170 controlchar (c)
171   character c;
172 {
173   if (c == '\177') {
174 	return '?';
175   } else {
176 	return c + '@';
177   }
178 }
179 
180 
181 /**
182    Return the isolated form of an ALEF character.
183  */
184 unsigned long
isolated_alef(unichar)185 isolated_alef (unichar)
186   unsigned long unichar;
187 {
188 	if (unichar == 0x0622) {
189 		/* ALEF WITH MADDA ABOVE */
190 		return 0xFE81;
191 	} else if (unichar == 0x0623) {
192 		/* ALEF WITH HAMZA ABOVE */
193 		return 0xFE83;
194 	} else if (unichar == 0x0625) {
195 		/* ALEF WITH HAMZA BELOW */
196 		return 0xFE87;
197 	} else if (unichar == 0x0627) {
198 		/* ALEF */
199 		return 0xFE8D;
200 	} else {
201 		return 0xFE8D;
202 	}
203 }
204 
205 /**
206    Return the ligature with LAM of an ALEF character.
207    Use the ISOLATED FORM.
208  */
209 unsigned long
ligature_lam_alef(unichar)210 ligature_lam_alef (unichar)
211   unsigned long unichar;
212 {
213 	if (unichar == 0x0622) {
214 		/* ALEF WITH MADDA ABOVE */
215 		return 0xFEF5;
216 	} else if (unichar == 0x0623) {
217 		/* ALEF WITH HAMZA ABOVE */
218 		return 0xFEF7;
219 	} else if (unichar == 0x0625) {
220 		/* ALEF WITH HAMZA BELOW */
221 		return 0xFEF9;
222 	} else if (unichar == 0x0627) {
223 		/* ALEF */
224 		return 0xFEFB;
225 	} else {
226 		return 0xFEFB;
227 	}
228 }
229 
230 
231 /**
232    Return max value in current encoding.
233  */
234 unsigned long
max_char_value()235 max_char_value ()
236 {
237   if (cjk_text) switch (text_encoding_tag) {
238 	case 'G': return 0xFF39FF39;
239 	case 'C': return 0x8EFFFFFF;
240 	case 'J': return 0x8FFFFF;
241 	case 'X': return 0x8FFFFF;
242 	default: return 0xFFFF;
243   } else if (utf8_text) {
244 	return 0x7FFFFFFF;
245   } else {
246 	return 0xFF;
247   }
248 }
249 
250 
251 /**
252    Convert CJK character in current text encoding to byte sequence.
253    Terminate with NUL byte.
254    Return byte count.
255  */
256 int
cjkencode(cjkchar,buf)257 cjkencode (cjkchar, buf)
258   unsigned long cjkchar;
259   character * buf;
260 {
261   return cjkencode_char (False, cjkchar, buf);
262 }
263 
264 static
265 int
multi_char(term,c)266 multi_char (term, c)
267   FLAG term;
268   character c;
269 {
270   if (term) {
271 	return (character) c >= 0x80
272 		&& (! cjk_term
273 		 || (term_encoding_tag != 'S' && term_encoding_tag != 'x')
274 		 || (character) c < 0xA1
275 		 || (character) c > 0xDF);
276   } else {
277 	return multichar (c);
278   }
279 }
280 
281 /**
282    Convert CJK character in terminal or text encoding to byte sequence.
283  */
284 int
cjkencode_char(term,cjkchar,buf)285 cjkencode_char (term, cjkchar, buf)
286   FLAG term;
287   unsigned long cjkchar;
288   character * buf;
289 {
290   int len = 0;
291   int i;
292   char encoding_tag = term ? term_encoding_tag : text_encoding_tag;
293 
294   if (cjkchar >= 0x1000000) {
295 	i = (cjkchar >> 16) & 0xFF;
296 	if (encoding_tag == 'G' && cjkchar >= 0x80000000
297 	 && i >= '0' && i <= '9') {
298 		len = 4;
299 	} else if (encoding_tag == 'C' && (cjkchar >> 24) == 0x8E) {
300 		len = 4;
301 	}
302   } else if (cjkchar >= 0x10000) {
303 	if ((encoding_tag == 'J' || encoding_tag == 'X')
304 	    && (cjkchar >> 16) == 0x8F) {
305 		len = 3;
306 	}
307   } else if (cjkchar >= 0x8000 && (cjkchar & 0xFF) > 0 &&
308 		multi_char (term, (character) (cjkchar >> 8))) {
309 	len = 2;
310   } else if (cjkchar < 0x100 && ! multi_char (term, cjkchar)) {
311 	len = 1;
312   }
313 
314   for (i = len - 1; i >= 0; i --) {
315 	buf [i] = cjkchar & 0xFF;
316 	cjkchar = cjkchar >> 8;
317 	if (buf [i] == '\0') {
318 		len = 0;
319 	}
320   }
321   buf [len] = '\0';
322 
323   return len;
324 }
325 
326 /**
327    Convert Unicode character to UTF-8.
328    Terminate with NUL byte.
329    Return byte count.
330  */
331 int
utfencode(unichar,buf)332 utfencode (unichar, buf)
333   unsigned long unichar;
334   character * buf;
335 {
336   int len;
337 
338   if (unichar < 0x80) {
339 	len = 1;
340 	* buf ++ = unichar;
341   } else if (unichar < 0x800) {
342 	len = 2;
343 	* buf ++ = 0xC0 | (unichar >> 6);
344 	* buf ++ = 0x80 | (unichar & 0x3F);
345   } else if (unichar < 0x10000) {
346 	len = 3;
347 	* buf ++ = 0xE0 | (unichar >> 12);
348 	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
349 	* buf ++ = 0x80 | (unichar & 0x3F);
350   } else if (unichar < 0x200000) {
351 	len = 4;
352 	* buf ++ = 0xF0 | (unichar >> 18);
353 	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
354 	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
355 	* buf ++ = 0x80 | (unichar & 0x3F);
356   } else if (unichar < 0x4000000) {
357 	len = 5;
358 	* buf ++ = 0xF8 | (unichar >> 24);
359 	* buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
360 	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
361 	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
362 	* buf ++ = 0x80 | (unichar & 0x3F);
363   } else if (unichar < 0x80000000) {
364 	len = 6;
365 	* buf ++ = 0xFC | (unichar >> 30);
366 	* buf ++ = 0x80 | ((unichar >> 24) & 0x3F);
367 	* buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
368 	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
369 	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
370 	* buf ++ = 0x80 | (unichar & 0x3F);
371   } else {
372 	len = 0;
373   }
374   * buf = '\0';
375   return len;
376 }
377 
378 /**
379    Convert character to byte sequence.
380    Terminate with NUL byte.
381    Return buffer pointer.
382  */
383 char *
encode_char(c)384 encode_char (c)
385   unsigned long c;
386 {
387   static char buf [7];
388   if (utf8_text) {
389 	(void) utfencode (c, buf);
390   } else if (cjk_text) {
391 	(void) cjkencode (c, buf);
392   } else {
393 	buf [0] = c;
394 	buf [1] = '\0';
395   }
396   return buf;
397 }
398 
399 
400 /**
401    Check if a character code is a valid CJK encoding pattern
402    (not necessarily in the assigned ranges)
403    of the currently active text encoding.
404  */
405 FLAG
valid_cjk(cjkchar,cjkbytes)406 valid_cjk (cjkchar, cjkbytes)
407   unsigned long cjkchar;
408   character * cjkbytes;
409 {
410   return valid_cjkchar (False, cjkchar, cjkbytes);
411 }
412 
413 /**
414    Check if a character code is a valid CJK encoding pattern
415    (not necessarily in the assigned ranges)
416    of the currently active terminal or text encoding.
417  */
418 FLAG
valid_cjkchar(term,cjkchar,cjkbytes)419 valid_cjkchar (term, cjkchar, cjkbytes)
420   FLAG term;
421   unsigned long cjkchar;
422   character * cjkbytes;
423 {
424   character cjkbuf [5];
425   char encoding_tag = term ? term_encoding_tag : text_encoding_tag;
426 
427   if (cjkchar < 0x80) {
428 	return True;
429   }
430 
431   if (! cjkbytes) {
432 	cjkbytes = cjkbuf;
433 	(void) cjkencode_char (term, cjkchar, cjkbytes);
434   }
435 
436   switch (encoding_tag) {
437 /*
438 		GB	GBK		81-FE	40-7E, 80-FE
439 			GB18030		81-FE	30-39	81-FE	30-39
440 		Big5	Big5-HKSCS	87-FE	40-7E, A1-FE
441 		CNS	EUC-TW		A1-FE	A1-FE
442 					8E	A1-A7	A1-FE	A1-FE
443 */
444     case 'G':	if (cjkchar > 0xFFFF) {
445 			return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
446 				&& cjkbytes [1] >= '0' && cjkbytes [1] <= '9'
447 				&& cjkbytes [2] >= 0x81 && cjkbytes [2] <= 0xFE
448 				&& cjkbytes [3] >= '0' && cjkbytes [3] <= '9';
449 		} else {
450 			return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
451 				&& cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFE
452 				&& cjkbytes [1] != 0x7F;
453 		}
454     case 'B':	return cjkbytes [0] >= 0x87 && cjkbytes [0] <= 0xFE
455 				&& ((cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0x7E)
456 				    ||
457 				    (cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE)
458 				   )
459 				&& cjkbytes [2] == 0;
460     case 'C':	return (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
461 			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
462 			&& cjkbytes [2] == 0)
463 			||
464 			(cjkbytes [0] == 0x8E
465 			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xAF
466 			&& cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
467 			&& cjkbytes [3] >= 0xA1 && cjkbytes [3] <= 0xFE);
468 /*
469 		EUC-JP			8E	A1-DF
470 					A1-A8	A1-FE
471 					B0-F4	A1-FE
472 					8F	A2,A6,A7,A9-AB,B0-ED	A1-FE
473 					8F	A1-FE	A1-FE
474 		EUC-JIS X 0213		8E	A1-DF
475 					A1-FE	A1-FE
476 					8F	A1,A3-A5,A8,AC-AF,EE-FE	A1-FE
477 */
478     case 'X':
479     case 'J':	return  (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
480 			 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
481 			 && cjkbytes [2] == 0
482 			)
483 			||
484 			(cjkbytes [0] == 0x8E
485 			 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xDF
486 			 && cjkbytes [2] == 0
487 			)
488 			||
489 			(cjkbytes [0] == 0x8F
490 			 && cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
491 			 && cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
492 			 && cjkbytes [3] == 0
493 			);
494 /*
495 		Shift-JIS		A1-DF
496 					81-84, 87-9F		40-7E, 80-FC
497 					E0-EA, ED-EE, FA-FC	40-7E, 80-FC
498 		Shift-JIS X 0213	A1-DF
499 					81-9F		40-7E, 80-FC
500 					E0-FC		40-7E, 80-FC
501 */
502     case 'x':
503     case 'S':	return (cjkchar >= 0xA1 && cjkchar <= 0xDF)
504 			|| (cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFC
505 			    && cjkbytes [1] != 0x7F && cjkbytes [2] == 0
506 			    && (
507 				(cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0x9F)
508 			     || (cjkbytes [0] >= 0xE0 && cjkbytes [0] <= 0xFC)
509 			       )
510 			   );
511 /*
512 		UHC	UHC		81-FE	41-5A, 61-7A, 81-FE
513 		Johab			84-DE	31-7E, 81-FE
514 					E0-F9	31-7E, 81-FE
515 */
516     case 'K':	return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
517 			&& ((cjkbytes [1] >= 0x41 && cjkbytes [1] <= 0x5A)
518 			    ||
519 			    (cjkbytes [1] >= 0x61 && cjkbytes [1] <= 0x7A)
520 			    ||
521 			    (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
522 			   )
523 			&& cjkbytes [2] == 0;
524     case 'H':	return ((cjkbytes [0] >= 0x84 && cjkbytes [0] <= 0xDE)
525 			 ||
526 			(cjkbytes [0] >= 0xE0 && cjkbytes [0] <= 0xF9)
527 			)
528 			&&
529 			((cjkbytes [1] >= 0x31 && cjkbytes [1] <= 0x7E)
530 			 ||
531 			 (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
532 			)
533 			&& cjkbytes [2] == 0;
534     default:	return False;
535   }
536 }
537 
538 
539 /*======================================================================*\
540 	Conversion tables mapping various CJK encodings to Unicode
541 \*======================================================================*/
542 
543 #include "charmaps.h"
544 
545 struct charmap_table_entry {
546 	struct encoding_table_entry * table;
547 	unsigned int * table_len;
548 	char * charmap;
549 	char * tag2;
550 	char tag1;
551 };
552 
553 static struct charmap_table_entry charmaps_table [] = {
554 # ifdef __TURBOC__
555 	{cp437_table, & cp437_table_len, "CP437", "PC", 'p'},
556 	{cp850_table, & cp850_table_len, "CP850", "PL", 'P'},
557 # else
558 # include "charmaps.t"
559 # endif
560 };
561 
562 
563 /*======================================================================*\
564 |*			Configuration string matching			*|
565 \*======================================================================*/
566 
567 /**
568    matchprefix determines whether its first parameter contains its
569    second parameter matching approximately as an initial prefix,
570    at word boundaries!
571    The match ignores separating '-', '_', and space characters,
572    and does not match case.
573    The algorithm assumes that letters are ASCII as this is used for
574    configuration strings only.
575  */
576 static
577 int
matchprefix(s,m)578 matchprefix (s, m)
579   char * s;
580   char * m;
581 {
582   do {
583 	char cs, cm;
584 	while (* m == '-' || * m == '_' || * m == ' ') {
585 		m ++;
586 	}
587 	while (* s == '-' || * s == '_' || * s == ' ') {
588 		s ++;
589 	}
590 	if (! * m) {
591 		return True;
592 #ifdef koi8_ru_fix
593 		/* approx. prefix match found; check word boundary */
594 		if ( (* s >= 'a' && * s <= 'z')
595 		  || (* s >= 'A' && * s <= 'Z')
596 		  || (* s >= '0' && * s <= '9')
597 		   ) {
598 			/* continue */
599 		} else {
600 			return True;
601 		}
602 #endif
603 	}
604 	if (! * s) {
605 		return False;
606 	}
607 	cs = * s;
608 	if (cs >= 'a' && cs <= 'z') {
609 		cs = cs - 'a' + 'A';
610 	}
611 	cm = * m;
612 	if (cm >= 'a' && cm <= 'z') {
613 		cm = cm - 'a' + 'A';
614 	}
615 	if (cm != cs) {
616 		return False;
617 	}
618 	s ++;
619 	m ++;
620   } while (True);
621 }
622 
623 /**
624    matchpart determines whether its first parameter contains its
625    non-empty second parameter matching approximately as an initial
626    prefix or as a prefix of any part after a '/' or '>' separator,
627    at word boundaries!
628    The match ignores separating '-', '_', and space characters,
629    and does not match case.
630    The algorithm assumes that letters are ASCII as this is used for
631    configuration strings only.
632  */
633 static
634 int
matchpart(s,m)635 matchpart (s, m)
636   char * s;
637   char * m;
638 {
639   char * p;
640   if (! * m) {
641 	return False;
642   }
643   if (matchprefix (s, m)) {
644 	return True;
645   } else {
646 	p = strpbrk (s, ">/");
647 	if (p) {
648 		p ++;
649 		return matchpart (p, m);
650 	} else {
651 		return False;
652 	}
653   }
654 }
655 
656 
657 /*======================================================================*\
658 |*			Mapping tables and functions			*|
659 \*======================================================================*/
660 
661 /**
662    Terminal character mapping table and its length
663  */
664 static struct encoding_table_entry * terminal_table = (struct encoding_table_entry *) 0;
665 static unsigned int terminal_table_len = 0;
666 
667 /**
668    Current CJK/Unicode mapping table and its length
669  */
670 static struct encoding_table_entry * text_table = (struct encoding_table_entry *) 0;
671 static unsigned int text_table_len = 0;
672 
673 
674 /**
675    Are mapped text and terminal encodings different?
676  */
677 FLAG
remapping_chars()678 remapping_chars ()
679 {
680   return text_table != terminal_table;
681 }
682 
683 
684 /**
685    List of 2nd characters of 2 Unicode character mappings (mostly accents)
686    for certain 2-character CJK mappings (JIS or HKSCS);
687    must be consistent with range and order of according #defines in charcode.h
688  */
689 static unsigned int uni2_accents [] =
690 	{0x309A, 0x0300, 0x0301, 0x02E5, 0x02E9, 0x0304, 0x030C};
691 
692 /**
693    Current encoding indications
694  */
695 char text_encoding_tag = '-';
696 char * text_encoding_flag = "??";	/* for display in flags menu area */
697 char term_encoding_tag = '-';
698 static char * current_text_encoding = "";
699 static char * term_encoding = "";
700 
701 /**
702    Return charmap name of current text encoding.
703  */
704 char *
get_text_encoding()705 get_text_encoding ()
706 {
707   if (utf8_text) {
708 	if (utf16_file) {
709 		if (utf16_little_endian) {
710 			return "UTF-16LE";
711 		} else {
712 			return "UTF-16BE";
713 		}
714 	} else {
715 		return "UTF-8";
716 	}
717   } else if (! cjk_text && ! mapped_text) {
718 	if (ebcdic_file) {
719 		return "CP1047";
720 	} else {
721 		return "ISO 8859-1";
722 	}
723   } else {
724 	return current_text_encoding;
725   }
726 }
727 
728 /**
729    Return charmap name of terminal encoding.
730  */
731 char *
get_term_encoding()732 get_term_encoding ()
733 {
734   if (utf8_screen) {
735 	return "UTF-8";
736   } else if (! cjk_term && ! mapped_term) {
737 	return "ISO 8859-1";
738   } else {
739 	return term_encoding;
740   }
741 }
742 
743 
744 static FLAG combined_text;
745 
746 /**
747    Return True if active encoding has combining characters.
748  */
749 FLAG
encoding_has_combining()750 encoding_has_combining ()
751 {
752   return utf8_text
753 	|| (mapped_text && combined_text)
754 	|| (cjk_text && combined_text);
755 }
756 
757 /**
758    Determine if active encoding has combining characters.
759  */
760 static
761 FLAG
mapping_has_combining(term)762 mapping_has_combining (term)
763   FLAG term;
764 {
765   unsigned long i;
766   for (i = 0; i < 0x100; i ++) {
767 	unsigned long unichar;
768 	if (term) {
769 		unichar = lookup_mappedtermchar (i);
770 	} else {
771 		unichar = lookup_encodedchar (i);
772 	}
773 	if (term ? term_iscombining (unichar) : iscombining_unichar (unichar)) {
774 		return True;
775 	}
776   }
777   return False;
778 }
779 
780 #ifdef split_map_entries
781 /*
782    Decode CJK character value from split table entry.
783  */
784 static
785 unsigned long
decode_cjk(entrypoi,map_table)786 decode_cjk (entrypoi, map_table)
787   struct encoding_table_entry * entrypoi;
788   struct encoding_table_entry * map_table;
789 {
790 #ifdef use_CJKcharmaps
791   if (map_table == gb_table) {
792 	if ((unsigned int) entrypoi->cjk_ext == 0xFF) {
793 		return entrypoi->cjk_base;
794 	} else {
795 		return ((entrypoi->cjk_base & 0x00FF) << 24)
796 			| (entrypoi->cjk_base & 0xFF00)
797 			| 0x00300030
798 			| ((((unsigned int) entrypoi->cjk_ext) & 0xF0) << 12)
799 			| (((unsigned int) entrypoi->cjk_ext) & 0x0F);
800 	}
801   }
802 #endif
803 
804   if ((unsigned int) entrypoi->cjk_ext >= 0x90) {
805 	return 0x8E000000 | (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
806   } else {
807 	return (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
808   }
809 }
810 #endif
811 
812 static
813 void
setup_mapping(term,map_table,map_table_len,tag1,tag2)814 setup_mapping (term, map_table, map_table_len, tag1, tag2)
815   FLAG term;
816   struct encoding_table_entry * map_table;
817   unsigned int map_table_len;
818   char tag1;
819   char * tag2;
820 {
821   FLAG multi_byte = False;
822   unsigned int j;
823 
824   if (term) {
825 	terminal_table = map_table;
826 	terminal_table_len = map_table_len;
827 	term_encoding_tag = tag1;
828   } else {
829 	text_table = map_table;
830 	text_table_len = map_table_len;
831 	text_encoding_tag = tag1;
832 	text_encoding_flag = tag2;
833   }
834 
835   /* check if it is a multi-byte mapping table */
836   for (j = 0; j < map_table_len; j ++) {
837 	unsigned long cjki;
838 #ifdef split_map_entries
839 		cjki = decode_cjk (& map_table [j], map_table);
840 #else
841 		cjki = map_table [j].cjk;
842 #endif
843 	if (cjki > 0xFF) {
844 		multi_byte = True;
845 		break;
846 	}
847   }
848 
849   if (term) {
850 	if (multi_byte) {
851 		cjk_term = True;
852 		mapped_term = False;
853 		/* combining_screen is auto-detected */
854 	} else {
855 		mapped_term = True;
856 		cjk_term = False;
857 		/* combining_screen is auto-detected */
858 	}
859   } else {
860 	if (multi_byte) {
861 		cjk_text = True;
862 		mapped_text = False;
863 		combined_text = text_encoding_tag == 'G'
864 				|| text_encoding_tag == 'X'
865 				|| text_encoding_tag == 'x';
866 	} else {
867 		mapped_text = True;
868 		cjk_text = False;
869 		combined_text = mapping_has_combining (term);
870 	}
871   }
872 }
873 
874 /**
875    Set either text or terminal character mapping table.
876    Return True on success, False if tag unknown.
877  */
878 static
879 FLAG
set_char_encoding(term,charmap,tag)880 set_char_encoding (term, charmap, tag)
881   FLAG term;
882   char * charmap;
883   char tag;
884 {
885   if (term) {
886 	ascii_screen = False;
887   }
888   if (charmap && ! term
889 	&& (streq (":16", charmap) || matchpart ("UTF-16BE", charmap))) {
890 	utf8_text = True;
891 	utf16_file = True;
892 	utf16_little_endian = False;
893 	cjk_text = False;
894 	mapped_text = False;
895 	current_text_encoding = "UTF-16BE";
896 	text_encoding_flag = "16";
897 	return True;
898   } else if (charmap && ! term
899 	&& (streq (":61", charmap) || matchpart ("UTF-16LE", charmap))) {
900 	utf8_text = True;
901 	utf16_file = True;
902 	utf16_little_endian = True;
903 	cjk_text = False;
904 	mapped_text = False;
905 	current_text_encoding = "UTF-16LE";
906 	text_encoding_flag = "61";
907 	return True;
908   } else if (charmap && ! term && streq (":??", charmap)) {
909 	text_table_len = 0;
910 	text_encoding_tag = ' ';
911 	text_encoding_flag = "??";
912 	utf8_text = False;
913 	utf16_file = False;
914 	cjk_text = True;
915 	mapped_text = False;
916 	current_text_encoding = "[CJK]";
917 	return True;
918   } else if (charmap ? strisprefix ("UTF-8", charmap) : tag == 'U') {
919 	if (term) {
920 		utf8_screen = True;
921 		utf8_input = True;
922 		cjk_term = False;
923 		mapped_term = False;
924 		term_encoding = "UTF-8";
925 		term_encoding_tag = 'U';
926 	} else {
927 		utf8_text = True;
928 		utf16_file = False;
929 		cjk_text = False;
930 		mapped_text = False;
931 		current_text_encoding = "UTF-8";
932 		text_encoding_flag = "U8";
933 	}
934 	return True;
935   } else if (charmap ? matchpart ("ISO 8859-1", charmap) : tag == 'L') {
936 	if (term) {
937 		utf8_screen = False;
938 		utf8_input = False;
939 		cjk_term = False;
940 		mapped_term = False;
941 		term_encoding = "ISO 8859-1";
942 		term_encoding_tag = 'L';
943 	} else {
944 		utf8_text = False;
945 		utf16_file = False;
946 		cjk_text = False;
947 		mapped_text = False;
948 		current_text_encoding = "ISO 8859-1";
949 		text_encoding_flag = "L1";
950 	}
951 	return True;
952   } else {
953     int i;
954     for (i = 0; i < arrlen (charmaps_table); i ++) {
955 	if (charmap ? (charmap [0] == ':' ?
956 			  streq (& charmap [1], charmaps_table [i].tag2)
957 			: matchpart (charmaps_table [i].charmap, charmap)
958 		      )
959 		    : charmaps_table [i].tag1 == tag) {
960 		if (term) {
961 			if (streq (charmaps_table [i].charmap, "CP1047")) {
962 				/* not supporting EBCDIC terminal */
963 				break;
964 			}
965 			utf8_screen = False;
966 			utf8_input = False;
967 			term_encoding = charmaps_table [i].charmap;
968 			if (streq (term_encoding, "ASCII")) {
969 				ascii_screen = True;
970 			}
971 		} else {
972 			utf8_text = False;
973 			utf16_file = False;
974 			current_text_encoding = charmaps_table [i].charmap;
975 		}
976 		setup_mapping (term,
977 				charmaps_table [i].table,
978 				* charmaps_table [i].table_len,
979 				charmaps_table [i].tag1,
980 				charmaps_table [i].tag2);
981 		return True;
982 	}
983     }
984   }
985   return False;
986 }
987 
988 static struct {
989 	char * alias;
990 	char * codepage;
991 } cpaliases [] = {
992 	{"CP819", "ISO-8859-1"},
993 	{"CP912", "ISO-8859-2"},
994 	{"CP913", "ISO-8859-3"},
995 	{"CP914", "ISO-8859-4"},
996 	{"CP915", "ISO-8859-5"},
997 	{"CP1089", "ISO-8859-6"},
998 	{"CP813", "ISO-8859-7"},
999 	{"CP916", "ISO-8859-8"},
1000 	{"CP920", "ISO-8859-9"},
1001 	{"CP919", "ISO-8859-10"},
1002 	{"CP923", "ISO-8859-15"},
1003 	{"CP28591", "ISO-8859-1"},
1004 	{"CP28592", "ISO-8859-2"},
1005 	{"CP28593", "ISO-8859-3"},
1006 	{"CP28594", "ISO-8859-4"},
1007 	{"CP28595", "ISO-8859-5"},
1008 	{"CP28596", "ISO-8859-6"},
1009 	{"CP28597", "ISO-8859-7"},
1010 	{"CP28598", "ISO-8859-8"},	/* indicates visual ordering ... */
1011 	{"CP28599", "ISO-8859-9"},
1012 	{"CP28603", "ISO-8859-13"},
1013 	{"CP28605", "ISO-8859-15"},
1014 	{"CP38598", "ISO-8859-8"},	/* indicates logical ordering ... */
1015 	{"CP20000", "CNS"},
1016 	{"CP20127", "ASCII"},
1017 	{"CP20866", "KOI8-R"},
1018 	{"CP20936", "GB2312"},
1019 	{"CP21866", "KOI8-U"},
1020 	{"CP51949", "EUC-KR"},
1021 	{"CP54936", "GB18030"},
1022 	{"CP65001", "UTF-8"},
1023 	{"CP932", "Shift-JIS"},
1024 	{"CP936", "GBK"},
1025 	{"CP949", "EUC-KR"},
1026 	{"CP950", "Big5"},
1027 	{"CP20932", "EUC-JP"},
1028 };
1029 
1030 /**
1031    Set terminal character code mapping table according to encoding tag.
1032    Return True on success, False if tag unknown.
1033  */
1034 FLAG
set_term_encoding(charmap,tag)1035 set_term_encoding (charmap, tag)
1036   char * charmap;
1037   char tag;
1038 {
1039   /* handle generic codepage notation (cygwin 1.7 / DOS support) */
1040   if (charmap && strisprefix ("CP", charmap)) {
1041 	if (set_char_encoding (True, charmap, tag)) {
1042 		return True;
1043 	} else {
1044 		int i;
1045 		/* check DOS/Windows ISO-8859 codepage aliases */
1046 		for (i = 0; i < arrlen (cpaliases); i ++) {
1047 			if (streq (charmap, cpaliases [i].alias)) {
1048 				if (set_char_encoding (True, cpaliases [i].codepage, tag)) {
1049 					return True;
1050 				}
1051 			}
1052 		}
1053 		(void) set_char_encoding (True, "ASCII", ' ');
1054 		return False;
1055 	}
1056   }
1057 
1058   return set_char_encoding (True, charmap, tag);
1059 }
1060 
1061 #define dont_debug_set_text_encoding
1062 
1063 /**
1064    Set character mapping table and text encoding variables according
1065    to encoding tag.
1066    Return True on success, False if tag unknown.
1067  */
1068 FLAG
set_text_encoding(charmap,tag,debug_tag)1069 set_text_encoding (charmap, tag, debug_tag)
1070   char * charmap;
1071   char tag;
1072   char * debug_tag;
1073 {
1074   FLAG ret = set_char_encoding (False, charmap, tag);
1075 
1076 #ifdef debug_set_text_encoding
1077   printf ("set_text_encoding [%s] %s [%c] -> %d: <%s>\n", debug_tag, charmap, tag, ret, get_text_encoding ());
1078 #endif
1079 
1080   /* EBCDIC kludge */
1081   code_SPACE = encodedchar (' ');
1082   code_TAB = encodedchar ('\t');
1083   code_LF = encodedchar ('\n');
1084   code_NL = encodedchar (0x85);
1085   if (code_SPACE == ' ') {
1086 	ebcdic_text = False;
1087 	ebcdic_file = False;
1088   } else {
1089 	ebcdic_text = True;
1090 	/* or rather transform than map: */
1091 	ebcdic_text = False;
1092 	ebcdic_file = True;
1093 	mapped_text = False;
1094   }
1095 
1096   return ret;
1097 }
1098 
1099 /*
1100    Look up a Unicode value in a character set mapping table.
1101    @return CJK value, or CHAR_INVALID if not found
1102  */
1103 static
1104 unsigned long
unmap_char(unichar,map_table,map_table_len,term)1105 unmap_char (unichar, map_table, map_table_len, term)
1106   unsigned long unichar;
1107   struct encoding_table_entry * map_table;
1108   unsigned int map_table_len;
1109   FLAG term;
1110 {
1111 #ifdef split_map_entries
1112 	unsigned char unichar_high = unichar >> 16;
1113 	unsigned short unichar_low = unichar & 0xFFFF;
1114 #endif
1115 	int i;
1116 
1117 	/* workaround for handling ambiguous mappings:
1118 	   for terminal output, prefer longer code point:
1119 	   term ? scan table downwards : scan table upwards
1120 	 */
1121 	term = False;	/* don't use this quirk-around */
1122 
1123 	for (i = 0; i < map_table_len; i ++) {
1124 		struct encoding_table_entry * map_table_poi =
1125 			& map_table [term ? map_table_len - 1 - i : i];
1126 #ifdef split_map_entries
1127 		if (
1128 		    unichar_low == map_table_poi->unicode_low
1129 		 && unichar_high == map_table_poi->unicode_high
1130 		   ) {
1131 			return decode_cjk (map_table_poi, map_table);
1132 		}
1133 #else
1134 		if (
1135 		    unichar == map_table_poi->unicode
1136 		   ) {
1137 			return map_table_poi->cjk;
1138 		}
1139 #endif
1140 	}
1141 	return CHAR_INVALID;
1142 }
1143 
1144 /*
1145    Map a character in a character set mapping table.
1146    @return Unicode value, or CHAR_INVALID if not found
1147  */
1148 static
1149 unsigned long
map_char(cjk,map_table,map_table_len)1150 map_char (cjk, map_table, map_table_len)
1151   unsigned long cjk;
1152   struct encoding_table_entry * map_table;
1153   unsigned int map_table_len;
1154 {
1155 	int low = 0;
1156 	int high = map_table_len - 1;
1157 	int i;
1158 
1159 	unsigned long cjki;
1160 
1161 	while (low <= high) {
1162 		i = (low + high) / 2;
1163 #ifdef split_map_entries
1164 		cjki = decode_cjk (& map_table [i], map_table);
1165 #else
1166 		cjki = map_table [i].cjk;
1167 #endif
1168 		if (cjki == cjk) {
1169 #ifdef split_map_entries
1170 			if (map_table [i].unicode_high & 0x80) {
1171 				return 0x80000000 | (uni2_accents [map_table [i].unicode_high & 0x7F] << 16) | (map_table [i].unicode_low);
1172 			} else {
1173 				return (((unsigned long) map_table [i].unicode_high) << 16) | (map_table [i].unicode_low);
1174 			}
1175 #else
1176 			if (map_table [i].unicode & 0x800000) {
1177 				return 0x80000000 | (uni2_accents [(map_table [i].unicode >> 16) & 0x7F] << 16) | (map_table [i].unicode & 0xFFFF);
1178 			} else {
1179 				return map_table [i].unicode;
1180 			}
1181 #endif
1182 		} else if (cjki >= cjk) {
1183 			high = i - 1;
1184 		} else {
1185 			low = i + 1;
1186 		}
1187 	}
1188 	return CHAR_INVALID;
1189 }
1190 
1191 
1192 /*======================================================================*\
1193 |*		Conversion functions					*|
1194 \*======================================================================*/
1195 
1196 /**
1197    GB18030 algorithmic mapping part
1198  */
1199 static
1200 unsigned long
gb_to_unicode(gb)1201 gb_to_unicode (gb)
1202   unsigned long gb;
1203 {
1204 	unsigned int byte2 = (gb >> 16) & 0xFF;
1205 	unsigned int byte3 = (gb >> 8) & 0xFF;
1206 	unsigned int byte4 = gb & 0xFF;
1207 
1208 	if (byte2 < '0' || byte2 > '9' || byte3 < 0x81 || byte4 < '0' || byte4 > '9') {
1209 		return CHAR_INVALID;
1210 	}
1211 
1212 	return (((((gb >> 24) & 0xFF) - 0x90) * 10
1213 		+ (byte2 - 0x30)) * 126L
1214 		+ (byte3 - 0x81)) * 10L
1215 		+ (byte4 - 0x30)
1216 		+ 0x10000;
1217 }
1218 
1219 static
1220 unsigned long
unicode_to_gb(uc)1221 unicode_to_gb (uc)
1222   unsigned long uc;
1223 {
1224 	unsigned int a, b, c, d;
1225 
1226 	if (uc >= 0x200000) {
1227 		return CHAR_INVALID;
1228 	}
1229 
1230 	uc -= 0x10000;
1231 	d = 0x30 + uc % 10;
1232 	uc /= 10;
1233 	c = 0x81 + uc % 126;
1234 	uc /= 126;
1235 	b = 0x30 + uc % 10;
1236 	uc /= 10;
1237 	a = 0x90 + uc;
1238 
1239 	return (a << 24) | (b << 16) | (c << 8) | d;
1240 }
1241 
1242 
1243 /*
1244    mapped_char () converts a Unicode value into an encoded character,
1245    using the table given as parameter.
1246  */
1247 static
1248 unsigned long
mapped_char(unichar,map_table,map_table_len,term)1249 mapped_char (unichar, map_table, map_table_len, term)
1250   unsigned long unichar;
1251   struct encoding_table_entry * map_table;
1252   unsigned int map_table_len;
1253   FLAG term;
1254 {
1255 	unsigned long cjkchar;
1256 
1257 #ifdef use_CJKcharmaps
1258 	if (map_table == gb_table && unichar >= 0x10000) {
1259 		return unicode_to_gb (unichar);
1260 	}
1261 #endif
1262 
1263 	cjkchar = unmap_char (unichar, map_table, map_table_len, term);
1264 	if (cjkchar != CHAR_INVALID) {
1265 		return cjkchar;
1266 	}
1267 
1268 	if (unichar < 0x20) {
1269 		/* transparently return control range (for commands) */
1270 		return unichar;
1271 	} else if (unichar < 0x80) {
1272 		/* transparently map ASCII range unless mapped already */
1273 		cjkchar = unichar;
1274 		unichar = map_char (cjkchar, map_table, map_table_len);
1275 		if (! no_unichar (unichar) && unichar != cjkchar) {
1276 			return CHAR_INVALID;
1277 		} else {
1278 			return cjkchar;
1279 		}
1280 	} else {
1281 		/* notify "not found" */
1282 		return CHAR_INVALID;
1283 	}
1284 }
1285 
1286 /*
1287    mappedtermchar () converts a Unicode value into an encoded character,
1288    using the terminal encoding (terminal_table).
1289  */
1290 unsigned long
mappedtermchar(unichar)1291 mappedtermchar (unichar)
1292   unsigned long unichar;
1293 {
1294 	return mapped_char (unichar, terminal_table, terminal_table_len, True);
1295 }
1296 
1297 /*
1298    encodedchar () converts a Unicode value into an encoded character,
1299    using the current text encoding (text_table).
1300  */
1301 unsigned long
encodedchar(unichar)1302 encodedchar (unichar)
1303   unsigned long unichar;
1304 {
1305   if (cjk_text || mapped_text) {
1306 	return mapped_char (unichar, text_table, text_table_len, False);
1307   } else if (utf8_text || unichar < 0x100) {
1308 	return unichar;
1309   } else {
1310 	return CHAR_INVALID;
1311   }
1312 }
1313 
1314 /*
1315    encodedchar2 () converts two Unicode values into one JIS character,
1316    using the current text encoding (text_table).
1317  */
1318 unsigned long
encodedchar2(uc1,uc2)1319 encodedchar2 (uc1, uc2)
1320   unsigned long uc1;
1321   unsigned long uc2;
1322 {
1323   int i;
1324   for (i = 0; i < arrlen (uni2_accents); i ++) {
1325 	if (uni2_accents [i] == uc2) {
1326 		unsigned long unichar = uc1 | ((0x80 + i) << uni2tag_shift);
1327 		return mapped_char (unichar, text_table, text_table_len, False);
1328 	}
1329   }
1330   return CHAR_INVALID;
1331 }
1332 
1333 
1334 /*
1335    lookup_mapped_char () converts an encoded character to Unicode,
1336    using the table given as parameter.
1337  */
1338 static
1339 unsigned long
lookup_mapped_char(cjk,map_table,map_table_len)1340 lookup_mapped_char (cjk, map_table, map_table_len)
1341   unsigned long cjk;
1342   struct encoding_table_entry * map_table;
1343   unsigned int map_table_len;
1344 {
1345 	unsigned long unichar;
1346 
1347 #ifdef use_CJKcharmaps
1348 	if (map_table == gb_table && cjk >= 0x90000000) {
1349 		return gb_to_unicode (cjk);
1350 	}
1351 #endif
1352 
1353 	unichar = map_char (cjk, map_table, map_table_len);
1354 	if (! no_unichar (unichar)) {
1355 		return unichar;
1356 	} else if (cjk < 0x80) {
1357 		/* transparently map ASCII range unless mapped already */
1358 		return cjk;
1359 	} else {
1360 		/* notify "not found" */
1361 		return CHAR_INVALID;
1362 	}
1363 }
1364 
1365 /*
1366    lookup_mappedtermchar () converts an encoded character to Unicode,
1367    using the terminal encoding (terminal_table).
1368  */
1369 unsigned long
lookup_mappedtermchar(cjk)1370 lookup_mappedtermchar (cjk)
1371   unsigned long cjk;
1372 {
1373 	return lookup_mapped_char (cjk, terminal_table, terminal_table_len);
1374 }
1375 
1376 /*
1377    lookup_encodedchar () converts an encoded character to Unicode,
1378    using the current text encoding (text_table).
1379  */
1380 unsigned long
lookup_encodedchar(cjk)1381 lookup_encodedchar (cjk)
1382   unsigned long cjk;
1383 {
1384   if (cjk_text || mapped_text) {
1385 	return lookup_mapped_char (cjk, text_table, text_table_len);
1386   } else if (utf8_text || cjk < 0x100) {
1387 	return cjk;
1388   } else {
1389 	return CHAR_INVALID;
1390   }
1391 }
1392 
1393 
1394 /*======================================================================*\
1395 |*				End					*|
1396 \*======================================================================*/
1397