1 /* ========================================================================
2  * Copyright 2008-2010 Mark Crispin
3  * ========================================================================
4  */
5 
6 /*
7  * Program:	UTF-8 routines
8  *
9  * Author:	Mark Crispin
10  *
11  * Date:	11 June 1997
12  * Last Edited:	28 May 2010
13  *
14  * Previous versions of this file were
15  *
16  * Copyright 1988-2008 University of Washington
17  *
18  * Licensed under the Apache License, Version 2.0 (the "License");
19  * you may not use this file except in compliance with the License.
20  * You may obtain a copy of the License at
21  *
22  *     http://www.apache.org/licenses/LICENSE-2.0
23  *
24  */
25 
26 
27 #include <stdio.h>
28 #include <ctype.h>
29 #include "c-client.h"
30 
31 /*	*** IMPORTANT ***
32  *
33  *  There is a very important difference between "character set" and "charset",
34  * and the comments in this file reflect these differences.  A "character set"
35  * (also known as "coded character set") is a mapping between codepoints and
36  * characters.  A "charset" is as defined in MIME, and incorporates one or more
37  * coded character sets in a character encoding scheme.  See RFC 2130 for more
38  * details.
39  */
40 
41 static ucs4width_t ucs4width = NIL;
42 
43 /* Character set conversion tables */
44 
45 #include "iso_8859.c"		/* 8-bit single-byte coded graphic */
46 #include "koi8_r.c"		/* Cyrillic - Russia */
47 #include "koi8_u.c"		/* Cyrillic - Ukraine */
48 #include "tis_620.c"		/* Thai */
49 #include "viscii.c"		/* Vietnamese */
50 #include "windows.c"		/* Windows */
51 #include "ibm.c"		/* IBM */
52 #include "gb_2312.c"		/* Chinese (PRC) - simplified */
53 #include "gb_12345.c"		/* Chinese (PRC) - traditional */
54 #include "jis_0208.c"		/* Japanese - basic */
55 #include "jis_0212.c"		/* Japanese - supplementary */
56 #include "ksc_5601.c"		/* Korean */
57 #include "big5.c"		/* Taiwanese (ROC) - industrial standard */
58 #include "cns11643.c"		/* Taiwanese (ROC) - national standard */
59 
60 
61 #include "widths.c"		/* Unicode character widths */
62 #include "tmap.c"		/* Unicode titlecase mapping */
63 #include "decomtab.c"		/* Unicode decomposions */
64 
65 /* EUC parameters */
66 
67 #ifdef GBTOUNICODE		/* PRC simplified Chinese */
68 static const struct utf8_eucparam gb_param = {
69   BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN,
70   (void *) gb2312tab};
71 #endif
72 
73 
74 #ifdef GB12345TOUNICODE		/* PRC traditional Chinese */
75 static const struct utf8_eucparam gbt_param = {
76   BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN,
77   (void *) gb12345tab};
78 #endif
79 
80 
81 #ifdef BIG5TOUNICODE		/* ROC traditional Chinese */
82 static const struct utf8_eucparam big5_param[] = {
83   {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab},
84   {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}
85 };
86 #endif
87 
88 
89 #ifdef JISTOUNICODE		/* Japanese */
90 static const struct utf8_eucparam jis_param[] = {
91   {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN,
92      (void *) jis0208tab},
93   {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},
94 #ifdef JIS0212TOUNICODE		/* Japanese extended */
95   {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN,
96      (void *) jis0212tab}
97 #else
98   {0,0,0,0,NIL}
99 #endif
100 };
101 #endif
102 
103 
104 #ifdef KSCTOUNICODE		/* Korean */
105 static const struct utf8_eucparam ksc_param = {
106   BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,
107   (void *) ksc5601tab};
108 #endif
109 
110 /* List of supported charsets */
111 
112 static const CHARSET utf8_csvalid[] = {
113   {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
114    NIL,NIL,NIL},
115   {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
116    NIL,SC_UNICODE,NIL},
117   {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT,
118    NIL,SC_UNICODE,"UTF-8"},
119   {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
120    NIL,SC_LATIN_1,NIL},
121   {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
122    (void *) iso8859_2tab,SC_LATIN_2,NIL},
123   {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
124    (void *) iso8859_3tab,SC_LATIN_3,NIL},
125   {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
126    (void *) iso8859_4tab,SC_LATIN_4,NIL},
127   {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
128    (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"},
129   {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
130    (void *) iso8859_6tab,SC_ARABIC,NIL},
131   {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
132    (void *) iso8859_7tab,SC_GREEK,NIL},
133   {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
134    (void *) iso8859_8tab,SC_HEBREW,NIL},
135   {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
136    (void *) iso8859_9tab,SC_LATIN_5,NIL},
137   {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
138    (void *) iso8859_10tab,SC_LATIN_6,NIL},
139   {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
140    (void *) iso8859_11tab,SC_THAI,NIL},
141 #if 0				/* ISO 8859-12 reserved for ISCII(?) */
142   {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
143    (void *) iso8859_12tab,NIL,NIL},
144 #endif
145   {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
146    (void *) iso8859_13tab,SC_LATIN_7,NIL},
147   {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
148    (void *) iso8859_14tab,SC_LATIN_8,NIL},
149   {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
150    (void *) iso8859_15tab,SC_LATIN_9,NIL},
151   {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
152    (void *) iso8859_16tab,SC_LATIN_10,NIL},
153   {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
154    (void *) koi8rtab,SC_CYRILLIC,NIL},
155   {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
156    (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL},
157   {"KOI8-RU",CT_1BYTE,CF_DISPLAY,
158    (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"},
159   {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
160    (void *) tis620tab,SC_THAI,"ISO-8859-11"},
161   {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
162    (void *) visciitab,SC_VIETNAMESE,NIL},
163 
164 #ifdef GBTOUNICODE
165   {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
166      (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL},
167   {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
168    (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
169   {"CN-GB",CT_DBYTE,CF_DISPLAY,
170      (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
171 #ifdef CNS1TOUNICODE
172   {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT,
173      NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,
174    NIL},
175 #endif
176 #endif
177 #ifdef GB12345TOUNICODE
178   {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
179      (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},
180 #endif
181 #ifdef BIG5TOUNICODE
182   {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
183      (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL},
184   {"CN-BIG5",CT_DBYTE2,CF_DISPLAY,
185      (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
186   {"BIG-5",CT_DBYTE2,CF_DISPLAY,
187      (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
188 #endif
189 #ifdef JISTOUNICODE
190   {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
191      NIL,SC_JAPANESE,NIL},
192   {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY,
193      (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"},
194   {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
195      NIL,SC_JAPANESE,"ISO-2022-JP"},
196   {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
197      NIL,SC_JAPANESE,"ISO-2022-JP"},
198 #ifdef JIS0212TOUNICODE
199   {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT,
200      NIL,SC_JAPANESE,"ISO-2022-JP"},
201 #ifdef GBTOUNICODE
202 #ifdef KSCTOUNICODE
203   {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT,
204      NIL,
205      SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 |
206        SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 |
207 	 SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI |
208 	   SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN
209 #ifdef CNS1TOUNICODE
210 	     | SC_CHINESE_TRADITIONAL
211 #endif
212 	       ,"UTF-8"},
213 #endif
214 #endif
215 #endif
216 #endif
217 
218 #ifdef KSCTOUNICODE
219   {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT,
220      NIL,SC_KOREAN,"EUC-KR"},
221   {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
222      (void *) &ksc_param,SC_KOREAN,NIL},
223   {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
224      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
225   {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
226      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
227   {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY,
228      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
229   {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY,
230      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
231   {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY,
232      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
233   {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY,
234      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
235 #endif
236 
237 				/* deep sigh */
238   {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
239      (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
240   {"CP874",CT_1BYTE,CF_DISPLAY,
241      (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
242 #ifdef GBTOUNICODE
243   {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
244      (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
245   {"CP936",CT_DBYTE,CF_DISPLAY,
246      (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
247 #endif
248 #ifdef KSCTOUNICODE
249   {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
250      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
251   {"CP949",CT_DBYTE,CF_DISPLAY,
252      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
253   {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
254      (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
255 #endif
256   {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
257      (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
258   {"CP1250",CT_1BYTE,CF_DISPLAY,
259      (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
260   {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
261      (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
262   {"CP1251",CT_1BYTE,CF_DISPLAY,
263      (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
264   {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
265      (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
266   {"CP1252",CT_1BYTE,CF_DISPLAY,
267      (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
268   {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
269      (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
270   {"CP1253",CT_1BYTE,CF_DISPLAY,
271      (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
272   {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
273      (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
274   {"CP1254",CT_1BYTE,CF_DISPLAY,
275      (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
276   {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
277      (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
278   {"CP1255",CT_1BYTE,CF_DISPLAY,
279      (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
280   {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
281      (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
282   {"CP1256",CT_1BYTE,CF_DISPLAY,
283      (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
284   {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
285      (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
286   {"CP1257",CT_1BYTE,CF_DISPLAY,
287      (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
288   {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
289      (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
290   {"CP1258",CT_1BYTE,CF_DISPLAY,
291      (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
292 
293 				/* deeper sigh */
294   {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY,
295      NIL,NIL,"US-ASCII"},
296   {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
297      (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"},
298   {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
299      (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"},
300   {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
301      (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"},
302   {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
303      (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"},
304   {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
305      (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"},
306   {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
307      (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"},
308   {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
309      (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"},
310   {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
311      (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"},
312   {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
313      (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"},
314   {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
315      (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"},
316   {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
317      (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"},
318   {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
319      (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"},
320   {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
321      (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"},
322   {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
323      (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"},
324   {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
325      (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"},
326   {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
327      (void *) ibm_874tab,SC_THAI,"ISO-8859-11"},
328   {"ROMAN8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
329      (void *) ibm_1051tab,SC_LATIN_1,"ISO-8859-1"},
330 				/* deepest sigh */
331   {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY,
332      NIL,NIL,"US-ASCII"},
333   {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT,
334      NIL,SC_UNICODE,"UTF-8"},
335 				/* these should never appear in email */
336   {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
337      NIL,SC_UNICODE,"UTF-8"},
338   {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
339      NIL,SC_UNICODE,"UTF-8"},
340   {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
341      NIL,SC_UNICODE,"UTF-8"},
342   {NIL, NIL, NIL, NIL, NIL, NIL}
343 };
344 
345 /* Non-Unicode Script table */
346 
347 static const SCRIPT utf8_scvalid[] = {
348   {"Arabic",NIL,SC_ARABIC},
349   {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED},
350   {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL},
351   {"Cyrillic",NIL,SC_CYRILLIC},
352   {"Cyrillic Ukrainian",NIL,SC_UKRANIAN},
353   {"Greek",NIL,SC_GREEK},
354   {"Hebrew",NIL,SC_HEBREW},
355   {"Japanese",NIL,SC_JAPANESE},
356   {"Korean",NIL,SC_KOREAN},
357   {"Latin-1","Western Europe",SC_LATIN_1},
358   {"Latin-2","Eastern Europe",SC_LATIN_2},
359   {"Latin-3","Southern Europe",SC_LATIN_3},
360   {"Latin-4","Northern Europe",SC_LATIN_4},
361   {"Latin-5","Turkish",SC_LATIN_5},
362   {"Latin-6","Nordic",SC_LATIN_6},
363   {"Latin-7","Baltic",SC_LATIN_7},
364   {"Latin-8","Celtic",SC_LATIN_8},
365   {"Latin-9","Euro",SC_LATIN_9},
366   {"Latin-10","Balkan",SC_LATIN_10},
367   {"Thai",NIL,SC_THAI},
368   {"Vietnamese",NIL,SC_VIETNAMESE},
369   {NIL, NIL, NIL}
370 };
371 
372 /* Look up script name or return entire table
373  * Accepts: script name or NIL
374  * Returns: pointer to script table entry or NIL if unknown
375  */
376 
utf8_script(char * script)377 SCRIPT *utf8_script (char *script)
378 {
379   unsigned long i;
380   if (!script) return (SCRIPT *) &utf8_scvalid[0];
381   else if (*script && (strlen (script) < 128))
382     for (i = 0; utf8_scvalid[i].name; i++)
383       if (!compare_cstring (script,utf8_scvalid[i].name))
384 	return (SCRIPT *) &utf8_scvalid[i];
385   return NIL;			/* failed */
386 }
387 
388 
389 /* Look up charset name or return entire table
390  * Accepts: charset name or NIL
391  * Returns: charset table entry or NIL if unknown
392  */
393 
utf8_charset(char * charset)394 const CHARSET *utf8_charset (char *charset)
395 {
396   unsigned long i;
397   if (!charset) return (CHARSET *) &utf8_csvalid[0];
398   else if (*charset && (strlen (charset) < 128))
399     for (i = 0; utf8_csvalid[i].name; i++)
400       if (!compare_cstring (charset,utf8_csvalid[i].name))
401 	return (CHARSET *) &utf8_csvalid[i];
402   return NIL;			/* failed */
403 }
404 
405 /* Validate charset and generate error message if invalid
406  * Accepts: bad character set
407  * Returns: NIL if good charset, else error message string
408  */
409 
410 #define BADCSS "[BADCHARSET ("
411 #define BADCSE ")] Unknown charset: "
412 
utf8_badcharset(char * charset)413 char *utf8_badcharset (char *charset)
414 {
415   char *msg = NIL;
416   if (!utf8_charset (charset)) {
417     char *s,*t;
418     unsigned long i,j;
419 				/* calculate size of header, trailer, and bad
420 				 * charset plus charset names */
421     for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;
422 	 utf8_csvalid[i].name; i++)
423       j += strlen (utf8_csvalid[i].name) + 1;
424 				/* not built right */
425     if (!i) fatal ("No valid charsets!");
426 				/* header */
427     for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);
428 				/* each charset */
429     for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)
430       for (t = utf8_csvalid[i].name; *t; *s++ = *t++);
431 				/* back over last space, trailer */
432     for (t = BADCSE, --s; *t; *s++ = *t++);
433 				/* finally bogus charset */
434     for (t = charset; *t; *s++ = *t++);
435     *s++ = '\0';		/* finally tie off string */
436     if (s != (msg + j)) fatal ("charset msg botch");
437   }
438   return msg;
439 }
440 
441 /* Convert charset labelled sized text to UTF-8
442  * Accepts: source sized text
443  *	    charset
444  *	    pointer to returned sized text if non-NIL
445  *	    flags
446  * Returns: T if successful, NIL if failure
447  */
448 
utf8_text(SIZEDTEXT * text,char * charset,SIZEDTEXT * ret,long flags)449 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags)
450 {
451   ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;
452   ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;
453   const CHARSET *cs = (charset && *charset) ?
454     utf8_charset (charset) : utf8_infercharset (text);
455   if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;
456   if (ret) {			/* no conversion possible */
457     ret->data = text->data;	/* so return source */
458     ret->size = text->size;
459   }
460   return NIL;			/* failure */
461 }
462 
463 
464 /* Operations used in converting data */
465 
466 #define UTF8_COUNT_BMP(count,c,cv,de) {		\
467   void *more = NIL;				\
468   if (cv) c = (*cv) (c);			\
469   if (de) c = (*de) (c,&more);			\
470   do count += UTF8_SIZE_BMP(c);			\
471   while (more && (c = (*de) (U8G_ERROR,&more)));\
472 }
473 
474 #define UTF8_WRITE_BMP(b,c,cv,de) {		\
475   void *more = NIL;				\
476   if (cv) c = (*cv) (c);			\
477   if (de) c = (*de) (c,&more);			\
478   do UTF8_PUT_BMP (b,c)				\
479   while (more && (c = (*de) (U8G_ERROR,&more)));\
480 }
481 
482 #define UTF8_COUNT(count,c,cv,de) {		\
483   void *more = NIL;				\
484   if (cv) c = (*cv) (c);			\
485   if (de) c = (*de) (c,&more);			\
486   do count += utf8_size (c);			\
487   while (more && (c = (*de) (U8G_ERROR,&more)));\
488 }
489 
490 #define UTF8_WRITE(b,c,cv,de) {			\
491   void *more = NIL;				\
492   if (cv) c = (*cv) (c);			\
493   if (de) c = (*de) (c,&more);			\
494   do b = utf8_put (b,c);			\
495   while (more && (c = (*de) (U8G_ERROR,&more)));\
496 }
497 
498 /* Convert sized text to UTF-8 given CHARSET block
499  * Accepts: source sized text
500  *	    CHARSET block
501  *	    pointer to returned sized text
502  *	    canonicalization function
503  *	    decomposition function
504  * Returns: T if successful, NIL if failure
505  */
506 
utf8_text_cs(SIZEDTEXT * text,const CHARSET * cs,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)507 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
508 		   ucs4cn_t cv,ucs4de_t de)
509 {
510   ret->data = text->data;	/* default to source */
511   ret->size = text->size;
512   switch (cs->type) {		/* convert if type known */
513   case CT_ASCII:		/* 7-bit ASCII no table */
514   case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */
515     if (cv || de) utf8_text_utf8 (text,ret,cv,de);
516     break;
517   case CT_1BYTE0:		/* 1 byte no table */
518     utf8_text_1byte0 (text,ret,cv,de);
519     break;
520   case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
521     utf8_text_1byte (text,ret,cs->tab,cv,de);
522     break;
523   case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
524     utf8_text_1byte8 (text,ret,cs->tab,cv,de);
525     break;
526   case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
527     utf8_text_euc (text,ret,cs->tab,cv,de);
528     break;
529   case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
530     utf8_text_dbyte (text,ret,cs->tab,cv,de);
531     break;
532   case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
533     utf8_text_dbyte2 (text,ret,cs->tab,cv,de);
534     break;
535   case CT_UTF7:			/* variable UTF-7 encoded Unicode no table */
536     utf8_text_utf7 (text,ret,cv,de);
537     break;
538   case CT_UCS2:			/* 2 byte 16-bit Unicode no table */
539     utf8_text_ucs2 (text,ret,cv,de);
540     break;
541   case CT_UCS4:			/* 4 byte 32-bit Unicode no table */
542     utf8_text_ucs4 (text,ret,cv,de);
543     break;
544   case CT_UTF16:		/* variable UTF-16 encoded Unicode no table */
545     utf8_text_utf16 (text,ret,cv,de);
546     break;
547   case CT_2022:			/* variable ISO-2022 encoded no table*/
548     utf8_text_2022 (text,ret,cv,de);
549     break;
550   case CT_SJIS:			/* 2 byte Shift-JIS encoded JIS no table */
551     utf8_text_sjis (text,ret,cv,de);
552     break;
553   default:			/* unknown character set type */
554     return NIL;
555   }
556   return LONGT;			/* return success */
557 }
558 
559 /* Reverse mapping routines
560  *
561  * These routines only support character sets, not all possible charsets.  In
562  * particular, they do not support any Unicode encodings or ISO 2022.
563  *
564  * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
565  * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
566  * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
567  *
568  * No attempt is made to map "equivalent" Unicode characters or Unicode
569  * characters that have the same glyph; nor is there any attempt to handle
570  * combining characters or otherwise do any stringprep.  Maybe later.
571  */
572 
573 
574 /* Convert UTF-8 sized text to charset
575  * Accepts: source sized text
576  *	    destination charset
577  *	    pointer to returned sized text
578  *	    substitute character if not in cs, else NIL to return failure
579  * Returns: T if successful, NIL if failure
580  */
581 
582 
utf8_cstext(SIZEDTEXT * text,char * charset,SIZEDTEXT * ret,unsigned long errch)583 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
584 		  unsigned long errch)
585 {
586   short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");
587   unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);
588   return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;
589 }
590 
591 /* Convert charset labelled sized text to another charset
592  * Accepts: source sized text
593  *	    source charset
594  *	    pointer to returned sized text
595  *	    destination charset
596  *	    substitute character if not in dest cs, else NIL to return failure
597  * Returns: T if successful, NIL if failure
598  *
599  * This routine has the same restricts as utf8_cstext().
600  */
601 
utf8_cstocstext(SIZEDTEXT * src,char * sc,SIZEDTEXT * dst,char * dc,unsigned long errch)602 long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,
603 		      unsigned long errch)
604 {
605   SIZEDTEXT utf8;
606   const CHARSET *scs,*dcs;
607   unsigned short *rmap;
608   long ret = NIL;
609   long iso2022jp;
610 				/* lookup charsets and reverse map */
611   if ((dc && (dcs = utf8_charset (dc))) &&
612       (rmap = (iso2022jp = ((dcs->type == CT_2022) &&
613 			    !compare_cstring (dcs->name,"ISO-2022-JP"))) ?
614        utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&
615       (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {
616 				/* init temporary buffer */
617     memset (&utf8,NIL,sizeof (SIZEDTEXT));
618 				/* source cs equivalent to dest cs? */
619     if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {
620       dst->data = src->data;	/* yes, just copy pointers */
621       dst->size = src->size;
622       ret = LONGT;
623     }
624 				/* otherwise do the conversion */
625     else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&
626 		utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));
627 				/* flush temporary buffer */
628     if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))
629       fs_give ((void **) &utf8.data);
630   }
631   return ret;
632 }
633 
634 /* Cached rmap */
635 
636 static const CHARSET *currmapcs = NIL;
637 static unsigned short *currmap = NIL;
638 
639 
640 /* Cache and return map for UTF-8 -> character set
641  * Accepts: character set name
642  * Returns: cached map if character set found, else NIL
643  */
644 
utf8_rmap(char * charset)645 unsigned short *utf8_rmap (char *charset)
646 {
647   return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :
648     utf8_rmap_cs (utf8_charset (charset));
649 }
650 
651 
652 /* Cache and return map for UTF-8 -> character set given CHARSET block
653  * Accepts: CHARSET block
654  * Returns: cached map if character set found, else NIL
655  */
656 
utf8_rmap_cs(const CHARSET * cs)657 unsigned short *utf8_rmap_cs (const CHARSET *cs)
658 {
659   unsigned short *ret = NIL;
660   if (!cs);			/* have charset? */
661   else if (cs == currmapcs) ret = currmap;
662   else if ((ret = utf8_rmap_gen (cs,currmap)) != NULL) {
663     currmapcs = cs;
664     currmap = ret;
665   }
666   return ret;
667 }
668 
669 /* Return map for UTF-8 -> character set given CHARSET block
670  * Accepts: CHARSET block
671  *	    old map to recycle
672  * Returns: map if character set found, else NIL
673  */
674 
utf8_rmap_gen(const CHARSET * cs,unsigned short * oldmap)675 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap)
676 {
677   unsigned short u,*tab,*rmap;
678   unsigned int i,m,ku,ten;
679   struct utf8_eucparam *param,*p2;
680   switch (cs->type) {		/* is a character set? */
681   case CT_ASCII:		/* 7-bit ASCII no table */
682   case CT_1BYTE0:		/* 1 byte no table */
683   case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
684   case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
685   case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
686   case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
687   case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
688   case CT_SJIS:			/* 2 byte Shift-JIS */
689     rmap = oldmap ? oldmap :	/* recycle old map if supplied else make new */
690       (unsigned short *) fs_get (65536 * sizeof (unsigned short));
691 				/* initialize table for ASCII */
692     for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;
693 				/* populate remainder of table with NOCHAR */
694 #define NOCHARBYTE (NOCHAR & 0xff)
695 #if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
696     while (i < 65536) rmap[i++] = NOCHAR;
697 #else
698     memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));
699 #endif
700     break;
701   default:			/* unsupported charset type */
702     rmap = NIL;			/* no map possible */
703   }
704   if (rmap) {			/* have a map? */
705     switch (cs->type) {		/* additional reverse map actions */
706     case CT_1BYTE0:		/* 1 byte no table */
707       for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;
708       break;
709     case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
710       for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
711 	if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;
712       break;
713     case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
714       for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
715 	if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;
716       break;
717     case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
718       for (param = (struct utf8_eucparam *) cs->tab,
719 	     tab = (unsigned short *) param->tab, ku = 0;
720 	   ku < param->max_ku; ku++)
721 	for (ten = 0; ten < param->max_ten; ten++)
722 	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
723 	    rmap[u] = ((ku + param->base_ku) << 8) +
724 	      (ten + param->base_ten) + 0x8080;
725       break;
726 
727     case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
728       for (param = (struct utf8_eucparam *) cs->tab,
729 	     tab = (unsigned short *) param->tab, ku = 0;
730 	   ku < param->max_ku; ku++)
731 	for (ten = 0; ten < param->max_ten; ten++)
732 	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
733 	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
734       break;
735     case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
736       param = (struct utf8_eucparam *) cs->tab;
737       p2 = param + 1;		/* plane 2 parameters */
738 				/* only ten parameters should differ */
739       if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
740 	fatal ("ku definition error for CT_DBYTE2 charset");
741 				/* total codepoints in each ku */
742       m = param->max_ten + p2->max_ten;
743       tab = (unsigned short *) param->tab;
744       for (ku = 0; ku < param->max_ku; ku++) {
745 	for (ten = 0; ten < param->max_ten; ten++)
746 	  if ((u = tab[(ku * m) + ten]) != UBOGON)
747 	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
748 	for (ten = 0; ten < p2->max_ten; ten++)
749 	  if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
750 	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);
751       }
752       break;
753     case CT_SJIS:		/* 2 byte Shift-JIS */
754       for (ku = 0; ku < MAX_JIS0208_KU; ku++)
755 	for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
756 	  if ((u = jis0208tab[ku][ten]) != UBOGON) {
757 	    int sku = ku + BASE_JIS0208_KU;
758 	    int sten = ten + BASE_JIS0208_TEN;
759 	    rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +
760 	      sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);
761 	  }
762 				/* JIS Roman */
763       rmap[UCS2_YEN] = JISROMAN_YEN;
764       rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;
765 				/* JIS hankaku katakana */
766       for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
767 	rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;
768       break;
769     }
770 				/* hack: map NBSP to SP if otherwise no map */
771     if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];
772   }
773   return rmap;			/* return map */
774 }
775 
776 /* Convert UTF-8 sized text to charset using rmap
777  * Accepts: source sized text
778  *	    conversion rmap
779  *	    pointer to returned sized text
780  *	    substitute character if not in rmap, else NIL to return failure
781  *	    ISO-2022-JP conversion flag
782  * Returns T if successful, NIL if failure
783  *
784  * This routine doesn't try to convert to all possible charsets; in particular
785  * it doesn't support other Unicode encodings or any ISO 2022 other than
786  * ISO-2022-JP.
787  */
788 
utf8_rmaptext(SIZEDTEXT * text,unsigned short * rmap,SIZEDTEXT * ret,unsigned long errch,long iso2022jp)789 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
790 		    unsigned long errch,long iso2022jp)
791 {
792   unsigned long i,u,c;
793 				/* get size of buffer */
794   if ((i = utf8_rmapsize (text,rmap,errch,iso2022jp)) != 0L) {
795     unsigned char *s = text->data;
796     unsigned char *t = ret->data = (unsigned char *) fs_get (i);
797     ret->size = i - 1;		/* number of octets in destination buffer */
798 				/* start non-zero ISO-2022-JP state at 1 */
799     if (iso2022jp) iso2022jp = 1;
800 				/* convert string, ignore BOM */
801     for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
802 				/* substitute error character for NOCHAR */
803       if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
804       switch (iso2022jp) {	/* depends upon ISO 2022 mode */
805       case 0:			/* ISO 2022 not in effect */
806 				/* two-byte character */
807 	if (c > 0xff) *t++ = (unsigned char) (c >> 8);
808 				/* single-byte or low-byte of two-byte */
809 	*t++ = (unsigned char) (c & 0xff);
810 	break;
811       case 1:			/* ISO 2022 Roman */
812 				/* <ch> */
813 	if (c < 0x80) *t++ = (unsigned char) c;
814 	else {			/* JIS character */
815 	  *t++ = I2C_ESC;	/* ESC $ B <hi> <lo> */
816 	  *t++ = I2C_MULTI;
817 	  *t++ = I2CS_94x94_JIS_NEW;
818 	  *t++ = (unsigned char) (c >> 8) & 0x7f;
819 	  *t++ = (unsigned char) c & 0x7f;
820 	  iso2022jp = 2;	/* shift to ISO 2022 JIS */
821 	}
822 	break;
823       case 2:			/* ISO 2022 JIS */
824 	if (c > 0x7f) {		/* <hi> <lo> */
825 	  *t++ = (unsigned char) (c >> 8) & 0x7f;
826 	  *t++ = (unsigned char) c & 0x7f;
827 	}
828 	else {			/* ASCII character */
829 	  *t++ = I2C_ESC;	/* ESC ( J <ch> */
830 	  *t++ = I2C_G0_94;
831 	  *t++ = I2CS_94_JIS_ROMAN;
832 	  *t++ = (unsigned char) c;
833 	  iso2022jp = 1;	/* shift to ISO 2022 Roman */
834 	}
835 	break;
836       }
837     }
838     if (iso2022jp == 2) {	/* ISO-2022-JP string must end in Roman */
839       *t++ = I2C_ESC;		/* ESC ( J */
840       *t++ = I2C_G0_94;
841       *t++ = I2CS_94_JIS_ROMAN;
842     }
843     *t++ = NIL;			/* tie off returned data */
844     return LONGT;		/* return success */
845   }
846   ret->data = NIL;
847   ret->size = 0;
848   return NIL;			/* failure */
849 }
850 
851 /* Calculate size of convertsion of UTF-8 sized text to charset using rmap
852  * Accepts: source sized text
853  *	    conversion rmap
854  *	    pointer to returned sized text
855  *	    substitute character if not in rmap, else NIL to return failure
856  *	    ISO-2022-JP conversion flag
857  * Returns size+1 if successful, NIL if failure
858  *
859  * This routine doesn't try to handle to all possible charsets; in particular
860  * it doesn't support other Unicode encodings or any ISO 2022 other than
861  * ISO-2022-JP.
862  */
863 
utf8_rmapsize(SIZEDTEXT * text,unsigned short * rmap,unsigned long errch,long iso2022jp)864 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
865 			     unsigned long errch,long iso2022jp)
866 {
867   unsigned long i,u,c;
868   unsigned long ret = 1;	/* terminating NUL */
869   unsigned char *s = text->data;
870   if (iso2022jp) iso2022jp = 1;	/* start non-zero ISO-2022-JP state at 1 */
871   for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
872     if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
873       return NIL;		/* not in BMP, or NOCHAR and no err char */
874     switch (iso2022jp) {	/* depends upon ISO 2022 mode */
875     case 0:			/* ISO 2022 not in effect */
876       ret += (c > 0xff) ? 2 : 1;
877       break;
878     case 1:			/* ISO 2022 Roman */
879       if (c < 0x80) ret += 1;	/* <ch> */
880       else {			/* JIS character */
881 	ret += 5;		/* ESC $ B <hi> <lo> */
882 	iso2022jp = 2;		/* shift to ISO 2022 JIS */
883       }
884       break;
885     case 2:			/* ISO 2022 JIS */
886       if (c > 0x7f) ret += 2;	/* <hi> <lo> */
887       else {			/* ASCII character */
888 	ret += 4;		/* ESC ( J <ch> */
889 	iso2022jp = 1;		/* shift to ISO 2022 Roman */
890       }
891       break;
892     }
893   }
894   if (iso2022jp == 2) {		/* ISO-2022-JP string must end in Roman */
895     ret += 3;			/* ESC ( J */
896     iso2022jp = 1;		/* reset state to Roman */
897   }
898   return ret;
899 }
900 
901 /* Convert UCS-4 to charset using rmap
902  * Accepts: source UCS-4 character(s)
903  *	    number of UCS-4 characters
904  *	    conversion rmap
905  *	    pointer to returned sized text
906  *	    substitute character if not in rmap, else NIL to return failure
907  * Returns T if successful, NIL if failure
908  *
909  * Currently only supports BMP characters, and does not support ISO-2022
910  */
911 
ucs4_rmaptext(unsigned long * ucs4,unsigned long len,unsigned short * rmap,SIZEDTEXT * ret,unsigned long errch)912 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
913 		    SIZEDTEXT *ret,unsigned long errch)
914 {
915   long size = ucs4_rmaplen (ucs4,len,rmap,errch);
916   return (size >= 0) ?		/* build in newly-created buffer */
917     ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),
918 		  ucs4,len,rmap,errch) : NIL;
919 }
920 
921 /* Return size of UCS-4 string converted to other CS via rmap
922  * Accepts: source UCS-4 character(s)
923  *	    number of UCS-4 characters
924  *	    conversion rmap
925  *	    substitute character if not in rmap, else NIL to return failure
926  * Returns: length if success, negative if failure (no-convert)
927  */
928 
ucs4_rmaplen(unsigned long * ucs4,unsigned long len,unsigned short * rmap,unsigned long errch)929 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
930 		   unsigned long errch)
931 {
932   long ret;
933   unsigned long i,u,c;
934 				/* count non-BOM characters */
935   for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
936     if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
937       return -1;		/* not in BMP, or NOCHAR and no err char? */
938     ret += (c > 0xff) ? 2 : 1;
939   }
940   return ret;
941 }
942 
943 
944 /* Stuff buffer with UCS-4 string converted to other CS via rmap
945  * Accepts: destination buffer
946  *	    source UCS-4 character(s)
947  *	    number of UCS-4 characters
948  *	    conversion rmap
949  *	    substitute character if not in rmap, else NIL to return failure
950  * Returns: T, always
951  */
952 
ucs4_rmapbuf(unsigned char * t,unsigned long * ucs4,unsigned long len,unsigned short * rmap,unsigned long errch)953 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
954 		   unsigned short *rmap,unsigned long errch)
955 {
956   unsigned long i,u,c;
957 				/* convert non-BOM characters */
958   for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
959 				/* substitute error character for NOCHAR */
960     if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
961 				/* two-byte character? */
962     if (c > 0xff) *t++ = (unsigned char) (c >> 8);
963 				/* single-byte or low-byte of two-byte */
964     *t++ = (unsigned char) (c & 0xff);
965   }
966   *t++ = NIL;			/* tie off returned data */
967   return LONGT;
968 }
969 
970 /* Return UCS-4 Unicode character from UTF-8 string
971  * Accepts: pointer to string
972  *	    remaining octets in string
973  * Returns: UCS-4 character with pointer and count updated
974  *	    or error code with pointer and count unchanged
975  */
976 
utf8_get(unsigned char ** s,unsigned long * i)977 unsigned long utf8_get (unsigned char **s,unsigned long *i)
978 {
979   unsigned char *t = *s;
980   unsigned long j = *i;
981 				/* decode raw UTF-8 string */
982   unsigned long ret = utf8_get_raw (&t,&j);
983   if (ret & U8G_ERROR);		/* invalid raw UTF-8 decoding? */
984 				/* no, is it surrogate? */
985   else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;
986 				/* or in non-Unicode ISO 10646 space? */
987   else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;
988   else {
989     *s = t;			/* all is well, update pointer */
990     *i = j;			/* and counter */
991   }
992   return ret;			/* return value */
993 }
994 
995 /* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
996  * Accepts: pointer to string
997  *	    remaining octets in string
998  * Returns: UCS-4 character with pointer and count updated
999  *	    or error code with pointer and count unchanged
1000  */
1001 
utf8_get_raw(unsigned char ** s,unsigned long * i)1002 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i)
1003 {
1004   unsigned char c,c1;
1005   unsigned char *t = *s;
1006   unsigned long j = *i;
1007   unsigned long ret = U8G_NOTUTF8;
1008   int more = 0;
1009   do {				/* make sure have source octets available */
1010     if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;
1011 				/* UTF-8 continuation? */
1012     else if (((c = *t++) > 0x7f) && (c < 0xc0)) {
1013 				/* continuation when not in progress */
1014       if (!more) return U8G_BADCONT;
1015       --more;			/* found a continuation octet */
1016       ret <<= 6;		/* shift current value by 6 bits */
1017       ret |= c & 0x3f;		/* merge continuation octet */
1018     }
1019 				/* incomplete UTF-8 character */
1020     else if (more) return U8G_INCMPLT;
1021     else {			/* start of sequence */
1022       c1 = j ? *t : 0xbf;	/* assume valid continuation if incomplete */
1023       if (c < 0x80) ret = c;	/* U+0000 - U+007f */
1024       else if (c < 0xc2);	/* c0 and c1 never valid */
1025       else if (c < 0xe0) {	/* U+0080 - U+07ff */
1026 	c &= 0x1f;
1027 	if (c1 >= 0x80) more = 1;
1028       }
1029       else if (c == 0xe0) {	/* U+0800 - U+0fff */
1030 	c &= 0x0f;
1031 	if (c1 >= 0xa0) more = 2;
1032       }
1033       else if (c < 0xed) {	/* U+1000 - U+cfff */
1034 	c &= 0x0f;
1035 	if (c1 >= 0x80) more = 2;
1036       }
1037       else if (c == 0xed) {	/* U+d000 - U+d7ff */
1038 	c &= 0x0f;
1039 	if (j == 0 || ((c1 >= 0x80) && (c1 <= 0x9f))) more = 2;
1040       }
1041       else if (c < 0xf0) {	/* U+e000 - U+ffff */
1042 	c &= 0x0f;
1043 	if (c1 >= 0x80) more = 2;
1044       }
1045       else if (c == 0xf0) {	/* U+10000 - U+3ffff */
1046 	c &= 0x07;
1047 	if (c1 >= 0x90) more = 3;
1048       }
1049       else if (c < 0xf3) {	/* U+40000 - U+fffff */
1050 	c &= 0x07;
1051 	if (c1 >= 0x80) more = 3;
1052       }
1053 #if 0
1054       else if (c == 0xf4) {	/* U+100000 - U+10ffff */
1055 	c &= 0x07;
1056 	if (((c1 >= 0x80) && (c1 <= 0x8f))) more = 3;
1057       }
1058 #else
1059       else if (c < 0xf8) {	/* U+100000 - U+10ffff (and 110000 - 1fffff) */
1060 	c &= 0x07;
1061 	if ((c1 >= 0x80)) more = 3;
1062       }
1063       else if (c < 0xfc) {	/* ISO 10646 200000 - 3ffffff */
1064 	c &= 0x03;
1065 	if ((c1 >= 0x80)) more = 4;
1066       }
1067       else if (c < 0xfe) {	/* ISO 10646 4000000 - 7fffffff */
1068 	c &= 0x01;
1069 	if ((c1 >= 0x80)) more = 5;
1070       }
1071 #endif
1072 				/* fe and ff never valid */
1073       if (more) {		/* multi-octet, make sure more to come */
1074 	if (!j) return U8G_ENDSTRI;
1075 	ret = c;		/* continuation needed, save start bits */
1076       }
1077     }
1078   } while (more);
1079   if (!(ret & U8G_ERROR)) {	/* success return? */
1080     *s = t;			/* yes, update pointer */
1081     *i = j;			/* and counter */
1082   }
1083   return ret;			/* return value */
1084 }
1085 
1086 /* Return UCS-4 character from named charset string
1087  * Accepts: charset
1088  *	    pointer to string
1089  *	    remaining octets in string
1090  * Returns: UCS-4 character with pointer and count updated, negative if error
1091  *
1092  * Error codes are the same as utf8_get().
1093  */
1094 
ucs4_cs_get(CHARSET * cs,unsigned char ** s,unsigned long * i)1095 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i)
1096 {
1097   unsigned char c,c1,ku,ten;
1098   unsigned long ret,d;
1099   unsigned char *t = *s;
1100   unsigned long j = *i;
1101   struct utf8_eucparam *p1,*p2,*p3;
1102   if (j--) c = *t++;		/* get first octet */
1103   else return U8G_ENDSTRG;	/* empty string */
1104   switch (cs->type) {		/* convert if type known */
1105   case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */
1106     return utf8_get (s,i);
1107   case CT_ASCII:		/* 7-bit ASCII no table */
1108     if (c >= 0x80) return U8G_NOTUTF8;
1109   case CT_1BYTE0:		/* 1 byte no table */
1110     ret = c;			/* identity */
1111     break;
1112   case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
1113     ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;
1114     break;
1115   case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
1116     ret = ((unsigned short *) cs->tab)[c];
1117     break;
1118 
1119   case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1120     if (c & BIT8) {
1121       p1 = (struct utf8_eucparam *) cs->tab;
1122       p2 = p1 + 1;
1123       p3 = p1 + 2;
1124       if (j--) c1 = *t++;	/* get second octet */
1125       else return U8G_ENDSTRI;
1126       if (!(c1 & BIT8)) return U8G_NOTUTF8;
1127       switch (c) {		/* check 8bit code set */
1128       case EUC_CS2:		/* CS2 */
1129 	if (p2->base_ku) {	/* CS2 set up? */
1130 	  if (p2->base_ten) {	/* yes, multibyte? */
1131 	    if (j--) c = *t++;	/* get second octet */
1132 	    else return U8G_ENDSTRI;
1133 	    if ((c & BIT8) &&
1134 		((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1135 		((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {
1136 	      ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];
1137 	      break;
1138 	    }
1139 	  }
1140 	  else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {
1141 	    ret = c1 + ((unsigned long) p2->tab);
1142 	    break;
1143 	  }
1144 	}
1145 	return U8G_NOTUTF8;	/* CS2 not set up or bogus */
1146       case EUC_CS3:		/* CS3 */
1147 	if (p3->base_ku) {	/* CS3 set up? */
1148 	  if (p3->base_ten) {	/* yes, multibyte? */
1149 	    if (j--) c = *t++;	/* get second octet */
1150 	    else return U8G_ENDSTRI;
1151 	    if ((c & BIT8) &&
1152 		((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1153 		((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {
1154 	      ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];
1155 	      break;
1156 	    }
1157 	  }
1158 	  else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {
1159 	    ret = c1 + ((unsigned long) p3->tab);
1160 	    break;
1161 	  }
1162 	}
1163 	return U8G_NOTUTF8;	/* CS3 not set up or bogus */
1164       default:
1165 	if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1166 	    ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten))
1167 	  return U8G_NOTUTF8;
1168 	ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1169 		/* special hack for JIS X 0212: merge rows less than 10 */
1170 	if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten)
1171 	  ret = ((unsigned short *) p3->tab)
1172 	    [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1173 	break;
1174       }
1175     }
1176     else ret = c;		/* ASCII character */
1177     break;
1178 
1179   case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
1180     if (c & BIT8) {		/* double-byte character? */
1181       p1 = (struct utf8_eucparam *) cs->tab;
1182       if (j--) c1 = *t++;	/* get second octet */
1183       else return U8G_ENDSTRI;
1184       if (((ku = c - p1->base_ku) < p1->max_ku) &&
1185 	  ((ten = c1 - p1->base_ten) < p1->max_ten))
1186 	ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1187       else return U8G_NOTUTF8;
1188     }
1189     else ret = c;		/* ASCII character */
1190     break;
1191   case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
1192     if (c & BIT8) {		/* double-byte character? */
1193       p1 = (struct utf8_eucparam *) cs->tab;
1194       p2 = p1 + 1;
1195       if (j--) c1 = *t++;	/* get second octet */
1196       else return U8G_ENDSTRI;
1197       if (c1 & BIT8) {		/* high vs. low plane */
1198 	if ((ku = c - p2->base_ku) < p2->max_ku &&
1199 	    ((ten = c1 - p2->base_ten) < p2->max_ten))
1200 	  ret = ((unsigned short *) p1->tab)
1201 	    [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten];
1202 	else return U8G_NOTUTF8;
1203       }
1204       else if ((ku = c - p1->base_ku) < p1->max_ku &&
1205 	       ((ten = c1 - p1->base_ten) < p1->max_ten))
1206 	  ret = ((unsigned short *) p1->tab)
1207 	    [(ku*(p1->max_ten + p2->max_ten)) + ten];
1208       else return U8G_NOTUTF8;
1209     }
1210     else ret = c;		/* ASCII character */
1211     break;
1212   case CT_SJIS:			/* 2 byte Shift-JIS encoded JIS no table */
1213 				/* compromise - do yen sign but not overline */
1214     if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c;
1215 				/* half-width katakana? */
1216     else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8;
1217     else {			/* Shift-JIS */
1218       if (j--) c1 = *t++;	/* get second octet */
1219       else return U8G_ENDSTRI;
1220       SJISTOJIS (c,c1);
1221       ret = JISTOUNICODE (c,c1,ku,ten);
1222     }
1223     break;
1224 
1225   case CT_UCS2:			/* 2 byte 16-bit Unicode no table */
1226     ret = c << 8;
1227     if (j--) c = *t++;		/* get second octet */
1228     else return U8G_ENDSTRI;	/* empty string */
1229     ret |= c;
1230     break;
1231   case CT_UCS4:			/* 4 byte 32-bit Unicode no table */
1232     if (c & 0x80) return U8G_NOTUTF8;
1233     if (j < 3) return U8G_ENDSTRI;
1234     j -= 3;			/* count three octets */
1235     ret = c << 24;
1236     ret |= (*t++) << 16;
1237     ret |= (*t++) << 8;
1238     ret |= (*t++);
1239     break;
1240   case CT_UTF16:		/* variable UTF-16 encoded Unicode no table */
1241     ret = c << 8;
1242     if (j--) c = *t++;		/* get second octet */
1243     else return U8G_ENDSTRI;	/* empty string */
1244     ret |= c;
1245 				/* surrogate? */
1246     if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) {
1247 				/* invalid first surrogate */
1248       if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8;
1249       j -= 2;			/* count two octets */
1250       d = (*t++) << 8;		/* first octet of second surrogate */
1251       d |= *t++;		/* second octet of second surrogate */
1252       if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8;
1253       ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) +
1254 	(d & UTF16_MASK);
1255     }
1256     break;
1257   default:			/* unknown/unsupported character set type */
1258     return U8G_NOTUTF8;
1259   }
1260   *s = t;			/* update pointer and counter */
1261   *i = j;
1262   return ret;
1263 }
1264 
1265 /* Produce charset validity map for BMP
1266  * Accepts: list of charsets to map
1267  * Returns: validity map, indexed by BMP codepoint
1268  *
1269  * Bit 0x1 is the "not-CJK" character bit
1270  */
1271 
utf8_csvalidmap(char * charsets[])1272 unsigned long *utf8_csvalidmap (char *charsets[])
1273 {
1274   unsigned short u,*tab;
1275   unsigned int m,ku,ten;
1276   unsigned long i,csi,csb;
1277   struct utf8_eucparam *param,*p2;
1278   char *s;
1279   const CHARSET *cs;
1280   unsigned long *ret = (unsigned long *)
1281     fs_get (i = 0x10000 * sizeof (unsigned long));
1282   memset (ret,0,i);		/* zero the entire vector */
1283 				/* mark all the non-CJK codepoints */
1284 	/* U+0000 - U+2E7F non-CJK */
1285   for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1;
1286 	/* U+2E80 - U+2EFF CJK Radicals Supplement
1287 	 * U+2F00 - U+2FDF Kangxi Radicals
1288 	 * U+2FE0 - U+2FEF unassigned
1289 	 * U+2FF0 - U+2FFF Ideographic Description Characters
1290 	 * U+3000 - U+303F CJK Symbols and Punctuation
1291 	 * U+3040 - U+309F Hiragana
1292 	 * U+30A0 - U+30FF Katakana
1293 	 * U+3100 - U+312F BoPoMoFo
1294 	 * U+3130 - U+318F Hangul Compatibility Jamo
1295 	 * U+3190 - U+319F Kanbun
1296 	 * U+31A0 - U+31BF BoPoMoFo Extended
1297 	 * U+31C0 - U+31EF CJK Strokes
1298 	 * U+31F0 - U+31FF Katakana Phonetic Extensions
1299 	 * U+3200 - U+32FF Enclosed CJK Letters and Months
1300 	 * U+3300 - U+33FF CJK Compatibility
1301 	 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
1302 	 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
1303 	 * U+4E00 - U+9FFF CJK Unified Ideographs
1304 	 * U+A000 - U+A48F Yi Syllables
1305 	 * U+A490 - U+A4CF Yi Radicals
1306 	 * U+A700 - U+A71F Modifier Tone Letters
1307 	 */
1308   for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1;
1309 	/* U+AC00 - U+D7FF Hangul Syllables */
1310   for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1;
1311 	/* U+F900 - U+FAFF CJK Compatibility Ideographs */
1312   for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1;
1313 	/* U+FE30 - U+FE4F CJK Compatibility Forms
1314 	 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
1315 	 */
1316   for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1;
1317 	/* U+FF00 - U+FFEF CJK Compatibility Ideographs */
1318   for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1;
1319 
1320 				/* for each supplied charset */
1321   for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) {
1322 				/* substitute EUC-JP for ISO-2022-JP */
1323     if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP";
1324 				/* look up charset */
1325     if ((cs = utf8_charset (s)) != NULL) {
1326       csb = 1 << csi;		/* charset bit */
1327       switch (cs->type) {
1328       case CT_ASCII:		/* 7-bit ASCII no table */
1329       case CT_1BYTE0:		/* 1 byte no table */
1330       case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
1331       case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
1332       case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1333       case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
1334       case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
1335       case CT_SJIS:		/* 2 byte Shift-JIS */
1336 				/* supported charset type, all ASCII is OK */
1337 	for (i = 0; i < 128; ++i) ret[i] |= csb;
1338 	break;
1339       default:			/* unsupported charset type */
1340 	fs_give ((void **) &ret);
1341 	break;
1342       }
1343 				/* now do additional operations */
1344       if (ret) switch (cs->type) {
1345       case CT_1BYTE0:		/* 1 byte no table */
1346 	for (i = 128; i < 256; i++) ret[i] |= csb;
1347 	break;
1348       case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
1349 	for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
1350 	  if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb;
1351 	break;
1352       case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
1353 	for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
1354 	  if (tab[i] != UBOGON) ret[tab[i]] |= csb;
1355       break;
1356       case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1357 	for (param = (struct utf8_eucparam *) cs->tab,
1358 	       tab = (unsigned short *) param->tab, ku = 0;
1359 	     ku < param->max_ku; ku++)
1360 	  for (ten = 0; ten < param->max_ten; ten++)
1361 	    if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1362 	      ret[u] |= csb;
1363 	break;
1364 
1365       case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
1366 	for (param = (struct utf8_eucparam *) cs->tab,
1367 	       tab = (unsigned short *) param->tab, ku = 0;
1368 	     ku < param->max_ku; ku++)
1369 	  for (ten = 0; ten < param->max_ten; ten++)
1370 	    if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1371 	      ret[u] |= csb;
1372       break;
1373       case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
1374 	param = (struct utf8_eucparam *) cs->tab;
1375 	p2 = param + 1;		/* plane 2 parameters */
1376 				/* only ten parameters should differ */
1377 	if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
1378 	  fatal ("ku definition error for CT_DBYTE2 charset");
1379 				/* total codepoints in each ku */
1380 	m = param->max_ten + p2->max_ten;
1381 	tab = (unsigned short *) param->tab;
1382 	for (ku = 0; ku < param->max_ku; ku++) {
1383 	  for (ten = 0; ten < param->max_ten; ten++)
1384 	    if ((u = tab[(ku * m) + ten]) != UBOGON)
1385 	      ret[u] |= csb;
1386 	  for (ten = 0; ten < p2->max_ten; ten++)
1387 	    if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
1388 	      ret[u] |= csb;
1389 	}
1390 	break;
1391       case CT_SJIS:		/* 2 byte Shift-JIS */
1392 	for (ku = 0; ku < MAX_JIS0208_KU; ku++)
1393 	  for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
1394 	    if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb;
1395 				/* JIS hankaku katakana */
1396 	for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
1397 	  ret[UCS2_KATAKANA + u] |= csb;
1398 	break;
1399       }
1400     }
1401 				/* invalid charset, punt */
1402     else fs_give ((void **) &ret);
1403   }
1404   return ret;
1405 }
1406 
1407 /* Infer charset from unlabelled sized text
1408  * Accepts: sized text
1409  * Returns: charset if one inferred, or NIL if unknown
1410  */
1411 
utf8_infercharset(SIZEDTEXT * src)1412 const CHARSET *utf8_infercharset (SIZEDTEXT *src)
1413 {
1414   long iso2022jp = NIL;
1415   long eightbit = NIL;
1416   unsigned long i;
1417 				/* look for ISO 2022 */
1418   if (src) for (i = 0; i < src->size; i++) {
1419 				/* ESC sequence? */
1420     if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) {
1421     case I2C_MULTI:		/* yes, multibyte? */
1422       if (++i < src->size) switch (src->data[i]) {
1423       case I2CS_94x94_JIS_OLD:	/* JIS X 0208-1978 */
1424       case I2CS_94x94_JIS_NEW:	/* JIS X 0208-1983 */
1425       case I2CS_94x94_JIS_EXT:	/* JIS X 0212-1990 (kludge...) */
1426 	iso2022jp = T;		/* found an ISO-2022-JP sequence */
1427 	break;
1428       default:			/* other multibyte */
1429 	return NIL;		/* definitely invalid */
1430       }
1431       break;
1432     case I2C_G0_94:		/* single byte */
1433       if (++i < src->size) switch (src->data[i]) {
1434       case I2CS_94_JIS_BUGROM:	/* in case old buggy software */
1435       case I2CS_94_JIS_ROMAN:	/* JIS X 0201-1976 left half */
1436       case I2CS_94_ASCII:	/* ASCII */
1437       case I2CS_94_BRITISH:	/* good enough for gov't work */
1438 	break;
1439       default:			/* other 94 single byte */
1440 	return NIL;		/* definitely invalid */
1441       }
1442     }
1443 				/* if possible UTF-8 and not ISO-2022-JP */
1444     else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) &&
1445 	     (eightbit = utf8_validate (src->data + i,src->size - i)) > 0)
1446       i += eightbit - 1;	/* skip past all but last of UTF-8 char */
1447   }
1448 				/* ISO-2022-JP overrides other guesses */
1449   if (iso2022jp) return utf8_charset ("ISO-2022-JP");
1450   if (eightbit > 0) return utf8_charset ("UTF-8");
1451   return eightbit ? NIL : utf8_charset ("US-ASCII");
1452 }
1453 
1454 
1455 /* Validate that character at this position is UTF-8
1456  * Accepts: string pointer
1457  *	    size of remaining string
1458  * Returns: size of UTF-8 character in octets or -1 if not UTF-8
1459  */
1460 
utf8_validate(unsigned char * s,unsigned long i)1461 long utf8_validate (unsigned char *s,unsigned long i)
1462 {
1463   unsigned long j = i;
1464   return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;
1465 }
1466 
1467 /* Convert ISO 8859-1 to UTF-8
1468  * Accepts: source sized text
1469  *	    pointer to return sized text
1470  *	    canonicalization function
1471  */
1472 
utf8_text_1byte0(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)1473 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1474 {
1475   unsigned long i;
1476   unsigned char *s;
1477   unsigned int c;
1478   for (ret->size = i = 0; i < text->size;) {
1479     c = text->data[i++];
1480     UTF8_COUNT_BMP (ret->size,c,cv,de)
1481   }
1482   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1483   for (i = 0; i < text->size;) {
1484     c = text->data[i++];
1485     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1486   }
1487 }
1488 
1489 
1490 /* Convert single byte ASCII+8bit character set sized text to UTF-8
1491  * Accepts: source sized text
1492  *	    pointer to return sized text
1493  *	    conversion table
1494  *	    canonicalization function
1495  */
1496 
utf8_text_1byte(SIZEDTEXT * text,SIZEDTEXT * ret,void * tab,ucs4cn_t cv,ucs4de_t de)1497 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1498 		      ucs4de_t de)
1499 {
1500   unsigned long i;
1501   unsigned char *s;
1502   unsigned int c;
1503   unsigned short *tbl = (unsigned short *) tab;
1504   for (ret->size = i = 0; i < text->size;) {
1505     if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1506     UTF8_COUNT_BMP (ret->size,c,cv,de)
1507   }
1508   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1509   for (i = 0; i < text->size;) {
1510     if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1511     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1512   }
1513 }
1514 
1515 /* Convert single byte 8bit character set sized text to UTF-8
1516  * Accepts: source sized text
1517  *	    pointer to return sized text
1518  *	    conversion table
1519  *	    canonicalization function
1520  */
1521 
utf8_text_1byte8(SIZEDTEXT * text,SIZEDTEXT * ret,void * tab,ucs4cn_t cv,ucs4de_t de)1522 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1523 		       ucs4de_t de)
1524 {
1525   unsigned long i;
1526   unsigned char *s;
1527   unsigned int c;
1528   unsigned short *tbl = (unsigned short *) tab;
1529   for (ret->size = i = 0; i < text->size;) {
1530     c = tbl[text->data[i++]];
1531     UTF8_COUNT_BMP (ret->size,c,cv,de)
1532   }
1533   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1534   for (i = 0; i < text->size;) {
1535     c = tbl[text->data[i++]];
1536     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1537   }
1538 }
1539 
1540 /* Convert EUC sized text to UTF-8
1541  * Accepts: source sized text
1542  *	    pointer to return sized text
1543  *	    EUC parameter table
1544  *	    canonicalization function
1545  */
1546 
utf8_text_euc(SIZEDTEXT * text,SIZEDTEXT * ret,void * tab,ucs4cn_t cv,ucs4de_t de)1547 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1548 		    ucs4de_t de)
1549 {
1550   unsigned long i;
1551   unsigned char *s;
1552   unsigned int pass,c,c1,ku,ten;
1553   struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1554   struct utf8_eucparam *p2 = p1 + 1;
1555   struct utf8_eucparam *p3 = p1 + 2;
1556   unsigned short *t1 = (unsigned short *) p1->tab;
1557   unsigned short *t2 = (unsigned short *) p2->tab;
1558   unsigned short *t3 = (unsigned short *) p3->tab;
1559   for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1560     for (i = 0; i < text->size;) {
1561 				/* not CS0? */
1562       if ((c = text->data[i++]) & BIT8) {
1563 				/* yes, must have another high byte */
1564 	if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8))
1565 	  c = UBOGON;		/* out of space or bogon */
1566 	else switch (c) {	/* check 8bit code set */
1567 	case EUC_CS2:		/* CS2 */
1568 	  if (p2->base_ku) {	/* CS2 set up? */
1569 	    if (p2->base_ten)	/* yes, multibyte? */
1570 	      c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1571 		   ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1572 		   ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ?
1573 		     t2[(ku*p2->max_ten) + ten] : UBOGON;
1574 	    else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ?
1575 	      c1 + ((unsigned long) p2->tab) : UBOGON;
1576 	  }
1577 	  else {		/* CS2 not set up */
1578 	    c = UBOGON;		/* swallow byte, say bogon */
1579 	    if (i < text->size) i++;
1580 	  }
1581 	  break;
1582 	case EUC_CS3:		/* CS3 */
1583 	  if (p3->base_ku) {	/* CS3 set up? */
1584 	    if (p3->base_ten)	/* yes, multibyte? */
1585 	      c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1586 		   ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1587 		   ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ?
1588 		     t3[(ku*p3->max_ten) + ten] : UBOGON;
1589 	    else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ?
1590 	      c1 + ((unsigned long) p3->tab) : UBOGON;
1591 	  }
1592 	  else {		/* CS3 not set up */
1593 	    c = UBOGON;		/* swallow byte, say bogon */
1594 	    if (i < text->size) i++;
1595 	  }
1596 	  break;
1597 
1598 	default:
1599 	  if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1600 	      ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON;
1601 	  else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) &&
1602 		   /* special hack for JIS X 0212: merge rows less than 10 */
1603 		   ku && (ku < 10) && t3 && p3->base_ten)
1604 	    c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1605 	}
1606       }
1607 				/* convert if second pass */
1608       if (pass) UTF8_WRITE_BMP (s,c,cv,de)
1609       else UTF8_COUNT_BMP (ret->size,c,cv,de);
1610     }
1611     if (!pass) (s = ret->data = (unsigned char *)
1612 		fs_get (ret->size + 1))[ret->size] =NIL;
1613   }
1614 }
1615 
1616 
1617 /* Convert ASCII + double-byte sized text to UTF-8
1618  * Accepts: source sized text
1619  *	    pointer to return sized text
1620  *	    conversion table
1621  *	    canonicalization function
1622  */
1623 
utf8_text_dbyte(SIZEDTEXT * text,SIZEDTEXT * ret,void * tab,ucs4cn_t cv,ucs4de_t de)1624 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1625 		      ucs4de_t de)
1626 {
1627   unsigned long i;
1628   unsigned char *s;
1629   unsigned int c,c1,ku,ten;
1630   struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1631   unsigned short *t1 = (unsigned short *) p1->tab;
1632   for (ret->size = i = 0; i < text->size;) {
1633     if ((c = text->data[i++]) & BIT8) {
1634 				/* special hack for GBK: 0x80 is Euro */
1635       if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1636       else c = ((i < text->size) && (c1 = text->data[i++]) &&
1637 		((ku = c - p1->base_ku) < p1->max_ku) &&
1638 		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1639 	     t1[(ku*p1->max_ten) + ten] : UBOGON;
1640     }
1641     UTF8_COUNT_BMP (ret->size,c,cv,de)
1642   }
1643   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1644   for (i = 0; i < text->size;) {
1645     if ((c = text->data[i++]) & BIT8) {
1646 				/* special hack for GBK: 0x80 is Euro */
1647       if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1648       else c = ((i < text->size) && (c1 = text->data[i++]) &&
1649 		((ku = c - p1->base_ku) < p1->max_ku) &&
1650 		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1651 	     t1[(ku*p1->max_ten) + ten] : UBOGON;
1652     }
1653     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1654   }
1655 }
1656 
1657 /* Convert ASCII + double byte 2 plane sized text to UTF-8
1658  * Accepts: source sized text
1659  *	    pointer to return sized text
1660  *	    conversion table
1661  *	    canonicalization function
1662  */
1663 
utf8_text_dbyte2(SIZEDTEXT * text,SIZEDTEXT * ret,void * tab,ucs4cn_t cv,ucs4de_t de)1664 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1665 		       ucs4de_t de)
1666 {
1667   unsigned long i;
1668   unsigned char *s;
1669   unsigned int c,c1,ku,ten;
1670   struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1671   struct utf8_eucparam *p2 = p1 + 1;
1672   unsigned short *t = (unsigned short *) p1->tab;
1673   for (ret->size = i = 0; i < text->size;) {
1674     if ((c = text->data[i++]) & BIT8) {
1675       if ((i >= text->size) || !(c1 = text->data[i++]))
1676 	c = UBOGON;		/* out of space or bogon */
1677       else if (c1 & BIT8)	/* high vs. low plane */
1678 	c = ((ku = c - p2->base_ku) < p2->max_ku &&
1679 	     ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1680 	       t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1681       else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1682 		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1683 		  t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1684     }
1685     UTF8_COUNT_BMP (ret->size,c,cv,de)
1686   }
1687   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1688   for (i = 0; i < text->size;) {
1689     if ((c = text->data[i++]) & BIT8) {
1690       if ((i >= text->size) || !(c1 = text->data[i++]))
1691 	c = UBOGON;		/* out of space or bogon */
1692       else if (c1 & BIT8)	/* high vs. low plane */
1693 	c = ((ku = c - p2->base_ku) < p2->max_ku &&
1694 	     ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1695 	       t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1696       else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1697 		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1698 		  t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1699     }
1700     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1701   }
1702 }
1703 
1704 #ifdef JISTOUNICODE		/* Japanese */
1705 /* Convert Shift JIS sized text to UTF-8
1706  * Accepts: source sized text
1707  *	    pointer to return sized text
1708  *	    canonicalization function
1709  */
1710 
utf8_text_sjis(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)1711 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,
1712 		     ucs4de_t de)
1713 {
1714   unsigned long i;
1715   unsigned char *s;
1716   unsigned int c,c1,ku,ten;
1717   for (ret->size = i = 0; i < text->size;) {
1718     if ((c = text->data[i++]) & BIT8) {
1719 				/* half-width katakana */
1720       if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1721       else if (i >= text->size) c = UBOGON;
1722       else {			/* Shift-JIS */
1723 	c1 = text->data[i++];
1724 	SJISTOJIS (c,c1);
1725 	c = JISTOUNICODE (c,c1,ku,ten);
1726       }
1727     }
1728 				/* compromise - do yen sign but not overline */
1729     else if (c == JISROMAN_YEN) c = UCS2_YEN;
1730     UTF8_COUNT_BMP (ret->size,c,cv,de)
1731   }
1732   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1733   for (i = 0; i < text->size;) {
1734     if ((c = text->data[i++]) & BIT8) {
1735 				/* half-width katakana */
1736       if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1737       else {			/* Shift-JIS */
1738 	c1 = text->data[i++];
1739 	SJISTOJIS (c,c1);
1740 	c = JISTOUNICODE (c,c1,ku,ten);
1741       }
1742     }
1743 				/* compromise - do yen sign but not overline */
1744     else if (c == JISROMAN_YEN) c = UCS2_YEN;
1745     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
1746   }
1747 }
1748 #endif
1749 
1750 /* Convert ISO-2022 sized text to UTF-8
1751  * Accepts: source sized text
1752  *	    pointer to returned sized text
1753  *	    canonicalization function
1754  */
1755 
utf8_text_2022(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)1756 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1757 {
1758   unsigned long i;
1759   unsigned char *s;
1760   unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten;
1761   for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1762     gi = 0;			/* quell compiler warnings */
1763     state = I2S_CHAR;		/* initialize engine */
1764     g[0]= g[2] = I2CS_ASCII;	/* G0 and G2 are ASCII */
1765     g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */
1766     gl = I2C_G0; gr = I2C_G1;	/* left is G0, right is G1 */
1767     for (i = 0; i < text->size;) {
1768       c = text->data[i++];
1769       switch (state) {		/* dispatch based upon engine state */
1770       case I2S_ESC:		/* ESC seen */
1771 	switch (c) {		/* process intermediate character */
1772 	case I2C_MULTI:		/* multibyte character? */
1773 	  state = I2S_MUL;	/* mark multibyte flag seen */
1774 	  break;
1775         case I2C_SS2:		/* single shift GL to G2 */
1776 	case I2C_SS2_ALT:	/* Taiwan SeedNet */
1777 	  gl |= I2C_SG2;
1778 	  break;
1779         case I2C_SS3:		/* single shift GL to G3 */
1780 	case I2C_SS3_ALT:	/* Taiwan SeedNet */
1781 	  gl |= I2C_SG3;
1782 	  break;
1783         case I2C_LS2:		/* shift GL to G2 */
1784 	  gl = I2C_G2;
1785 	  break;
1786         case I2C_LS3:		/* shift GL to G3 */
1787 	  gl = I2C_G3;
1788 	  break;
1789         case I2C_LS1R:		/* shift GR to G1 */
1790 	  gr = I2C_G1;
1791 	  break;
1792         case I2C_LS2R:		/* shift GR to G2 */
1793 	  gr = I2C_G2;
1794 	  break;
1795         case I2C_LS3R:		/* shift GR to G3 */
1796 	  gr = I2C_G3;
1797 	  break;
1798 	case I2C_G0_94: case I2C_G1_94: case I2C_G2_94:	case I2C_G3_94:
1799 	  g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94;
1800 	  state = I2S_INT;	/* ready for character set */
1801 	  break;
1802 	case I2C_G0_96:	case I2C_G1_96: case I2C_G2_96:	case I2C_G3_96:
1803 	  g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96;
1804 	  state = I2S_INT;	/* ready for character set */
1805 	  break;
1806 	default:		/* bogon */
1807 	  if (pass) *s++ = I2C_ESC,*s++ = c;
1808 	  else ret->size += 2;
1809 	  state = I2S_CHAR;	/* return to previous state */
1810 	}
1811 	break;
1812 
1813       case I2S_MUL:		/* ESC $ */
1814 	switch (c) {		/* process multibyte intermediate character */
1815 	case I2C_G0_94: case I2C_G1_94: case I2C_G2_94:	case I2C_G3_94:
1816 	  g[gi = c - I2C_G0_94] = I2CS_94x94;
1817 	  state = I2S_INT;	/* ready for character set */
1818 	  break;
1819 	case I2C_G0_96:	case I2C_G1_96: case I2C_G2_96:	case I2C_G3_96:
1820 	  g[gi = c - I2C_G0_96] = I2CS_96x96;
1821 	  state = I2S_INT;	/* ready for character set */
1822 	  break;
1823 	default:		/* probably omitted I2CS_94x94 */
1824 	  g[gi = I2C_G0] = I2CS_94x94 | c;
1825 	  state = I2S_CHAR;	/* return to character state */
1826 	}
1827 	break;
1828       case I2S_INT:
1829 	state = I2S_CHAR;	/* return to character state */
1830 	g[gi] |= c;		/* set character set */
1831 	break;
1832 
1833       case I2S_CHAR:		/* character data */
1834 	switch (c) {
1835 	case I2C_ESC:		/* ESC character */
1836 	  state = I2S_ESC;	/* see if ISO-2022 prefix */
1837 	  break;
1838 	case I2C_SI:		/* shift GL to G0 */
1839 	  gl = I2C_G0;
1840 	  break;
1841 	case I2C_SO:		/* shift GL to G1 */
1842 	  gl = I2C_G1;
1843 	  break;
1844         case I2C_SS2_ALT:	/* single shift GL to G2 */
1845 	case I2C_SS2_ALT_7:
1846 	  gl |= I2C_SG2;
1847 	  break;
1848         case I2C_SS3_ALT:	/* single shift GL to G3 */
1849 	case I2C_SS3_ALT_7:
1850 	  gl |= I2C_SG3;
1851 	  break;
1852 
1853 	default:		/* ordinary character */
1854 	  co = c;		/* note original character */
1855 	  if (gl & (3 << 2)) {	/* single shifted? */
1856 	    gi = g[gl >> 2];	/* get shifted character set */
1857 	    gl &= 0x3;		/* cancel shift */
1858 	  }
1859 				/* select left or right half */
1860 	  else gi = (c & BIT8) ? g[gr] : g[gl];
1861 	  c &= BITS7;		/* make 7-bit */
1862 	  switch (gi) {		/* interpret in character set */
1863 	  case I2CS_ASCII:	/* ASCII */
1864 	    break;		/* easy! */
1865 	  case I2CS_BRITISH:	/* British ASCII */
1866 				/* Pound sterling sign */
1867 	    if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING;
1868 	    break;
1869 	  case I2CS_JIS_ROMAN:	/* JIS Roman */
1870 	  case I2CS_JIS_BUGROM:	/* old bugs */
1871 	    switch (c) {	/* two exceptions to ASCII */
1872 	    case JISROMAN_YEN:	/* Yen sign */
1873 	      c = UCS2_YEN;
1874 	      break;
1875 				/* overline */
1876 	    case JISROMAN_OVERLINE:
1877 	      c = UCS2_OVERLINE;
1878 	      break;
1879 	    }
1880 	    break;
1881 	  case I2CS_JIS_KANA:	/* JIS hankaku katakana */
1882 	    if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7;
1883 	    break;
1884 
1885 	  case I2CS_ISO8859_1:	/* Latin-1 (West European) */
1886 	    c |= BIT8;		/* just turn on high bit */
1887 	    break;
1888 	  case I2CS_ISO8859_2:	/* Latin-2 (Czech, Slovak) */
1889 	    c = iso8859_2tab[c];
1890 	    break;
1891 	  case I2CS_ISO8859_3:	/* Latin-3 (Dutch, Turkish) */
1892 	    c = iso8859_3tab[c];
1893 	    break;
1894 	  case I2CS_ISO8859_4:	/* Latin-4 (Scandinavian) */
1895 	    c = iso8859_4tab[c];
1896 	    break;
1897 	  case I2CS_ISO8859_5:	/* Cyrillic */
1898 	    c = iso8859_5tab[c];
1899 	    break;
1900 	  case I2CS_ISO8859_6:	/* Arabic */
1901 	    c = iso8859_6tab[c];
1902 	    break;
1903 	  case I2CS_ISO8859_7:	/* Greek */
1904 	    c = iso8859_7tab[c];
1905 	    break;
1906 	  case I2CS_ISO8859_8:	/* Hebrew */
1907 	    c = iso8859_8tab[c];
1908 	    break;
1909 	  case I2CS_ISO8859_9:	/* Latin-5 (Finnish, Portuguese) */
1910 	    c = iso8859_9tab[c];
1911 	    break;
1912 	  case I2CS_TIS620:	/* Thai */
1913 	    c = tis620tab[c];
1914 	    break;
1915 	  case I2CS_ISO8859_10:	/* Latin-6 (Northern Europe) */
1916 	    c = iso8859_10tab[c];
1917 	    break;
1918 	  case I2CS_ISO8859_13:	/* Latin-7 (Baltic) */
1919 	    c = iso8859_13tab[c];
1920 	    break;
1921 	  case I2CS_VSCII:	/* Vietnamese */
1922 	    c = visciitab[c];
1923 	    break;
1924 	  case I2CS_ISO8859_14:	/* Latin-8 (Celtic) */
1925 	    c = iso8859_14tab[c];
1926 	    break;
1927 	  case I2CS_ISO8859_15:	/* Latin-9 (Euro) */
1928 	    c = iso8859_15tab[c];
1929 	    break;
1930 	  case I2CS_ISO8859_16:	/* Latin-10 (Baltic) */
1931 	    c = iso8859_16tab[c];
1932 	    break;
1933 
1934 	  default:		/* all other character sets */
1935 				/* multibyte character set */
1936 	    if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) {
1937 	      c = (i < text->size) ? text->data[i++] : 0;
1938 	      switch (gi) {
1939 #ifdef GBTOUNICODE
1940 	      case I2CS_GB:	/* GB 2312 */
1941 		co |= BIT8;	/* make into EUC */
1942 		c |= BIT8;
1943 		c = GBTOUNICODE (co,c,ku,ten);
1944 		break;
1945 #endif
1946 #ifdef JISTOUNICODE
1947 	      case I2CS_JIS_OLD:/* JIS X 0208-1978 */
1948 	      case I2CS_JIS_NEW:/* JIS X 0208-1983 */
1949 		c = JISTOUNICODE (co,c,ku,ten);
1950 		break;
1951 #endif
1952 #ifdef JIS0212TOUNICODE
1953 	      case I2CS_JIS_EXT:/* JIS X 0212-1990 */
1954 		c = JIS0212TOUNICODE (co,c,ku,ten);
1955 		break;
1956 #endif
1957 #ifdef KSCTOUNICODE
1958 	      case I2CS_KSC:	/* KSC 5601 */
1959 		co |= BIT8;	/* make into EUC */
1960 		c |= BIT8;
1961 		c = KSCTOUNICODE (co,c,ku,ten);
1962 		break;
1963 #endif
1964 #ifdef CNS1TOUNICODE
1965 	      case I2CS_CNS1:	/* CNS 11643 plane 1 */
1966 		c = CNS1TOUNICODE (co,c,ku,ten);
1967 		break;
1968 #endif
1969 #ifdef CNS2TOUNICODE
1970 	      case I2CS_CNS2:	/* CNS 11643 plane 2 */
1971 		c = CNS2TOUNICODE (co,c,ku,ten);
1972 		break;
1973 #endif
1974 #ifdef CNS3TOUNICODE
1975 	      case I2CS_CNS3:	/* CNS 11643 plane 3 */
1976 		c = CNS3TOUNICODE (co,c,ku,ten);
1977 		break;
1978 #endif
1979 #ifdef CNS4TOUNICODE
1980 	      case I2CS_CNS4:	/* CNS 11643 plane 4 */
1981 		c = CNS4TOUNICODE (co,c,ku,ten);
1982 		break;
1983 #endif
1984 #ifdef CNS5TOUNICODE
1985 	      case I2CS_CNS5:	/* CNS 11643 plane 5 */
1986 		c = CNS5TOUNICODE (co,c,ku,ten);
1987 		break;
1988 #endif
1989 #ifdef CNS6TOUNICODE
1990 	      case I2CS_CNS6:	/* CNS 11643 plane 6 */
1991 		c = CNS6TOUNICODE (co,c,ku,ten);
1992 		break;
1993 #endif
1994 #ifdef CNS7TOUNICODE
1995 	      case I2CS_CNS7:	/* CNS 11643 plane 7 */
1996 		c = CNS7TOUNICODE (co,c,ku,ten);
1997 		break;
1998 #endif
1999 	      default:		/* unknown multibyte, treat as UCS-2 */
2000 		c |= (co << 8);	/* wrong, but nothing else to do */
2001 		break;
2002 	      }
2003 	    }
2004 	    else c = co;	/* unknown single byte, treat as 8859-1 */
2005 	  }
2006 				/* convert if second pass */
2007 	  if (pass) UTF8_WRITE_BMP (s,c,cv,de)
2008 	  else UTF8_COUNT_BMP (ret->size,c,cv,de);
2009 	}
2010       }
2011     }
2012     if (!pass) (s = ret->data = (unsigned char *)
2013 		fs_get (ret->size + 1))[ret->size] = NIL;
2014     else if (((unsigned long) (s - ret->data)) != ret->size)
2015       fatal ("ISO-2022 to UTF-8 botch");
2016   }
2017 }
2018 
2019 /* Convert UTF-7 sized text to UTF-8
2020  * Accepts: source sized text
2021  *	    pointer to returned sized text
2022  *	    canonicalization function
2023  */
2024 
utf8_text_utf7(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)2025 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2026 {
2027   unsigned long i;
2028   unsigned char *s;
2029   unsigned int c,c1,d,uc,pass,e,e1,state,surrh;
2030   for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
2031     c1 = d = uc = e = e1 = 0;
2032     for (i = 0,state = NIL; i < text->size;) {
2033       c = text->data[i++];	/* get next byte */
2034       switch (state) {
2035       case U7_PLUS:		/* previous character was + */
2036 	if (c == '-') {		/* +- means textual + */
2037 	  c = '+';
2038 	  state = U7_ASCII;	/* revert to ASCII */
2039 	  break;
2040 	}
2041 	state = U7_UNICODE;	/* enter Unicode state */
2042 	e = e1 = 0;		/* initialize Unicode quantum position */
2043       case U7_UNICODE:		/* Unicode state */
2044 	if (c == '-') state = U7_MINUS;
2045 	else {			/* decode Unicode */
2046 	  /* don't use isupper/islower since this is ASCII only */
2047 	  if ((c >= 'A') && (c <= 'Z')) c -= 'A';
2048 	  else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26;
2049 	  else if (isdigit (c)) c -= '0' - 52;
2050 	  else if (c == '+') c = 62;
2051 	  else if (c == '/') c = 63;
2052 	  else state = U7_ASCII;/* end of modified BASE64 */
2053 	}
2054 	break;
2055       case U7_MINUS:		/* previous character was absorbed - */
2056 	state = U7_ASCII;	/* revert to ASCII */
2057       case U7_ASCII:		/* ASCII state */
2058 	if (c == '+') state = U7_PLUS;
2059 	break;
2060       }
2061 
2062       switch (state) {		/* store character if in character mode */
2063       case U7_UNICODE:		/* Unicode */
2064 	switch (e++) {		/* install based on BASE64 state */
2065 	case 0:
2066 	  c1 = c << 2;		/* byte 1: high 6 bits */
2067 	  break;
2068 	case 1:
2069 	  d = c1 | (c >> 4);	/* byte 1: low 2 bits */
2070 	  c1 = c << 4;		/* byte 2: high 4 bits */
2071 	  break;
2072 	case 2:
2073 	  d = c1 | (c >> 2);	/* byte 2: low 4 bits */
2074 	  c1 = c << 6;		/* byte 3: high 2 bits */
2075 	  break;
2076 	case 3:
2077 	  d = c | c1;		/* byte 3: low 6 bits */
2078 	  e = 0;		/* reinitialize mechanism */
2079 	  break;
2080 	}
2081 	if (e == 1) break;	/* done if first BASE64 state */
2082 	if (!e1) {		/* first byte of UCS-2 character */
2083 	  uc = (d & 0xff) << 8;	/* note first byte */
2084 	  e1 = T;		/* enter second UCS-2 state */
2085 	  break;		/* done */
2086 	}
2087 	c = uc | (d & 0xff);	/* build UCS-2 character */
2088 	e1 = NIL;		/* back to first UCS-2 state, drop in */
2089 				/* surrogate pair?  */
2090 	if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2091 				/* save high surrogate for later */
2092 	  if (c < UTF16_SURRL) surrh = c;
2093 	  else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) +
2094 		 (c & UTF16_MASK);
2095 	  break;		/* either way with surrogates, we're done */
2096 	}
2097       case U7_ASCII:		/* just install if ASCII */
2098 				/* convert if second pass */
2099 	if (pass) UTF8_WRITE_BMP (s,c,cv,de)
2100 	else UTF8_COUNT_BMP (ret->size,c,cv,de);
2101       }
2102     }
2103     if (!pass) (s = ret->data = (unsigned char *)
2104 		fs_get (ret->size + 1))[ret->size] = NIL;
2105     else if (((unsigned long) (s - ret->data)) != ret->size)
2106       fatal ("UTF-7 to UTF-8 botch");
2107   }
2108 }
2109 
2110 
2111 /* Convert UTF-8 sized text to UTF-8
2112  * Accepts: source sized text
2113  *	    pointer to returned sized text
2114  *	    canonicalization function
2115  */
2116 
utf8_text_utf8(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)2117 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2118 {
2119   unsigned long i,c;
2120   unsigned char *s,*t;
2121   for (ret->size = 0, t = text->data, i = text->size; i;) {
2122     if ((c = utf8_get (&t,&i)) & U8G_ERROR) {
2123       ret->data = text->data;	/* conversion failed */
2124       ret->size = text->size;
2125       return;
2126     }
2127     UTF8_COUNT (ret->size,c,cv,de)
2128   }
2129   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
2130   for (t = text->data, i = text->size; i;) {
2131     c = utf8_get (&t,&i);
2132     UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
2133   }
2134   if (((unsigned long) (s - ret->data)) != ret->size)
2135     fatal ("UTF-8 to UTF-8 botch");
2136 }
2137 
2138 /* Convert UCS-2 sized text to UTF-8
2139  * Accepts: source sized text
2140  *	    pointer to returned sized text
2141  *	    canonicalization function
2142  */
2143 
utf8_text_ucs2(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)2144 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2145 {
2146   unsigned long i;
2147   unsigned char *s,*t;
2148   unsigned int c;
2149   for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2150     c = *t++ << 8;
2151     c |= *t++;
2152     UTF8_COUNT_BMP (ret->size,c,cv,de);
2153   }
2154   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2155   for (t = text->data, i = text->size / 2; i; --i) {
2156     c = *t++ << 8;
2157     c |= *t++;
2158     UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
2159   }
2160   if (((unsigned long) (s - ret->data)) != ret->size)
2161     fatal ("UCS-2 to UTF-8 botch");
2162 }
2163 
2164 
2165 /* Convert UCS-4 sized text to UTF-8
2166  * Accepts: source sized text
2167  *	    pointer to returned sized text
2168  *	    canonicalization function
2169  */
2170 
utf8_text_ucs4(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)2171 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2172 {
2173   unsigned long i;
2174   unsigned char *s,*t;
2175   unsigned long c;
2176   for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) {
2177     c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2178     UTF8_COUNT (ret->size,c,cv,de);
2179   }
2180   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2181   for (t = text->data, i = text->size / 2; i; --i) {
2182     c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2183     UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
2184   }
2185   if (((unsigned long) (s - ret->data)) != ret->size)
2186     fatal ("UCS-4 to UTF-8 botch");
2187 }
2188 
2189 /* Convert UTF-16 sized text to UTF-8
2190  * Accepts: source sized text
2191  *	    pointer to returned sized text
2192  *	    canonicalization function
2193  */
2194 
utf8_text_utf16(SIZEDTEXT * text,SIZEDTEXT * ret,ucs4cn_t cv,ucs4de_t de)2195 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2196 {
2197   unsigned long i;
2198   unsigned char *s,*t;
2199   unsigned long c,d;
2200   for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2201     c = *t++ << 8;
2202     c |= *t++;
2203 				/* possible surrogate? */
2204     if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2205 				/* invalid first surrogate */
2206       if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2207       else {			/* get second surrogate */
2208 	d = *t++ << 8;
2209 	d |= *t++;
2210 	--i;			/* swallowed another 16-bits */
2211 				/* invalid second surrogate */
2212 	if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2213 	else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2214 	       (d & UTF16_MASK);
2215       }
2216     }
2217     UTF8_COUNT (ret->size,c,cv,de);
2218   }
2219   (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2220   for (t = text->data, i = text->size / 2; i; --i) {
2221     c = *t++ << 8;
2222     c |= *t++;
2223 				/* possible surrogate? */
2224     if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2225 				/* invalid first surrogate */
2226       if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2227       else {			/* get second surrogate */
2228 	d = *t++ << 8;
2229 	d |= *t++;
2230 	--i;			/* swallowed another 16-bits */
2231 				/* invalid second surrogate */
2232 	if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2233 	else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2234 	       (d & UTF16_MASK);
2235       }
2236     }
2237     UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
2238   }
2239   if (((unsigned long) (s - ret->data)) != ret->size)
2240     fatal ("UTF-16 to UTF-8 botch");
2241 }
2242 
2243 /* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
2244  * Accepts: character
2245  * Returns: size (0 means bogon)
2246  *
2247  * Use UTF8_SIZE macro if known to be in the BMP
2248  */
2249 
utf8_size(unsigned long c)2250 unsigned long utf8_size (unsigned long c)
2251 {
2252   if (c < 0x80) return 1;
2253   else if (c < 0x800) return 2;
2254   else if (c < 0x10000) return 3;
2255   else if (c < 0x200000) return 4;
2256   else if (c < 0x4000000) return 5;
2257   else if (c < 0x80000000) return 6;
2258   return 0;
2259 }
2260 
2261 
2262 /* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
2263  * Accepts: destination string pointer
2264  *	    character
2265  * Returns: updated destination pointer
2266  *
2267  * Use UTF8_PUT_BMP macro if known to be in the BMP
2268  */
2269 
utf8_put(unsigned char * s,unsigned long c)2270 unsigned char *utf8_put (unsigned char *s,unsigned long c)
2271 {
2272   unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
2273   unsigned long size = utf8_size (c);
2274   switch (size) {
2275   case 6:
2276     s[5] = 0x80 | (unsigned char) (c & 0x3f);
2277     c >>= 6;
2278   case 5:
2279     s[4] = 0x80 | (unsigned char) (c & 0x3f);
2280     c >>= 6;
2281   case 4:
2282     s[3] = 0x80 | (unsigned char) (c & 0x3f);
2283     c >>= 6;
2284   case 3:
2285     s[2] = 0x80 | (unsigned char) (c & 0x3f);
2286     c >>= 6;
2287   case 2:
2288     s[1] = 0x80 | (unsigned char) (c & 0x3f);
2289     c >>= 6;
2290   case 1:
2291     *s = mark[size-1] | (unsigned char) (c & 0x7f);
2292     break;
2293   }
2294   return s + size;
2295 }
2296 
2297 /* Return title case of a fixed-width UCS-4 character
2298  * Accepts: character
2299  * Returns: title case of character
2300  */
2301 
ucs4_titlecase(unsigned long c)2302 unsigned long ucs4_titlecase (unsigned long c)
2303 {
2304   if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c];
2305   if (c < UCS4_TMAPHIMIN) return c;
2306   if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP;
2307   if (c < UCS4_TMAPDESERETMIN) return c;
2308   if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP;
2309   return c;
2310 }
2311 
2312 
2313 /* Return width of a fixed-width UCS-4 character in planes 0-2
2314  * Accepts: character
2315  * Returns: width (0, 1, 2) or negative error condition if not valid
2316  */
2317 
ucs4_width(unsigned long c)2318 long ucs4_width (unsigned long c)
2319 {
2320   long ret;
2321   ucs4width_t uw = (ucs4width_t) utf8_parameters(GET_UCS4WIDTH, NIL);
2322 				/* out of range, not-a-char, or surrogates */
2323   if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) ||
2324       ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD;
2325 				/* private-use */
2326   else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE;
2327 				/* SSP are not printing characters */
2328   else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR;
2329 				/* unassigned planes */
2330   else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN;
2331 				/* SIP and reserved plane 3 are wide */
2332   else if (c >= UCS4_SIPBASE) ret = 2;
2333 #if (UCS4_WIDLEN != UCS4_SIPBASE)
2334 #error "UCS4_WIDLEN != UCS4_SIPBASE"
2335 #endif
2336 				/* C0/C1 controls */
2337   else if ((c <= UCS2_C0CONTROLEND) ||
2338 	   ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND)))
2339     ret = U4W_CONTROL;
2340 				/* BMP and SMP get value from table */
2341   else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){
2342   case 0:			/* zero-width */
2343     if (c == 0x00ad) ret = 1;	/* force U+00ad (SOFT HYPHEN) to width 1 */
2344   case 1:			/* single-width */
2345   case 2:			/* double-width */
2346     break;
2347   case 3:			/* ambiguous width */
2348     ret = uw ? (*uw)(c)		/* this is better than the line below */
2349 	     : (c >= 0x2100) ? 2 : 1;/* need to do something better than this */
2350     break;
2351   }
2352   return ret;
2353 }
2354 
2355 /* Return screen width of UTF-8 string
2356  * Accepts: string
2357  * Returns: width or negative if not valid UTF-8
2358  */
2359 
utf8_strwidth(unsigned char * s)2360 long utf8_strwidth (unsigned char *s)
2361 {
2362   unsigned long c,i,ret;
2363 				/* go through string */
2364   for (ret = 0; *s; ret += ucs4_width (c)) {
2365     /* It's alright to give a fake value for the byte count to utf8_get()
2366      * since the null of a null-terminated string will stop processing anyway.
2367      */
2368     i = 6;			/* fake value */
2369     if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2370   }
2371   return ret;
2372 }
2373 
2374 
2375 /* Return screen width of UTF-8 text
2376  * Accepts: SIZEDTEXT to string
2377  * Returns: width or negative if not valid UTF-8
2378  */
2379 
utf8_textwidth(SIZEDTEXT * utf8)2380 long utf8_textwidth (SIZEDTEXT *utf8)
2381 {
2382   unsigned long c;
2383   unsigned char *s = utf8->data;
2384   unsigned long i = utf8->size;
2385   unsigned long ret = 0;
2386   while (i) {			/* while there's a string to process */
2387     if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2388     ret += ucs4_width (c);
2389   }
2390   return ret;
2391 }
2392 
2393 /* Decomposition (phew!) */
2394 
2395 #define MORESINGLE 1		/* single UCS-4 tail value */
2396 #define MOREMULTIPLE 2		/* multiple UCS-2 tail values */
2397 
2398 struct decomposemore {
2399   short type;			/* type of more */
2400   union {
2401     unsigned long single;	/* single decomposed value */
2402     struct {			/* multiple BMP values */
2403       unsigned short *next;
2404       unsigned long count;
2405     } multiple;
2406   } data;
2407 };
2408 
2409 #define RECURSIVEMORE struct recursivemore
2410 
2411 RECURSIVEMORE {
2412   struct decomposemore *more;
2413   RECURSIVEMORE *next;
2414 };
2415 
2416 
2417 /* Return decomposition of a UCS-4 character
2418  * Accepts: character or U8G_ERROR to return next from "more"
2419  *	    pointer to returned more
2420  * Returns: [next] decomposed value, more set if still more decomposition
2421  */
2422 
ucs4_decompose(unsigned long c,void ** more)2423 unsigned long ucs4_decompose (unsigned long c,void **more)
2424 {
2425   unsigned long i,ix,ret = c;
2426   struct decomposemore *m;
2427   if (c & U8G_ERROR) {		/* want to chase more? */
2428 				/* do sanity check */
2429     if ((m = (struct decomposemore *) *more) != NULL) switch (m->type) {
2430     case MORESINGLE:		/* single value */
2431       ret = m->data.single;
2432       fs_give (more);		/* no more decomposition */
2433       break;
2434     case MOREMULTIPLE:		/* multiple value */
2435       ret = *m->data.multiple.next++;
2436       if (!--m->data.multiple.count) fs_give (more);
2437       break;
2438     default:			/* uh-oh */
2439       fatal ("invalid more block argument to ucs4_decompose!");
2440     }
2441     else fatal ("no more block provided to ucs4_decompose!");
2442   }
2443 
2444   else {			/* start decomposition */
2445     *more = NIL;		/* initially set no more */
2446 				/* BMP low decompositions */
2447     if (c < UCS4_BMPLOMIN) ret = c;
2448 				/* fix this someday */
2449     else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0];
2450     else if (c <= UCS4_BMPLOMAX) {
2451 				/* within range - have a decomposition? */
2452       if ((i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) != 0L) {
2453 				/* get first value of decomposition */
2454 	ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK];
2455 				/* has continuation? */
2456 	if (i & UCS4_BMPLOSIZEMASK) {
2457 	  m = (struct decomposemore *)
2458 	    (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2459 			    sizeof (struct decomposemore)));
2460 	  m->type = MOREMULTIPLE;
2461 	  m->data.multiple.next = &ucs4_dbmplotab[++ix];
2462 	  m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT;
2463 	}
2464       }
2465       else ret = c;		/* in range but doesn't decompose */
2466     }
2467 				/* BMP CJK compatibility */
2468     else if (c < UCS4_BMPCJKMIN) ret = c;
2469     else if (c <= UCS4_BMPCJKMAX) {
2470       if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c;
2471     }
2472 				/* BMP CJK compatibility - some not in BMP */
2473 #if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
2474     else if (c < UCS4_BMPCJK2MIN) ret = c;
2475 #endif
2476     else if (c <= UCS4_BMPCJK2MAX)
2477       ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN];
2478 				/* BMP high decompositions */
2479     else if (c < UCS4_BMPHIMIN) ret = c;
2480     else if (c <= UCS4_BMPHIMAX) {
2481 				/* within range - have a decomposition? */
2482       if ((i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) != 0L) {
2483 				/* get first value of decomposition */
2484 	ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK];
2485 				/* has continuation? */
2486 	if (i & UCS4_BMPHISIZEMASK) {
2487 	  m = (struct decomposemore *)
2488 	    (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2489 			    sizeof (struct decomposemore)));
2490 	  m->type = MOREMULTIPLE;
2491 	  m->data.multiple.next = &ucs4_dbmphitab[++ix];
2492 	  m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT;
2493 	}
2494       }
2495       else ret = c;		/* in range but doesn't decompose */
2496     }
2497 
2498 				/* BMP half and full width forms */
2499     else if (c < UCS4_BMPHALFFULLMIN) ret = c;
2500     else if (c <= UCS4_BMPHALFFULLMAX) {
2501       if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c;
2502     }
2503 				/* SMP music */
2504     else if (c < UCS4_SMPMUSIC1MIN) ret = c;
2505     else if (c <= UCS4_SMPMUSIC1MAX) {
2506       ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0];
2507       m = (struct decomposemore *)
2508 	(*more = memset (fs_get (sizeof (struct decomposemore)),0,
2509 			 sizeof (struct decomposemore)));
2510       m->type = MORESINGLE;
2511       m->data.single = ucs4_smpmusic1decomptab[c][1];
2512     }
2513     else if (c < UCS4_SMPMUSIC2MIN) ret = c;
2514     else if (c <= UCS4_SMPMUSIC2MAX) {
2515       ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0];
2516       m = (struct decomposemore *)
2517 	(*more = memset (fs_get (sizeof (struct decomposemore)),0,
2518 			 sizeof (struct decomposemore)));
2519       m->type = MORESINGLE;
2520       m->data.single = ucs4_smpmusic2decomptab[c][1];
2521     }
2522 				/* SMP mathematical forms */
2523     else if (c < UCS4_SMPMATHMIN) ret = c;
2524     else if (c <= UCS4_SMPMATHMAX) {
2525       if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c;
2526     }
2527 				/* CJK compatibility ideographs in SIP */
2528     else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ?
2529 	       ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c;
2530   }
2531   return ret;
2532 }
2533 
2534 /* Return recursive decomposition of a UCS-4 character
2535  * Accepts: character or U8G_ERROR to return next from "more"
2536  *	    pointer to returned more
2537  * Returns: [next] decomposed value, more set if still more decomposition
2538  */
2539 
ucs4_decompose_recursive(unsigned long c,void ** more)2540 unsigned long ucs4_decompose_recursive (unsigned long c,void **more)
2541 {
2542   unsigned long c1;
2543   void *m,*mn;
2544   RECURSIVEMORE *mr;
2545   if (c & U8G_ERROR) {		/* want to chase more? */
2546     mn = NIL;
2547     if ((mr = (RECURSIVEMORE *) *more) != NULL) switch (mr->more->type) {
2548     case MORESINGLE:		/* decompose single value */
2549       c = ucs4_decompose_recursive (mr->more->data.single,&mn);
2550       *more = mr->next;		/* done with this more, remove it */
2551       fs_give ((void **) &mr->more);
2552       fs_give ((void **) &mr);
2553       break;
2554     case MOREMULTIPLE:		/* decompose current value in multiple */
2555       c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn);
2556 				/* if done with this multiple decomposition */
2557       if (!--mr->more->data.multiple.count) {
2558 	*more = mr->next;	/* done with this more, remove it */
2559 	fs_give ((void **) &mr->more);
2560 	fs_give ((void **) &mr);
2561       }
2562       break;
2563     default:			/* uh-oh */
2564       fatal ("invalid more block argument to ucs4_decompose_recursive!");
2565     }
2566     else fatal ("no more block provided to ucs4_decompose_recursive!");
2567     if ((mr = mn) != NULL) {	/* did this value recurse on us? */
2568       mr->next = *more;		/* yes, insert new more at head */
2569       *more = mr;
2570     }
2571   }
2572   else {			/* start decomposition */
2573     *more = NIL;		/* initially set no more */
2574     mr = NIL;
2575     do {			/* repeatedly decompose this codepoint */
2576       c = ucs4_decompose (c1 = c,&m);
2577       if (m) {			/* multi-byte decomposition */
2578 	if (c1 == c) fatal ("endless multiple decomposition!");
2579 				/* create a block to stash this more */
2580 	mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE));
2581 	mr->more = m;		/* note the expansion */
2582 	mr->next = *more;	/* old list is the tail */
2583 	*more = mr;		/* and this is the new head */
2584       }
2585     } while (c1 != c);		/* until nothing more to decompose */
2586   }
2587   return c;
2588 }
2589 
utf8_parameters(long function,void * value)2590 void *utf8_parameters (long function,void *value)
2591 {
2592   void *ret;
2593 
2594   switch(function){
2595      case SET_UCS4WIDTH:
2596 	ucs4width = (ucs4width_t) value;
2597      case GET_UCS4WIDTH:
2598 	ret = (void *) ucs4width;
2599 	break;
2600      default: ret = NULL;
2601 	break;
2602   }
2603   return ret;
2604 }
2605