1 /* ========================================================================
2  * Copyright 1988-2008 University of Washington
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  *
11  * ========================================================================
12  */
13 
14 /*
15  * Program:	UTF-8 routines
16  *
17  * Author:	Mark Crispin
18  *		Networks and Distributed Computing
19  *		Computing & Communications
20  *		University of Washington
21  *		Administration Building, AG-44
22  *		Seattle, WA  98195
23  *		Internet: MRC@CAC.Washington.EDU
24  *
25  * Date:	11 June 1997
26  * Last Edited:	17 January 2008
27  */
28 
29 /* UTF-8 size and conversion routines from UCS-2 values (thus in the BMP).
30  * Don't use these if UTF-16 data (surrogate pairs) are an issue.
31  * For UCS-4 values, use the utf8_size() and utf8_put() functions.
32  */
33 
34 #define UTF8_SIZE_BMP(c) ((c & 0xff80) ? ((c & 0xf800) ? 3 : 2) : 1)
35 
36 #define UTF8_PUT_BMP(b,c) {				\
37   if (c & 0xff80) {		/* non-ASCII? */	\
38     if (c & 0xf800) {		/* three byte code */	\
39       *b++ = 0xe0 | (c >> 12);				\
40       *b++ = 0x80 | ((c >> 6) & 0x3f);			\
41     }							\
42     else *b++ = 0xc0 | ((c >> 6) & 0x3f);		\
43     *b++ = 0x80 | (c & 0x3f); 				\
44   }							\
45   else *b++ = c;					\
46 }
47 
48 /* utf8_text() flag values */
49 
50 #define U8T_CASECANON 2		/* canonicalize case */
51 #define U8T_DECOMPOSE 4		/* decompose */
52 				/* full canonicalization */
53 #define U8T_CANONICAL (U8T_CASECANON | U8T_DECOMPOSE)
54 
55 
56 /* utf8_get() return values */
57 
58 				/* 0x0000 - 0xffff BMP plane */
59 #define U8GM_NONBMP 0xffff0000	/* mask for non-BMP values */
60 				/* 0x10000 - 0x10ffff extended planes */
61 				/* 0x110000 - 0x7ffffff non-Unicode */
62 #define U8G_ERROR 0x80000000	/* error flag */
63 #define U8G_BADCONT U8G_ERROR+1	/* continuation when not in progress */
64 #define U8G_INCMPLT U8G_ERROR+2	/* incomplete UTF-8 character */
65 #define U8G_NOTUTF8 U8G_ERROR+3	/* not a valid UTF-8 octet */
66 #define U8G_ENDSTRG U8G_ERROR+4	/* end of string */
67 #define U8G_ENDSTRI U8G_ERROR+5	/* end of string w/ incomplete UTF-8 char */
68 #define U8G_SURROGA U8G_ERROR+6	/* surrogate codepoint */
69 #define U8G_NOTUNIC U8G_ERROR+7	/* non-Unicode codepoint */
70 
71 
72 /* ucs4_width() return values */
73 
74 #define U4W_ERROR 0x80000000	/* error flags */
75 #define U4W_NOTUNCD U4W_ERROR+1	/* not a Unicode char */
76 #define U4W_PRIVATE U4W_ERROR+2	/* private-space plane */
77 #define U4W_SSPCHAR U4W_ERROR+3	/* Supplementary Special-purpose Plane */
78 #define U4W_UNASSGN U4W_ERROR+4	/* unassigned space plane */
79 #define U4W_CONTROL U4W_ERROR+5	/* C0/C1 control */
80 #define U4W_CTLSRGT U4W_CONTROL	/* in case legacy code references this */
81 
82 /* ISO-2022 engine states */
83 
84 #define I2S_CHAR 0		/* character */
85 #define I2S_ESC 1		/* previous character was ESC */
86 #define I2S_MUL 2		/* previous character was multi-byte code */
87 #define I2S_INT 3		/* previous character was intermediate */
88 
89 
90 /* ISO-2022 Gn selections */
91 
92 #define I2C_G0 0		/* G0 */
93 #define I2C_G1 1		/* G1 */
94 #define I2C_G2 2		/* G2 */
95 #define I2C_G3 3		/* G3 */
96 #define I2C_SG2 (2 << 2)	/* single shift G2 */
97 #define I2C_SG3 (3 << 2)	/* single shift G2 */
98 
99 
100 /* ISO-2022 octet definitions */
101 
102 #define I2C_ESC 0x1b		/* ESCape */
103 
104 	/* Intermediate character */
105 #define I2C_STRUCTURE 0x20	/* announce code structure */
106 #define I2C_C0 0x21		/* C0 */
107 #define I2C_C1 0x22		/* C1 */
108 #define I2C_CONTROL 0x23	/* single control function */
109 #define I2C_MULTI 0x24		/* multi-byte character set */
110 #define I2C_OTHER 0x25		/* other coding system */
111 #define I2C_REVISED 0x26	/* revised registration */
112 #define I2C_G0_94 0x28		/* G0 94-character set */
113 #define I2C_G1_94 0x29		/* G1 94-character set */
114 #define I2C_G2_94 0x2A		/* G2 94-character set */
115 #define I2C_G3_94 0x2B		/* G3 94-character set */
116 #define I2C_G0_96 0x2C		/* (not in ISO-2022) G0 96-character set */
117 #define I2C_G1_96 0x2D		/* G1 96-character set */
118 #define I2C_G2_96 0x2E		/* G2 96-character set */
119 #define I2C_G3_96 0x2F		/* G3 96-character set */
120 
121 	/* Locking shifts */
122 #define I2C_SI 0x0f		/* lock shift to G0 (Shift In) */
123 #define I2C_SO 0x0e		/* lock shift to G1 (Shift Out) */
124 	/* prefixed by ESC */
125 #define I2C_LS2 0x6e		/* lock shift to G2 */
126 #define I2C_LS3 0x6f		/* lock shift to G3 */
127 #define I2C_LS1R 0x7e		/* lock shift GR to G1 */
128 #define I2C_LS2R 0x7d		/* lock shift GR to G2 */
129 #define I2C_LS3R 0x7c		/* lock shift GR to G3 */
130 
131 	/* Single shifts */
132 #define I2C_SS2_ALT 0x8e	/* single shift to G2 (SS2) */
133 #define I2C_SS3_ALT 0x8f	/* single shift to G3 (SS3) */
134 #define I2C_SS2_ALT_7 0x19	/* single shift to G2 (SS2) */
135 #define I2C_SS3_ALT_7 0x1d	/* single shift to G3 (SS3) */
136 	/* prefixed by ESC */
137 #define I2C_SS2 0x4e		/* single shift to G2 (SS2) */
138 #define I2C_SS3 0x4f		/* single shift to G3 (SS3) */
139 
140 /* 94 character sets */
141 
142 				/* 4/0 ISO 646 IRV */
143 #define I2CS_94_BRITISH 0x41	/* 4/1 ISO 646 British */
144 #define I2CS_94_ASCII 0x42	/* 4/2 ISO 646 USA (ASCII) */
145 				/* 4/3 NATS Finland/Sweden (primary) */
146 				/* 4/4 NATS Finland/Sweden (secondary) */
147 				/* 4/5 NATS Denmark/Norway (primary) */
148 				/* 4/6 NATS Denmark/Norway (secondary) */
149 				/* 4/7 ISO 646 Swedish SEN 850200 */
150 				/* 4/8 ISO 646 Swedish names */
151 #define I2CS_94_JIS_BUGROM 0x48	/* 4/8 some buggy software does this */
152 #define I2CS_94_JIS_KANA 0x49	/* 4/9 JIS X 0201-1976 right half */
153 #define I2CS_94_JIS_ROMAN 0x4a	/* 4/a JIS X 0201-1976 left half */
154 				/* 4/b ISO 646 German */
155 				/* 4/c ISO 646 Portuguese (Olivetti) */
156 				/* 4/d ISO 6438 African */
157 				/* 4/e ISO 5427 Cyrillic (Honeywell-Bull) */
158 				/* 4/f DIN 31624 extended bibliography  */
159 				/* 5/0 ISO 5426-1980 Bibliography */
160 				/* 5/1 ISO 5427-1981 Cyrillic*/
161 				/* 5/2 ISO 646 French (withdrawn) */
162 				/* 5/3 ISO 5428-1980 Greek bibliography */
163 				/* 5/4 GB 1988-80 Chinese */
164 				/* 5/5 Latin-Greek (Honeywell-Bull) */
165 				/* 5/6 UK Viewdata/Teletext */
166 				/* 5/7 INIS (IRV subset) */
167 				/* 5/8 ISO 5428 Greek Bibliography */
168 				/* 5/9 ISO 646 Italian (Olivetti) */
169 				/* 5/a ISO 646 Spanish (Olivetti) */
170 				/* 5/b Greek (Olivetti) */
171 				/* 5/c Latin-Greek (Olivetti) */
172 				/* 5/d INIS non-standard extension */
173 				/* 5/e INIS Cyrillic extension */
174 				/* 5/f Arabic CODAR-U IERA */
175 				/* 6/0 ISO 646 Norwegian */
176 				/* 6/1 Norwegian version 2 (withdrawn) */
177 				/* 6/2 Videotex supplementary */
178 				/* 6/3 Videotex supplementary #2 */
179 				/* 6/4 Videotex supplementary #3 */
180 				/* 6/5 APL */
181 				/* 6/6 ISO 646 French */
182 				/* 6/7 ISO 646 Portuguese (IBM) */
183 				/* 6/8 ISO 646 Spanish (IBM) */
184 				/* 6/9 ISO 646 Hungarian */
185 				/* 6/a Greek ELOT (withdrawn) */
186 				/* 6/b ISO 9036 Arabic 7-bit */
187 				/* 6/c ISO 646 IRV supplementary set */
188 				/* 6/d JIS C6229-1984 OCR-A */
189 				/* 6/e JIS C6229-1984 OCR-B */
190 				/* 6/f JIS C6229-1984 OCR-B additional */
191 				/* 7/0 JIS C6229-1984 hand-printed */
192 				/* 7/1 JIS C6229-1984 additional hand-printd */
193 				/* 7/2 JIS C6229-1984 katakana hand-printed */
194 				/* 7/3 E13B Japanese graphic */
195 				/* 7/4 Supplementary Videotex (withdrawn) */
196 				/* 7/5 Teletex primary CCITT T.61 */
197 				/* 7/6 Teletex secondary CCITT T.61 */
198 				/* 7/7 CSA Z 243.4-1985 Alternate primary #1 */
199 				/* 7/8 CSA Z 243.4-1985 Alternate primary #2 */
200 				/* 7/9 Mosaic CCITT T.101 */
201 				/* 7/a Serbocroatian/Slovenian Latin */
202 				/* 7/b Serbocroatian Cyrillic */
203 				/* 7/c Supplementary CCITT T.101 */
204 				/* 7/d Macedonian Cyrillic */
205 
206 /* 94 character sets - second intermediate byte */
207 
208 				/* 4/0 Greek primary CCITT */
209 				/* 4/1 Cuba */
210 				/* 4/2 ISO/IEC 646 invariant */
211 				/* 4/3 Irish Gaelic 7-bit */
212 				/* 4/4 Turkmen */
213 
214 
215 /* 94x94 character sets */
216 
217 #define I2CS_94x94_JIS_OLD 0x40	/* 4/0 JIS X 0208-1978 */
218 #define I2CS_94x94_GB 0x41	/* 4/1 GB 2312 */
219 #define I2CS_94x94_JIS_NEW 0x42	/* 4/2 JIS X 0208-1983 */
220 #define I2CS_94x94_KSC 0x43	/* 4/3 KSC 5601 */
221 #define I2CS_94x94_JIS_EXT 0x44	/* 4/4 JIS X 0212-1990 */
222 				/* 4/5 CCITT Chinese */
223 				/* 4/6 Blisssymbol Graphic */
224 #define I2CS_94x94_CNS1 0x47	/* 4/7 CNS 11643 plane 1 */
225 #define I2CS_94x94_CNS2 0x48	/* 4/8 CNS 11643 plane 2 */
226 #define I2CS_94x94_CNS3 0x49	/* 4/9 CNS 11643 plane 3 */
227 #define I2CS_94x94_CNS4 0x4a	/* 4/a CNS 11643 plane 4 */
228 #define I2CS_94x94_CNS5 0x4b	/* 4/b CNS 11643 plane 5 */
229 #define I2CS_94x94_CNS6 0x4c	/* 4/c CNS 11643 plane 6 */
230 #define I2CS_94x94_CNS7 0x4d	/* 4/d CNS 11643 plane 7 */
231 				/* 4/e DPRK (North Korea) KGCII */
232 				/* 4/f JGCII plane 1 */
233 				/* 5/0 JGCII plane 2 */
234 
235 /* 96 character sets */
236 
237 #define I2CS_96_ISO8859_1 0x41	/* 4/1 Latin-1 (Western Europe) */
238 #define I2CS_96_ISO8859_2 0x42	/* 4/2 Latin-2 (Czech, Slovak) */
239 #define I2CS_96_ISO8859_3 0x43	/* 4/3 Latin-3 (Dutch, Turkish) */
240 #define I2CS_96_ISO8859_4 0x44	/* 4/4 Latin-4 (Scandinavian) */
241 				/* 4/5 CSA Z 243.4-1985 */
242 #define I2CS_96_ISO8859_7 0x46	/* 4/6 Greek */
243 #define I2CS_96_ISO8859_6 0x47	/* 4/7 Arabic */
244 #define I2CS_96_ISO8859_8 0x48	/* 4/8 Hebrew */
245 				/* 4/9 Czechoslovak CSN 369103 */
246 				/* 4/a Supplementary Latin and non-alpha */
247 				/* 4/b Technical */
248 #define I2CS_96_ISO8859_5 0x4c	/* 4/c Cyrillic */
249 #define I2CS_96_ISO8859_9 0x4d	/* 4/d Latin-5 (Finnish, Portuguese) */
250 				/* 4/e ISO 6937-2 residual */
251 				/* 4/f Basic Cyrillic */
252 				/* 5/0 Supplementary Latin 1, 2 and 5 */
253 				/* 5/1 Basic Box */
254 				/* 5/2 Supplementary ISO/IEC 6937 : 1992 */
255 				/* 5/3 CCITT Hebrew supplementary */
256 #define I2CS_96_TIS620 0x54	/* 5/4 TIS 620 */
257 				/* 5/5 Arabic/French/German */
258 #define I2CS_96_ISO8859_10 0x56	/* 5/6 Latin-6 (Northern Europe) */
259 				/* 5/7 ??? */
260 				/* 5/8 Sami (Lappish) supplementary */
261 #define I2CS_96_ISO8859_13 0x59	/* 5/9 Latin-7 (Baltic) */
262 #define I2CS_96_VSCII 0x5a	/* 5/a Vietnamese */
263 				/* 5/b Technical #1 IEC 1289 */
264 #define I2CS_96_ISO8859_14 0x5c	/* 5/c Latin-8 (Celtic) */
265 				/* 5/d Sami supplementary Latin */
266 				/* 5/e Latin/Hebrew */
267 				/* 5/f Celtic supplementary Latin */
268 				/* 6/0 Uralic supplementary Cyrillic */
269 				/* 6/1 Volgaic supplementary Cyrillic */
270 #define I2CS_96_ISO8859_15 0x62	/* 6/2 Latin-9 (Euro) */
271 				/* 6/3 Latin-1 with Euro */
272 				/* 6/4 Latin-4 with Euro */
273 				/* 6/5 Latin-7 with Euro */
274 #define I2CS_96_ISO8859_16 0x66	/* 6/6 Latin-10 (Balkan) */
275 				/* 6/7 Ogham */
276 				/* 6/8 Sami supplementary Latin #2 */
277 				/* 7/d Supplementary Mosaic for CCITT 101 */
278 
279 /* 96x96 character sets */
280 
281 /* Types of character sets */
282 
283 #define I2CS_94 0x000		/* 94 character set */
284 #define I2CS_96 0x100		/* 96 character set */
285 #define I2CS_MUL 0x200		/* multi-byte */
286 #define I2CS_94x94 (I2CS_MUL | I2CS_94)
287 #define I2CS_96x96 (I2CS_MUL | I2CS_96)
288 
289 
290 /* Character set identifiers stored in Gn */
291 
292 #define I2CS_BRITISH (I2CS_94 | I2CS_94_BRITISH)
293 #define I2CS_ASCII (I2CS_94 | I2CS_94_ASCII)
294 #define I2CS_JIS_BUGROM (I2CS_94 | I2CS_94_JIS_BUGROM)
295 #define I2CS_JIS_KANA (I2CS_94 | I2CS_94_JIS_KANA)
296 #define I2CS_JIS_ROMAN (I2CS_94 | I2CS_94_JIS_ROMAN)
297 #define I2CS_JIS_OLD (I2CS_94x94 | I2CS_94x94_JIS_OLD)
298 #define I2CS_GB (I2CS_94x94 | I2CS_94x94_GB)
299 #define I2CS_JIS_NEW (I2CS_94x94 | I2CS_94x94_JIS_NEW)
300 #define I2CS_KSC (I2CS_94x94 | I2CS_94x94_KSC)
301 #define I2CS_JIS_EXT (I2CS_94x94 | I2CS_94x94_JIS_EXT)
302 #define I2CS_CNS1 (I2CS_94x94 | I2CS_94x94_CNS1)
303 #define I2CS_CNS2 (I2CS_94x94 | I2CS_94x94_CNS2)
304 #define I2CS_CNS3 (I2CS_94x94 | I2CS_94x94_CNS3)
305 #define I2CS_CNS4 (I2CS_94x94 | I2CS_94x94_CNS4)
306 #define I2CS_CNS5 (I2CS_94x94 | I2CS_94x94_CNS5)
307 #define I2CS_CNS6 (I2CS_94x94 | I2CS_94x94_CNS6)
308 #define I2CS_CNS7 (I2CS_94x94 | I2CS_94x94_CNS7)
309 #define I2CS_ISO8859_1 (I2CS_96 | I2CS_96_ISO8859_1)
310 #define I2CS_ISO8859_2 (I2CS_96 | I2CS_96_ISO8859_2)
311 #define I2CS_ISO8859_3 (I2CS_96 | I2CS_96_ISO8859_3)
312 #define I2CS_ISO8859_4 (I2CS_96 | I2CS_96_ISO8859_4)
313 #define I2CS_ISO8859_7 (I2CS_96 | I2CS_96_ISO8859_7)
314 #define I2CS_ISO8859_6 (I2CS_96 | I2CS_96_ISO8859_6)
315 #define I2CS_ISO8859_8 (I2CS_96 | I2CS_96_ISO8859_8)
316 #define I2CS_ISO8859_5 (I2CS_96 | I2CS_96_ISO8859_5)
317 #define I2CS_ISO8859_9 (I2CS_96 | I2CS_96_ISO8859_9)
318 #define I2CS_TIS620 (I2CS_96 | I2CS_96_TIS620)
319 #define I2CS_ISO8859_10 (I2CS_96 | I2CS_96_ISO8859_10)
320 #define I2CS_ISO8859_13 (I2CS_96 | I2CS_96_ISO8859_13)
321 #define I2CS_VSCII (I2CS_96 | I2CS_96_VSCII)
322 #define I2CS_ISO8859_14 (I2CS_96 | I2CS_96_ISO8859_14)
323 #define I2CS_ISO8859_15 (I2CS_96 | I2CS_96_ISO8859_15)
324 #define I2CS_ISO8859_16 (I2CS_96 | I2CS_96_ISO8859_16)
325 
326 
327 /* Miscellaneous ISO 2022 definitions */
328 
329 #define EUC_CS2 0x8e		/* single shift CS2 */
330 #define EUC_CS3 0x8f		/* single shift CS3 */
331 
332 #define BITS7 0x7f		/* 7-bit value mask */
333 #define BIT8 0x80		/* 8th bit mask */
334 
335 /* The following saves us from having to have yet more charset tables */
336 
337 /* Unicode codepoints */
338 
339 #define UCS2_C0CONTROL 0x00	/* first C0 control */
340 #define UCS2_C0CONTROLEND 0x1F	/* last C0 control */
341 #define UCS2_C1CONTROL 0x80	/* first C1 control */
342 #define UCS2_C1CONTROLEND 0x9F	/* last C1 control */
343 
344 				/* ISO 646 substituted Unicode codepoints */
345 #define UCS2_POUNDSTERLING 0x00a3
346 #define UCS2_YEN 0x00a5
347 #define UCS2_OVERLINE 0x203e
348 #define UCS2_EURO 0x20ac
349 #define UCS2_KATAKANA 0xff61	/* first katakana codepoint */
350 #define UCS2_BOM 0xfeff		/* byte order mark */
351 #define UCS2_BOGON 0xfffd	/* replacement character */
352 				/* next two codepoints are not Unicode chars */
353 #define UCS2_BOMCHECK 0xfffe	/* used to check byte order with UCS2_BOM */
354 #define UCS2_NOTCHAR 0xffff	/* not a character */
355 
356 #define UCS4_BMPBASE 0x0000	/* Basic Multilingual Plane */
357 #define UCS4_SMPBASE 0x10000	/* Supplementary Multilinugual Plane */
358 #define UCS4_SIPBASE 0x20000	/* Supplementary Ideographic Plane */
359 				/* EastAsianWidth says plane 3 is wide */
360 #define UCS4_UNABASE 0x40000	/* unassigned space */
361 #define UCS4_SSPBASE 0xe0000	/* Supplementary Special-purpose Plane */
362 #define UCS4_PVTBASE 0xf0000	/* private-space (two planes) */
363 #define UCS4_MAXUNICODE 0x10ffff/* highest Unicode codepoint */
364 
365 #define UTF16_BASE 0x10000	/* base of codepoints needing surrogates */
366 #define UTF16_SHIFT 10		/* surrogate shift */
367 #define UTF16_MASK 0x3ff	/* surrogate mask */
368 #define UTF16_SURR 0xd800	/* UTF-16 surrogate area */
369 #define UTF16_SURRH 0xd800	/* UTF-16 first high surrogate */
370 #define UTF16_SURRHEND 0xdbff	/* UTF-16 last high surrogate */
371 #define UTF16_SURRL 0xdc00	/* UTF-16 first low surrogate */
372 #define UTF16_SURRLEND 0xdfff	/* UTF-16 last low surrogate */
373 #define UTF16_MAXSURR 0xdfff	/* end of UTF-16 surrogates */
374 
375 
376 /* UBOGON is used to represent a codepoint in a character set which does not
377  * map to Unicode.  It is also used for mapping failures, e.g. incomplete
378  * shift sequences.  This name has the same text width as 0x????, for
379  * convenience in the mapping tables.
380  *
381  * NOCHAR is used to represent a codepoint in Unicode which does not map to
382  * the target character set in a reverse mapping table.  This name has the
383  * same text width as 0x???? in case we ever add static reverse mapping tables.
384  */
385 
386 #define UBOGON 0xfff8		/* unmapped from UCS2_BOGON */
387 #define NOCHAR UCS2_NOTCHAR
388 
389 /* Codepoints in non-Unicode character sets */
390 
391 /* Codepoints in ISO 646 character sets */
392 
393 /* British ASCII codepoints */
394 
395 #define BRITISH_POUNDSTERLING 0x23
396 
397 /* JIS Roman codepoints */
398 
399 #define JISROMAN_YEN 0x5c
400 #define JISROMAN_OVERLINE 0x7e
401 
402 
403 /* Hankaku katakana codepoints & parameters
404  *
405  * In earlier versions, MAX_KANA_7 and MAX_KANA_8 were the maximum codepoint
406  * values.  Although this made sense, it was confusing with the "max ku" and
407  * "max ten" values used in the double-byte tables; there are 1-origin, but
408  * the calculated values used for "ku" and "ten" are 0-origin (derived by
409  * subtracting the "base").  What this all meant is that for double byte
410  * characters the limit test is of the form (value < max_ku), but for single
411  * byte characters (which used the same cell to hold the max ku) the limit
412  * test was (value <= max_ku).
413  *
414  * By making MAX_KANA_[78] be maximum+1, the same (value < max_ku) limit test
415  * is used throughout.  - 6/15/2006
416  */
417 
418 #define MIN_KANA_7 0x21
419 #define MAX_KANA_7 0x60		/* maximum value + 1 */
420 #define KANA_7 (UCS2_KATAKANA - MIN_KANA_7)
421 #define MIN_KANA_8 (MIN_KANA_7 | BIT8)
422 #define MAX_KANA_8 (MAX_KANA_7 | BIT8)
423 #define KANA_8 (UCS2_KATAKANA - MIN_KANA_8)
424 
425 /* Charset scripts */
426 
427 /*  The term "script" is used here in a very loose sense, enough to make
428  * purists cringe.  Basically, the idea is to give the main program some
429  * idea of how it should treat the characters of text in a charset with
430  * respect to font, drawing routines, etc.
431  *
432  *  In some cases, "script" is associated with a charset; in other cases,
433  * it's more closely tied to a language.
434  */
435 
436 #define SC_UNICODE 0x1		/* Unicode */
437 #define SC_LATIN_1 0x10		/* Western Europe */
438 #define SC_LATIN_2 0x20		/* Eastern Europe */
439 #define SC_LATIN_3 0x40		/* Southern Europe */
440 #define SC_LATIN_4 0x80		/* Northern Europe */
441 #define SC_LATIN_5 0x100	/* Turkish */
442 #define SC_LATIN_6 0x200	/* Nordic */
443 #define SC_LATIN_7 0x400	/* Baltic */
444 #define SC_LATIN_8 0x800	/* Celtic */
445 #define SC_LATIN_9 0x1000	/* Euro */
446 #define SC_LATIN_0 SC_LATIN_9	/* colloquial name for Latin-9 */
447 #define SC_ARABIC 0x2000
448 #define SC_CYRILLIC 0x4000
449 #define SC_GREEK 0x8000
450 #define SC_HEBREW 0x10000
451 #define SC_THAI 0x20000
452 #define SC_UKRANIAN 0x40000
453 #define SC_LATIN_10 0x80000	/* Balkan */
454 #define SC_VIETNAMESE 0x100000
455 #define SC_CHINESE_SIMPLIFIED 0x1000000
456 #define SC_CHINESE_TRADITIONAL 0x2000000
457 #define SC_JAPANESE 0x4000000
458 #define SC_KOREAN 0x8000000
459 
460 
461 /* Script table */
462 
463 typedef struct utf8_scent {
464   char *name;			/* script name */
465   char *description;		/* script description */
466   unsigned long script;		/* script bitmask */
467 } SCRIPT;
468 
469 /* Character set table support */
470 
471 typedef struct utf8_csent {
472   char *name;			/* charset name */
473   unsigned short type;		/* type of charset */
474   unsigned short flags;		/* charset flags */
475   void *tab;			/* additional data */
476   unsigned long script;		/* script(s) implemented by this charset */
477   char *preferred;		/* preferred charset over this one */
478 } CHARSET;
479 
480 
481 struct utf8_eucparam {
482   unsigned int base_ku : 8;	/* base row */
483   unsigned int base_ten : 8;	/* base column */
484   unsigned int max_ku : 8;	/* maximum row */
485   unsigned int max_ten : 8;	/* maximum column */
486   void *tab;			/* conversion table */
487 };
488 
489 
490 /* Charset types */
491 
492 #define CT_UNKNOWN 0		/* unknown 8-bit */
493 #define CT_ASCII 1		/* 7-bit ASCII no table */
494 #define CT_UCS2 2		/* 2 byte 16-bit Unicode no table */
495 #define CT_UCS4 3		/* 4 byte 32-bit Unicode no table */
496 #define CT_1BYTE0 10		/* 1 byte ISO 8859-1 no table */
497 #define CT_1BYTE 11		/* 1 byte ASCII + table 0x80-0xff */
498 #define CT_1BYTE8 12		/* 1 byte table 0x00 - 0xff */
499 #define CT_EUC 100		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
500 #define CT_DBYTE 101		/* 2 byte ASCII + utf8_eucparam */
501 #define CT_DBYTE2 102		/* 2 byte ASCII + utf8_eucparam plane1/2 */
502 #define CT_UTF16 1000		/* variable UTF-16 encoded Unicode no table */
503 #define CT_UTF8 1001		/* variable UTF-8 encoded Unicode no table */
504 #define CT_UTF7 1002		/* variable UTF-7 encoded Unicode no table */
505 #define CT_2022 10000		/* variable ISO-2022 encoded no table */
506 #define CT_SJIS 10001		/* 2 byte Shift-JIS encoded JIS no table */
507 
508 
509 /* Character set flags */
510 
511 #define CF_PRIMARY 0x1		/* primary name for this charset */
512 #define CF_DISPLAY 0x2		/* charset used in displays */
513 #define CF_POSTING 0x4		/* charset used in email posting */
514 #define CF_UNSUPRT 0x8		/* charset unsupported (can't convert to it) */
515 #define CF_NOEMAIL 0x10		/* charset not used in email */
516 
517 
518 /* UTF-7 engine states */
519 
520 #define U7_ASCII 0		/* ASCII character */
521 #define U7_PLUS 1		/* plus seen */
522 #define U7_UNICODE 2		/* Unicode characters */
523 #define U7_MINUS 3		/* absorbed minus seen */
524 
525 /* Function prototypes */
526 
527 typedef unsigned long (*ucs4cn_t) (unsigned long c);
528 typedef unsigned long (*ucs4de_t) (unsigned long c,void **more);
529 
530 SCRIPT *utf8_script (char *script);
531 const CHARSET *utf8_charset (char *charset);
532 char *utf8_badcharset (char *charset);
533 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags);
534 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
535 		   ucs4cn_t cv,ucs4de_t de);
536 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
537 		  unsigned long errch);
538 long utf8_cstocstext (SIZEDTEXT *text,char *sc,SIZEDTEXT *ret,char *dc,
539 		      unsigned long errch);
540 unsigned short *utf8_rmap (char *charset);
541 unsigned short *utf8_rmap_cs (const CHARSET *cs);
542 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap);
543 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
544 		    unsigned long errch,long iso2022jp);
545 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
546 			     unsigned long errch,long iso2022jp);
547 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
548 		    SIZEDTEXT *ret,unsigned long errch);
549 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
550 		   unsigned long errch);
551 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
552 		   unsigned short *rmap,unsigned long errch);
553 unsigned long utf8_get (unsigned char **s,unsigned long *i);
554 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i);
555 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i);
556 unsigned long *utf8_csvalidmap (char *charsets[]);
557 const CHARSET *utf8_infercharset (SIZEDTEXT *src);
558 long utf8_validate (unsigned char *s,unsigned long i);
559 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
560 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
561 		      ucs4de_t de);
562 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
563 		       ucs4de_t de);
564 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
565 		    ucs4de_t de);
566 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
567 		      ucs4de_t de);
568 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
569 		       ucs4de_t de);
570 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
571 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
572 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
573 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
574 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
575 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
576 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
577 unsigned long utf8_size (unsigned long c);
578 unsigned char *utf8_put (unsigned char *s,unsigned long c);
579 unsigned long ucs4_titlecase (unsigned long c);
580 long ucs4_width (unsigned long c);
581 long utf8_strwidth (unsigned char *s);
582 long utf8_textwidth (SIZEDTEXT *utf8);
583 unsigned long ucs4_decompose (unsigned long c,void **more);
584 unsigned long ucs4_decompose_recursive (unsigned long c,void **more);
585 void *utf8_parameters (long function, void *value);
586