1 /*-
2 * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
3 * All rights reserved.
4 *
5 * Some parts of this code are derived from the public domain software
6 * DECUS cpp (1984,1985) written by Martin Minow.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 * M B C H A R . C
32 * C h a r a c t e r h a n d l i n g R o u t i n e s
33 *
34 * Character handling and multi-byte character handling routines are
35 * placed here.
36 */
37
38 #if PREPROCESSED
39 #include "mcpp.H"
40 #else
41 #include "system.H"
42 #include "internal.H"
43 #endif
44
45 /*
46 * Tables of character types and multi-byte character types.
47 *
48 * Some of these character attributes will be overwritten by
49 * execution time option '-@post' or '-@old'.
50 * Warning on erroneous sequence will be issued from the caller routines:
51 * scan_quote(), scan_id() or scan_number().
52 */
53
54 /* Non-ASCII characters are always checked by mb_read(). */
55 #define NA 0x4000 /* Non-ASCII characters */
56
57 /* Horizontal spaces (' ', '\t' and TOK_SEP) */
58 #define HSPA (SPA | HSP)
59
60 short * char_type; /* Pointer to one of the following type_*[]. */
61
62 #define EJ1 0x100 /* 1st byte of EUC_JP */
63 #define EJ2 0x200 /* 2nd byte of EUC_JP */
64 #define GB1 0x400 /* 1st byte of GB2312 */
65 #define GB2 0x800 /* 2nd byte of GB2312 */
66 #define KS1 0x1000 /* 1st byte of KSC5601 */
67 #define KS2 0x2000 /* 2nd byte of KSC5601 */
68
69 #define EJ12 (EJ1 | EJ2) /* 1st byte or 2nd byte of EUC_JP */
70 #define GB12 (GB1 | GB2)
71 #define KS12 (KS1 | KS2)
72 #define EJ1N (NA | EJ1)
73 #define EU12N (NA | EJ12 | GB12 | KS12)
74 /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
75
76 static short type_euc[ UCHARMAX + 1] = {
77 /*
78 * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
79 */
80
81 /* Character type codes */
82 /* 0, 1, 2, 3, 4, 5, 6, 7, */
83 /* 8, 9, A, B, C, D, E, F, Hex */
84
85 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
86 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
87 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
88 /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
89 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
90 HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
91 PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
92 DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
93 DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
94
95 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
96 LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
97 LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
98 LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
99 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
100 LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
101 LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
102 LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
103
104 NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
105 NA, NA, NA, NA, NA, NA, EJ1N, NA, /* 88 .. 8F */
106 NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
107 NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
108 NA, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A0 .. A7 */
109 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A8 .. AF */
110 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B0 .. B7 */
111 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B8 .. BF */
112 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C0 .. C7 */
113 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C8 .. CF */
114 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D0 .. D7 */
115 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D8 .. DF */
116 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E0 .. E7 */
117 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E8 .. EF */
118 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* F0 .. F7 */
119 EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA, /* F8 .. FF */
120 };
121
122 static short type_bsl[ UCHARMAX + 1] = {
123 /*
124 * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
125 * the second byte of multi-byte character.
126 */
127
128 #define SJ1 0x100 /* 1st byte of SJIS */
129 #define SJ2 0x200 /* 2nd byte of SJIS */
130 #define BF1 0x400 /* 1st byte of BIGFIVE */
131 #define BF2 0x800 /* 2nd byte of BIGFIVE */
132
133 #define SB2 (SJ2 | BF2)
134 #define SJ2N (NA | SJ2)
135 #define SB2N (NA | SJ2 | BF2)
136 #define SJ12N (NA | SJ1 | SJ2)
137 #define BF12N (NA | BF1 | BF2)
138 #define SB12N (NA | SJ1 | SJ2 | BF1 | BF2)
139 #define S2B12N (NA | SJ2 | BF1 | BF2)
140
141 #define LSB2 (LET | SB2)
142 #define PSB2 (PUNC| SB2)
143
144 /* Character type codes */
145 /* 0, 1, 2, 3, 4, 5, 6, 7, */
146 /* 8, 9, A, B, C, D, E, F, Hex */
147
148 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
149 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
150 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
151 /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
152 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
153 HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
154 PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
155 DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
156 DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
157
158 SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 40 @ABCDEFG */
159 LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 48 HIJKLMNO */
160 LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 50 PQRSTUVW */
161 LSB2, LSB2, LSB2, PSB2, SB2, PSB2, PSB2, LSB2, /* 58 XYZ[\]^_ */
162 SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 60 `abcdefg */
163 LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 68 hijklmno */
164 LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 70 pqrstuvw */
165 LSB2, LSB2, LSB2, PSB2, PSB2, PSB2, PSB2, 000, /* 78 xyz{|}~ */
166
167 SB2N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 80 .. 87 */
168 SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 88 .. 8F */
169 SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 90 .. 97 */
170 SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 98 .. 9F */
171 SJ2N, S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A0 .. A7 */
172 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A8 .. AF */
173 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B0 .. B7 */
174 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B8 .. BF */
175 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C0 .. C7 */
176 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C8 .. CF */
177 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D0 .. D7 */
178 S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D8 .. DF */
179 SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E0 .. E7 */
180 SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E8 .. EF */
181 SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* F0 .. F7 */
182 SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA, /* F8 .. FF */
183 };
184
185 /*
186 * For ISO2022_JP multi-byte character encoding.
187 */
188
189 #define IS1 0x100 /* 1st byte of shift-sequence */
190 #define IS2 0x200 /* 2nd byte of shift-sequence */
191 #define IS3 0x400 /* 3rd byte of shift-sequence */
192 #define IS4 0x800 /* 4th byte of shift-sequence */
193 #define IJP 0x1000 /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1) */
194
195 #define PIJP (PUNC | IJP)
196 #define QIJP (QUO | IJP)
197 #define DTJP (DOT | IJP)
198 #define DGJP (DIG | IJP)
199 #define LIJP (LET | IJP)
200
201 #define JPS2 (IJP | IS2)
202 #define PJPS23 (PIJP | IS2 | IS3)
203 #define LJPS3 (LIJP | IS3)
204 #define LJPS4 (LIJP | IS4)
205
206 static short type_iso2022_jp[ UCHARMAX + 1] = {
207
208 /* Character type codes */
209 /* 0, 1, 2, 3, 4, 5, 6, 7, */
210 /* 8, 9, A, B, C, D, E, F, Hex */
211
212 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
213 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
214 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
215 /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
216 000, LET, LET, IS1, 000, 000, 000, HSPA, /* 18 */
217 HSPA, PIJP, QIJP, PIJP, JPS2, PIJP, PIJP, QIJP, /* 20 !"#$%&' */
218 PJPS23,PIJP, PIJP, PIJP, PIJP, PIJP, DTJP, PIJP, /* 28 ()*+,-./ */
219 DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, /* 30 01234567 */
220 DGJP, DGJP, PIJP, PIJP, PIJP, PIJP, PIJP, PIJP, /* 38 89:;<=>? */
221
222 IJP, LIJP, LJPS3, LIJP, LJPS4, LIJP, LIJP, LIJP, /* 40 @ABCDEFG */
223 LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 48 HIJKLMNO */
224 LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 50 PQRSTUVW */
225 LIJP, LIJP, LIJP, PIJP, IJP, PIJP, PIJP, LIJP, /* 58 XYZ[\]^_ */
226 IJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 60 `abcdefg */
227 LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 68 hijklmno */
228 LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 70 pqrstuvw */
229 LIJP, LIJP, LIJP, PIJP, PIJP, PIJP, PIJP, 000, /* 78 xyz{|}~ */
230
231 NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
232 NA, NA, NA, NA, NA, NA, NA, NA, /* 88 .. 8F */
233 NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
234 NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
235 NA, NA, NA, NA, NA, NA, NA, NA, /* A0 .. A7 */
236 NA, NA, NA, NA, NA, NA, NA, NA, /* A8 .. AF */
237 NA, NA, NA, NA, NA, NA, NA, NA, /* B0 .. B7 */
238 NA, NA, NA, NA, NA, NA, NA, NA, /* B8 .. BF */
239 NA, NA, NA, NA, NA, NA, NA, NA, /* C0 .. C7 */
240 NA, NA, NA, NA, NA, NA, NA, NA, /* C8 .. CF */
241 NA, NA, NA, NA, NA, NA, NA, NA, /* D0 .. D7 */
242 NA, NA, NA, NA, NA, NA, NA, NA, /* D8 .. DF */
243 NA, NA, NA, NA, NA, NA, NA, NA, /* E0 .. E7 */
244 NA, NA, NA, NA, NA, NA, NA, NA, /* E8 .. EF */
245 NA, NA, NA, NA, NA, NA, NA, NA, /* F0 .. F7 */
246 NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
247 };
248
249 /*
250 * For UTF8 multi-byte character encoding.
251 */
252
253 #define U2_1 0x100 /* 1st byte of 2-byte encoding of UTF8 */
254 #define U3_1 0x200 /* 1st byte of 3-byte encoding of UTF8 */
255 #define U4_1 0x400 /* 1st byte of 4-byte encoding of UTF8 */
256 #define UCONT 0x800 /* Continuation of a 2, 3, or 4 byte UTF8 sequence */
257 #define U2_1N (NA | U2_1)
258 #define U3_1N (NA | U3_1)
259 #define U4_1N (NA | U4_1)
260 #define UCONTN (NA | UCONT)
261
262 static short type_utf8[ UCHARMAX + 1] = {
263
264 /* Character type codes */
265 /* 0, 1, 2, 3, 4, 5, 6, 7, */
266 /* 8, 9, A, B, C, D, E, F, Hex */
267
268 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
269 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
270 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
271 /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
272 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
273 HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
274 PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
275 DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
276 DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
277
278 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
279 LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
280 LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
281 LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
282 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
283 LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
284 LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
285 LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
286
287 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 80 .. 87 */
288 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 88 .. 8F */
289 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 90 .. 97 */
290 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 98 .. 9F */
291 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A0 .. A7 */
292 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A8 .. AF */
293 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B0 .. B7 */
294 UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B8 .. BF */
295 NA, NA, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C0 .. C7 */
296 U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C8 .. CF */
297 U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D0 .. D7 */
298 U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D8 .. DF */
299 U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E0 .. E7 */
300 U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E8 .. EF */
301 U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA, NA, NA, /* F0 .. F7 */
302 NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
303 };
304
305 #define SETLOCALE 2 /* #pragma setlocale (not __setlocale) */
306
307 #define NUM_ENCODING 8
308 #define NUM_ALIAS 6
309
310 /* Names of encoding recognized. Table for search_encoding(). */
311 static const char * const encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
312 /* Visual C full, Visual C short
313 , 4 miscellaneous */
314 { "english", "c"
315 , "c", "en", "latin", "iso8859"},
316 { "", ""
317 , "eucjp", "euc", "ujis", ""},
318 { "chinesesimplified", "chs"
319 , "gb2312", "cngb", "euccn", ""},
320 { "korean", "kor"
321 , "ksc5601", "ksx1001", "wansung", "euckr"},
322 { "japanese", "jpn"
323 , "sjis", "shiftjis", "mskanji", ""},
324 { "chinesetraditional", "cht"
325 , "bigfive", "big5", "cnbig5", "euctw"},
326 { "", ""
327 , "iso2022jp", "iso2022jp1", "jis", ""},
328 { "", ""
329 , "utf8", "utf", "", ""},
330 };
331
332 static int mbstart;
333 static int mb2;
334
335 static size_t mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
336 /* For 2-byte encodings of mbchar */
337 static const char * search_encoding( char * norm, int alias);
338 /* Search encoding_name[][] table */
339 static void strip_bar( char * string);
340 /* Remove '_', '-' or '.' in the string */
341 static void conv_case( char * name, char * lim, int upper);
342 /* Convert to upper/lower case */
343 static size_t mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
344 /* For ISO2022_JP encoding */
345 static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
346 /* For UTF8 mbchar encoding */
347
348 #define NAMLEN 20
349 #define UPPER 1 /* To upper */
350 #define LOWER 0 /* To lower */
351
352
set_encoding(char * name,char * env,int pragma)353 const char * set_encoding(
354 char * name, /* Name of encoding specified */
355 char * env, /* Name of environment variable */
356 int pragma
357 /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
358 )
359 /*
360 * Search the encoding specified and re-initialize mbchar settings.
361 */
362 {
363 const char * unknown_encoding
364 = "Unknown encoding: %s%.0ld%.0s"; /* _W1_ */
365 const char * too_long
366 = "Too long encoding name: %s%.0ld%.0s"; /* _E_ */
367 const char * loc = "";
368 int alias;
369 char norm[ NAMLEN];
370 /*
371 * Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
372 * and lowered.
373 */
374
375 if (strlen( name) >= NAMLEN) {
376 if ((env || pragma) && (warn_level & 1)) {
377 cwarn( too_long, name, 0L, NULL);
378 } else {
379 mcpp_fprintf( ERR, too_long, name);
380 mcpp_fputc( '\n', ERR);
381 }
382 }
383 strcpy( norm, name);
384 if (norm[ 5] == '.')
385 memmove( norm, norm + 5, strlen( norm + 5) + 1);
386 /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other */
387 conv_case( norm, norm + strlen( norm), LOWER);
388 strip_bar( norm);
389
390 if (strlen( name) == 0) { /* "" */
391 mbchar = MBCHAR; /* Restore to the default encoding */
392 } else if (memcmp( norm, "iso8859", 7) == 0 /* iso8859* */
393 || memcmp( norm, "latin", 5) == 0 /* latin* */
394 || memcmp( norm, "en", 2) == 0) { /* en* */
395 mbchar = 0; /* No multi-byte character */
396 } else {
397 alias = 2;
398 #if COMPILER == MSC
399 if (pragma == SETLOCALE) /* #pragma setlocale */
400 alias = 0;
401 #endif
402 loc = search_encoding( norm, alias); /* Search the name */
403 }
404 if (loc == NULL) {
405 if ((env || pragma) && (warn_level & 1)) {
406 cwarn( unknown_encoding, name, 0L, NULL);
407 } else { /* -m option */
408 mcpp_fprintf( ERR, unknown_encoding, name);
409 mcpp_fputc( '\n', ERR);
410 }
411 } else {
412 mb_init(); /* Re-initialize */
413 }
414 return loc;
415 }
416
search_encoding(char * norm,int alias)417 static const char * search_encoding(
418 char * norm, /* The name of encoding specified */
419 int alias /* The number of alias to start searching */
420 )
421 {
422 const char * loc;
423 int lo, al;
424
425 for (lo = 0; lo < NUM_ENCODING; lo++) {
426 for (al = alias ; al < NUM_ALIAS; al++) {
427 loc = encoding_name[ lo][ al];
428 if (str_eq( loc, norm)) {
429 switch (lo) {
430 case 0 : mbchar = 0; break;
431 case 1 : mbchar = EUC_JP; break;
432 case 2 : mbchar = GB2312; break;
433 case 3 : mbchar = KSC5601; break;
434 case 4 : mbchar = SJIS; break;
435 case 5 : mbchar = BIGFIVE; break;
436 case 6 : mbchar = ISO2022_JP; break;
437 case 7 : mbchar = UTF8; break;
438 }
439 return loc;
440 }
441 }
442 }
443 return NULL;
444 }
445
strip_bar(char * string)446 static void strip_bar(
447 char * string
448 )
449 /*
450 * Strip '_', '-' or '.' in the string.
451 */
452 {
453 char * cp = string;
454
455 while (*cp != EOS) {
456 if (*cp == '_' || *cp == '-' || *cp == '.')
457 memmove( cp, cp + 1, strlen( cp));
458 else
459 cp++;
460 }
461 }
462
conv_case(char * name,char * lim,int upper)463 static void conv_case(
464 char * name, /* (diretory) Name */
465 char * lim, /* End of (directory) name */
466 int upper /* TRUE if to upper */
467 )
468 /* Convert a string to upper-case letters or lower-case letters in-place */
469 {
470 int c;
471 char * sp;
472
473 for (sp = name; sp < lim; sp++) {
474 c = *sp & UCHARMAX;
475 #if MBCHAR
476 if ((char_type[ c] & mbstart)) {
477 char tmp[ PATHMAX+1];
478 char * tp = tmp;
479 *tp++ = *sp++;
480 mb_read( c, &sp, &tp);
481 } else
482 #endif
483 {
484 if (upper)
485 *sp = toupper( c);
486 else
487 *sp = tolower( c);
488 }
489 }
490 }
491
mb_init(void)492 void mb_init( void)
493 /*
494 * Initialize multi-byte character settings.
495 * First called prior to setting the 'mcpp_mode'.
496 * Will be called again each time the multibyte character encoding is changed.
497 */
498 {
499 /*
500 * Select the character classification table, select the multi-byte
501 * character reading routine and decide whether multi-byte character
502 * may contain the byte of value 0x5c.
503 */
504 switch (mbchar) {
505 case 0 :
506 case EUC_JP :
507 case GB2312 :
508 case KSC5601 :
509 char_type = type_euc;
510 bsl_in_mbchar = FALSE;
511 mb_read = mb_read_2byte;
512 break;
513 case SJIS :
514 case BIGFIVE :
515 char_type = type_bsl;
516 bsl_in_mbchar = TRUE;
517 mb_read = mb_read_2byte;
518 break;
519 case ISO2022_JP :
520 char_type = type_iso2022_jp;
521 bsl_in_mbchar = TRUE;
522 mb_read = mb_read_iso2022_jp;
523 break;
524 case UTF8 :
525 char_type = type_utf8;
526 bsl_in_mbchar = FALSE;
527 mb_read = mb_read_utf8;
528 break;
529 }
530
531 /* Set the bit patterns for character classification. */
532 switch (mbchar) {
533 case 0 :
534 mbstart = 0;
535 break;
536 case EUC_JP :
537 mbstart = EJ1;
538 mb2 = EJ2;
539 break;
540 case GB2312 :
541 mbstart = GB1;
542 mb2 = GB2;
543 break;
544 case KSC5601:
545 mbstart = KS1;
546 mb2 = KS2;
547 break;
548 case SJIS :
549 mbstart = SJ1;
550 mb2 = SJ2;
551 break;
552 case BIGFIVE:
553 mbstart = BF1;
554 mb2 = BF2;
555 break;
556 case ISO2022_JP :
557 mbstart = IS1;
558 break;
559 case UTF8 :
560 mbstart = (U2_1 | U3_1 | U4_1);
561 break;
562 }
563 switch (mbchar) {
564 case 0 :
565 mbchk = 0;
566 break;
567 case EUC_JP :
568 case GB2312 :
569 case KSC5601:
570 case SJIS :
571 case BIGFIVE:
572 case UTF8 :
573 mbchk = NA;
574 break;
575 case ISO2022_JP :
576 mbchk = (IS1 | NA);
577 break;
578 }
579
580 /*
581 * Set special handling for some encodings to supplement some compiler's
582 * deficiency.
583 */
584 switch (mbchar) {
585 case SJIS :
586 #if ! SJIS_IS_ESCAPE_FREE
587 bsl_need_escape = TRUE;
588 #endif
589 break;
590 case BIGFIVE:
591 #if ! BIGFIVE_IS_ESCAPE_FREE
592 bsl_need_escape = TRUE;
593 #endif
594 break;
595 case ISO2022_JP :
596 #if ! ISO2022_JP_IS_ESCAPE_FREE
597 bsl_need_escape = TRUE;
598 #endif
599 break;
600 default :
601 bsl_need_escape = FALSE;
602 break;
603 }
604
605 /*
606 * Modify magic characters in character type table.
607 * char_type[] table should be rewritten in accordance with the 'mcpp_mode'
608 * whenever the encoding is changed.
609 */
610 if (mcpp_mode) { /* If mcpp_mode is already set */
611 char_type[ DEF_MAGIC] = standard ? LET : 0;
612 char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
613 char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
614 ? HSPA: 0; /* TOK_SEP equals to COM_SEP */
615 }
616 }
617
mb_read_2byte(int c1,char ** in_pp,char ** out_pp)618 static size_t mb_read_2byte(
619 int c1, /* The 1st byte of mbchar sequence (already read) */
620 char ** in_pp, /* Pointer to input */
621 char ** out_pp /* Pointer to output */
622 )
623 /*
624 * Multi-byte character reading routine for 2-byte encodings.
625 */
626 {
627 int error = FALSE;
628 size_t len = 0; /* Number of multi-byte characters read. */
629 char * in_p = *in_pp;
630 char * out_p = *out_pp;
631
632 if (! (char_type[ c1 & UCHARMAX] & mbstart))
633 return MB_ERROR; /* Not a multi-byte character */
634
635 do {
636 if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
637 error = TRUE;
638 break;
639 }
640 len++;
641 } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
642 *in_pp = --in_p;
643 *(--out_p) = EOS;
644 *out_pp = out_p;
645 return error ? (len | MB_ERROR) : len;
646 }
647
mb_read_iso2022_jp(int c1,char ** in_pp,char ** out_pp)648 static size_t mb_read_iso2022_jp(
649 int c1, /* The 1st byte of the sequence already read (always 0x1b). */
650 char ** in_pp,
651 char ** out_pp
652 )
653 /*
654 * Multi-byte character reading routine for ISO2022_JP.
655 */
656 {
657 int error = FALSE;
658 size_t len = 0;
659 char * in_p = *in_pp;
660 char * out_p = *out_pp;
661 int c2, c3, c4;
662
663 if (! (char_type[ c1 & UCHARMAX] & mbstart))
664 return MB_ERROR;
665
666 do {
667
668 *out_p++ = c2 = *in_p++;
669 if (! (char_type[ c2 & UCHARMAX] & IS2)) {
670 error = TRUE;
671 break;
672 }
673 *out_p++ = c3 = *in_p++;
674 if (! (char_type[ c3 & UCHARMAX] & IS3)) {
675 error = TRUE;
676 break;
677 }
678
679 switch (c2) {
680 case 0x24 :
681 switch (c3) {
682 case 0x42 : /* 0x1b 0x24 0x42: JIS X 0208-1983 */
683 break;
684 case 0x28 :
685 *out_p++ = c4 = *in_p++;
686 if (! (char_type[ c4 & UCHARMAX] & IS4))
687 error = TRUE;
688 /* else: 0x1b 0x24 0x28 0x44: JIS X 0212 */
689 break;
690 default :
691 error = TRUE;
692 }
693 break;
694 case 0x28 :
695 switch (c3) {
696 case 0x42 : /* 0x1b 0x28 0x42: ASCII */
697 c1 = *out_p++ = *in_p++ & UCHARMAX;
698 continue;
699 default :
700 error = TRUE;
701 }
702 break;
703 }
704 if (error)
705 break;
706
707 while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
708 if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
709 error = TRUE;
710 break;
711 }
712 len++; /* String of multi-byte characters */
713 }
714 if (error)
715 break;
716
717 } while (char_type[ c1] & IS1); /* 0x1b: start of shift-sequence */
718
719 *in_pp = --in_p;
720 *(--out_p) = EOS;
721 *out_pp = out_p;
722 return error ? (len | MB_ERROR) : len;
723 }
724
mb_read_utf8(int c1,char ** in_pp,char ** out_pp)725 static size_t mb_read_utf8(
726 int c1,
727 char ** in_pp,
728 char ** out_pp
729 )
730 /*
731 * Multi-byte character reading routine for UTF8.
732 */
733 {
734 int error = FALSE;
735 size_t len = 0;
736 char * in_p = *in_pp;
737 char * out_p = *out_pp;
738
739 if (! (char_type[ c1 & UCHARMAX] & mbstart))
740 return MB_ERROR;
741
742 do {
743 unsigned int codepoint;
744 int i, bytes;
745
746 if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
747 bytes = 4; /* 4-byte character */
748 else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
749 bytes = 3; /* 3-byte character */
750 else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
751 bytes = 2; /* 2-byte character */
752
753 /* Must ensure that the sequence is not reserved as a surrogate */
754 codepoint = ((2 << (6-bytes)) - 1) & c1; /* mask off top bits */
755
756 /* All bytes left in the sequence must be in 0x80 - 0xBF */
757 for (i = bytes - 1; i && !error; i--) {
758 codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
759 if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
760 error = TRUE;
761 }
762
763 /* Check for overlong/underlong sequences */
764 if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
765 || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
766 || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
767 error = TRUE;
768 if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
769 /* Check for reserved surrogate codepoints */
770 || (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
771 /* Illegal */
772 error = TRUE;
773 #if 0
774 printf( "codepoint:0x%x\n", codepoint);
775 #endif
776 if (error)
777 break;
778 len++;
779 } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
780 /* Start of the next multi-byte character */
781 *in_pp = --in_p;
782 *(--out_p) = EOS;
783 *out_pp = out_p;
784 return error ? (len | MB_ERROR) : len;
785 }
786
mb_eval(char ** seq_pp)787 uexpr_t mb_eval(
788 char ** seq_pp
789 )
790 /*
791 * Evaluate the value of a multi-byte character.
792 * This routine does not check the legality of the sequence.
793 * This routine is called from eval_char().
794 * This routine is never called in POST_STD mode.
795 */
796 {
797 char * seq = *seq_pp;
798 uexpr_t val = 0;
799 int c, c1;
800
801 if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
802 *seq_pp = seq;
803 return c; /* Not a multi-byte character */
804 }
805
806 switch (mbchar) {
807 case EUC_JP :
808 case GB2312 :
809 case KSC5601:
810 case SJIS :
811 case BIGFIVE:
812 val = (c << 8) + (*seq++ & UCHARMAX);
813 /* Evaluate the 2-byte sequence */
814 break;
815 case ISO2022_JP :
816 if (char_type[ c & UCHARMAX] & IS1) { /* Skip shift-sequence */
817 if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
818 if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
819 if (c1 == 0x28)
820 seq++;
821 if (c == 0x28 && c1 == 0x42) { /* Shift-out sequence */
822 val = 0;
823 break;
824 }
825 c = *seq++ & UCHARMAX;
826 }
827 }
828 }
829 val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-bytes */
830 break;
831 case UTF8 : /* Evaluate the sequence of 2, 3 or 4 bytes as it is */
832 val = (c << 8) + (*seq++ & UCHARMAX);
833 if (char_type[ c & UCHARMAX] & U3_1) {
834 val = (val << 8) + (*seq++ & UCHARMAX);
835 } else if (char_type[ c & UCHARMAX] & U4_1) {
836 val = (val << 8) + (*seq++ & UCHARMAX);
837 val = (val << 8) + (*seq++ & UCHARMAX);
838 }
839 break;
840 }
841
842 *seq_pp = seq;
843 return val;
844 }
845
last_is_mbchar(const char * in,int len)846 int last_is_mbchar(
847 const char * in, /* Input physical line */
848 int len /* Length of the line minus 2 */
849 )
850 /*
851 * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
852 * else return 0.
853 */
854 {
855 const char * cp = in + len;
856 const char * const endp = in + len; /* -> the char befor '\n' */
857
858 if ((mbchar & (SJIS | BIGFIVE)) == 0)
859 return 0;
860 while (in <= --cp) { /* Search backwardly */
861 if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
862 break; /* Not the first byte of MBCHAR */
863 }
864 if ((endp - cp) & 1)
865 return 0;
866 else
867 return 2;
868 }
869
870