1 /*-
2  * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
3  * All rights reserved.
4  *
5  * Some parts of this code are derived from the public domain software
6  * DECUS cpp (1984,1985) written by Martin Minow.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  *                          M B C H A R . C
32  *      C h a r a c t e r    h a n d l i n g    R o u t i n e s
33  *
34  * Character handling and multi-byte character handling routines are
35  * placed here.
36  */
37 
38 #if PREPROCESSED
39 #include    "mcpp.H"
40 #else
41 #include    "system.H"
42 #include    "internal.H"
43 #endif
44 
45 /*
46  * Tables of character types and multi-byte character types.
47  *
48  * Some of these character attributes will be overwritten by
49  *      execution time option '-@post' or '-@old'.
50  * Warning on erroneous sequence will be issued from the caller routines:
51  * scan_quote(), scan_id() or scan_number().
52  */
53 
54 /* Non-ASCII characters are always checked by mb_read().    */
55 #define NA      0x4000  /* Non-ASCII characters */
56 
57 /* Horizontal spaces (' ', '\t' and TOK_SEP)    */
58 #define HSPA    (SPA | HSP)
59 
60 short *     char_type;  /* Pointer to one of the following type_*[].    */
61 
62 #define EJ1     0x100   /* 1st byte of EUC_JP   */
63 #define EJ2     0x200   /* 2nd byte of EUC_JP   */
64 #define GB1     0x400   /* 1st byte of GB2312   */
65 #define GB2     0x800   /* 2nd byte of GB2312   */
66 #define KS1     0x1000  /* 1st byte of KSC5601  */
67 #define KS2     0x2000  /* 2nd byte of KSC5601  */
68 
69 #define EJ12    (EJ1 | EJ2)     /* 1st byte or 2nd byte of EUC_JP   */
70 #define GB12    (GB1 | GB2)
71 #define KS12    (KS1 | KS2)
72 #define EJ1N    (NA | EJ1)
73 #define EU12N   (NA | EJ12 | GB12 | KS12)
74     /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
75 
76 static short    type_euc[ UCHARMAX + 1] = {
77 /*
78  * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
79  */
80 
81 /* Character type codes */
82 /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
83 /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
84 
85    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
86    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
87    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
88     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
89    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
90    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
91    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
92    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
93    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
94 
95    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 40 @ABCDEFG  */
96    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 48 HIJKLMNO  */
97    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 50 PQRSTUVW  */
98    LET,   LET,   LET,   PUNC,  000,   PUNC,  PUNC,  LET,    /* 58 XYZ[\]^_  */
99    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 60 `abcdefg  */
100    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 68 hijklmno  */
101    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 70 pqrstuvw  */
102    LET,   LET,   LET,   PUNC,  PUNC,  PUNC,  PUNC,  000,    /* 78 xyz{|}~   */
103 
104    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   80 .. 87   */
105    NA,    NA,    NA,    NA,    NA,    NA,    EJ1N,  NA,     /*   88 .. 8F   */
106    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   90 .. 97   */
107    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   98 .. 9F   */
108    NA,    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   A0 .. A7   */
109    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   A8 .. AF   */
110    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   B0 .. B7   */
111    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   B8 .. BF   */
112    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   C0 .. C7   */
113    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   C8 .. CF   */
114    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   D0 .. D7   */
115    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   D8 .. DF   */
116    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   E0 .. E7   */
117    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   E8 .. EF   */
118    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   F0 .. F7   */
119    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA,     /*   F8 .. FF   */
120 };
121 
122 static short    type_bsl[ UCHARMAX + 1] = {
123 /*
124  * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
125  * the second byte of multi-byte character.
126  */
127 
128 #define SJ1     0x100   /* 1st byte of SJIS     */
129 #define SJ2     0x200   /* 2nd byte of SJIS     */
130 #define BF1     0x400   /* 1st byte of BIGFIVE  */
131 #define BF2     0x800   /* 2nd byte of BIGFIVE  */
132 
133 #define SB2     (SJ2 | BF2)
134 #define SJ2N    (NA | SJ2)
135 #define SB2N    (NA | SJ2 | BF2)
136 #define SJ12N   (NA | SJ1 | SJ2)
137 #define BF12N   (NA | BF1 | BF2)
138 #define SB12N   (NA | SJ1 | SJ2 | BF1 | BF2)
139 #define S2B12N  (NA | SJ2 | BF1 | BF2)
140 
141 #define LSB2    (LET | SB2)
142 #define PSB2    (PUNC| SB2)
143 
144 /* Character type codes */
145 /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
146 /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
147 
148    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
149    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
150    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
151     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
152    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
153    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
154    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
155    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
156    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
157 
158    SB2,   LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 40 @ABCDEFG  */
159    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 48 HIJKLMNO  */
160    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 50 PQRSTUVW  */
161    LSB2,  LSB2,  LSB2,  PSB2,  SB2,   PSB2,  PSB2,  LSB2,   /* 58 XYZ[\]^_  */
162    SB2,   LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 60 `abcdefg  */
163    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 68 hijklmno  */
164    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 70 pqrstuvw  */
165    LSB2,  LSB2,  LSB2,  PSB2,  PSB2,  PSB2,  PSB2,  000,    /* 78 xyz{|}~   */
166 
167    SB2N,  SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   80 .. 87   */
168    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   88 .. 8F   */
169    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   90 .. 97   */
170    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   98 .. 9F   */
171    SJ2N,  S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   A0 .. A7   */
172    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   A8 .. AF   */
173    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   B0 .. B7   */
174    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   B8 .. BF   */
175    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   C0 .. C7   */
176    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   C8 .. CF   */
177    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   D0 .. D7   */
178    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   D8 .. DF   */
179    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   E0 .. E7   */
180    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   E8 .. EF   */
181    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   F0 .. F7   */
182    SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA,     /*   F8 .. FF   */
183 };
184 
185 /*
186  * For ISO2022_JP multi-byte character encoding.
187  */
188 
189 #define IS1     0x100   /* 1st byte of shift-sequence   */
190 #define IS2     0x200   /* 2nd byte of shift-sequence   */
191 #define IS3     0x400   /* 3rd byte of shift-sequence   */
192 #define IS4     0x800   /* 4th byte of shift-sequence   */
193 #define IJP     0x1000  /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1)    */
194 
195 #define PIJP    (PUNC | IJP)
196 #define QIJP    (QUO | IJP)
197 #define DTJP    (DOT | IJP)
198 #define DGJP    (DIG | IJP)
199 #define LIJP    (LET | IJP)
200 
201 #define JPS2    (IJP | IS2)
202 #define PJPS23  (PIJP | IS2 | IS3)
203 #define LJPS3   (LIJP | IS3)
204 #define LJPS4   (LIJP | IS4)
205 
206 static short    type_iso2022_jp[ UCHARMAX + 1] = {
207 
208 /* Character type codes */
209 /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
210 /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
211 
212    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
213    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
214    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
215     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
216    000,   LET,   LET,   IS1,   000,   000,   000,   HSPA,   /* 18           */
217    HSPA,  PIJP,  QIJP,  PIJP,  JPS2,  PIJP,  PIJP,  QIJP,   /* 20  !"#$%&'  */
218    PJPS23,PIJP,  PIJP,  PIJP,  PIJP,  PIJP,  DTJP,  PIJP,   /* 28 ()*+,-./  */
219    DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,   /* 30 01234567  */
220    DGJP,  DGJP,  PIJP,  PIJP,  PIJP,  PIJP,  PIJP,  PIJP,   /* 38 89:;<=>?  */
221 
222    IJP,   LIJP,  LJPS3, LIJP,  LJPS4, LIJP,  LIJP,  LIJP,   /* 40 @ABCDEFG  */
223    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 48 HIJKLMNO  */
224    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 50 PQRSTUVW  */
225    LIJP,  LIJP,  LIJP,  PIJP,  IJP,   PIJP,  PIJP,  LIJP,   /* 58 XYZ[\]^_  */
226    IJP,   LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 60 `abcdefg  */
227    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 68 hijklmno  */
228    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 70 pqrstuvw  */
229    LIJP,  LIJP,  LIJP,  PIJP,  PIJP,  PIJP,  PIJP,  000,    /* 78 xyz{|}~   */
230 
231    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   80 .. 87   */
232    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   88 .. 8F   */
233    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   90 .. 97   */
234    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   98 .. 9F   */
235    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   A0 .. A7   */
236    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   A8 .. AF   */
237    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   B0 .. B7   */
238    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   B8 .. BF   */
239    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   C0 .. C7   */
240    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   C8 .. CF   */
241    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   D0 .. D7   */
242    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   D8 .. DF   */
243    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   E0 .. E7   */
244    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   E8 .. EF   */
245    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F0 .. F7   */
246    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F8 .. FF   */
247 };
248 
249 /*
250  * For UTF8 multi-byte character encoding.
251  */
252 
253 #define U2_1    0x100       /* 1st byte of 2-byte encoding of UTF8  */
254 #define U3_1    0x200       /* 1st byte of 3-byte encoding of UTF8  */
255 #define U4_1    0x400       /* 1st byte of 4-byte encoding of UTF8  */
256 #define UCONT   0x800   /* Continuation of a 2, 3, or 4 byte UTF8 sequence  */
257 #define U2_1N   (NA | U2_1)
258 #define U3_1N   (NA | U3_1)
259 #define U4_1N   (NA | U4_1)
260 #define UCONTN  (NA | UCONT)
261 
262 static short    type_utf8[ UCHARMAX + 1] = {
263 
264 /* Character type codes */
265 /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
266 /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
267 
268    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
269    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
270    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
271     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
272    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
273    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
274    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
275    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
276    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
277 
278    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 40 @ABCDEFG  */
279    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 48 HIJKLMNO  */
280    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 50 PQRSTUVW  */
281    LET,   LET,   LET,   PUNC,  000,   PUNC,  PUNC,  LET,    /* 58 XYZ[\]^_  */
282    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 60 `abcdefg  */
283    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 68 hijklmno  */
284    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 70 pqrstuvw  */
285    LET,   LET,   LET,   PUNC,  PUNC,  PUNC,  PUNC,  000,    /* 78 xyz{|}~   */
286 
287    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   80 .. 87   */
288    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   88 .. 8F   */
289    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   90 .. 97   */
290    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   98 .. 9F   */
291    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   A0 .. A7   */
292    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   A8 .. AF   */
293    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   B0 .. B7   */
294    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   B8 .. BF   */
295    NA,    NA,    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   C0 .. C7   */
296    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   C8 .. CF   */
297    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   D0 .. D7   */
298    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   D8 .. DF   */
299    U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N,  /*   E0 .. E7   */
300    U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N,  /*   E8 .. EF   */
301    U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA,    NA,    NA,     /*   F0 .. F7   */
302    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F8 .. FF   */
303 };
304 
305 #define SETLOCALE       2       /* #pragma setlocale (not __setlocale)  */
306 
307 #define NUM_ENCODING    8
308 #define NUM_ALIAS       6
309 
310 /* Names of encoding recognized.  Table for search_encoding().  */
311 static const char * const   encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
312     /* Visual C full, Visual C short
313         , 4 miscellaneous  */
314     { "english",    "c"
315         , "c",      "en",   "latin",    "iso8859"},
316     { "",     ""
317         , "eucjp",  "euc",  "ujis",     ""},
318     { "chinesesimplified",  "chs"
319         , "gb2312", "cngb",     "euccn",    ""},
320     { "korean",   "kor"
321         , "ksc5601",    "ksx1001",  "wansung",  "euckr"},
322     { "japanese", "jpn"
323         , "sjis",   "shiftjis", "mskanji",  ""},
324     { "chinesetraditional", "cht"
325         , "bigfive",    "big5", "cnbig5",   "euctw"},
326     { "",     ""
327         , "iso2022jp",  "iso2022jp1",   "jis",  ""},
328     { "",     ""
329         , "utf8",   "utf",      "",     ""},
330 };
331 
332 static int      mbstart;
333 static int      mb2;
334 
335 static size_t   mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
336                 /* For 2-byte encodings of mbchar   */
337 static const char *     search_encoding( char * norm, int alias);
338                 /* Search encoding_name[][] table   */
339 static void     strip_bar( char * string);
340                 /* Remove '_', '-' or '.' in the string */
341 static void     conv_case( char * name, char * lim, int upper);
342                 /* Convert to upper/lower case      */
343 static size_t   mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
344                 /* For ISO2022_JP encoding          */
345 static size_t   mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
346                 /* For UTF8 mbchar encoding         */
347 
348 #define NAMLEN          20
349 #define UPPER           1               /* To upper */
350 #define LOWER           0               /* To lower */
351 
352 
set_encoding(char * name,char * env,int pragma)353 const char *    set_encoding(
354     char *  name,       /* Name of encoding specified   */
355     char *  env,        /* Name of environment variable */
356     int     pragma
357         /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
358 )
359 /*
360  * Search the encoding specified and re-initialize mbchar settings.
361  */
362 {
363     const char *    unknown_encoding
364             = "Unknown encoding: %s%.0ld%.0s";          /* _W1_ */
365     const char *    too_long
366             = "Too long encoding name: %s%.0ld%.0s";    /* _E_  */
367     const char *    loc = "";
368     int     alias;
369     char    norm[ NAMLEN];
370             /*
371              * Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
372              * and lowered.
373              */
374 
375     if (strlen( name) >= NAMLEN) {
376         if ((env || pragma) && (warn_level & 1)) {
377             cwarn( too_long, name, 0L, NULL);
378         } else {
379             mcpp_fprintf( ERR, too_long, name);
380             mcpp_fputc( '\n', ERR);
381         }
382     }
383     strcpy( norm, name);
384     if (norm[ 5] == '.')
385         memmove( norm, norm + 5, strlen( norm + 5) + 1);
386         /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other   */
387     conv_case( norm, norm + strlen( norm), LOWER);
388     strip_bar( norm);
389 
390     if (strlen( name) == 0) {                       /* ""       */
391         mbchar = MBCHAR;    /* Restore to the default encoding  */
392     } else if (memcmp( norm, "iso8859", 7) == 0     /* iso8859* */
393             || memcmp( norm, "latin", 5) == 0       /* latin*   */
394             || memcmp( norm, "en", 2) == 0) {       /* en*      */
395         mbchar = 0;                 /* No multi-byte character  */
396     } else {
397         alias = 2;
398 #if COMPILER == MSC
399         if (pragma == SETLOCALE)        /* #pragma setlocale    */
400             alias = 0;
401 #endif
402         loc = search_encoding( norm, alias);        /* Search the name  */
403     }
404     if (loc == NULL) {
405         if ((env || pragma) && (warn_level & 1)) {
406             cwarn( unknown_encoding, name, 0L, NULL);
407         } else {                        /* -m option            */
408             mcpp_fprintf( ERR, unknown_encoding, name);
409             mcpp_fputc( '\n', ERR);
410         }
411     } else {
412         mb_init();                      /* Re-initialize        */
413     }
414     return  loc;
415 }
416 
search_encoding(char * norm,int alias)417 static const char * search_encoding(
418     char *  norm,           /* The name of encoding specified   */
419     int     alias           /* The number of alias to start searching   */
420 )
421 {
422     const char *    loc;
423     int             lo, al;
424 
425     for (lo = 0; lo < NUM_ENCODING; lo++) {
426         for (al = alias ; al < NUM_ALIAS; al++) {
427             loc = encoding_name[ lo][ al];
428             if (str_eq( loc, norm)) {
429                 switch (lo) {
430                 case 0  :   mbchar = 0;             break;
431                 case 1  :   mbchar = EUC_JP;        break;
432                 case 2  :   mbchar = GB2312;        break;
433                 case 3  :   mbchar = KSC5601;       break;
434                 case 4  :   mbchar = SJIS;          break;
435                 case 5  :   mbchar = BIGFIVE;       break;
436                 case 6  :   mbchar = ISO2022_JP;    break;
437                 case 7  :   mbchar = UTF8;          break;
438                 }
439                 return  loc;
440             }
441         }
442     }
443     return  NULL;
444 }
445 
strip_bar(char * string)446 static void strip_bar(
447     char *  string
448 )
449 /*
450  * Strip '_', '-' or '.' in the string.
451  */
452 {
453     char *  cp = string;
454 
455     while (*cp != EOS) {
456         if (*cp == '_' || *cp == '-' || *cp == '.')
457             memmove( cp, cp + 1, strlen( cp));
458         else
459             cp++;
460     }
461 }
462 
conv_case(char * name,char * lim,int upper)463 static void     conv_case(
464     char *  name,                       /* (diretory) Name          */
465     char *  lim,                        /* End of (directory) name  */
466     int     upper                       /* TRUE if to upper         */
467 )
468 /* Convert a string to upper-case letters or lower-case letters in-place    */
469 {
470     int     c;
471     char *  sp;
472 
473     for (sp = name; sp < lim; sp++) {
474         c = *sp & UCHARMAX;
475 #if MBCHAR
476         if ((char_type[ c] & mbstart)) {
477             char    tmp[ PATHMAX+1];
478             char *  tp = tmp;
479             *tp++ = *sp++;
480             mb_read( c, &sp, &tp);
481         } else
482 #endif
483         {
484             if (upper)
485                 *sp = toupper( c);
486             else
487                 *sp = tolower( c);
488         }
489     }
490 }
491 
mb_init(void)492 void    mb_init( void)
493 /*
494  * Initialize multi-byte character settings.
495  * First called prior to setting the 'mcpp_mode'.
496  * Will be called again each time the multibyte character encoding is changed.
497  */
498 {
499     /*
500      * Select the character classification table, select the multi-byte
501      * character reading routine and decide whether multi-byte character
502      * may contain the byte of value 0x5c.
503      */
504     switch (mbchar) {
505     case 0      :
506     case EUC_JP     :
507     case GB2312     :
508     case KSC5601    :
509         char_type = type_euc;
510         bsl_in_mbchar = FALSE;
511         mb_read = mb_read_2byte;
512         break;
513     case SJIS   :
514     case BIGFIVE    :
515         char_type = type_bsl;
516         bsl_in_mbchar = TRUE;
517         mb_read = mb_read_2byte;
518         break;
519     case ISO2022_JP :
520         char_type = type_iso2022_jp;
521         bsl_in_mbchar = TRUE;
522         mb_read = mb_read_iso2022_jp;
523         break;
524     case UTF8   :
525         char_type = type_utf8;
526         bsl_in_mbchar = FALSE;
527         mb_read = mb_read_utf8;
528         break;
529     }
530 
531     /* Set the bit patterns for character classification.   */
532     switch (mbchar) {
533     case 0      :
534         mbstart = 0;
535         break;
536     case EUC_JP :
537         mbstart = EJ1;
538         mb2 = EJ2;
539         break;
540     case GB2312 :
541         mbstart = GB1;
542         mb2 = GB2;
543         break;
544     case KSC5601:
545         mbstart = KS1;
546         mb2 = KS2;
547         break;
548     case SJIS   :
549         mbstart = SJ1;
550         mb2 = SJ2;
551         break;
552     case BIGFIVE:
553         mbstart = BF1;
554         mb2 = BF2;
555         break;
556     case ISO2022_JP :
557         mbstart = IS1;
558         break;
559     case UTF8   :
560         mbstart = (U2_1 | U3_1 | U4_1);
561         break;
562     }
563     switch (mbchar) {
564     case 0      :
565         mbchk = 0;
566         break;
567     case EUC_JP :
568     case GB2312 :
569     case KSC5601:
570     case SJIS   :
571     case BIGFIVE:
572     case UTF8   :
573         mbchk = NA;
574         break;
575     case ISO2022_JP :
576         mbchk = (IS1 | NA);
577         break;
578     }
579 
580     /*
581      * Set special handling for some encodings to supplement some compiler's
582      * deficiency.
583      */
584     switch (mbchar) {
585     case SJIS   :
586 #if ! SJIS_IS_ESCAPE_FREE
587         bsl_need_escape = TRUE;
588 #endif
589         break;
590     case BIGFIVE:
591 #if ! BIGFIVE_IS_ESCAPE_FREE
592         bsl_need_escape = TRUE;
593 #endif
594         break;
595     case ISO2022_JP :
596 #if ! ISO2022_JP_IS_ESCAPE_FREE
597         bsl_need_escape = TRUE;
598 #endif
599         break;
600     default :
601         bsl_need_escape = FALSE;
602         break;
603     }
604 
605     /*
606      * Modify magic characters in character type table.
607      * char_type[] table should be rewritten in accordance with the 'mcpp_mode'
608      * whenever the encoding is changed.
609      */
610     if (mcpp_mode) {                /* If mcpp_mode is already set  */
611         char_type[ DEF_MAGIC] = standard ? LET : 0;
612         char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
613         char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
614                 ? HSPA: 0;          /* TOK_SEP equals to COM_SEP    */
615     }
616 }
617 
mb_read_2byte(int c1,char ** in_pp,char ** out_pp)618 static size_t   mb_read_2byte(
619     int     c1,         /* The 1st byte of mbchar sequence (already read)   */
620     char ** in_pp,              /* Pointer to input     */
621     char ** out_pp              /* Pointer to output    */
622 )
623 /*
624  * Multi-byte character reading routine for 2-byte encodings.
625  */
626 {
627     int     error = FALSE;
628     size_t  len = 0;    /* Number of multi-byte characters read.    */
629     char *  in_p = *in_pp;
630     char *  out_p = *out_pp;
631 
632     if (! (char_type[ c1 & UCHARMAX] & mbstart))
633         return  MB_ERROR;           /* Not a multi-byte character   */
634 
635     do {
636         if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
637             error = TRUE;
638             break;
639         }
640         len++;
641     } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
642     *in_pp = --in_p;
643     *(--out_p) = EOS;
644     *out_pp = out_p;
645     return  error ? (len | MB_ERROR) : len;
646 }
647 
mb_read_iso2022_jp(int c1,char ** in_pp,char ** out_pp)648 static size_t   mb_read_iso2022_jp(
649     int     c1, /* The 1st byte of the sequence already read (always 0x1b). */
650     char ** in_pp,
651     char ** out_pp
652 )
653 /*
654  * Multi-byte character reading routine for ISO2022_JP.
655  */
656 {
657     int     error = FALSE;
658     size_t  len = 0;
659     char *  in_p = *in_pp;
660     char *  out_p = *out_pp;
661     int     c2, c3, c4;
662 
663     if (! (char_type[ c1 & UCHARMAX] & mbstart))
664         return  MB_ERROR;
665 
666     do {
667 
668         *out_p++ = c2 = *in_p++;
669         if (! (char_type[ c2 & UCHARMAX] & IS2)) {
670             error = TRUE;
671             break;
672         }
673         *out_p++ = c3 = *in_p++;
674         if (! (char_type[ c3 & UCHARMAX] & IS3)) {
675             error = TRUE;
676             break;
677         }
678 
679         switch (c2) {
680         case 0x24   :
681             switch (c3) {
682             case 0x42   :   /* 0x1b 0x24 0x42:  JIS X 0208-1983 */
683                 break;
684             case 0x28   :
685                 *out_p++ = c4 = *in_p++;
686                 if (! (char_type[ c4 & UCHARMAX] & IS4))
687                     error = TRUE;
688                 /* else:    0x1b 0x24 0x28 0x44:    JIS X 0212  */
689                 break;
690             default :
691                 error = TRUE;
692             }
693             break;
694         case 0x28   :
695             switch (c3) {
696             case 0x42   :   /* 0x1b 0x28 0x42:  ASCII   */
697                 c1 = *out_p++ = *in_p++ & UCHARMAX;
698                 continue;
699             default :
700                 error = TRUE;
701             }
702             break;
703         }
704         if (error)
705             break;
706 
707         while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
708             if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
709                 error = TRUE;
710                 break;
711             }
712             len++;          /* String of multi-byte characters  */
713         }
714         if (error)
715             break;
716 
717     } while (char_type[ c1] & IS1);     /* 0x1b:    start of shift-sequence */
718 
719     *in_pp = --in_p;
720     *(--out_p) = EOS;
721     *out_pp = out_p;
722     return  error ? (len | MB_ERROR) : len;
723 }
724 
mb_read_utf8(int c1,char ** in_pp,char ** out_pp)725 static size_t   mb_read_utf8(
726     int     c1,
727     char ** in_pp,
728     char ** out_pp
729 )
730 /*
731  * Multi-byte character reading routine for UTF8.
732  */
733 {
734     int     error = FALSE;
735     size_t  len = 0;
736     char *  in_p = *in_pp;
737     char *  out_p = *out_pp;
738 
739     if (! (char_type[ c1 & UCHARMAX] & mbstart))
740         return  MB_ERROR;
741 
742     do {
743         unsigned int    codepoint;
744         int             i, bytes;
745 
746         if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
747             bytes = 4;                          /* 4-byte character */
748         else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
749             bytes = 3;                          /* 3-byte character */
750         else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
751             bytes = 2;                          /* 2-byte character */
752 
753         /* Must ensure that the sequence is not reserved as a surrogate */
754         codepoint = ((2 << (6-bytes)) - 1) & c1;    /* mask off top bits    */
755 
756         /* All bytes left in the sequence must be in 0x80 - 0xBF    */
757         for (i = bytes - 1; i && !error; i--) {
758             codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
759             if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
760                 error = TRUE;
761         }
762 
763         /* Check for overlong/underlong sequences */
764         if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
765             || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
766             || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
767             error = TRUE;
768         if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
769             /* Check for reserved surrogate codepoints */
770                 || (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
771                 /* Illegal  */
772             error = TRUE;
773 #if 0
774         printf( "codepoint:0x%x\n", codepoint);
775 #endif
776         if (error)
777             break;
778         len++;
779     } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
780                         /* Start of the next multi-byte character   */
781     *in_pp = --in_p;
782     *(--out_p) = EOS;
783     *out_pp = out_p;
784     return  error ? (len | MB_ERROR) : len;
785 }
786 
mb_eval(char ** seq_pp)787 uexpr_t     mb_eval(
788     char ** seq_pp
789 )
790 /*
791  * Evaluate the value of a multi-byte character.
792  * This routine does not check the legality of the sequence.
793  * This routine is called from eval_char().
794  * This routine is never called in POST_STD mode.
795  */
796 {
797     char *      seq = *seq_pp;
798     uexpr_t     val = 0;
799     int         c, c1;
800 
801     if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
802         *seq_pp = seq;
803         return  c;                  /* Not a multi-byte character   */
804     }
805 
806     switch (mbchar) {
807     case EUC_JP :
808     case GB2312 :
809     case KSC5601:
810     case SJIS   :
811     case BIGFIVE:
812         val = (c << 8) + (*seq++ & UCHARMAX);
813         /* Evaluate the 2-byte sequence */
814         break;
815     case ISO2022_JP :
816         if (char_type[ c & UCHARMAX] & IS1) {   /* Skip shift-sequence  */
817             if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
818                 if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
819                     if (c1 == 0x28)
820                         seq++;
821                     if (c == 0x28 && c1 == 0x42) {  /* Shift-out sequence   */
822                         val = 0;
823                         break;
824                     }
825                     c = *seq++ & UCHARMAX;
826                 }
827             }
828         }
829         val = (c << 8) + (*seq++ & UCHARMAX);       /* Evaluate the 2-bytes */
830         break;
831     case UTF8   :   /* Evaluate the sequence of 2, 3 or 4 bytes as it is    */
832         val = (c << 8) + (*seq++ & UCHARMAX);
833         if (char_type[ c & UCHARMAX] & U3_1) {
834             val = (val << 8) + (*seq++ & UCHARMAX);
835         } else if (char_type[ c & UCHARMAX] & U4_1) {
836             val = (val << 8) + (*seq++ & UCHARMAX);
837             val = (val << 8) + (*seq++ & UCHARMAX);
838         }
839         break;
840     }
841 
842     *seq_pp = seq;
843     return  val;
844 }
845 
last_is_mbchar(const char * in,int len)846 int  last_is_mbchar(
847     const char *  in,               /* Input physical line          */
848     int     len                     /* Length of the line minus 2   */
849 )
850 /*
851  * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
852  * else return 0.
853  */
854 {
855     const char *    cp = in + len;
856     const char * const      endp = in + len;    /* -> the char befor '\n'   */
857 
858     if ((mbchar & (SJIS | BIGFIVE)) == 0)
859         return  0;
860     while (in <= --cp) {                    /* Search backwardly    */
861         if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
862             break;                  /* Not the first byte of MBCHAR */
863     }
864     if ((endp - cp) & 1)
865         return  0;
866     else
867         return  2;
868 }
869 
870