1*c87b03e5Sespie /* Multibyte Character Functions.
2*c87b03e5Sespie Copyright (C) 1998 Free Software Foundation, Inc.
3*c87b03e5Sespie
4*c87b03e5Sespie This file is part of GCC.
5*c87b03e5Sespie
6*c87b03e5Sespie GCC is free software; you can redistribute it and/or modify it under
7*c87b03e5Sespie the terms of the GNU General Public License as published by the Free
8*c87b03e5Sespie Software Foundation; either version 2, or (at your option) any later
9*c87b03e5Sespie version.
10*c87b03e5Sespie
11*c87b03e5Sespie GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12*c87b03e5Sespie WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*c87b03e5Sespie FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14*c87b03e5Sespie for more details.
15*c87b03e5Sespie
16*c87b03e5Sespie You should have received a copy of the GNU General Public License
17*c87b03e5Sespie along with GCC; see the file COPYING. If not, write to the Free
18*c87b03e5Sespie Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19*c87b03e5Sespie 02111-1307, USA. */
20*c87b03e5Sespie
21*c87b03e5Sespie /* Note regarding cross compilation:
22*c87b03e5Sespie
23*c87b03e5Sespie In general, translation of multibyte characters to wide characters can
24*c87b03e5Sespie only work in a native compiler since the translation function (mbtowc)
25*c87b03e5Sespie needs to know about both the source and target character encoding. However,
26*c87b03e5Sespie this particular implementation for JIS, SJIS and EUCJP source characters
27*c87b03e5Sespie will work for any compiler with a newlib target. Other targets may also
28*c87b03e5Sespie work provided that their wchar_t implementation is 2 bytes and the encoding
29*c87b03e5Sespie leaves the source character values unchanged (except for removing the
30*c87b03e5Sespie state shifting markers). */
31*c87b03e5Sespie
32*c87b03e5Sespie #include "config.h"
33*c87b03e5Sespie #ifdef MULTIBYTE_CHARS
34*c87b03e5Sespie #include "system.h"
35*c87b03e5Sespie #include "mbchar.h"
36*c87b03e5Sespie #include <locale.h>
37*c87b03e5Sespie
38*c87b03e5Sespie typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39*c87b03e5Sespie JIS_C_NUM} JIS_CHAR_TYPE;
40*c87b03e5Sespie
41*c87b03e5Sespie typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42*c87b03e5Sespie J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
43*c87b03e5Sespie
44*c87b03e5Sespie typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45*c87b03e5Sespie EMPTY, ERROR} JIS_ACTION;
46*c87b03e5Sespie
47*c87b03e5Sespie /* State/action tables for processing JIS encoding:
48*c87b03e5Sespie
49*c87b03e5Sespie Where possible, switches to JIS are grouped with proceding JIS characters
50*c87b03e5Sespie and switches to ASCII are grouped with preceding JIS characters.
51*c87b03e5Sespie Thus, maximum returned length is:
52*c87b03e5Sespie 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
53*c87b03e5Sespie
54*c87b03e5Sespie static const JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
55*c87b03e5Sespie /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
56*c87b03e5Sespie /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57*c87b03e5Sespie /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58*c87b03e5Sespie /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
59*c87b03e5Sespie /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
60*c87b03e5Sespie /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
61*c87b03e5Sespie /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
62*c87b03e5Sespie /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
63*c87b03e5Sespie /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
64*c87b03e5Sespie /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
65*c87b03e5Sespie /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
66*c87b03e5Sespie };
67*c87b03e5Sespie
68*c87b03e5Sespie static const JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
69*c87b03e5Sespie /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
70*c87b03e5Sespie /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
71*c87b03e5Sespie /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
72*c87b03e5Sespie /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
73*c87b03e5Sespie /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
74*c87b03e5Sespie /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
75*c87b03e5Sespie /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
76*c87b03e5Sespie /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
77*c87b03e5Sespie /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR},
78*c87b03e5Sespie /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
79*c87b03e5Sespie /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR},
80*c87b03e5Sespie };
81*c87b03e5Sespie
82*c87b03e5Sespie
83*c87b03e5Sespie const char *literal_codeset = NULL;
84*c87b03e5Sespie
85*c87b03e5Sespie /* Store into *PWC (if PWC is not null) the wide character
86*c87b03e5Sespie corresponding to the multibyte character at the start of the
87*c87b03e5Sespie buffer S of size N. Return the number of bytes in the multibyte
88*c87b03e5Sespie character. Return -1 if the bytes do not form a valid character,
89*c87b03e5Sespie or 0 if S is null or points to a null byte.
90*c87b03e5Sespie
91*c87b03e5Sespie This function behaves like the Standard C function mbtowc, except
92*c87b03e5Sespie it treats locale names of the form "C-..." specially. */
93*c87b03e5Sespie
94*c87b03e5Sespie int
local_mbtowc(pwc,s,n)95*c87b03e5Sespie local_mbtowc (pwc, s, n)
96*c87b03e5Sespie wchar_t *pwc;
97*c87b03e5Sespie const char *s;
98*c87b03e5Sespie size_t n;
99*c87b03e5Sespie {
100*c87b03e5Sespie static JIS_STATE save_state = ASCII;
101*c87b03e5Sespie JIS_STATE curr_state = save_state;
102*c87b03e5Sespie const unsigned char *t = (const unsigned char *) s;
103*c87b03e5Sespie
104*c87b03e5Sespie if (s != NULL && n == 0)
105*c87b03e5Sespie return -1;
106*c87b03e5Sespie
107*c87b03e5Sespie if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
108*c87b03e5Sespie /* This must be the "C" locale or unknown locale -- fall thru */
109*c87b03e5Sespie ;
110*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-SJIS"))
111*c87b03e5Sespie {
112*c87b03e5Sespie int char1;
113*c87b03e5Sespie if (s == NULL)
114*c87b03e5Sespie /* Not state-dependent. */
115*c87b03e5Sespie return 0;
116*c87b03e5Sespie
117*c87b03e5Sespie char1 = *t;
118*c87b03e5Sespie if (ISSJIS1 (char1))
119*c87b03e5Sespie {
120*c87b03e5Sespie int char2 = t[1];
121*c87b03e5Sespie
122*c87b03e5Sespie if (n <= 1)
123*c87b03e5Sespie return -1;
124*c87b03e5Sespie
125*c87b03e5Sespie if (ISSJIS2 (char2))
126*c87b03e5Sespie {
127*c87b03e5Sespie if (pwc != NULL)
128*c87b03e5Sespie *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129*c87b03e5Sespie return 2;
130*c87b03e5Sespie }
131*c87b03e5Sespie
132*c87b03e5Sespie return -1;
133*c87b03e5Sespie }
134*c87b03e5Sespie
135*c87b03e5Sespie if (pwc != NULL)
136*c87b03e5Sespie *pwc = (wchar_t) *t;
137*c87b03e5Sespie
138*c87b03e5Sespie if (*t == '\0')
139*c87b03e5Sespie return 0;
140*c87b03e5Sespie
141*c87b03e5Sespie return 1;
142*c87b03e5Sespie }
143*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-EUCJP"))
144*c87b03e5Sespie {
145*c87b03e5Sespie int char1;
146*c87b03e5Sespie
147*c87b03e5Sespie if (s == NULL)
148*c87b03e5Sespie /* Not state-dependent. */
149*c87b03e5Sespie return 0;
150*c87b03e5Sespie
151*c87b03e5Sespie char1 = *t;
152*c87b03e5Sespie if (ISEUCJP (char1))
153*c87b03e5Sespie {
154*c87b03e5Sespie int char2 = t[1];
155*c87b03e5Sespie
156*c87b03e5Sespie if (n <= 1)
157*c87b03e5Sespie return -1;
158*c87b03e5Sespie
159*c87b03e5Sespie if (ISEUCJP (char2))
160*c87b03e5Sespie {
161*c87b03e5Sespie if (pwc != NULL)
162*c87b03e5Sespie *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163*c87b03e5Sespie return 2;
164*c87b03e5Sespie }
165*c87b03e5Sespie
166*c87b03e5Sespie return -1;
167*c87b03e5Sespie }
168*c87b03e5Sespie
169*c87b03e5Sespie if (pwc != NULL)
170*c87b03e5Sespie *pwc = (wchar_t) *t;
171*c87b03e5Sespie
172*c87b03e5Sespie if (*t == '\0')
173*c87b03e5Sespie return 0;
174*c87b03e5Sespie
175*c87b03e5Sespie return 1;
176*c87b03e5Sespie }
177*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-JIS"))
178*c87b03e5Sespie {
179*c87b03e5Sespie JIS_ACTION action;
180*c87b03e5Sespie JIS_CHAR_TYPE ch;
181*c87b03e5Sespie const unsigned char *ptr;
182*c87b03e5Sespie size_t i, curr_ch;
183*c87b03e5Sespie
184*c87b03e5Sespie if (s == NULL)
185*c87b03e5Sespie {
186*c87b03e5Sespie save_state = ASCII;
187*c87b03e5Sespie /* State-dependent. */
188*c87b03e5Sespie return 1;
189*c87b03e5Sespie }
190*c87b03e5Sespie
191*c87b03e5Sespie ptr = t;
192*c87b03e5Sespie
193*c87b03e5Sespie for (i = 0; i < n; i++)
194*c87b03e5Sespie {
195*c87b03e5Sespie curr_ch = t[i];
196*c87b03e5Sespie switch (curr_ch)
197*c87b03e5Sespie {
198*c87b03e5Sespie case JIS_ESC_CHAR:
199*c87b03e5Sespie ch = ESCAPE;
200*c87b03e5Sespie break;
201*c87b03e5Sespie case '$':
202*c87b03e5Sespie ch = DOLLAR;
203*c87b03e5Sespie break;
204*c87b03e5Sespie case '@':
205*c87b03e5Sespie ch = AT;
206*c87b03e5Sespie break;
207*c87b03e5Sespie case '(':
208*c87b03e5Sespie ch = BRACKET;
209*c87b03e5Sespie break;
210*c87b03e5Sespie case 'B':
211*c87b03e5Sespie ch = B;
212*c87b03e5Sespie break;
213*c87b03e5Sespie case 'J':
214*c87b03e5Sespie ch = J;
215*c87b03e5Sespie break;
216*c87b03e5Sespie case '\0':
217*c87b03e5Sespie ch = NUL;
218*c87b03e5Sespie break;
219*c87b03e5Sespie default:
220*c87b03e5Sespie if (ISJIS (curr_ch))
221*c87b03e5Sespie ch = JIS_CHAR;
222*c87b03e5Sespie else
223*c87b03e5Sespie ch = OTHER;
224*c87b03e5Sespie }
225*c87b03e5Sespie
226*c87b03e5Sespie action = JIS_action_table[curr_state][ch];
227*c87b03e5Sespie curr_state = JIS_state_table[curr_state][ch];
228*c87b03e5Sespie
229*c87b03e5Sespie switch (action)
230*c87b03e5Sespie {
231*c87b03e5Sespie case NOOP:
232*c87b03e5Sespie break;
233*c87b03e5Sespie
234*c87b03e5Sespie case EMPTY:
235*c87b03e5Sespie if (pwc != NULL)
236*c87b03e5Sespie *pwc = (wchar_t) 0;
237*c87b03e5Sespie
238*c87b03e5Sespie save_state = curr_state;
239*c87b03e5Sespie return i;
240*c87b03e5Sespie
241*c87b03e5Sespie case COPYA:
242*c87b03e5Sespie if (pwc != NULL)
243*c87b03e5Sespie *pwc = (wchar_t) *ptr;
244*c87b03e5Sespie save_state = curr_state;
245*c87b03e5Sespie return i + 1;
246*c87b03e5Sespie
247*c87b03e5Sespie case COPYJ:
248*c87b03e5Sespie if (pwc != NULL)
249*c87b03e5Sespie *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
250*c87b03e5Sespie
251*c87b03e5Sespie save_state = curr_state;
252*c87b03e5Sespie return i + 1;
253*c87b03e5Sespie
254*c87b03e5Sespie case COPYJ2:
255*c87b03e5Sespie if (pwc != NULL)
256*c87b03e5Sespie *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
257*c87b03e5Sespie
258*c87b03e5Sespie save_state = curr_state;
259*c87b03e5Sespie return ptr - t + 2;
260*c87b03e5Sespie
261*c87b03e5Sespie case MAKE_A:
262*c87b03e5Sespie case MAKE_J:
263*c87b03e5Sespie ptr = (const unsigned char *) (t + i + 1);
264*c87b03e5Sespie break;
265*c87b03e5Sespie
266*c87b03e5Sespie case ERROR:
267*c87b03e5Sespie default:
268*c87b03e5Sespie return -1;
269*c87b03e5Sespie }
270*c87b03e5Sespie }
271*c87b03e5Sespie
272*c87b03e5Sespie /* More than n bytes needed. */
273*c87b03e5Sespie return -1;
274*c87b03e5Sespie }
275*c87b03e5Sespie
276*c87b03e5Sespie #ifdef CROSS_COMPILE
277*c87b03e5Sespie if (s == NULL)
278*c87b03e5Sespie /* Not state-dependent. */
279*c87b03e5Sespie return 0;
280*c87b03e5Sespie
281*c87b03e5Sespie if (pwc != NULL)
282*c87b03e5Sespie *pwc = *s;
283*c87b03e5Sespie return 1;
284*c87b03e5Sespie #else
285*c87b03e5Sespie
286*c87b03e5Sespie /* This must be the "C" locale or unknown locale. */
287*c87b03e5Sespie return mbtowc (pwc, s, n);
288*c87b03e5Sespie #endif
289*c87b03e5Sespie }
290*c87b03e5Sespie
291*c87b03e5Sespie /* Return the number of bytes in the multibyte character at the start
292*c87b03e5Sespie of the buffer S of size N. Return -1 if the bytes do not form a
293*c87b03e5Sespie valid character, or 0 if S is null or points to a null byte.
294*c87b03e5Sespie
295*c87b03e5Sespie This function behaves like the Standard C function mblen, except
296*c87b03e5Sespie it treats locale names of the form "C-..." specially. */
297*c87b03e5Sespie
298*c87b03e5Sespie int
local_mblen(s,n)299*c87b03e5Sespie local_mblen (s, n)
300*c87b03e5Sespie const char *s;
301*c87b03e5Sespie size_t n;
302*c87b03e5Sespie {
303*c87b03e5Sespie return local_mbtowc (NULL, s, n);
304*c87b03e5Sespie }
305*c87b03e5Sespie
306*c87b03e5Sespie /* Return the maximum mumber of bytes in a multibyte character.
307*c87b03e5Sespie
308*c87b03e5Sespie This function returns the same value as the Standard C macro MB_CUR_MAX,
309*c87b03e5Sespie except it treats locale names of the form "C-..." specially. */
310*c87b03e5Sespie
311*c87b03e5Sespie int
local_mb_cur_max()312*c87b03e5Sespie local_mb_cur_max ()
313*c87b03e5Sespie {
314*c87b03e5Sespie if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
315*c87b03e5Sespie ;
316*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-SJIS"))
317*c87b03e5Sespie return 2;
318*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-EUCJP"))
319*c87b03e5Sespie return 2;
320*c87b03e5Sespie else if (! strcmp (literal_codeset, "C-JIS"))
321*c87b03e5Sespie return 8; /* 3 + 2 + 3 */
322*c87b03e5Sespie
323*c87b03e5Sespie #ifdef CROSS_COMPILE
324*c87b03e5Sespie return 1;
325*c87b03e5Sespie #else
326*c87b03e5Sespie if (MB_CUR_MAX > 0)
327*c87b03e5Sespie return MB_CUR_MAX;
328*c87b03e5Sespie
329*c87b03e5Sespie return 1; /* default */
330*c87b03e5Sespie #endif
331*c87b03e5Sespie }
332*c87b03e5Sespie #else /* MULTIBYTE_CHARS */
333*c87b03e5Sespie extern int dummy; /* silence 'ANSI C forbids an empty source file' warning */
334*c87b03e5Sespie #endif /* MULTIBYTE_CHARS */
335