xref: /openbsd/gnu/usr.bin/gcc/gcc/mbchar.c (revision c87b03e5)
1*c87b03e5Sespie /* Multibyte Character Functions.
2*c87b03e5Sespie    Copyright (C) 1998 Free Software Foundation, Inc.
3*c87b03e5Sespie 
4*c87b03e5Sespie This file is part of GCC.
5*c87b03e5Sespie 
6*c87b03e5Sespie GCC is free software; you can redistribute it and/or modify it under
7*c87b03e5Sespie the terms of the GNU General Public License as published by the Free
8*c87b03e5Sespie Software Foundation; either version 2, or (at your option) any later
9*c87b03e5Sespie version.
10*c87b03e5Sespie 
11*c87b03e5Sespie GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12*c87b03e5Sespie WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*c87b03e5Sespie FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14*c87b03e5Sespie for more details.
15*c87b03e5Sespie 
16*c87b03e5Sespie You should have received a copy of the GNU General Public License
17*c87b03e5Sespie along with GCC; see the file COPYING.  If not, write to the Free
18*c87b03e5Sespie Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19*c87b03e5Sespie 02111-1307, USA.  */
20*c87b03e5Sespie 
21*c87b03e5Sespie /* Note regarding cross compilation:
22*c87b03e5Sespie 
23*c87b03e5Sespie    In general, translation of multibyte characters to wide characters can
24*c87b03e5Sespie    only work in a native compiler since the translation function (mbtowc)
25*c87b03e5Sespie    needs to know about both the source and target character encoding.  However,
26*c87b03e5Sespie    this particular implementation for JIS, SJIS and EUCJP source characters
27*c87b03e5Sespie    will work for any compiler with a newlib target.  Other targets may also
28*c87b03e5Sespie    work provided that their wchar_t implementation is 2 bytes and the encoding
29*c87b03e5Sespie    leaves the source character values unchanged (except for removing the
30*c87b03e5Sespie    state shifting markers).  */
31*c87b03e5Sespie 
32*c87b03e5Sespie #include "config.h"
33*c87b03e5Sespie #ifdef MULTIBYTE_CHARS
34*c87b03e5Sespie #include "system.h"
35*c87b03e5Sespie #include "mbchar.h"
36*c87b03e5Sespie #include <locale.h>
37*c87b03e5Sespie 
38*c87b03e5Sespie typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39*c87b03e5Sespie 	      JIS_C_NUM} JIS_CHAR_TYPE;
40*c87b03e5Sespie 
41*c87b03e5Sespie typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42*c87b03e5Sespie 	     J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
43*c87b03e5Sespie 
44*c87b03e5Sespie typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45*c87b03e5Sespie 	      EMPTY, ERROR} JIS_ACTION;
46*c87b03e5Sespie 
47*c87b03e5Sespie /* State/action tables for processing JIS encoding:
48*c87b03e5Sespie 
49*c87b03e5Sespie    Where possible, switches to JIS are grouped with proceding JIS characters
50*c87b03e5Sespie    and switches to ASCII are grouped with preceding JIS characters.
51*c87b03e5Sespie    Thus, maximum returned length is:
52*c87b03e5Sespie      2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.  */
53*c87b03e5Sespie 
54*c87b03e5Sespie static const JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
55*c87b03e5Sespie /*            ESCAPE DOLLAR   BRACKET   AT     B      J     NUL JIS_CHAR OTH*/
56*c87b03e5Sespie /*ASCII*/   { A_ESC, ASCII,   ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57*c87b03e5Sespie /*A_ESC*/   { ASCII, A_ESC_DL,ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58*c87b03e5Sespie /*A_ESC_DL*/{ ASCII, ASCII,   ASCII,    JIS,   JIS,   ASCII, ASCII,ASCII,ASCII},
59*c87b03e5Sespie /*JIS*/     { J_ESC, JIS_1,   JIS_1,    JIS_1, JIS_1, JIS_1, INV,  JIS_1,INV },
60*c87b03e5Sespie /*JIS_1*/   { INV,   JIS_2,   JIS_2,    JIS_2, JIS_2, JIS_2, INV,  JIS_2,INV },
61*c87b03e5Sespie /*JIS_2*/   { J2_ESC,JIS,     JIS,      JIS,   JIS,   JIS,   INV,  JIS,  JIS },
62*c87b03e5Sespie /*J_ESC*/   { INV,   INV,     J_ESC_BR, INV,   INV,   INV,   INV,  INV,  INV },
63*c87b03e5Sespie /*J_ESC_BR*/{ INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
64*c87b03e5Sespie /*J2_ESC*/  { INV,   INV,     J2_ESC_BR,INV,   INV,   INV,   INV,  INV,  INV },
65*c87b03e5Sespie /*J2_ESC_BR*/{INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
66*c87b03e5Sespie };
67*c87b03e5Sespie 
68*c87b03e5Sespie static const JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
69*c87b03e5Sespie /*            ESCAPE DOLLAR BRACKET AT     B       J      NUL  JIS_CHAR OTH */
70*c87b03e5Sespie /*ASCII */   {NOOP,  COPYA, COPYA, COPYA,  COPYA,  COPYA, EMPTY, COPYA, COPYA},
71*c87b03e5Sespie /*A_ESC */   {COPYA, NOOP,  COPYA, COPYA,  COPYA,  COPYA, COPYA, COPYA, COPYA},
72*c87b03e5Sespie /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
73*c87b03e5Sespie /*JIS */     {NOOP,  NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
74*c87b03e5Sespie /*JIS_1 */   {ERROR, NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
75*c87b03e5Sespie /*JIS_2 */   {NOOP,  COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
76*c87b03e5Sespie /*J_ESC */   {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
77*c87b03e5Sespie /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR,  NOOP,   NOOP,  ERROR, ERROR, ERROR},
78*c87b03e5Sespie /*J2_ESC */  {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
79*c87b03e5Sespie /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR,  COPYJ,  COPYJ, ERROR, ERROR, ERROR},
80*c87b03e5Sespie };
81*c87b03e5Sespie 
82*c87b03e5Sespie 
83*c87b03e5Sespie const char *literal_codeset = NULL;
84*c87b03e5Sespie 
85*c87b03e5Sespie /* Store into *PWC (if PWC is not null) the wide character
86*c87b03e5Sespie    corresponding to the multibyte character at the start of the
87*c87b03e5Sespie    buffer S of size N.  Return the number of bytes in the multibyte
88*c87b03e5Sespie    character.  Return -1 if the bytes do not form a valid character,
89*c87b03e5Sespie    or 0 if S is null or points to a null byte.
90*c87b03e5Sespie 
91*c87b03e5Sespie    This function behaves like the Standard C function mbtowc, except
92*c87b03e5Sespie    it treats locale names of the form "C-..." specially.  */
93*c87b03e5Sespie 
94*c87b03e5Sespie int
local_mbtowc(pwc,s,n)95*c87b03e5Sespie local_mbtowc (pwc, s, n)
96*c87b03e5Sespie      wchar_t *pwc;
97*c87b03e5Sespie      const char *s;
98*c87b03e5Sespie      size_t n;
99*c87b03e5Sespie {
100*c87b03e5Sespie   static JIS_STATE save_state = ASCII;
101*c87b03e5Sespie   JIS_STATE curr_state = save_state;
102*c87b03e5Sespie   const unsigned char *t = (const unsigned char *) s;
103*c87b03e5Sespie 
104*c87b03e5Sespie   if (s != NULL && n == 0)
105*c87b03e5Sespie     return -1;
106*c87b03e5Sespie 
107*c87b03e5Sespie   if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
108*c87b03e5Sespie     /* This must be the "C" locale or unknown locale -- fall thru */
109*c87b03e5Sespie     ;
110*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-SJIS"))
111*c87b03e5Sespie     {
112*c87b03e5Sespie       int char1;
113*c87b03e5Sespie       if (s == NULL)
114*c87b03e5Sespie 	/* Not state-dependent.  */
115*c87b03e5Sespie         return 0;
116*c87b03e5Sespie 
117*c87b03e5Sespie       char1 = *t;
118*c87b03e5Sespie       if (ISSJIS1 (char1))
119*c87b03e5Sespie         {
120*c87b03e5Sespie           int char2 = t[1];
121*c87b03e5Sespie 
122*c87b03e5Sespie           if (n <= 1)
123*c87b03e5Sespie             return -1;
124*c87b03e5Sespie 
125*c87b03e5Sespie           if (ISSJIS2 (char2))
126*c87b03e5Sespie             {
127*c87b03e5Sespie 	      if (pwc != NULL)
128*c87b03e5Sespie 		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129*c87b03e5Sespie               return 2;
130*c87b03e5Sespie             }
131*c87b03e5Sespie 
132*c87b03e5Sespie 	  return -1;
133*c87b03e5Sespie         }
134*c87b03e5Sespie 
135*c87b03e5Sespie       if (pwc != NULL)
136*c87b03e5Sespie 	*pwc = (wchar_t) *t;
137*c87b03e5Sespie 
138*c87b03e5Sespie       if (*t == '\0')
139*c87b03e5Sespie 	return 0;
140*c87b03e5Sespie 
141*c87b03e5Sespie       return 1;
142*c87b03e5Sespie     }
143*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-EUCJP"))
144*c87b03e5Sespie     {
145*c87b03e5Sespie       int char1;
146*c87b03e5Sespie 
147*c87b03e5Sespie       if (s == NULL)
148*c87b03e5Sespie 	/* Not state-dependent.  */
149*c87b03e5Sespie         return 0;
150*c87b03e5Sespie 
151*c87b03e5Sespie       char1 = *t;
152*c87b03e5Sespie       if (ISEUCJP (char1))
153*c87b03e5Sespie         {
154*c87b03e5Sespie           int char2 = t[1];
155*c87b03e5Sespie 
156*c87b03e5Sespie           if (n <= 1)
157*c87b03e5Sespie             return -1;
158*c87b03e5Sespie 
159*c87b03e5Sespie           if (ISEUCJP (char2))
160*c87b03e5Sespie             {
161*c87b03e5Sespie 	      if (pwc != NULL)
162*c87b03e5Sespie 		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163*c87b03e5Sespie               return 2;
164*c87b03e5Sespie             }
165*c87b03e5Sespie 
166*c87b03e5Sespie 	  return -1;
167*c87b03e5Sespie         }
168*c87b03e5Sespie 
169*c87b03e5Sespie       if (pwc != NULL)
170*c87b03e5Sespie 	*pwc = (wchar_t) *t;
171*c87b03e5Sespie 
172*c87b03e5Sespie       if (*t == '\0')
173*c87b03e5Sespie 	return 0;
174*c87b03e5Sespie 
175*c87b03e5Sespie       return 1;
176*c87b03e5Sespie     }
177*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-JIS"))
178*c87b03e5Sespie     {
179*c87b03e5Sespie       JIS_ACTION action;
180*c87b03e5Sespie       JIS_CHAR_TYPE ch;
181*c87b03e5Sespie       const unsigned char *ptr;
182*c87b03e5Sespie       size_t i, curr_ch;
183*c87b03e5Sespie 
184*c87b03e5Sespie       if (s == NULL)
185*c87b03e5Sespie 	{
186*c87b03e5Sespie 	  save_state = ASCII;
187*c87b03e5Sespie 	  /* State-dependent.  */
188*c87b03e5Sespie 	  return 1;
189*c87b03e5Sespie 	}
190*c87b03e5Sespie 
191*c87b03e5Sespie       ptr = t;
192*c87b03e5Sespie 
193*c87b03e5Sespie       for (i = 0; i < n; i++)
194*c87b03e5Sespie         {
195*c87b03e5Sespie           curr_ch = t[i];
196*c87b03e5Sespie           switch (curr_ch)
197*c87b03e5Sespie             {
198*c87b03e5Sespie 	    case JIS_ESC_CHAR:
199*c87b03e5Sespie               ch = ESCAPE;
200*c87b03e5Sespie               break;
201*c87b03e5Sespie 	    case '$':
202*c87b03e5Sespie               ch = DOLLAR;
203*c87b03e5Sespie               break;
204*c87b03e5Sespie             case '@':
205*c87b03e5Sespie               ch = AT;
206*c87b03e5Sespie               break;
207*c87b03e5Sespie             case '(':
208*c87b03e5Sespie 	      ch = BRACKET;
209*c87b03e5Sespie               break;
210*c87b03e5Sespie             case 'B':
211*c87b03e5Sespie               ch = B;
212*c87b03e5Sespie               break;
213*c87b03e5Sespie             case 'J':
214*c87b03e5Sespie               ch = J;
215*c87b03e5Sespie               break;
216*c87b03e5Sespie             case '\0':
217*c87b03e5Sespie               ch = NUL;
218*c87b03e5Sespie               break;
219*c87b03e5Sespie             default:
220*c87b03e5Sespie               if (ISJIS (curr_ch))
221*c87b03e5Sespie                 ch = JIS_CHAR;
222*c87b03e5Sespie               else
223*c87b03e5Sespie                 ch = OTHER;
224*c87b03e5Sespie 	    }
225*c87b03e5Sespie 
226*c87b03e5Sespie           action = JIS_action_table[curr_state][ch];
227*c87b03e5Sespie           curr_state = JIS_state_table[curr_state][ch];
228*c87b03e5Sespie 
229*c87b03e5Sespie           switch (action)
230*c87b03e5Sespie             {
231*c87b03e5Sespie             case NOOP:
232*c87b03e5Sespie               break;
233*c87b03e5Sespie 
234*c87b03e5Sespie             case EMPTY:
235*c87b03e5Sespie 	      if (pwc != NULL)
236*c87b03e5Sespie 		*pwc = (wchar_t) 0;
237*c87b03e5Sespie 
238*c87b03e5Sespie 	      save_state = curr_state;
239*c87b03e5Sespie               return i;
240*c87b03e5Sespie 
241*c87b03e5Sespie             case COPYA:
242*c87b03e5Sespie 	      if (pwc != NULL)
243*c87b03e5Sespie 		*pwc = (wchar_t) *ptr;
244*c87b03e5Sespie 	      save_state = curr_state;
245*c87b03e5Sespie               return i + 1;
246*c87b03e5Sespie 
247*c87b03e5Sespie             case COPYJ:
248*c87b03e5Sespie 	      if (pwc != NULL)
249*c87b03e5Sespie 		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
250*c87b03e5Sespie 
251*c87b03e5Sespie 	      save_state = curr_state;
252*c87b03e5Sespie               return i + 1;
253*c87b03e5Sespie 
254*c87b03e5Sespie             case COPYJ2:
255*c87b03e5Sespie 	      if (pwc != NULL)
256*c87b03e5Sespie 		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
257*c87b03e5Sespie 
258*c87b03e5Sespie 	      save_state = curr_state;
259*c87b03e5Sespie               return ptr - t + 2;
260*c87b03e5Sespie 
261*c87b03e5Sespie             case MAKE_A:
262*c87b03e5Sespie             case MAKE_J:
263*c87b03e5Sespie               ptr = (const unsigned char *) (t + i + 1);
264*c87b03e5Sespie               break;
265*c87b03e5Sespie 
266*c87b03e5Sespie             case ERROR:
267*c87b03e5Sespie             default:
268*c87b03e5Sespie               return -1;
269*c87b03e5Sespie             }
270*c87b03e5Sespie         }
271*c87b03e5Sespie 
272*c87b03e5Sespie       /* More than n bytes needed.  */
273*c87b03e5Sespie       return -1;
274*c87b03e5Sespie     }
275*c87b03e5Sespie 
276*c87b03e5Sespie #ifdef CROSS_COMPILE
277*c87b03e5Sespie   if (s == NULL)
278*c87b03e5Sespie     /* Not state-dependent.  */
279*c87b03e5Sespie     return 0;
280*c87b03e5Sespie 
281*c87b03e5Sespie   if (pwc != NULL)
282*c87b03e5Sespie     *pwc = *s;
283*c87b03e5Sespie   return 1;
284*c87b03e5Sespie #else
285*c87b03e5Sespie 
286*c87b03e5Sespie   /* This must be the "C" locale or unknown locale.  */
287*c87b03e5Sespie   return mbtowc (pwc, s, n);
288*c87b03e5Sespie #endif
289*c87b03e5Sespie }
290*c87b03e5Sespie 
291*c87b03e5Sespie /* Return the number of bytes in the multibyte character at the start
292*c87b03e5Sespie    of the buffer S of size N.  Return -1 if the bytes do not form a
293*c87b03e5Sespie    valid character, or 0 if S is null or points to a null byte.
294*c87b03e5Sespie 
295*c87b03e5Sespie    This function behaves like the Standard C function mblen, except
296*c87b03e5Sespie    it treats locale names of the form "C-..." specially.  */
297*c87b03e5Sespie 
298*c87b03e5Sespie int
local_mblen(s,n)299*c87b03e5Sespie local_mblen (s, n)
300*c87b03e5Sespie      const char *s;
301*c87b03e5Sespie      size_t n;
302*c87b03e5Sespie {
303*c87b03e5Sespie   return local_mbtowc (NULL, s, n);
304*c87b03e5Sespie }
305*c87b03e5Sespie 
306*c87b03e5Sespie /* Return the maximum mumber of bytes in a multibyte character.
307*c87b03e5Sespie 
308*c87b03e5Sespie    This function returns the same value as the Standard C macro MB_CUR_MAX,
309*c87b03e5Sespie    except it treats locale names of the form "C-..." specially.  */
310*c87b03e5Sespie 
311*c87b03e5Sespie int
local_mb_cur_max()312*c87b03e5Sespie local_mb_cur_max ()
313*c87b03e5Sespie {
314*c87b03e5Sespie   if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
315*c87b03e5Sespie     ;
316*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-SJIS"))
317*c87b03e5Sespie     return 2;
318*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-EUCJP"))
319*c87b03e5Sespie     return 2;
320*c87b03e5Sespie   else if (! strcmp (literal_codeset, "C-JIS"))
321*c87b03e5Sespie     return 8; /* 3 + 2 + 3 */
322*c87b03e5Sespie 
323*c87b03e5Sespie #ifdef CROSS_COMPILE
324*c87b03e5Sespie   return 1;
325*c87b03e5Sespie #else
326*c87b03e5Sespie   if (MB_CUR_MAX > 0)
327*c87b03e5Sespie     return MB_CUR_MAX;
328*c87b03e5Sespie 
329*c87b03e5Sespie   return 1; /* default */
330*c87b03e5Sespie #endif
331*c87b03e5Sespie }
332*c87b03e5Sespie #else  /* MULTIBYTE_CHARS */
333*c87b03e5Sespie extern int dummy;  /* silence 'ANSI C forbids an empty source file' warning */
334*c87b03e5Sespie #endif /* MULTIBYTE_CHARS */
335