xref: /openbsd/gnu/usr.bin/gcc/gcc/mbchar.c (revision c87b03e5)
1 /* Multibyte Character Functions.
2    Copyright (C) 1998 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING.  If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA.  */
20 
21 /* Note regarding cross compilation:
22 
23    In general, translation of multibyte characters to wide characters can
24    only work in a native compiler since the translation function (mbtowc)
25    needs to know about both the source and target character encoding.  However,
26    this particular implementation for JIS, SJIS and EUCJP source characters
27    will work for any compiler with a newlib target.  Other targets may also
28    work provided that their wchar_t implementation is 2 bytes and the encoding
29    leaves the source character values unchanged (except for removing the
30    state shifting markers).  */
31 
32 #include "config.h"
33 #ifdef MULTIBYTE_CHARS
34 #include "system.h"
35 #include "mbchar.h"
36 #include <locale.h>
37 
38 typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39 	      JIS_C_NUM} JIS_CHAR_TYPE;
40 
41 typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42 	     J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
43 
44 typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45 	      EMPTY, ERROR} JIS_ACTION;
46 
47 /* State/action tables for processing JIS encoding:
48 
49    Where possible, switches to JIS are grouped with proceding JIS characters
50    and switches to ASCII are grouped with preceding JIS characters.
51    Thus, maximum returned length is:
52      2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.  */
53 
54 static const JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
55 /*            ESCAPE DOLLAR   BRACKET   AT     B      J     NUL JIS_CHAR OTH*/
56 /*ASCII*/   { A_ESC, ASCII,   ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57 /*A_ESC*/   { ASCII, A_ESC_DL,ASCII,    ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58 /*A_ESC_DL*/{ ASCII, ASCII,   ASCII,    JIS,   JIS,   ASCII, ASCII,ASCII,ASCII},
59 /*JIS*/     { J_ESC, JIS_1,   JIS_1,    JIS_1, JIS_1, JIS_1, INV,  JIS_1,INV },
60 /*JIS_1*/   { INV,   JIS_2,   JIS_2,    JIS_2, JIS_2, JIS_2, INV,  JIS_2,INV },
61 /*JIS_2*/   { J2_ESC,JIS,     JIS,      JIS,   JIS,   JIS,   INV,  JIS,  JIS },
62 /*J_ESC*/   { INV,   INV,     J_ESC_BR, INV,   INV,   INV,   INV,  INV,  INV },
63 /*J_ESC_BR*/{ INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
64 /*J2_ESC*/  { INV,   INV,     J2_ESC_BR,INV,   INV,   INV,   INV,  INV,  INV },
65 /*J2_ESC_BR*/{INV,   INV,     INV,      INV,   ASCII, ASCII, INV,  INV,  INV },
66 };
67 
68 static const JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
69 /*            ESCAPE DOLLAR BRACKET AT     B       J      NUL  JIS_CHAR OTH */
70 /*ASCII */   {NOOP,  COPYA, COPYA, COPYA,  COPYA,  COPYA, EMPTY, COPYA, COPYA},
71 /*A_ESC */   {COPYA, NOOP,  COPYA, COPYA,  COPYA,  COPYA, COPYA, COPYA, COPYA},
72 /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
73 /*JIS */     {NOOP,  NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
74 /*JIS_1 */   {ERROR, NOOP,  NOOP,  NOOP,   NOOP,   NOOP,  ERROR, NOOP,  ERROR},
75 /*JIS_2 */   {NOOP,  COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
76 /*J_ESC */   {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
77 /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR,  NOOP,   NOOP,  ERROR, ERROR, ERROR},
78 /*J2_ESC */  {ERROR, ERROR, NOOP,  ERROR,  ERROR,  ERROR, ERROR, ERROR, ERROR},
79 /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR,  COPYJ,  COPYJ, ERROR, ERROR, ERROR},
80 };
81 
82 
83 const char *literal_codeset = NULL;
84 
85 /* Store into *PWC (if PWC is not null) the wide character
86    corresponding to the multibyte character at the start of the
87    buffer S of size N.  Return the number of bytes in the multibyte
88    character.  Return -1 if the bytes do not form a valid character,
89    or 0 if S is null or points to a null byte.
90 
91    This function behaves like the Standard C function mbtowc, except
92    it treats locale names of the form "C-..." specially.  */
93 
94 int
local_mbtowc(pwc,s,n)95 local_mbtowc (pwc, s, n)
96      wchar_t *pwc;
97      const char *s;
98      size_t n;
99 {
100   static JIS_STATE save_state = ASCII;
101   JIS_STATE curr_state = save_state;
102   const unsigned char *t = (const unsigned char *) s;
103 
104   if (s != NULL && n == 0)
105     return -1;
106 
107   if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
108     /* This must be the "C" locale or unknown locale -- fall thru */
109     ;
110   else if (! strcmp (literal_codeset, "C-SJIS"))
111     {
112       int char1;
113       if (s == NULL)
114 	/* Not state-dependent.  */
115         return 0;
116 
117       char1 = *t;
118       if (ISSJIS1 (char1))
119         {
120           int char2 = t[1];
121 
122           if (n <= 1)
123             return -1;
124 
125           if (ISSJIS2 (char2))
126             {
127 	      if (pwc != NULL)
128 		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129               return 2;
130             }
131 
132 	  return -1;
133         }
134 
135       if (pwc != NULL)
136 	*pwc = (wchar_t) *t;
137 
138       if (*t == '\0')
139 	return 0;
140 
141       return 1;
142     }
143   else if (! strcmp (literal_codeset, "C-EUCJP"))
144     {
145       int char1;
146 
147       if (s == NULL)
148 	/* Not state-dependent.  */
149         return 0;
150 
151       char1 = *t;
152       if (ISEUCJP (char1))
153         {
154           int char2 = t[1];
155 
156           if (n <= 1)
157             return -1;
158 
159           if (ISEUCJP (char2))
160             {
161 	      if (pwc != NULL)
162 		*pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163               return 2;
164             }
165 
166 	  return -1;
167         }
168 
169       if (pwc != NULL)
170 	*pwc = (wchar_t) *t;
171 
172       if (*t == '\0')
173 	return 0;
174 
175       return 1;
176     }
177   else if (! strcmp (literal_codeset, "C-JIS"))
178     {
179       JIS_ACTION action;
180       JIS_CHAR_TYPE ch;
181       const unsigned char *ptr;
182       size_t i, curr_ch;
183 
184       if (s == NULL)
185 	{
186 	  save_state = ASCII;
187 	  /* State-dependent.  */
188 	  return 1;
189 	}
190 
191       ptr = t;
192 
193       for (i = 0; i < n; i++)
194         {
195           curr_ch = t[i];
196           switch (curr_ch)
197             {
198 	    case JIS_ESC_CHAR:
199               ch = ESCAPE;
200               break;
201 	    case '$':
202               ch = DOLLAR;
203               break;
204             case '@':
205               ch = AT;
206               break;
207             case '(':
208 	      ch = BRACKET;
209               break;
210             case 'B':
211               ch = B;
212               break;
213             case 'J':
214               ch = J;
215               break;
216             case '\0':
217               ch = NUL;
218               break;
219             default:
220               if (ISJIS (curr_ch))
221                 ch = JIS_CHAR;
222               else
223                 ch = OTHER;
224 	    }
225 
226           action = JIS_action_table[curr_state][ch];
227           curr_state = JIS_state_table[curr_state][ch];
228 
229           switch (action)
230             {
231             case NOOP:
232               break;
233 
234             case EMPTY:
235 	      if (pwc != NULL)
236 		*pwc = (wchar_t) 0;
237 
238 	      save_state = curr_state;
239               return i;
240 
241             case COPYA:
242 	      if (pwc != NULL)
243 		*pwc = (wchar_t) *ptr;
244 	      save_state = curr_state;
245               return i + 1;
246 
247             case COPYJ:
248 	      if (pwc != NULL)
249 		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
250 
251 	      save_state = curr_state;
252               return i + 1;
253 
254             case COPYJ2:
255 	      if (pwc != NULL)
256 		*pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
257 
258 	      save_state = curr_state;
259               return ptr - t + 2;
260 
261             case MAKE_A:
262             case MAKE_J:
263               ptr = (const unsigned char *) (t + i + 1);
264               break;
265 
266             case ERROR:
267             default:
268               return -1;
269             }
270         }
271 
272       /* More than n bytes needed.  */
273       return -1;
274     }
275 
276 #ifdef CROSS_COMPILE
277   if (s == NULL)
278     /* Not state-dependent.  */
279     return 0;
280 
281   if (pwc != NULL)
282     *pwc = *s;
283   return 1;
284 #else
285 
286   /* This must be the "C" locale or unknown locale.  */
287   return mbtowc (pwc, s, n);
288 #endif
289 }
290 
291 /* Return the number of bytes in the multibyte character at the start
292    of the buffer S of size N.  Return -1 if the bytes do not form a
293    valid character, or 0 if S is null or points to a null byte.
294 
295    This function behaves like the Standard C function mblen, except
296    it treats locale names of the form "C-..." specially.  */
297 
298 int
local_mblen(s,n)299 local_mblen (s, n)
300      const char *s;
301      size_t n;
302 {
303   return local_mbtowc (NULL, s, n);
304 }
305 
306 /* Return the maximum mumber of bytes in a multibyte character.
307 
308    This function returns the same value as the Standard C macro MB_CUR_MAX,
309    except it treats locale names of the form "C-..." specially.  */
310 
311 int
local_mb_cur_max()312 local_mb_cur_max ()
313 {
314   if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
315     ;
316   else if (! strcmp (literal_codeset, "C-SJIS"))
317     return 2;
318   else if (! strcmp (literal_codeset, "C-EUCJP"))
319     return 2;
320   else if (! strcmp (literal_codeset, "C-JIS"))
321     return 8; /* 3 + 2 + 3 */
322 
323 #ifdef CROSS_COMPILE
324   return 1;
325 #else
326   if (MB_CUR_MAX > 0)
327     return MB_CUR_MAX;
328 
329   return 1; /* default */
330 #endif
331 }
332 #else  /* MULTIBYTE_CHARS */
333 extern int dummy;  /* silence 'ANSI C forbids an empty source file' warning */
334 #endif /* MULTIBYTE_CHARS */
335