xref: /reactos/sdk/tools/unicode/mbtowc.c (revision 1734f297)
1 /*
2  * MultiByteToWideChar implementation
3  *
4  * Copyright 2000 Alexandre Julliard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 
21 #include <string.h>
22 
23 #include "wine/unicode.h"
24 
25 extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN;
26 
27 /* check the code whether it is in Unicode Private Use Area (PUA). */
28 /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */
29 static inline int is_private_use_area_char(WCHAR code)
30 {
31     return (code >= 0xe000 && code <= 0xf8ff);
32 }
33 
34 /* check src string for invalid chars; return non-zero if invalid char found */
35 static inline int check_invalid_chars_sbcs( const struct sbcs_table *table, int flags,
36                                             const unsigned char *src, unsigned int srclen )
37 {
38     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
39     const WCHAR def_unicode_char = table->info.def_unicode_char;
40     const unsigned char def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
41                                                      + (def_unicode_char & 0xff)];
42     while (srclen)
43     {
44         if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
45             is_private_use_area_char(cp2uni[*src])) break;
46         src++;
47         srclen--;
48     }
49     return srclen;
50 }
51 
52 /* mbstowcs for single-byte code page */
53 /* all lengths are in characters, not bytes */
54 static inline int mbstowcs_sbcs( const struct sbcs_table *table, int flags,
55                                  const unsigned char *src, unsigned int srclen,
56                                  WCHAR *dst, unsigned int dstlen )
57 {
58     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
59     int ret = srclen;
60 
61     if (dstlen < srclen)
62     {
63         /* buffer too small: fill it up to dstlen and return error */
64         srclen = dstlen;
65         ret = -1;
66     }
67 
68     while (srclen >= 16)
69     {
70         dst[0]  = cp2uni[src[0]];
71         dst[1]  = cp2uni[src[1]];
72         dst[2]  = cp2uni[src[2]];
73         dst[3]  = cp2uni[src[3]];
74         dst[4]  = cp2uni[src[4]];
75         dst[5]  = cp2uni[src[5]];
76         dst[6]  = cp2uni[src[6]];
77         dst[7]  = cp2uni[src[7]];
78         dst[8]  = cp2uni[src[8]];
79         dst[9]  = cp2uni[src[9]];
80         dst[10] = cp2uni[src[10]];
81         dst[11] = cp2uni[src[11]];
82         dst[12] = cp2uni[src[12]];
83         dst[13] = cp2uni[src[13]];
84         dst[14] = cp2uni[src[14]];
85         dst[15] = cp2uni[src[15]];
86         src += 16;
87         dst += 16;
88         srclen -= 16;
89     }
90 
91     /* now handle the remaining characters */
92     src += srclen;
93     dst += srclen;
94     switch (srclen)
95     {
96     case 15: dst[-15] = cp2uni[src[-15]];
97     case 14: dst[-14] = cp2uni[src[-14]];
98     case 13: dst[-13] = cp2uni[src[-13]];
99     case 12: dst[-12] = cp2uni[src[-12]];
100     case 11: dst[-11] = cp2uni[src[-11]];
101     case 10: dst[-10] = cp2uni[src[-10]];
102     case 9:  dst[-9]  = cp2uni[src[-9]];
103     case 8:  dst[-8]  = cp2uni[src[-8]];
104     case 7:  dst[-7]  = cp2uni[src[-7]];
105     case 6:  dst[-6]  = cp2uni[src[-6]];
106     case 5:  dst[-5]  = cp2uni[src[-5]];
107     case 4:  dst[-4]  = cp2uni[src[-4]];
108     case 3:  dst[-3]  = cp2uni[src[-3]];
109     case 2:  dst[-2]  = cp2uni[src[-2]];
110     case 1:  dst[-1]  = cp2uni[src[-1]];
111     case 0: break;
112     }
113     return ret;
114 }
115 
116 /* mbstowcs for single-byte code page with char decomposition */
117 static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags,
118                                     const unsigned char *src, unsigned int srclen,
119                                     WCHAR *dst, unsigned int dstlen )
120 {
121     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
122     unsigned int len;
123 
124     if (!dstlen)  /* compute length */
125     {
126         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
127         for (len = 0; srclen; srclen--, src++)
128             len += wine_decompose( 0, cp2uni[*src], dummy, 4 );
129         return len;
130     }
131 
132     for (len = dstlen; srclen && len; srclen--, src++)
133     {
134         unsigned int res = wine_decompose( 0, cp2uni[*src], dst, len );
135         if (!res) break;
136         len -= res;
137         dst += res;
138     }
139     if (srclen) return -1;  /* overflow */
140     return dstlen - len;
141 }
142 
143 /* query necessary dst length for src string */
144 static inline int get_length_dbcs( const struct dbcs_table *table,
145                                    const unsigned char *src, unsigned int srclen )
146 {
147     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
148     int len;
149 
150     for (len = 0; srclen; srclen--, src++, len++)
151     {
152         if (cp2uni_lb[*src] && srclen > 1 && src[1])
153         {
154             src++;
155             srclen--;
156         }
157     }
158     return len;
159 }
160 
161 /* check src string for invalid chars; return non-zero if invalid char found */
162 static inline int check_invalid_chars_dbcs( const struct dbcs_table *table,
163                                             const unsigned char *src, unsigned int srclen )
164 {
165     const WCHAR * const cp2uni = table->cp2uni;
166     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
167     const WCHAR def_unicode_char = table->info.def_unicode_char;
168     const unsigned short def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
169                                                       + (def_unicode_char & 0xff)];
170     while (srclen)
171     {
172         unsigned char off = cp2uni_lb[*src];
173         if (off)  /* multi-byte char */
174         {
175             if (srclen == 1) break;  /* partial char, error */
176             if (cp2uni[(off << 8) + src[1]] == def_unicode_char &&
177                 ((src[0] << 8) | src[1]) != def_char) break;
178             src++;
179             srclen--;
180         }
181         else if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
182                  is_private_use_area_char(cp2uni[*src])) break;
183         src++;
184         srclen--;
185     }
186     return srclen;
187 }
188 
189 /* mbstowcs for double-byte code page */
190 /* all lengths are in characters, not bytes */
191 static inline int mbstowcs_dbcs( const struct dbcs_table *table,
192                                  const unsigned char *src, unsigned int srclen,
193                                  WCHAR *dst, unsigned int dstlen )
194 {
195     const WCHAR * const cp2uni = table->cp2uni;
196     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
197     unsigned int len;
198 
199     if (!dstlen) return get_length_dbcs( table, src, srclen );
200 
201     for (len = dstlen; srclen && len; len--, srclen--, src++, dst++)
202     {
203         unsigned char off = cp2uni_lb[*src];
204         if (off && srclen > 1 && src[1])
205         {
206             src++;
207             srclen--;
208             *dst = cp2uni[(off << 8) + *src];
209         }
210         else *dst = cp2uni[*src];
211     }
212     if (srclen) return -1;  /* overflow */
213     return dstlen - len;
214 }
215 
216 
217 /* mbstowcs for double-byte code page with character decomposition */
218 static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
219                                     const unsigned char *src, unsigned int srclen,
220                                     WCHAR *dst, unsigned int dstlen )
221 {
222     const WCHAR * const cp2uni = table->cp2uni;
223     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
224     unsigned int len, res;
225     WCHAR ch;
226 
227     if (!dstlen)  /* compute length */
228     {
229         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
230         for (len = 0; srclen; srclen--, src++)
231         {
232             unsigned char off = cp2uni_lb[*src];
233             if (off && srclen > 1 && src[1])
234             {
235                 src++;
236                 srclen--;
237                 ch = cp2uni[(off << 8) + *src];
238             }
239             else ch = cp2uni[*src];
240             len += wine_decompose( 0, ch, dummy, 4 );
241         }
242         return len;
243     }
244 
245     for (len = dstlen; srclen && len; srclen--, src++)
246     {
247         unsigned char off = cp2uni_lb[*src];
248         if (off && srclen > 1 && src[1])
249         {
250             src++;
251             srclen--;
252             ch = cp2uni[(off << 8) + *src];
253         }
254         else ch = cp2uni[*src];
255         if (!(res = wine_decompose( 0, ch, dst, len ))) break;
256         dst += res;
257         len -= res;
258     }
259     if (srclen) return -1;  /* overflow */
260     return dstlen - len;
261 }
262 
263 
264 /* return -1 on dst buffer overflow, -2 on invalid input char */
265 int wine_cp_mbstowcs( const union cptable *table, int flags,
266                       const char *s, int srclen,
267                       WCHAR *dst, int dstlen )
268 {
269     const unsigned char *src = (const unsigned char*) s;
270 
271     if (table->info.char_size == 1)
272     {
273         if (flags & MB_ERR_INVALID_CHARS)
274         {
275             if (check_invalid_chars_sbcs( &table->sbcs, flags, src, srclen )) return -2;
276         }
277         if (!(flags & MB_COMPOSITE))
278         {
279             if (!dstlen) return srclen;
280             return mbstowcs_sbcs( &table->sbcs, flags, src, srclen, dst, dstlen );
281         }
282         return mbstowcs_sbcs_decompose( &table->sbcs, flags, src, srclen, dst, dstlen );
283     }
284     else /* mbcs */
285     {
286         if (flags & MB_ERR_INVALID_CHARS)
287         {
288             if (check_invalid_chars_dbcs( &table->dbcs, src, srclen )) return -2;
289         }
290         if (!(flags & MB_COMPOSITE))
291             return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
292         else
293             return mbstowcs_dbcs_decompose( &table->dbcs, src, srclen, dst, dstlen );
294     }
295 }
296