xref: /reactos/sdk/tools/unicode/mbtowc.c (revision cc439606)
1 /*
2  * MultiByteToWideChar implementation
3  *
4  * Copyright 2000 Alexandre Julliard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 
21 #include <string.h>
22 
23 #include "wine/unicode.h"
24 
25 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN;
26 
27 /* check the code whether it is in Unicode Private Use Area (PUA). */
28 /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */
29 static inline int is_private_use_area_char(WCHAR code)
30 {
31     return (code >= 0xe000 && code <= 0xf8ff);
32 }
33 
34 /* check src string for invalid chars; return non-zero if invalid char found */
35 static inline int check_invalid_chars_sbcs( const struct sbcs_table *table, int flags,
36                                             const unsigned char *src, unsigned int srclen )
37 {
38     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
39     const WCHAR def_unicode_char = table->info.def_unicode_char;
40     const unsigned char def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
41                                                      + (def_unicode_char & 0xff)];
42     while (srclen)
43     {
44         if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
45             is_private_use_area_char(cp2uni[*src])) break;
46         src++;
47         srclen--;
48     }
49     return srclen;
50 }
51 
52 /* mbstowcs for single-byte code page */
53 /* all lengths are in characters, not bytes */
54 static inline int mbstowcs_sbcs( const struct sbcs_table *table, int flags,
55                                  const unsigned char *src, unsigned int srclen,
56                                  WCHAR *dst, unsigned int dstlen )
57 {
58     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
59     int ret = srclen;
60 
61     if (dstlen < srclen)
62     {
63         /* buffer too small: fill it up to dstlen and return error */
64         srclen = dstlen;
65         ret = -1;
66     }
67 
68     for (;;)
69     {
70         switch(srclen)
71         {
72         default:
73         case 16: dst[15] = cp2uni[src[15]];
74         case 15: dst[14] = cp2uni[src[14]];
75         case 14: dst[13] = cp2uni[src[13]];
76         case 13: dst[12] = cp2uni[src[12]];
77         case 12: dst[11] = cp2uni[src[11]];
78         case 11: dst[10] = cp2uni[src[10]];
79         case 10: dst[9]  = cp2uni[src[9]];
80         case 9:  dst[8]  = cp2uni[src[8]];
81         case 8:  dst[7]  = cp2uni[src[7]];
82         case 7:  dst[6]  = cp2uni[src[6]];
83         case 6:  dst[5]  = cp2uni[src[5]];
84         case 5:  dst[4]  = cp2uni[src[4]];
85         case 4:  dst[3]  = cp2uni[src[3]];
86         case 3:  dst[2]  = cp2uni[src[2]];
87         case 2:  dst[1]  = cp2uni[src[1]];
88         case 1:  dst[0]  = cp2uni[src[0]];
89         case 0: break;
90         }
91         if (srclen < 16) return ret;
92         dst += 16;
93         src += 16;
94         srclen -= 16;
95     }
96 }
97 
98 /* mbstowcs for single-byte code page with char decomposition */
99 static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags,
100                                     const unsigned char *src, unsigned int srclen,
101                                     WCHAR *dst, unsigned int dstlen )
102 {
103     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
104     unsigned int len;
105 
106     if (!dstlen)  /* compute length */
107     {
108         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
109         for (len = 0; srclen; srclen--, src++)
110             len += wine_decompose( cp2uni[*src], dummy, 4 );
111         return len;
112     }
113 
114     for (len = dstlen; srclen && len; srclen--, src++)
115     {
116         unsigned int res = wine_decompose( cp2uni[*src], dst, len );
117         if (!res) break;
118         len -= res;
119         dst += res;
120     }
121     if (srclen) return -1;  /* overflow */
122     return dstlen - len;
123 }
124 
125 /* query necessary dst length for src string */
126 static inline int get_length_dbcs( const struct dbcs_table *table,
127                                    const unsigned char *src, unsigned int srclen )
128 {
129     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
130     int len;
131 
132     for (len = 0; srclen; srclen--, src++, len++)
133     {
134         if (cp2uni_lb[*src] && srclen > 1 && src[1])
135         {
136             src++;
137             srclen--;
138         }
139     }
140     return len;
141 }
142 
143 /* check src string for invalid chars; return non-zero if invalid char found */
144 static inline int check_invalid_chars_dbcs( const struct dbcs_table *table,
145                                             const unsigned char *src, unsigned int srclen )
146 {
147     const WCHAR * const cp2uni = table->cp2uni;
148     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
149     const WCHAR def_unicode_char = table->info.def_unicode_char;
150     const unsigned short def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
151                                                       + (def_unicode_char & 0xff)];
152     while (srclen)
153     {
154         unsigned char off = cp2uni_lb[*src];
155         if (off)  /* multi-byte char */
156         {
157             if (srclen == 1) break;  /* partial char, error */
158             if (cp2uni[(off << 8) + src[1]] == def_unicode_char &&
159                 ((src[0] << 8) | src[1]) != def_char) break;
160             src++;
161             srclen--;
162         }
163         else if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
164                  is_private_use_area_char(cp2uni[*src])) break;
165         src++;
166         srclen--;
167     }
168     return srclen;
169 }
170 
171 /* mbstowcs for double-byte code page */
172 /* all lengths are in characters, not bytes */
173 static inline int mbstowcs_dbcs( const struct dbcs_table *table,
174                                  const unsigned char *src, unsigned int srclen,
175                                  WCHAR *dst, unsigned int dstlen )
176 {
177     const WCHAR * const cp2uni = table->cp2uni;
178     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
179     unsigned int len;
180 
181     if (!dstlen) return get_length_dbcs( table, src, srclen );
182 
183     for (len = dstlen; srclen && len; len--, srclen--, src++, dst++)
184     {
185         unsigned char off = cp2uni_lb[*src];
186         if (off && srclen > 1 && src[1])
187         {
188             src++;
189             srclen--;
190             *dst = cp2uni[(off << 8) + *src];
191         }
192         else *dst = cp2uni[*src];
193     }
194     if (srclen) return -1;  /* overflow */
195     return dstlen - len;
196 }
197 
198 
199 /* mbstowcs for double-byte code page with character decomposition */
200 static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
201                                     const unsigned char *src, unsigned int srclen,
202                                     WCHAR *dst, unsigned int dstlen )
203 {
204     const WCHAR * const cp2uni = table->cp2uni;
205     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
206     unsigned int len, res;
207     WCHAR ch;
208 
209     if (!dstlen)  /* compute length */
210     {
211         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
212         for (len = 0; srclen; srclen--, src++)
213         {
214             unsigned char off = cp2uni_lb[*src];
215             if (off && srclen > 1 && src[1])
216             {
217                 src++;
218                 srclen--;
219                 ch = cp2uni[(off << 8) + *src];
220             }
221             else ch = cp2uni[*src];
222             len += wine_decompose( ch, dummy, 4 );
223         }
224         return len;
225     }
226 
227     for (len = dstlen; srclen && len; srclen--, src++)
228     {
229         unsigned char off = cp2uni_lb[*src];
230         if (off && srclen > 1 && src[1])
231         {
232             src++;
233             srclen--;
234             ch = cp2uni[(off << 8) + *src];
235         }
236         else ch = cp2uni[*src];
237         if (!(res = wine_decompose( ch, dst, len ))) break;
238         dst += res;
239         len -= res;
240     }
241     if (srclen) return -1;  /* overflow */
242     return dstlen - len;
243 }
244 
245 
246 /* return -1 on dst buffer overflow, -2 on invalid input char */
247 int wine_cp_mbstowcs( const union cptable *table, int flags,
248                       const char *s, int srclen,
249                       WCHAR *dst, int dstlen )
250 {
251     const unsigned char *src = (const unsigned char*) s;
252 
253     if (table->info.char_size == 1)
254     {
255         if (flags & MB_ERR_INVALID_CHARS)
256         {
257             if (check_invalid_chars_sbcs( &table->sbcs, flags, src, srclen )) return -2;
258         }
259         if (!(flags & MB_COMPOSITE))
260         {
261             if (!dstlen) return srclen;
262             return mbstowcs_sbcs( &table->sbcs, flags, src, srclen, dst, dstlen );
263         }
264         return mbstowcs_sbcs_decompose( &table->sbcs, flags, src, srclen, dst, dstlen );
265     }
266     else /* mbcs */
267     {
268         if (flags & MB_ERR_INVALID_CHARS)
269         {
270             if (check_invalid_chars_dbcs( &table->dbcs, src, srclen )) return -2;
271         }
272         if (!(flags & MB_COMPOSITE))
273             return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
274         else
275             return mbstowcs_dbcs_decompose( &table->dbcs, src, srclen, dst, dstlen );
276     }
277 }
278