xref: /reactos/sdk/tools/unicode/wctomb.c (revision f2df3bf0)
1 /*
2  * WideCharToMultiByte implementation
3  *
4  * Copyright 2000 Alexandre Julliard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 
21 #include <string.h>
22 
23 #include "wine/unicode.h"
24 
25 extern WCHAR wine_compose( const WCHAR *str ) DECLSPEC_HIDDEN;
26 
27 /****************************************************************/
28 /* sbcs support */
29 
30 /* check if 'ch' is an acceptable sbcs mapping for 'wch' */
31 static inline int is_valid_sbcs_mapping( const struct sbcs_table *table, int flags,
32                                          WCHAR wch, unsigned char ch )
33 {
34     if ((flags & WC_NO_BEST_FIT_CHARS) || ch == (unsigned char)table->info.def_char)
35         return (table->cp2uni[ch] == wch);
36     return 1;
37 }
38 
39 /* query necessary dst length for src string */
40 static int get_length_sbcs( const struct sbcs_table *table, int flags,
41                             const WCHAR *src, unsigned int srclen, int *used )
42 {
43     const unsigned char  * const uni2cp_low = table->uni2cp_low;
44     const unsigned short * const uni2cp_high = table->uni2cp_high;
45     int ret, tmp;
46     WCHAR composed;
47 
48     if (!used) used = &tmp;  /* avoid checking on every char */
49     *used = 0;
50 
51     for (ret = 0; srclen; ret++, src++, srclen--)
52     {
53         WCHAR wch = *src;
54         unsigned char ch;
55 
56         if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = wine_compose(src)))
57         {
58             /* now check if we can use the composed char */
59             ch = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
60             if (is_valid_sbcs_mapping( table, flags, composed, ch ))
61             {
62                 /* we have a good mapping, use it */
63                 src++;
64                 srclen--;
65                 continue;
66             }
67             /* no mapping for the composed char, check the other flags */
68             if (flags & WC_DEFAULTCHAR) /* use the default char instead */
69             {
70                 *used = 1;
71                 src++;  /* skip the non-spacing char */
72                 srclen--;
73                 continue;
74             }
75             if (flags & WC_DISCARDNS) /* skip the second char of the composition */
76             {
77                 src++;
78                 srclen--;
79             }
80             /* WC_SEPCHARS is the default */
81         }
82         if (!*used)
83         {
84             ch = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
85             *used = !is_valid_sbcs_mapping( table, flags, wch, ch );
86         }
87     }
88     return ret;
89 }
90 
91 /* wcstombs for single-byte code page */
92 static inline int wcstombs_sbcs( const struct sbcs_table *table,
93                                  const WCHAR *src, unsigned int srclen,
94                                  char *dst, unsigned int dstlen )
95 {
96     const unsigned char  * const uni2cp_low = table->uni2cp_low;
97     const unsigned short * const uni2cp_high = table->uni2cp_high;
98     int ret = srclen;
99 
100     if (dstlen < srclen)
101     {
102         /* buffer too small: fill it up to dstlen and return error */
103         srclen = dstlen;
104         ret = -1;
105     }
106 
107     while (srclen >= 16)
108     {
109         dst[0]  = uni2cp_low[uni2cp_high[src[0]  >> 8] + (src[0]  & 0xff)];
110         dst[1]  = uni2cp_low[uni2cp_high[src[1]  >> 8] + (src[1]  & 0xff)];
111         dst[2]  = uni2cp_low[uni2cp_high[src[2]  >> 8] + (src[2]  & 0xff)];
112         dst[3]  = uni2cp_low[uni2cp_high[src[3]  >> 8] + (src[3]  & 0xff)];
113         dst[4]  = uni2cp_low[uni2cp_high[src[4]  >> 8] + (src[4]  & 0xff)];
114         dst[5]  = uni2cp_low[uni2cp_high[src[5]  >> 8] + (src[5]  & 0xff)];
115         dst[6]  = uni2cp_low[uni2cp_high[src[6]  >> 8] + (src[6]  & 0xff)];
116         dst[7]  = uni2cp_low[uni2cp_high[src[7]  >> 8] + (src[7]  & 0xff)];
117         dst[8]  = uni2cp_low[uni2cp_high[src[8]  >> 8] + (src[8]  & 0xff)];
118         dst[9]  = uni2cp_low[uni2cp_high[src[9]  >> 8] + (src[9]  & 0xff)];
119         dst[10] = uni2cp_low[uni2cp_high[src[10] >> 8] + (src[10] & 0xff)];
120         dst[11] = uni2cp_low[uni2cp_high[src[11] >> 8] + (src[11] & 0xff)];
121         dst[12] = uni2cp_low[uni2cp_high[src[12] >> 8] + (src[12] & 0xff)];
122         dst[13] = uni2cp_low[uni2cp_high[src[13] >> 8] + (src[13] & 0xff)];
123         dst[14] = uni2cp_low[uni2cp_high[src[14] >> 8] + (src[14] & 0xff)];
124         dst[15] = uni2cp_low[uni2cp_high[src[15] >> 8] + (src[15] & 0xff)];
125         src += 16;
126         dst += 16;
127         srclen -= 16;
128     }
129 
130     /* now handle remaining characters */
131     src += srclen;
132     dst += srclen;
133     switch(srclen)
134     {
135     case 15: dst[-15] = uni2cp_low[uni2cp_high[src[-15] >> 8] + (src[-15] & 0xff)];
136     case 14: dst[-14] = uni2cp_low[uni2cp_high[src[-14] >> 8] + (src[-14] & 0xff)];
137     case 13: dst[-13] = uni2cp_low[uni2cp_high[src[-13] >> 8] + (src[-13] & 0xff)];
138     case 12: dst[-12] = uni2cp_low[uni2cp_high[src[-12] >> 8] + (src[-12] & 0xff)];
139     case 11: dst[-11] = uni2cp_low[uni2cp_high[src[-11] >> 8] + (src[-11] & 0xff)];
140     case 10: dst[-10] = uni2cp_low[uni2cp_high[src[-10] >> 8] + (src[-10] & 0xff)];
141     case 9:  dst[-9]  = uni2cp_low[uni2cp_high[src[-9]  >> 8] + (src[-9]  & 0xff)];
142     case 8:  dst[-8]  = uni2cp_low[uni2cp_high[src[-8]  >> 8] + (src[-8]  & 0xff)];
143     case 7:  dst[-7]  = uni2cp_low[uni2cp_high[src[-7]  >> 8] + (src[-7]  & 0xff)];
144     case 6:  dst[-6]  = uni2cp_low[uni2cp_high[src[-6]  >> 8] + (src[-6]  & 0xff)];
145     case 5:  dst[-5]  = uni2cp_low[uni2cp_high[src[-5]  >> 8] + (src[-5]  & 0xff)];
146     case 4:  dst[-4]  = uni2cp_low[uni2cp_high[src[-4]  >> 8] + (src[-4]  & 0xff)];
147     case 3:  dst[-3]  = uni2cp_low[uni2cp_high[src[-3]  >> 8] + (src[-3]  & 0xff)];
148     case 2:  dst[-2]  = uni2cp_low[uni2cp_high[src[-2]  >> 8] + (src[-2]  & 0xff)];
149     case 1:  dst[-1]  = uni2cp_low[uni2cp_high[src[-1]  >> 8] + (src[-1]  & 0xff)];
150     case 0: break;
151     }
152     return ret;
153 }
154 
155 /* slow version of wcstombs_sbcs that handles the various flags */
156 static int wcstombs_sbcs_slow( const struct sbcs_table *table, int flags,
157                                const WCHAR *src, unsigned int srclen,
158                                char *dst, unsigned int dstlen,
159                                const char *defchar, int *used )
160 {
161     const unsigned char  * const uni2cp_low = table->uni2cp_low;
162     const unsigned short * const uni2cp_high = table->uni2cp_high;
163     unsigned char def;
164     unsigned int len;
165     int tmp;
166     WCHAR composed;
167 
168     if (!defchar)
169         def = table->info.def_char & 0xff;
170     else
171         def = *defchar;
172 
173     if (!used) used = &tmp;  /* avoid checking on every char */
174     *used = 0;
175 
176     for (len = dstlen; srclen && len; dst++, len--, src++, srclen--)
177     {
178         WCHAR wch = *src;
179 
180         if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = wine_compose(src)))
181         {
182             /* now check if we can use the composed char */
183             *dst = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
184             if (is_valid_sbcs_mapping( table, flags, composed, *dst ))
185             {
186                 /* we have a good mapping, use it */
187                 src++;
188                 srclen--;
189                 continue;
190             }
191             /* no mapping for the composed char, check the other flags */
192             if (flags & WC_DEFAULTCHAR) /* use the default char instead */
193             {
194                 *dst = def;
195                 *used = 1;
196                 src++;  /* skip the non-spacing char */
197                 srclen--;
198                 continue;
199             }
200             if (flags & WC_DISCARDNS) /* skip the second char of the composition */
201             {
202                 src++;
203                 srclen--;
204             }
205             /* WC_SEPCHARS is the default */
206         }
207 
208         *dst = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
209         if (!is_valid_sbcs_mapping( table, flags, wch, *dst ))
210         {
211             *dst = def;
212             *used = 1;
213         }
214     }
215     if (srclen) return -1;  /* overflow */
216     return dstlen - len;
217 }
218 
219 
220 /****************************************************************/
221 /* dbcs support */
222 
223 /* check if 'ch' is an acceptable dbcs mapping for 'wch' */
224 static inline int is_valid_dbcs_mapping( const struct dbcs_table *table, int flags,
225                                          WCHAR wch, unsigned short ch )
226 {
227     if ((flags & WC_NO_BEST_FIT_CHARS) || ch == table->info.def_char)
228     {
229         /* check if char maps back to the same Unicode value */
230         if (ch & 0xff00)
231         {
232             unsigned char off = table->cp2uni_leadbytes[ch >> 8];
233             return (table->cp2uni[(off << 8) + (ch & 0xff)] == wch);
234         }
235         return (table->cp2uni[ch & 0xff] == wch);
236     }
237     return 1;
238 }
239 
240 /* compute the default char for the dbcs case */
241 static inline WCHAR get_defchar_dbcs( const struct dbcs_table *table, const char *defchar )
242 {
243     if (!defchar) return table->info.def_char;
244     if (!defchar[1]) return (unsigned char)defchar[0];
245     return ((unsigned char)defchar[0] << 8) | (unsigned char)defchar[1];
246 }
247 
248 /* query necessary dst length for src string */
249 static int get_length_dbcs( const struct dbcs_table *table, int flags,
250                             const WCHAR *src, unsigned int srclen,
251                             const char *defchar, int *used )
252 {
253     const unsigned short * const uni2cp_low = table->uni2cp_low;
254     const unsigned short * const uni2cp_high = table->uni2cp_high;
255     WCHAR defchar_value, composed;
256     int len, tmp;
257 
258     if (!defchar && !used && !(flags & WC_COMPOSITECHECK))
259     {
260         for (len = 0; srclen; srclen--, src++, len++)
261         {
262             if (uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)] & 0xff00) len++;
263         }
264         return len;
265     }
266 
267     defchar_value = get_defchar_dbcs( table, defchar );
268     if (!used) used = &tmp;  /* avoid checking on every char */
269     *used = 0;
270     for (len = 0; srclen; len++, srclen--, src++)
271     {
272         unsigned short res;
273         WCHAR wch = *src;
274 
275         if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = wine_compose(src)))
276         {
277             /* now check if we can use the composed char */
278             res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
279 
280             if (is_valid_dbcs_mapping( table, flags, composed, res ))
281             {
282                 /* we have a good mapping for the composed char, use it */
283                 if (res & 0xff00) len++;
284                 src++;
285                 srclen--;
286                 continue;
287             }
288             /* no mapping for the composed char, check the other flags */
289             if (flags & WC_DEFAULTCHAR) /* use the default char instead */
290             {
291                 if (defchar_value & 0xff00) len++;
292                 *used = 1;
293                 src++;  /* skip the non-spacing char */
294                 srclen--;
295                 continue;
296             }
297             if (flags & WC_DISCARDNS) /* skip the second char of the composition */
298             {
299                 src++;
300                 srclen--;
301             }
302             /* WC_SEPCHARS is the default */
303         }
304 
305         res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
306         if (!is_valid_dbcs_mapping( table, flags, wch, res ))
307         {
308             res = defchar_value;
309             *used = 1;
310         }
311         if (res & 0xff00) len++;
312     }
313     return len;
314 }
315 
316 /* wcstombs for double-byte code page */
317 static inline int wcstombs_dbcs( const struct dbcs_table *table,
318                                  const WCHAR *src, unsigned int srclen,
319                                  char *dst, unsigned int dstlen )
320 {
321     const unsigned short * const uni2cp_low = table->uni2cp_low;
322     const unsigned short * const uni2cp_high = table->uni2cp_high;
323     int len;
324 
325     for (len = dstlen; srclen && len; len--, srclen--, src++)
326     {
327         unsigned short res = uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)];
328         if (res & 0xff00)
329         {
330             if (len == 1) break;  /* do not output a partial char */
331             len--;
332             *dst++ = res >> 8;
333         }
334         *dst++ = (char)res;
335     }
336     if (srclen) return -1;  /* overflow */
337     return dstlen - len;
338 }
339 
340 /* slow version of wcstombs_dbcs that handles the various flags */
341 static int wcstombs_dbcs_slow( const struct dbcs_table *table, int flags,
342                                const WCHAR *src, unsigned int srclen,
343                                char *dst, unsigned int dstlen,
344                                const char *defchar, int *used )
345 {
346     const unsigned short * const uni2cp_low = table->uni2cp_low;
347     const unsigned short * const uni2cp_high = table->uni2cp_high;
348     WCHAR defchar_value = get_defchar_dbcs( table, defchar );
349     WCHAR composed;
350     int len, tmp;
351 
352     if (!used) used = &tmp;  /* avoid checking on every char */
353     *used = 0;
354 
355     for (len = dstlen; srclen && len; len--, srclen--, src++)
356     {
357         unsigned short res;
358         WCHAR wch = *src;
359 
360         if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = wine_compose(src)))
361         {
362             /* now check if we can use the composed char */
363             res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
364 
365             if (is_valid_dbcs_mapping( table, flags, composed, res ))
366             {
367                 /* we have a good mapping for the composed char, use it */
368                 src++;
369                 srclen--;
370                 goto output_char;
371             }
372             /* no mapping for the composed char, check the other flags */
373             if (flags & WC_DEFAULTCHAR) /* use the default char instead */
374             {
375                 res = defchar_value;
376                 *used = 1;
377                 src++;  /* skip the non-spacing char */
378                 srclen--;
379                 goto output_char;
380             }
381             if (flags & WC_DISCARDNS) /* skip the second char of the composition */
382             {
383                 src++;
384                 srclen--;
385             }
386             /* WC_SEPCHARS is the default */
387         }
388 
389         res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
390         if (!is_valid_dbcs_mapping( table, flags, wch, res ))
391         {
392             res = defchar_value;
393             *used = 1;
394         }
395 
396     output_char:
397         if (res & 0xff00)
398         {
399             if (len == 1) break;  /* do not output a partial char */
400             len--;
401             *dst++ = res >> 8;
402         }
403         *dst++ = (char)res;
404     }
405     if (srclen) return -1;  /* overflow */
406     return dstlen - len;
407 }
408 
409 /* wide char to multi byte string conversion */
410 /* return -1 on dst buffer overflow */
411 int wine_cp_wcstombs( const union cptable *table, int flags,
412                       const WCHAR *src, int srclen,
413                       char *dst, int dstlen, const char *defchar, int *used )
414 {
415     if (table->info.char_size == 1)
416     {
417         if (flags || defchar || used)
418         {
419             if (!dstlen) return get_length_sbcs( &table->sbcs, flags, src, srclen, used );
420             return wcstombs_sbcs_slow( &table->sbcs, flags, src, srclen,
421                                        dst, dstlen, defchar, used );
422         }
423         if (!dstlen) return srclen;
424         return wcstombs_sbcs( &table->sbcs, src, srclen, dst, dstlen );
425     }
426     else /* mbcs */
427     {
428         if (!dstlen) return get_length_dbcs( &table->dbcs, flags, src, srclen, defchar, used );
429         if (flags || defchar || used)
430             return wcstombs_dbcs_slow( &table->dbcs, flags, src, srclen,
431                                        dst, dstlen, defchar, used );
432         return wcstombs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
433     }
434 }
435