1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Unicode conversions (yet more)
29 */
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <iconv.h>
36 #include <libintl.h>
37
38 #include <sys/u8_textprep.h>
39
40 #include <netsmb/smb_lib.h>
41 #include "charsets.h"
42
43
44 /*
45 * Number of unicode symbols in the string,
46 * not including the 2-byte null terminator.
47 * (multiply by two for storage size)
48 */
49 size_t
unicode_strlen(const uint16_t * us)50 unicode_strlen(const uint16_t *us)
51 {
52 size_t len = 0;
53 while (*us++)
54 len++;
55 return (len);
56 }
57
58 static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
59
60 /*
61 * Convert (native) Unicode string to UTF-8.
62 * Returns allocated memory.
63 */
64 char *
convert_unicode_to_utf8(uint16_t * us)65 convert_unicode_to_utf8(uint16_t *us)
66 {
67 static iconv_t cd1 = (iconv_t)-1;
68
69 /* Get conversion descriptor (to, from) */
70 if (cd1 == (iconv_t)-1)
71 cd1 = iconv_open("UTF-8", "UCS-2");
72
73 return (convert_ucs2xx_to_utf8(cd1, us));
74 }
75
76 /*
77 * Convert little-endian Unicode string to UTF-8.
78 * Returns allocated memory.
79 */
80 char *
convert_leunicode_to_utf8(unsigned short * us)81 convert_leunicode_to_utf8(unsigned short *us)
82 {
83 static iconv_t cd2 = (iconv_t)-1;
84
85 /* Get conversion descriptor (to, from) */
86 if (cd2 == (iconv_t)-1)
87 cd2 = iconv_open("UTF-8", "UCS-2LE");
88
89 return (convert_ucs2xx_to_utf8(cd2, us));
90 }
91
92 static char *
convert_ucs2xx_to_utf8(iconv_t cd,const uint16_t * us)93 convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
94 {
95 char *obuf, *optr;
96 const char *iptr;
97 size_t ileft, obsize, oleft, ret;
98
99 if (cd == (iconv_t)-1) {
100 smb_error(dgettext(TEXT_DOMAIN,
101 "iconv_open(UTF-8/UCS-2)"), -1);
102 return (NULL);
103 }
104
105 iptr = (const char *)us;
106 ileft = unicode_strlen(us);
107 ileft *= 2; /* now bytes */
108
109 /* Worst-case output size is 2x input size. */
110 oleft = ileft * 2;
111 obsize = oleft + 2; /* room for null */
112 obuf = malloc(obsize);
113 if (!obuf)
114 return (NULL);
115 optr = obuf;
116
117 ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
118 *optr = '\0';
119 if (ret == (size_t)-1) {
120 smb_error(dgettext(TEXT_DOMAIN,
121 "iconv(%s) failed"), errno, obuf);
122 }
123 if (ileft) {
124 smb_error(dgettext(TEXT_DOMAIN,
125 "iconv(%s) failed"), -1, obuf);
126 /*
127 * XXX: What's better? return NULL?
128 * The truncated string? << for now
129 */
130 }
131
132 return (obuf);
133 }
134
135 static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
136
137 /*
138 * Convert UTF-8 string to Unicode.
139 * Returns allocated memory.
140 */
141 uint16_t *
convert_utf8_to_unicode(const char * utf8_string)142 convert_utf8_to_unicode(const char *utf8_string)
143 {
144 static iconv_t cd3 = (iconv_t)-1;
145
146 /* Get conversion descriptor (to, from) */
147 if (cd3 == (iconv_t)-1)
148 cd3 = iconv_open("UCS-2", "UTF-8");
149 return (convert_utf8_to_ucs2xx(cd3, utf8_string));
150 }
151
152 /*
153 * Convert UTF-8 string to little-endian Unicode.
154 * Returns allocated memory.
155 */
156 uint16_t *
convert_utf8_to_leunicode(const char * utf8_string)157 convert_utf8_to_leunicode(const char *utf8_string)
158 {
159 static iconv_t cd4 = (iconv_t)-1;
160
161 /* Get conversion descriptor (to, from) */
162 if (cd4 == (iconv_t)-1)
163 cd4 = iconv_open("UCS-2LE", "UTF-8");
164 return (convert_utf8_to_ucs2xx(cd4, utf8_string));
165 }
166
167 static uint16_t *
convert_utf8_to_ucs2xx(iconv_t cd,const char * utf8_string)168 convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
169 {
170 uint16_t *obuf, *optr;
171 const char *iptr;
172 size_t ileft, obsize, oleft, ret;
173
174 if (cd == (iconv_t)-1) {
175 smb_error(dgettext(TEXT_DOMAIN,
176 "iconv_open(UCS-2/UTF-8)"), -1);
177 return (NULL);
178 }
179
180 iptr = utf8_string;
181 ileft = strlen(iptr);
182
183 /* Worst-case output size is 2x input size. */
184 oleft = ileft * 2;
185 obsize = oleft + 2; /* room for null */
186 obuf = malloc(obsize);
187 if (!obuf)
188 return (NULL);
189 optr = obuf;
190
191 ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
192 *optr = '\0';
193 if (ret == (size_t)-1) {
194 smb_error(dgettext(TEXT_DOMAIN,
195 "iconv(%s) failed"), errno, utf8_string);
196 }
197 if (ileft) {
198 smb_error(dgettext(TEXT_DOMAIN,
199 "iconv(%s) failed"), -1, utf8_string);
200 /*
201 * XXX: What's better? return NULL?
202 * The truncated string? << for now
203 */
204 }
205
206 return (obuf);
207 }
208
209
210 /*
211 * A simple wrapper around u8_textprep_str() that returns the Unicode
212 * upper-case version of some string. Returns memory from malloc.
213 * Borrowed from idmapd.
214 */
215 static char *
utf8_str_to_upper_or_lower(const char * s,int upper_lower)216 utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217 {
218 char *res = NULL;
219 char *outs;
220 size_t inlen, outlen, inbleft, outbleft;
221 int rc, err;
222
223 /*
224 * u8_textprep_str() does not allocate memory. The input and
225 * output buffers may differ in size (though that would be more
226 * likely when normalization is done). We have to loop over it...
227 *
228 * To improve the chances that we can avoid looping we add 10
229 * bytes of output buffer room the first go around.
230 */
231 inlen = inbleft = strlen(s);
232 outlen = outbleft = inlen + 10;
233 if ((res = malloc(outlen)) == NULL)
234 return (NULL);
235 outs = res;
236
237 while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238 &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239 err == E2BIG) {
240 if ((res = realloc(res, outlen + inbleft)) == NULL)
241 return (NULL);
242 /* adjust input/output buffer pointers */
243 s += (inlen - inbleft);
244 outs = res + outlen - outbleft;
245 /* adjust outbleft and outlen */
246 outlen += inbleft;
247 outbleft += inbleft;
248 }
249
250 if (rc < 0) {
251 free(res);
252 res = NULL;
253 return (NULL);
254 }
255
256 res[outlen - outbleft] = '\0';
257
258 return (res);
259 }
260
261 char *
utf8_str_toupper(const char * s)262 utf8_str_toupper(const char *s)
263 {
264 return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265 }
266
267 char *
utf8_str_tolower(const char * s)268 utf8_str_tolower(const char *s)
269 {
270 return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271 }
272