1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Unicode conversions (yet more) 29 */ 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <errno.h> 35 #include <iconv.h> 36 #include <libintl.h> 37 38 #include <sys/u8_textprep.h> 39 40 #include <netsmb/smb_lib.h> 41 #include "charsets.h" 42 43 44 /* 45 * Number of unicode symbols in the string, 46 * not including the 2-byte null terminator. 47 * (multiply by two for storage size) 48 */ 49 size_t 50 unicode_strlen(const uint16_t *us) 51 { 52 size_t len = 0; 53 while (*us++) 54 len++; 55 return (len); 56 } 57 58 static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *); 59 60 /* 61 * Convert (native) Unicode string to UTF-8. 62 * Returns allocated memory. 63 */ 64 char * 65 convert_unicode_to_utf8(uint16_t *us) 66 { 67 static iconv_t cd1 = (iconv_t)-1; 68 69 /* Get conversion descriptor (to, from) */ 70 if (cd1 == (iconv_t)-1) 71 cd1 = iconv_open("UTF-8", "UCS-2"); 72 73 return (convert_ucs2xx_to_utf8(cd1, us)); 74 } 75 76 /* 77 * Convert little-endian Unicode string to UTF-8. 78 * Returns allocated memory. 79 */ 80 char * 81 convert_leunicode_to_utf8(unsigned short *us) 82 { 83 static iconv_t cd2 = (iconv_t)-1; 84 85 /* Get conversion descriptor (to, from) */ 86 if (cd2 == (iconv_t)-1) 87 cd2 = iconv_open("UTF-8", "UCS-2LE"); 88 89 return (convert_ucs2xx_to_utf8(cd2, us)); 90 } 91 92 static char * 93 convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us) 94 { 95 char *obuf, *optr; 96 const char *iptr; 97 size_t ileft, obsize, oleft, ret; 98 99 if (cd == (iconv_t)-1) { 100 smb_error(dgettext(TEXT_DOMAIN, 101 "iconv_open(UTF-8/UCS-2)"), -1); 102 return (NULL); 103 } 104 105 iptr = (const char *)us; 106 ileft = unicode_strlen(us); 107 ileft *= 2; /* now bytes */ 108 109 /* Worst-case output size is 2x input size. */ 110 oleft = ileft * 2; 111 obsize = oleft + 2; /* room for null */ 112 obuf = malloc(obsize); 113 if (!obuf) 114 return (NULL); 115 optr = obuf; 116 117 ret = iconv(cd, &iptr, &ileft, &optr, &oleft); 118 *optr = '\0'; 119 if (ret == (size_t)-1) { 120 smb_error(dgettext(TEXT_DOMAIN, 121 "iconv(%s) failed"), errno, obuf); 122 } 123 if (ileft) { 124 smb_error(dgettext(TEXT_DOMAIN, 125 "iconv(%s) failed"), -1, obuf); 126 /* 127 * XXX: What's better? return NULL? 128 * The truncated string? << for now 129 */ 130 } 131 132 return (obuf); 133 } 134 135 static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *); 136 137 /* 138 * Convert UTF-8 string to Unicode. 139 * Returns allocated memory. 140 */ 141 uint16_t * 142 convert_utf8_to_unicode(const char *utf8_string) 143 { 144 static iconv_t cd3 = (iconv_t)-1; 145 146 /* Get conversion descriptor (to, from) */ 147 if (cd3 == (iconv_t)-1) 148 cd3 = iconv_open("UCS-2", "UTF-8"); 149 return (convert_utf8_to_ucs2xx(cd3, utf8_string)); 150 } 151 152 /* 153 * Convert UTF-8 string to little-endian Unicode. 154 * Returns allocated memory. 155 */ 156 uint16_t * 157 convert_utf8_to_leunicode(const char *utf8_string) 158 { 159 static iconv_t cd4 = (iconv_t)-1; 160 161 /* Get conversion descriptor (to, from) */ 162 if (cd4 == (iconv_t)-1) 163 cd4 = iconv_open("UCS-2LE", "UTF-8"); 164 return (convert_utf8_to_ucs2xx(cd4, utf8_string)); 165 } 166 167 static uint16_t * 168 convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string) 169 { 170 uint16_t *obuf, *optr; 171 const char *iptr; 172 size_t ileft, obsize, oleft, ret; 173 174 if (cd == (iconv_t)-1) { 175 smb_error(dgettext(TEXT_DOMAIN, 176 "iconv_open(UCS-2/UTF-8)"), -1); 177 return (NULL); 178 } 179 180 iptr = utf8_string; 181 ileft = strlen(iptr); 182 183 /* Worst-case output size is 2x input size. */ 184 oleft = ileft * 2; 185 obsize = oleft + 2; /* room for null */ 186 obuf = malloc(obsize); 187 if (!obuf) 188 return (NULL); 189 optr = obuf; 190 191 ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft); 192 *optr = '\0'; 193 if (ret == (size_t)-1) { 194 smb_error(dgettext(TEXT_DOMAIN, 195 "iconv(%s) failed"), errno, utf8_string); 196 } 197 if (ileft) { 198 smb_error(dgettext(TEXT_DOMAIN, 199 "iconv(%s) failed"), -1, utf8_string); 200 /* 201 * XXX: What's better? return NULL? 202 * The truncated string? << for now 203 */ 204 } 205 206 return (obuf); 207 } 208 209 210 /* 211 * A simple wrapper around u8_textprep_str() that returns the Unicode 212 * upper-case version of some string. Returns memory from malloc. 213 * Borrowed from idmapd. 214 */ 215 static char * 216 utf8_str_to_upper_or_lower(const char *s, int upper_lower) 217 { 218 char *res = NULL; 219 char *outs; 220 size_t inlen, outlen, inbleft, outbleft; 221 int rc, err; 222 223 /* 224 * u8_textprep_str() does not allocate memory. The input and 225 * output buffers may differ in size (though that would be more 226 * likely when normalization is done). We have to loop over it... 227 * 228 * To improve the chances that we can avoid looping we add 10 229 * bytes of output buffer room the first go around. 230 */ 231 inlen = inbleft = strlen(s); 232 outlen = outbleft = inlen + 10; 233 if ((res = malloc(outlen)) == NULL) 234 return (NULL); 235 outs = res; 236 237 while ((rc = u8_textprep_str((char *)s, &inbleft, outs, 238 &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 && 239 err == E2BIG) { 240 if ((res = realloc(res, outlen + inbleft)) == NULL) 241 return (NULL); 242 /* adjust input/output buffer pointers */ 243 s += (inlen - inbleft); 244 outs = res + outlen - outbleft; 245 /* adjust outbleft and outlen */ 246 outlen += inbleft; 247 outbleft += inbleft; 248 } 249 250 if (rc < 0) { 251 free(res); 252 res = NULL; 253 return (NULL); 254 } 255 256 res[outlen - outbleft] = '\0'; 257 258 return (res); 259 } 260 261 char * 262 utf8_str_toupper(const char *s) 263 { 264 return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER)); 265 } 266 267 char * 268 utf8_str_tolower(const char *s) 269 { 270 return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER)); 271 } 272