1 /* $NetBSD: unicode.c,v 1.2 2007/12/11 12:04:24 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Dieter Baron. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.2 2007/12/11 12:04:24 lukem Exp $"); 34 35 #include <sys/null.h> 36 37 #include "unicode.h" 38 39 size_t 40 utf8_to_utf16(uint16_t *dst, size_t dst_len, 41 const char *src, size_t src_len, 42 int flags, int *errp) 43 { 44 const unsigned char *s; 45 size_t spos, dpos; 46 int error; 47 uint16_t c; 48 49 #define IS_CONT(c) (((c)&0xc0) == 0x80) 50 51 error = 0; 52 s = (const unsigned char *)src; 53 spos = dpos = 0; 54 while (spos<src_len) { 55 if (s[spos] < 0x80) 56 c = s[spos++]; 57 else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 58 && (spos >= src_len || !IS_CONT(s[spos+1])) 59 && s[spos]>=0xa0) { 60 /* not valid UTF-8, assume ISO 8859-1 */ 61 c = s[spos++]; 62 } 63 else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 64 /* continuation byte without lead byte 65 or lead byte for codepoint above 0x10ffff */ 66 error++; 67 spos++; 68 continue; 69 } 70 else if (s[spos] < 0xe0) { 71 if (spos >= src_len || !IS_CONT(s[spos+1])) { 72 spos++; 73 error++; 74 continue; 75 } 76 c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 77 spos += 2; 78 if (c < 0x80) { 79 /* overlong encoding */ 80 error++; 81 continue; 82 } 83 } 84 else if (s[spos] < 0xf0) { 85 if (spos >= src_len-2 86 || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 87 spos++; 88 error++; 89 continue; 90 } 91 c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 92 | (s[spos+2] & 0x3f); 93 spos += 3; 94 if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 95 /* overlong encoding or encoded surrogate */ 96 error++; 97 continue; 98 } 99 } 100 else { 101 uint32_t cc; 102 /* UTF-16 surrogate pair */ 103 104 if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 105 || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 106 spos++; 107 error++; 108 109 continue; 110 } 111 cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 112 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 113 spos += 4; 114 if (cc < 0x10000) { 115 /* overlong encoding */ 116 error++; 117 continue; 118 } 119 if (dst && dpos < dst_len) 120 dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 121 dpos++; 122 c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 123 } 124 125 if (dst && dpos < dst_len) 126 dst[dpos] = c; 127 dpos++; 128 } 129 130 if (errp) 131 *errp = error; 132 133 return dpos; 134 135 #undef IS_CONT 136 } 137 138 139 size_t 140 utf16_to_utf8(char *dst, size_t dst_len, 141 const uint16_t *src, size_t src_len, 142 int flags, int *errp) 143 { 144 uint8_t spos, dpos; 145 int error; 146 147 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 148 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 149 150 error = 0; 151 dpos = 0; 152 for (spos=0; spos<src_len; spos++) { 153 if (src[spos] < 0x80) { 154 CHECK_LENGTH(1); 155 ADD_BYTE(src[spos]); 156 } 157 else if (src[spos] < 0x800) { 158 CHECK_LENGTH(2); 159 ADD_BYTE(0xc0 | (src[spos]>>6)); 160 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 161 } 162 else if ((src[spos] & 0xdc00) == 0xd800) { 163 uint32_t c; 164 /* first surrogate */ 165 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 166 /* no second surrogate present */ 167 error++; 168 continue; 169 } 170 spos++; 171 CHECK_LENGTH(4); 172 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 173 ADD_BYTE(0xf0 | (c>>18)); 174 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 175 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 176 ADD_BYTE(0x80 | (c & 0x3f)); 177 } 178 else if ((src[spos] & 0xdc00) == 0xdc00) { 179 /* second surrogate without preceding first surrogate */ 180 error++; 181 } 182 else { 183 CHECK_LENGTH(3); 184 ADD_BYTE(0xe0 | src[spos]>>12); 185 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 186 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 187 } 188 } 189 190 if (errp) 191 *errp = error; 192 193 return dpos; 194 195 #undef ADD_BYTE 196 #undef CHECK_LENGTH 197 } 198