1 /* $NetBSD: utf8.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */ 2 3 #ifndef lint 4 static char *rcsid = "Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp "; 5 #endif 6 7 /* 8 * Copyright (c) 2000 Japan Network Information Center. All rights reserved. 9 * 10 * By using this file, you agree to the terms and conditions set forth bellow. 11 * 12 * LICENSE TERMS AND CONDITIONS 13 * 14 * The following License Terms and Conditions apply, unless a different 15 * license is obtained from Japan Network Information Center ("JPNIC"), 16 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 17 * Chiyoda-ku, Tokyo 101-0047, Japan. 18 * 19 * 1. Use, Modification and Redistribution (including distribution of any 20 * modified or derived work) in source and/or binary forms is permitted 21 * under this License Terms and Conditions. 22 * 23 * 2. Redistribution of source code must retain the copyright notices as they 24 * appear in each source code file, this License Terms and Conditions. 25 * 26 * 3. Redistribution in binary form must reproduce the Copyright Notice, 27 * this License Terms and Conditions, in the documentation and/or other 28 * materials provided with the distribution. For the purposes of binary 29 * distribution the "Copyright Notice" refers to the following language: 30 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 31 * 32 * 4. The name of JPNIC may not be used to endorse or promote products 33 * derived from this Software without specific prior written approval of 34 * JPNIC. 35 * 36 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 37 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 39 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 43 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 44 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 45 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 46 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 47 */ 48 49 #include <config.h> 50 51 #include <stddef.h> 52 53 #include <idn/assert.h> 54 #include <idn/logmacro.h> 55 #include <idn/utf8.h> 56 #include <idn/debug.h> 57 58 #define UTF8_WIDTH(c) \ 59 (((c) < 0x80) ? 1 : \ 60 ((c) < 0xc0) ? 0 : \ 61 ((c) < 0xe0) ? 2 : \ 62 ((c) < 0xf0) ? 3 : \ 63 ((c) < 0xf8) ? 4 : \ 64 ((c) < 0xfc) ? 5 : \ 65 ((c) < 0xfe) ? 6 : 0) 66 67 #define VALID_CONT_BYTE(c) (0x80 <= (c) && (c) < 0xc0) 68 69 int 70 idn_utf8_mblen(const char *s) { 71 int c = *(unsigned char *)s; 72 73 assert(s != NULL); 74 75 #if 0 76 TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6))); 77 #endif 78 79 return UTF8_WIDTH(c); 80 } 81 82 int 83 idn_utf8_getmb(const char *s, size_t len, char *buf) { 84 /* buf must be at least 7-bytes long */ 85 const unsigned char *p = (const unsigned char *)s; 86 unsigned char *q = (unsigned char *)buf; 87 int width = UTF8_WIDTH(*p); 88 int w; 89 90 assert(s != NULL); 91 92 #if 0 93 TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n", 94 idn__debug_hexstring(s, 6), len)); 95 #endif 96 97 if (width == 0 || len < width) 98 return (0); 99 100 /* Copy the first byte. */ 101 *q++ = *p++; 102 103 /* .. and the rest. */ 104 w = width; 105 while (--w > 0) { 106 if (!VALID_CONT_BYTE(*p)) 107 return (0); 108 *q++ = *p++; 109 } 110 return (width); 111 } 112 113 int 114 idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) { 115 unsigned long v; 116 unsigned long min; 117 const unsigned char *p = (const unsigned char *)s; 118 int c; 119 int width; 120 int rest; 121 122 assert(s != NULL); 123 124 #if 0 125 TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n", 126 idn__debug_hexstring(s, 10), len)); 127 #endif 128 129 c = *p++; 130 width = UTF8_WIDTH(c); 131 132 switch (width) { 133 case 0: 134 return (0); 135 case 1: 136 v = c; 137 min = 0; 138 break; 139 case 2: 140 v = c & 0x1f; 141 min = 0x80; 142 break; 143 case 3: 144 v = c & 0xf; 145 min = 0x800; 146 break; 147 case 4: 148 v = c & 0x7; 149 min = 0x10000; 150 break; 151 case 5: 152 v = c & 3; 153 min = 0x200000; 154 break; 155 case 6: 156 v = c & 1; 157 min = 0x4000000; 158 break; 159 default: 160 FATAL(("idn_utf8_getint: internal error\n")); 161 return (0); 162 } 163 164 if (len < width) 165 return (0); 166 167 rest = width - 1; 168 while (rest-- > 0) { 169 if (!VALID_CONT_BYTE(*p)) 170 return (0); 171 v = (v << 6) | (*p & 0x3f); 172 p++; 173 } 174 175 if (v < min) 176 return (0); 177 178 *vp = v; 179 return (width); 180 } 181 182 int 183 idn_utf8_putwc(char *s, size_t len, unsigned long v) { 184 unsigned char *p = (unsigned char *)s; 185 int mask; 186 int off; 187 int l; 188 189 assert(s != NULL); 190 191 #if 0 192 TRACE(("idn_utf8_putwc(v=%lx)\n", v)); 193 #endif 194 195 if (v < 0x80) { 196 mask = 0; 197 l = 1; 198 } else if (v < 0x800) { 199 mask = 0xc0; 200 l = 2; 201 } else if (v < 0x10000) { 202 mask = 0xe0; 203 l = 3; 204 } else if (v < 0x200000) { 205 mask = 0xf0; 206 l = 4; 207 } else if (v < 0x4000000) { 208 mask = 0xf8; 209 l = 5; 210 } else if (v < 0x80000000) { 211 mask = 0xfc; 212 l = 6; 213 } else { 214 return (0); 215 } 216 217 if (len < l) 218 return (0); 219 220 off = 6 * (l - 1); 221 *p++ = (v >> off) | mask; 222 mask = 0x80; 223 while (off > 0) { 224 off -= 6; 225 *p++ = ((v >> off) & 0x3f) | mask; 226 } 227 return l; 228 } 229 230 int 231 idn_utf8_isvalidchar(const char *s) { 232 unsigned long dummy; 233 234 TRACE(("idn_utf8_isvalidchar(s=<%s>)\n", 235 idn__debug_hexstring(s, 6))); 236 237 return (idn_utf8_getwc(s, 6, &dummy) > 0); 238 } 239 240 int 241 idn_utf8_isvalidstring(const char *s) { 242 unsigned long dummy; 243 int width; 244 245 assert(s != NULL); 246 247 TRACE(("idn_utf8_isvalidstring(s=<%s>)\n", 248 idn__debug_hexstring(s, 20))); 249 250 while (*s != '\0') { 251 width = idn_utf8_getwc(s, 6, &dummy); 252 if (width == 0) 253 return (0); 254 s += width; 255 } 256 return (1); 257 } 258 259 char * 260 idn_utf8_findfirstbyte(const char *s, const char *known_top) { 261 const unsigned char *p = (const unsigned char *)s; 262 const unsigned char *t = (const unsigned char *)known_top; 263 264 assert(s != NULL && known_top != NULL && known_top <= s); 265 266 TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n", 267 idn__debug_hexstring(s, 8))); 268 269 while (p >= t) { 270 if (!VALID_CONT_BYTE(*p)) 271 break; 272 p--; 273 } 274 if (p < t || UTF8_WIDTH(*p) == 0) 275 return (NULL); 276 277 return ((char *)p); 278 } 279