1 /* $NetBSD: unormalize.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */ 2 3 #ifndef lint 4 static char *rcsid = "Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp "; 5 #endif 6 7 /* 8 * Copyright (c) 2000,2001,2002 Japan Network Information Center. 9 * All rights reserved. 10 * 11 * By using this file, you agree to the terms and conditions set forth bellow. 12 * 13 * LICENSE TERMS AND CONDITIONS 14 * 15 * The following License Terms and Conditions apply, unless a different 16 * license is obtained from Japan Network Information Center ("JPNIC"), 17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 18 * Chiyoda-ku, Tokyo 101-0047, Japan. 19 * 20 * 1. Use, Modification and Redistribution (including distribution of any 21 * modified or derived work) in source and/or binary forms is permitted 22 * under this License Terms and Conditions. 23 * 24 * 2. Redistribution of source code must retain the copyright notices as they 25 * appear in each source code file, this License Terms and Conditions. 26 * 27 * 3. Redistribution in binary form must reproduce the Copyright Notice, 28 * this License Terms and Conditions, in the documentation and/or other 29 * materials provided with the distribution. For the purposes of binary 30 * distribution the "Copyright Notice" refers to the following language: 31 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 32 * 33 * 4. The name of JPNIC may not be used to endorse or promote products 34 * derived from this Software without specific prior written approval of 35 * JPNIC. 36 * 37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 40 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 46 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 47 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 48 */ 49 50 #include <config.h> 51 52 #include <stddef.h> 53 #include <stdlib.h> 54 #include <string.h> 55 56 #include <idn/result.h> 57 #include <idn/assert.h> 58 #include <idn/logmacro.h> 59 #include <idn/ucs4.h> 60 #include <idn/unicode.h> 61 #include <idn/unormalize.h> 62 #include <idn/debug.h> 63 64 #if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY) 65 #define memmove(a,b,c) bcopy((char *)(b),(char *)(a),(int)(c)) 66 #endif 67 68 #define WORKBUF_SIZE 128 69 #define WORKBUF_SIZE_MAX 10000 70 71 typedef struct { 72 idn__unicode_version_t version; /* Unicode version */ 73 int cur; /* pointing now processing character */ 74 int last; /* pointing just after the last character */ 75 int size; /* size of UCS and CLASS array */ 76 unsigned long *ucs4; /* UCS-4 characters */ 77 int *class; /* and their canonical classes */ 78 unsigned long ucs4_buf[WORKBUF_SIZE]; /* local buffer */ 79 int class_buf[WORKBUF_SIZE]; /* ditto */ 80 } workbuf_t; 81 82 static idn_result_t normalize(idn__unicode_version_t version, 83 int do_composition, int compat, 84 const unsigned long *from, 85 unsigned long *to, size_t tolen); 86 static idn_result_t decompose(workbuf_t *wb, unsigned long c, int compat); 87 static void get_class(workbuf_t *wb); 88 static void reorder(workbuf_t *wb); 89 static void compose(workbuf_t *wb); 90 static idn_result_t flush_before_cur(workbuf_t *wb, 91 unsigned long **top, size_t *tolenp); 92 static void workbuf_init(workbuf_t *wb); 93 static void workbuf_free(workbuf_t *wb); 94 static idn_result_t workbuf_extend(workbuf_t *wb); 95 static idn_result_t workbuf_append(workbuf_t *wb, unsigned long c); 96 static void workbuf_shift(workbuf_t *wb, int shift); 97 static void workbuf_removevoid(workbuf_t *wb); 98 99 idn_result_t 100 idn__unormalize_formkc(idn__unicode_version_t version, 101 const unsigned long *from, unsigned long *to, 102 size_t tolen) { 103 assert(version != NULL && from != NULL && to != NULL && tolen >= 0); 104 TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n", 105 idn__debug_ucs4xstring(from, 50), tolen)); 106 return (normalize(version, 1, 1, from, to, tolen)); 107 } 108 109 static idn_result_t 110 normalize(idn__unicode_version_t version, int do_composition, int compat, 111 const unsigned long *from, unsigned long *to, size_t tolen) { 112 workbuf_t wb; 113 idn_result_t r = idn_success; 114 115 /* 116 * Initialize working buffer. 117 */ 118 workbuf_init(&wb); 119 wb.version = version; 120 121 while (*from != '\0') { 122 unsigned long c; 123 124 assert(wb.cur == wb.last); 125 126 /* 127 * Get one character from 'from'. 128 */ 129 c = *from++; 130 131 /* 132 * Decompose it. 133 */ 134 if ((r = decompose(&wb, c, compat)) != idn_success) 135 goto ret; 136 137 /* 138 * Get canonical class. 139 */ 140 get_class(&wb); 141 142 /* 143 * Reorder & compose. 144 */ 145 for (; wb.cur < wb.last; wb.cur++) { 146 if (wb.cur == 0) { 147 continue; 148 } else if (wb.class[wb.cur] > 0) { 149 /* 150 * This is not a starter. Try reordering. 151 * Note that characters up to it are 152 * already in canonical order. 153 */ 154 reorder(&wb); 155 continue; 156 } 157 158 /* 159 * This is a starter character, and there are 160 * some characters before it. Those characters 161 * have been reordered properly, and 162 * ready for composition. 163 */ 164 if (do_composition && wb.class[0] == 0) 165 compose(&wb); 166 167 /* 168 * If CUR points to a starter character, 169 * then process of characters before CUR are 170 * already finished, because any further 171 * reordering/composition for them are blocked 172 * by the starter CUR points. 173 */ 174 if (wb.cur > 0 && wb.class[wb.cur] == 0) { 175 /* Flush everything before CUR. */ 176 r = flush_before_cur(&wb, &to, &tolen); 177 if (r != idn_success) 178 goto ret; 179 } 180 } 181 } 182 183 if (r == idn_success) { 184 if (do_composition && wb.cur > 0 && wb.class[0] == 0) { 185 /* 186 * There is some characters left in WB. 187 * They are ordered, but not composed yet. 188 * Now CUR points just after the last character in WB, 189 * and since compose() tries to compose characters 190 * between top and CUR inclusive, we must make CUR 191 * one character back during compose(). 192 */ 193 wb.cur--; 194 compose(&wb); 195 wb.cur++; 196 } 197 /* 198 * Call this even when WB.CUR == 0, to make TO 199 * NUL-terminated. 200 */ 201 r = flush_before_cur(&wb, &to, &tolen); 202 if (r != idn_success) 203 goto ret; 204 } 205 206 if (tolen <= 0) { 207 r = idn_buffer_overflow; 208 goto ret; 209 } 210 *to = '\0'; 211 212 ret: 213 workbuf_free(&wb); 214 return (r); 215 } 216 217 static idn_result_t 218 decompose(workbuf_t *wb, unsigned long c, int compat) { 219 idn_result_t r; 220 int dec_len; 221 222 again: 223 r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last, 224 wb->size - wb->last, c, &dec_len); 225 switch (r) { 226 case idn_success: 227 wb->last += dec_len; 228 return (idn_success); 229 case idn_notfound: 230 return (workbuf_append(wb, c)); 231 case idn_buffer_overflow: 232 if ((r = workbuf_extend(wb)) != idn_success) 233 return (r); 234 if (wb->size > WORKBUF_SIZE_MAX) { 235 WARNING(("idn__unormalize_form*: " 236 "working buffer too large\n")); 237 return (idn_nomemory); 238 } 239 goto again; 240 default: 241 return (r); 242 } 243 /* NOTREACHED */ 244 } 245 246 static void 247 get_class(workbuf_t *wb) { 248 int i; 249 250 for (i = wb->cur; i < wb->last; i++) 251 wb->class[i] = idn__unicode_canonicalclass(wb->version, 252 wb->ucs4[i]); 253 } 254 255 static void 256 reorder(workbuf_t *wb) { 257 unsigned long c; 258 int i; 259 int class; 260 261 assert(wb != NULL); 262 263 i = wb->cur; 264 c = wb->ucs4[i]; 265 class = wb->class[i]; 266 267 while (i > 0 && wb->class[i - 1] > class) { 268 wb->ucs4[i] = wb->ucs4[i - 1]; 269 wb->class[i] =wb->class[i - 1]; 270 i--; 271 wb->ucs4[i] = c; 272 wb->class[i] = class; 273 } 274 } 275 276 static void 277 compose(workbuf_t *wb) { 278 int cur; 279 unsigned long *ucs4; 280 int *class; 281 int last_class; 282 int nvoids; 283 int i; 284 idn__unicode_version_t ver; 285 286 assert(wb != NULL && wb->class[0] == 0); 287 288 cur = wb->cur; 289 ucs4 = wb->ucs4; 290 class = wb->class; 291 ver = wb->version; 292 293 /* 294 * If there are no decomposition sequence that begins with 295 * the top character, composition is impossible. 296 */ 297 if (!idn__unicode_iscompositecandidate(ver, ucs4[0])) 298 return; 299 300 last_class = 0; 301 nvoids = 0; 302 for (i = 1; i <= cur; i++) { 303 unsigned long c; 304 int cl = class[i]; 305 306 if ((last_class < cl || cl == 0) && 307 idn__unicode_compose(ver, ucs4[0], ucs4[i], 308 &c) == idn_success) { 309 /* 310 * Replace the top character with the composed one. 311 */ 312 ucs4[0] = c; 313 class[0] = idn__unicode_canonicalclass(ver, c); 314 315 class[i] = -1; /* void this character */ 316 nvoids++; 317 } else { 318 last_class = cl; 319 } 320 } 321 322 /* Purge void characters, if any. */ 323 if (nvoids > 0) 324 workbuf_removevoid(wb); 325 } 326 327 static idn_result_t 328 flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) { 329 if (*tolenp < wb->cur) 330 return (idn_buffer_overflow); 331 332 memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur); 333 *top += wb->cur; 334 *tolenp -= wb->cur; 335 workbuf_shift(wb, wb->cur); 336 337 return (idn_success); 338 } 339 340 static void 341 workbuf_init(workbuf_t *wb) { 342 wb->cur = 0; 343 wb->last = 0; 344 wb->size = WORKBUF_SIZE; 345 wb->ucs4 = wb->ucs4_buf; 346 wb->class = wb->class_buf; 347 } 348 349 static void 350 workbuf_free(workbuf_t *wb) { 351 if (wb->ucs4 != wb->ucs4_buf) { 352 free(wb->ucs4); 353 free(wb->class); 354 } 355 } 356 357 static idn_result_t 358 workbuf_extend(workbuf_t *wb) { 359 int newsize = wb->size * 3; 360 361 if (wb->ucs4 == wb->ucs4_buf) { 362 wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize); 363 wb->class = malloc(sizeof(wb->class[0]) * newsize); 364 } else { 365 wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize); 366 wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize); 367 } 368 if (wb->ucs4 == NULL || wb->class == NULL) 369 return (idn_nomemory); 370 else 371 return (idn_success); 372 } 373 374 static idn_result_t 375 workbuf_append(workbuf_t *wb, unsigned long c) { 376 idn_result_t r; 377 378 if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success) 379 return (r); 380 wb->ucs4[wb->last++] = c; 381 return (idn_success); 382 } 383 384 static void 385 workbuf_shift(workbuf_t *wb, int shift) { 386 int nmove; 387 388 assert(wb != NULL && wb->cur >= shift); 389 390 nmove = wb->last - shift; 391 (void)memmove(&wb->ucs4[0], &wb->ucs4[shift], 392 nmove * sizeof(wb->ucs4[0])); 393 (void)memmove(&wb->class[0], &wb->class[shift], 394 nmove * sizeof(wb->class[0])); 395 wb->cur -= shift; 396 wb->last -= shift; 397 } 398 399 static void 400 workbuf_removevoid(workbuf_t *wb) { 401 int i, j; 402 int last = wb->last; 403 404 for (i = j = 0; i < last; i++) { 405 if (wb->class[i] >= 0) { 406 if (j < i) { 407 wb->ucs4[j] = wb->ucs4[i]; 408 wb->class[j] = wb->class[i]; 409 } 410 j++; 411 } 412 } 413 wb->cur -= last - j; 414 wb->last = j; 415 } 416