1 /* $NetBSD: utbm.c,v 1.1.1.3 2010/12/12 15:22:07 adam Exp $ */ 2 3 /* OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.7.2.5 2010/04/13 20:23:04 kurt Exp */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2010 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 1997, 1998, 1999 Computing Research Labs, 18 * New Mexico State University 19 * 20 * Permission is hereby granted, free of charge, to any person obtaining a 21 * copy of this software and associated documentation files (the "Software"), 22 * to deal in the Software without restriction, including without limitation 23 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 24 * and/or sell copies of the Software, and to permit persons to whom the 25 * Software is furnished to do so, subject to the following conditions: 26 * 27 * The above copyright notice and this permission notice shall be included in 28 * all copies or substantial portions of the Software. 29 * 30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 37 */ 38 /* Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp */ 39 40 /* 41 * Assumptions: 42 * 1. Case conversions of UTF-16 characters must also be UTF-16 characters. 43 * 2. Case conversions are all one-to-one. 44 * 3. Text and pattern have already been normalized in some fashion. 45 */ 46 47 #include <stdlib.h> 48 #include <unistd.h> 49 #include <string.h> 50 #include "utbm.h" 51 52 /* 53 * Single pattern character. 54 */ 55 typedef struct { 56 ucs4_t lc; 57 ucs4_t uc; 58 ucs4_t tc; 59 } _utbm_char_t; 60 61 typedef struct { 62 _utbm_char_t *ch; 63 unsigned long skip; 64 } _utbm_skip_t; 65 66 typedef struct _utbm_pattern_t { 67 unsigned long flags; 68 69 _utbm_char_t *pat; 70 unsigned long pat_used; 71 unsigned long pat_size; 72 unsigned long patlen; 73 74 _utbm_skip_t *skip; 75 unsigned long skip_used; 76 unsigned long skip_size; 77 78 unsigned long md4; 79 } _utbm_pattern_t; 80 81 /************************************************************************* 82 * 83 * Support functions. 84 * 85 *************************************************************************/ 86 87 /* 88 * Routine to look up the skip value for a character. 89 */ 90 static unsigned long 91 _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end) 92 { 93 unsigned long i; 94 ucs4_t c1, c2; 95 _utbm_skip_t *sp; 96 97 if (start >= end) 98 return 0; 99 100 c1 = *start; 101 c2 = (start + 1 < end) ? *(start + 1) : ~0; 102 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) 103 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 104 105 for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) { 106 if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) { 107 return ((unsigned long) (end - start) < sp->skip) ? 108 end - start : sp->skip; 109 } 110 } 111 return p->patlen; 112 } 113 114 static int 115 _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end, 116 unsigned long *match_start, unsigned long *match_end) 117 { 118 int check_space; 119 ucs4_t c1, c2; 120 unsigned long count; 121 _utbm_char_t *cp; 122 123 /* 124 * Set the potential match endpoint first. 125 */ 126 *match_end = (start - text) + 1; 127 128 c1 = *start; 129 c2 = (start + 1 < end) ? *(start + 1) : ~0; 130 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) { 131 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 132 /* 133 * Adjust the match end point to occur after the UTF-16 character. 134 */ 135 *match_end = *match_end + 1; 136 } 137 138 if (pat->pat_used == 1) { 139 *match_start = start - text; 140 return 1; 141 } 142 143 /* 144 * Compare backward. 145 */ 146 cp = pat->pat + (pat->pat_used - 1); 147 148 for (count = pat->patlen; start > text && count > 0;) { 149 /* 150 * Ignore non-spacing characters if indicated. 151 */ 152 if (pat->flags & UTBM_IGNORE_NONSPACING) { 153 while (start > text && _utbm_nonspacing(c1)) { 154 c2 = *--start; 155 c1 = (start - 1 > text) ? *(start - 1) : ~0; 156 if (0xdc00 <= c2 && c2 <= 0xdfff && 157 0xd800 <= c1 && c1 <= 0xdbff) { 158 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 159 start--; 160 } else 161 c1 = c2; 162 } 163 } 164 165 /* 166 * Handle space compression if indicated. 167 */ 168 if (pat->flags & UTBM_SPACE_COMPRESS) { 169 check_space = 0; 170 while (start > text && 171 (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) { 172 check_space = _utbm_isspace(c1, 1); 173 c2 = *--start; 174 c1 = (start - 1 > text) ? *(start - 1) : ~0; 175 if (0xdc00 <= c2 && c2 <= 0xdfff && 176 0xd800 <= c1 && c1 <= 0xdbff) { 177 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 178 start--; 179 } else 180 c1 = c2; 181 } 182 /* 183 * Handle things if space compression was indicated and one or 184 * more member characters were found. 185 */ 186 if (check_space) { 187 if (cp->uc != ' ') 188 return 0; 189 cp--; 190 count--; 191 } 192 } 193 194 /* 195 * Handle the normal comparison cases. 196 */ 197 if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc))) 198 return 0; 199 200 count -= (c1 >= 0x10000) ? 2 : 1; 201 if (count > 0) { 202 cp--; 203 204 /* 205 * Get the next preceding character. 206 */ 207 if (start > text) { 208 c2 = *--start; 209 c1 = (start - 1 > text) ? *(start - 1) : ~0; 210 if (0xdc00 <= c2 && c2 <= 0xdfff && 211 0xd800 <= c1 && c1 <= 0xdbff) { 212 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 213 start--; 214 } else 215 c1 = c2; 216 } 217 } 218 } 219 220 /* 221 * Set the match start position. 222 */ 223 *match_start = start - text; 224 return 1; 225 } 226 227 /************************************************************************* 228 * 229 * API. 230 * 231 *************************************************************************/ 232 233 utbm_pattern_t 234 utbm_create_pattern(void) 235 { 236 utbm_pattern_t p; 237 238 p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t)); 239 (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t)); 240 return p; 241 } 242 243 void 244 utbm_free_pattern(utbm_pattern_t pattern) 245 { 246 if (pattern == 0) 247 return; 248 249 if (pattern->pat_size > 0) 250 free((char *) pattern->pat); 251 252 if (pattern->skip_size > 0) 253 free((char *) pattern->skip); 254 255 free((char *) pattern); 256 } 257 258 void 259 utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags, 260 utbm_pattern_t p) 261 { 262 int have_space; 263 unsigned long i, j, k, slen; 264 _utbm_char_t *cp; 265 _utbm_skip_t *sp; 266 ucs4_t c1, c2, sentinel; 267 268 if (p == 0 || pat == 0 || *pat == 0 || patlen == 0) 269 return; 270 271 /* 272 * Reset the pattern buffer. 273 */ 274 p->patlen = p->pat_used = p->skip_used = 0; 275 276 /* 277 * Set the flags. 278 */ 279 p->flags = flags; 280 281 /* 282 * Initialize the extra skip flag. 283 */ 284 p->md4 = 1; 285 286 /* 287 * Allocate more storage if necessary. 288 */ 289 if (patlen > p->pat_size) { 290 if (p->pat_size == 0) { 291 p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen); 292 p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen); 293 } else { 294 p->pat = (_utbm_char_t *) 295 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen); 296 p->skip = (_utbm_skip_t *) 297 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen); 298 } 299 p->pat_size = p->skip_size = patlen; 300 } 301 302 /* 303 * Preprocess the pattern to remove controls (if specified) and determine 304 * case. 305 */ 306 for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) { 307 c1 = pat[i]; 308 c2 = (i + 1 < patlen) ? pat[i + 1] : ~0; 309 if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) 310 c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); 311 312 /* 313 * Make sure the `have_space' flag is turned off if the character 314 * is not an appropriate one. 315 */ 316 if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS)) 317 have_space = 0; 318 319 /* 320 * If non-spacing characters should be ignored, do it here. 321 */ 322 if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1)) 323 continue; 324 325 /* 326 * Check if spaces and controls need to be compressed. 327 */ 328 if (flags & UTBM_SPACE_COMPRESS) { 329 if (_utbm_isspace(c1, 1)) { 330 if (!have_space) { 331 /* 332 * Add a space and set the flag. 333 */ 334 cp->uc = cp->lc = cp->tc = ' '; 335 cp++; 336 337 /* 338 * Increase the real pattern length. 339 */ 340 p->patlen++; 341 sentinel = ' '; 342 have_space = 1; 343 } 344 continue; 345 } 346 347 /* 348 * Ignore all control characters. 349 */ 350 if (_utbm_iscntrl(c1)) 351 continue; 352 } 353 354 /* 355 * Add the character. 356 */ 357 if (flags & UTBM_CASEFOLD) { 358 cp->uc = _utbm_toupper(c1); 359 cp->lc = _utbm_tolower(c1); 360 cp->tc = _utbm_totitle(c1); 361 } else 362 cp->uc = cp->lc = cp->tc = c1; 363 364 /* 365 * Set the sentinel character. 366 */ 367 sentinel = cp->uc; 368 369 /* 370 * Move to the next character. 371 */ 372 cp++; 373 374 /* 375 * Increase the real pattern length appropriately. 376 */ 377 p->patlen += (c1 >= 0x10000) ? 2 : 1; 378 379 /* 380 * Increment the loop index for UTF-16 characters. 381 */ 382 i += (c1 >= 0x10000) ? 1 : 0; 383 384 } 385 386 /* 387 * Set the number of characters actually used. 388 */ 389 p->pat_used = cp - p->pat; 390 391 /* 392 * Go through and construct the skip array and determine the actual length 393 * of the pattern in UCS2 terms. 394 */ 395 slen = p->patlen - 1; 396 cp = p->pat; 397 for (i = k = 0; i < p->pat_used; i++, cp++) { 398 /* 399 * Locate the character in the skip array. 400 */ 401 for (sp = p->skip, j = 0; 402 j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ; 403 404 /* 405 * If the character is not found, set the new skip element and 406 * increase the number of skip elements. 407 */ 408 if (j == p->skip_used) { 409 sp->ch = cp; 410 p->skip_used++; 411 } 412 413 /* 414 * Set the updated skip value. If the character is UTF-16 and is 415 * not the last one in the pattern, add one to its skip value. 416 */ 417 sp->skip = slen - k; 418 if (cp->uc >= 0x10000 && k + 2 < slen) 419 sp->skip++; 420 421 /* 422 * Set the new extra skip for the sentinel character. 423 */ 424 if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) && 425 cp->uc == sentinel) 426 p->md4 = slen - k; 427 428 /* 429 * Increase the actual index. 430 */ 431 k += (cp->uc >= 0x10000) ? 2 : 1; 432 } 433 } 434 435 int 436 utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen, 437 unsigned long *match_start, unsigned long *match_end) 438 { 439 unsigned long k; 440 ucs2_t *start, *end; 441 442 if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 || 443 textlen < pat->patlen) 444 return 0; 445 446 start = text + pat->patlen; 447 end = text + textlen; 448 449 /* 450 * Adjust the start point if it points to a low surrogate. 451 */ 452 if (0xdc00 <= *start && *start <= 0xdfff && 453 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 454 start--; 455 456 while (start < end) { 457 while ((k = _utbm_skip(pat, start, end))) { 458 start += k; 459 if (start < end && 0xdc00 <= *start && *start <= 0xdfff && 460 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 461 start--; 462 } 463 464 if (start < end && 465 _utbm_match(pat, text, start, end, match_start, match_end)) 466 return 1; 467 468 start += pat->md4; 469 if (start < end && 0xdc00 <= *start && *start <= 0xdfff && 470 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) 471 start--; 472 } 473 return 0; 474 } 475