1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2008-2011, International Business Machines 7 * Corporation, Google and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 */ 11 // Author : eldawy@google.com (Mohamed Eldawy) 12 // ucnvsel.cpp 13 // 14 // Purpose: To generate a list of encodings capable of handling 15 // a given Unicode text 16 // 17 // Started 09-April-2008 18 19 /** 20 * \file 21 * 22 * This is an implementation of an encoding selector. 23 * The goal is, given a unicode string, find the encodings 24 * this string can be mapped to. To make processing faster 25 * a trie is built when you call ucnvsel_open() that 26 * stores all encodings a codepoint can map to 27 */ 28 29 #include "unicode/ucnvsel.h" 30 31 #if !UCONFIG_NO_CONVERSION 32 33 #include <string.h> 34 35 #include "unicode/uchar.h" 36 #include "unicode/uniset.h" 37 #include "unicode/ucnv.h" 38 #include "unicode/ustring.h" 39 #include "unicode/uchriter.h" 40 #include "utrie2.h" 41 #include "propsvec.h" 42 #include "uassert.h" 43 #include "ucmndata.h" 44 #include "uenumimp.h" 45 #include "cmemory.h" 46 #include "cstring.h" 47 48 U_NAMESPACE_USE 49 50 struct UConverterSelector { 51 UTrie2 *trie; // 16 bit trie containing offsets into pv 52 uint32_t* pv; // table of bits! 53 int32_t pvCount; 54 char** encodings; // which encodings did user ask to use? 55 int32_t encodingsCount; 56 int32_t encodingStrLength; 57 uint8_t* swapped; 58 UBool ownPv, ownEncodingStrings; 59 }; 60 61 static void generateSelectorData(UConverterSelector* result, 62 UPropsVectors *upvec, 63 const USet* excludedCodePoints, 64 const UConverterUnicodeSet whichSet, 65 UErrorCode* status) { 66 if (U_FAILURE(*status)) { 67 return; 68 } 69 70 int32_t columns = (result->encodingsCount+31)/32; 71 72 // set errorValue to all-ones 73 for (int32_t col = 0; col < columns; col++) { 74 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 75 col, ~0, ~0, status); 76 } 77 78 for (int32_t i = 0; i < result->encodingsCount; ++i) { 79 uint32_t mask; 80 uint32_t column; 81 int32_t item_count; 82 int32_t j; 83 UConverter* test_converter = ucnv_open(result->encodings[i], status); 84 if (U_FAILURE(*status)) { 85 return; 86 } 87 USet* unicode_point_set; 88 unicode_point_set = uset_open(1, 0); // empty set 89 90 ucnv_getUnicodeSet(test_converter, unicode_point_set, 91 whichSet, status); 92 if (U_FAILURE(*status)) { 93 ucnv_close(test_converter); 94 return; 95 } 96 97 column = i / 32; 98 mask = 1 << (i%32); 99 // now iterate over intervals on set i! 100 item_count = uset_getItemCount(unicode_point_set); 101 102 for (j = 0; j < item_count; ++j) { 103 UChar32 start_char; 104 UChar32 end_char; 105 UErrorCode smallStatus = U_ZERO_ERROR; 106 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 107 &smallStatus); 108 if (U_FAILURE(smallStatus)) { 109 // this will be reached for the converters that fill the set with 110 // strings. Those should be ignored by our system 111 } else { 112 upvec_setValue(upvec, start_char, end_char, column, ~0, mask, 113 status); 114 } 115 } 116 ucnv_close(test_converter); 117 uset_close(unicode_point_set); 118 if (U_FAILURE(*status)) { 119 return; 120 } 121 } 122 123 // handle excluded encodings! Simply set their values to all 1's in the upvec 124 if (excludedCodePoints) { 125 int32_t item_count = uset_getItemCount(excludedCodePoints); 126 for (int32_t j = 0; j < item_count; ++j) { 127 UChar32 start_char; 128 UChar32 end_char; 129 130 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 131 status); 132 for (int32_t col = 0; col < columns; col++) { 133 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, 134 status); 135 } 136 } 137 } 138 139 // alright. Now, let's put things in the same exact form you'd get when you 140 // unserialize things. 141 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 142 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 143 result->pvCount *= columns; // number of uint32_t = rows * columns 144 result->ownPv = TRUE; 145 } 146 147 /* open a selector. If converterListSize is 0, build for all converters. 148 If excludedCodePoints is NULL, don't exclude any codepoints */ 149 U_CAPI UConverterSelector* U_EXPORT2 150 ucnvsel_open(const char* const* converterList, int32_t converterListSize, 151 const USet* excludedCodePoints, 152 const UConverterUnicodeSet whichSet, UErrorCode* status) { 153 // check if already failed 154 if (U_FAILURE(*status)) { 155 return NULL; 156 } 157 // ensure args make sense! 158 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 159 *status = U_ILLEGAL_ARGUMENT_ERROR; 160 return NULL; 161 } 162 163 // allocate a new converter 164 LocalUConverterSelectorPointer newSelector( 165 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); 166 if (newSelector.isNull()) { 167 *status = U_MEMORY_ALLOCATION_ERROR; 168 return NULL; 169 } 170 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); 171 172 if (converterListSize == 0) { 173 converterList = NULL; 174 converterListSize = ucnv_countAvailable(); 175 } 176 newSelector->encodings = 177 (char**)uprv_malloc(converterListSize * sizeof(char*)); 178 if (!newSelector->encodings) { 179 *status = U_MEMORY_ALLOCATION_ERROR; 180 return NULL; 181 } 182 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 183 184 // make a backup copy of the list of converters 185 int32_t totalSize = 0; 186 int32_t i; 187 for (i = 0; i < converterListSize; i++) { 188 totalSize += 189 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 190 } 191 // 4-align the totalSize to 4-align the size of the serialized form 192 int32_t encodingStrPadding = totalSize & 3; 193 if (encodingStrPadding != 0) { 194 encodingStrPadding = 4 - encodingStrPadding; 195 } 196 newSelector->encodingStrLength = totalSize += encodingStrPadding; 197 char* allStrings = (char*) uprv_malloc(totalSize); 198 if (!allStrings) { 199 *status = U_MEMORY_ALLOCATION_ERROR; 200 return NULL; 201 } 202 203 for (i = 0; i < converterListSize; i++) { 204 newSelector->encodings[i] = allStrings; 205 uprv_strcpy(newSelector->encodings[i], 206 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 207 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 208 } 209 while (encodingStrPadding > 0) { 210 *allStrings++ = 0; 211 --encodingStrPadding; 212 } 213 214 newSelector->ownEncodingStrings = TRUE; 215 newSelector->encodingsCount = converterListSize; 216 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 217 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); 218 upvec_close(upvec); 219 220 if (U_FAILURE(*status)) { 221 return NULL; 222 } 223 224 return newSelector.orphan(); 225 } 226 227 /* close opened selector */ 228 U_CAPI void U_EXPORT2 229 ucnvsel_close(UConverterSelector *sel) { 230 if (!sel) { 231 return; 232 } 233 if (sel->ownEncodingStrings) { 234 uprv_free(sel->encodings[0]); 235 } 236 uprv_free(sel->encodings); 237 if (sel->ownPv) { 238 uprv_free(sel->pv); 239 } 240 utrie2_close(sel->trie); 241 uprv_free(sel->swapped); 242 uprv_free(sel); 243 } 244 245 static const UDataInfo dataInfo = { 246 sizeof(UDataInfo), 247 0, 248 249 U_IS_BIG_ENDIAN, 250 U_CHARSET_FAMILY, 251 U_SIZEOF_UCHAR, 252 0, 253 254 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 255 { 1, 0, 0, 0 }, /* formatVersion */ 256 { 0, 0, 0, 0 } /* dataVersion */ 257 }; 258 259 enum { 260 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 261 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 262 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 263 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 264 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 265 UCNVSEL_INDEX_COUNT = 16 266 }; 267 268 /* 269 * Serialized form of a UConverterSelector, formatVersion 1: 270 * 271 * The serialized form begins with a standard ICU DataHeader with a UDataInfo 272 * as the template above. 273 * This is followed by: 274 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 275 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 276 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 277 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 278 */ 279 280 /* serialize a selector */ 281 U_CAPI int32_t U_EXPORT2 282 ucnvsel_serialize(const UConverterSelector* sel, 283 void* buffer, int32_t bufferCapacity, UErrorCode* status) { 284 // check if already failed 285 if (U_FAILURE(*status)) { 286 return 0; 287 } 288 // ensure args make sense! 289 uint8_t *p = (uint8_t *)buffer; 290 if (bufferCapacity < 0 || 291 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 292 ) { 293 *status = U_ILLEGAL_ARGUMENT_ERROR; 294 return 0; 295 } 296 // add up the size of the serialized form 297 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 298 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 299 return 0; 300 } 301 *status = U_ZERO_ERROR; 302 303 DataHeader header; 304 uprv_memset(&header, 0, sizeof(header)); 305 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 306 header.dataHeader.magic1 = 0xda; 307 header.dataHeader.magic2 = 0x27; 308 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 309 310 int32_t indexes[UCNVSEL_INDEX_COUNT] = { 311 serializedTrieSize, 312 sel->pvCount, 313 sel->encodingsCount, 314 sel->encodingStrLength 315 }; 316 317 int32_t totalSize = 318 header.dataHeader.headerSize + 319 (int32_t)sizeof(indexes) + 320 serializedTrieSize + 321 sel->pvCount * 4 + 322 sel->encodingStrLength; 323 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 324 if (totalSize > bufferCapacity) { 325 *status = U_BUFFER_OVERFLOW_ERROR; 326 return totalSize; 327 } 328 // ok, save! 329 int32_t length = header.dataHeader.headerSize; 330 uprv_memcpy(p, &header, sizeof(header)); 331 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 332 p += length; 333 334 length = (int32_t)sizeof(indexes); 335 uprv_memcpy(p, indexes, length); 336 p += length; 337 338 utrie2_serialize(sel->trie, p, serializedTrieSize, status); 339 p += serializedTrieSize; 340 341 length = sel->pvCount * 4; 342 uprv_memcpy(p, sel->pv, length); 343 p += length; 344 345 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 346 p += sel->encodingStrLength; 347 348 return totalSize; 349 } 350 351 /** 352 * swap a selector into the desired Endianness and Asciiness of 353 * the system. Just as FYI, selectors are always saved in the format 354 * of the system that created them. They are only converted if used 355 * on another system. In other words, selectors created on different 356 * system can be different even if the params are identical (endianness 357 * and Asciiness differences only) 358 * 359 * @param ds pointer to data swapper containing swapping info 360 * @param inData pointer to incoming data 361 * @param length length of inData in bytes 362 * @param outData pointer to output data. Capacity should 363 * be at least equal to capacity of inData 364 * @param status an in/out ICU UErrorCode 365 * @return 0 on failure, number of bytes swapped on success 366 * number of bytes swapped can be smaller than length 367 */ 368 static int32_t 369 ucnvsel_swap(const UDataSwapper *ds, 370 const void *inData, int32_t length, 371 void *outData, UErrorCode *status) { 372 /* udata_swapDataHeader checks the arguments */ 373 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 374 if(U_FAILURE(*status)) { 375 return 0; 376 } 377 378 /* check data format and format version */ 379 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 380 if(!( 381 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 382 pInfo->dataFormat[1] == 0x53 && 383 pInfo->dataFormat[2] == 0x65 && 384 pInfo->dataFormat[3] == 0x6c 385 )) { 386 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 387 pInfo->dataFormat[0], pInfo->dataFormat[1], 388 pInfo->dataFormat[2], pInfo->dataFormat[3]); 389 *status = U_INVALID_FORMAT_ERROR; 390 return 0; 391 } 392 if(pInfo->formatVersion[0] != 1) { 393 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 394 pInfo->formatVersion[0]); 395 *status = U_UNSUPPORTED_ERROR; 396 return 0; 397 } 398 399 if(length >= 0) { 400 length -= headerSize; 401 if(length < 16*4) { 402 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 403 length); 404 *status = U_INDEX_OUTOFBOUNDS_ERROR; 405 return 0; 406 } 407 } 408 409 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 410 uint8_t *outBytes = (uint8_t *)outData + headerSize; 411 412 /* read the indexes */ 413 const int32_t *inIndexes = (const int32_t *)inBytes; 414 int32_t indexes[16]; 415 int32_t i; 416 for(i = 0; i < 16; ++i) { 417 indexes[i] = udata_readInt32(ds, inIndexes[i]); 418 } 419 420 /* get the total length of the data */ 421 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 422 if(length >= 0) { 423 if(length < size) { 424 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 425 length); 426 *status = U_INDEX_OUTOFBOUNDS_ERROR; 427 return 0; 428 } 429 430 /* copy the data for inaccessible bytes */ 431 if(inBytes != outBytes) { 432 uprv_memcpy(outBytes, inBytes, size); 433 } 434 435 int32_t offset = 0, count; 436 437 /* swap the int32_t indexes[] */ 438 count = UCNVSEL_INDEX_COUNT*4; 439 ds->swapArray32(ds, inBytes, count, outBytes, status); 440 offset += count; 441 442 /* swap the UTrie2 */ 443 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 444 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 445 offset += count; 446 447 /* swap the uint32_t pv[] */ 448 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 449 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 450 offset += count; 451 452 /* swap the encoding names */ 453 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 454 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 455 offset += count; 456 457 U_ASSERT(offset == size); 458 } 459 460 return headerSize + size; 461 } 462 463 /* unserialize a selector */ 464 U_CAPI UConverterSelector* U_EXPORT2 465 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 466 // check if already failed 467 if (U_FAILURE(*status)) { 468 return NULL; 469 } 470 // ensure args make sense! 471 const uint8_t *p = (const uint8_t *)buffer; 472 if (length <= 0 || 473 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 474 ) { 475 *status = U_ILLEGAL_ARGUMENT_ERROR; 476 return NULL; 477 } 478 // header 479 if (length < 32) { 480 // not even enough space for a minimal header 481 *status = U_INDEX_OUTOFBOUNDS_ERROR; 482 return NULL; 483 } 484 const DataHeader *pHeader = (const DataHeader *)p; 485 if (!( 486 pHeader->dataHeader.magic1==0xda && 487 pHeader->dataHeader.magic2==0x27 && 488 pHeader->info.dataFormat[0] == 0x43 && 489 pHeader->info.dataFormat[1] == 0x53 && 490 pHeader->info.dataFormat[2] == 0x65 && 491 pHeader->info.dataFormat[3] == 0x6c 492 )) { 493 /* header not valid or dataFormat not recognized */ 494 *status = U_INVALID_FORMAT_ERROR; 495 return NULL; 496 } 497 if (pHeader->info.formatVersion[0] != 1) { 498 *status = U_UNSUPPORTED_ERROR; 499 return NULL; 500 } 501 uint8_t* swapped = NULL; 502 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 503 pHeader->info.charsetFamily != U_CHARSET_FAMILY 504 ) { 505 // swap the data 506 UDataSwapper *ds = 507 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 508 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 509 if (U_FAILURE(*status)) { 510 udata_closeSwapper(ds); 511 return NULL; 512 } 513 if (length < totalSize) { 514 udata_closeSwapper(ds); 515 *status = U_INDEX_OUTOFBOUNDS_ERROR; 516 return NULL; 517 } 518 swapped = (uint8_t*)uprv_malloc(totalSize); 519 if (swapped == NULL) { 520 udata_closeSwapper(ds); 521 *status = U_MEMORY_ALLOCATION_ERROR; 522 return NULL; 523 } 524 ucnvsel_swap(ds, p, length, swapped, status); 525 udata_closeSwapper(ds); 526 if (U_FAILURE(*status)) { 527 uprv_free(swapped); 528 return NULL; 529 } 530 p = swapped; 531 pHeader = (const DataHeader *)p; 532 } 533 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 534 // not even enough space for the header and the indexes 535 uprv_free(swapped); 536 *status = U_INDEX_OUTOFBOUNDS_ERROR; 537 return NULL; 538 } 539 p += pHeader->dataHeader.headerSize; 540 length -= pHeader->dataHeader.headerSize; 541 // indexes 542 const int32_t *indexes = (const int32_t *)p; 543 if (length < indexes[UCNVSEL_INDEX_SIZE]) { 544 uprv_free(swapped); 545 *status = U_INDEX_OUTOFBOUNDS_ERROR; 546 return NULL; 547 } 548 p += UCNVSEL_INDEX_COUNT * 4; 549 // create and populate the selector object 550 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 551 char **encodings = 552 (char **)uprv_malloc( 553 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 554 if (sel == NULL || encodings == NULL) { 555 uprv_free(swapped); 556 uprv_free(sel); 557 uprv_free(encodings); 558 *status = U_MEMORY_ALLOCATION_ERROR; 559 return NULL; 560 } 561 uprv_memset(sel, 0, sizeof(UConverterSelector)); 562 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 563 sel->encodings = encodings; 564 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 565 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 566 sel->swapped = swapped; 567 // trie 568 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 569 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 570 status); 571 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 572 if (U_FAILURE(*status)) { 573 ucnvsel_close(sel); 574 return NULL; 575 } 576 // bit vectors 577 sel->pv = (uint32_t *)p; 578 p += sel->pvCount * 4; 579 // encoding names 580 char* s = (char*)p; 581 for (int32_t i = 0; i < sel->encodingsCount; ++i) { 582 sel->encodings[i] = s; 583 s += uprv_strlen(s) + 1; 584 } 585 p += sel->encodingStrLength; 586 587 return sel; 588 } 589 590 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just 591 // iterate over the selected encodings 592 struct Enumerator { 593 int16_t* index; 594 int16_t length; 595 int16_t cur; 596 const UConverterSelector* sel; 597 }; 598 599 U_CDECL_BEGIN 600 601 static void U_CALLCONV 602 ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 603 uprv_free(((Enumerator*)(enumerator->context))->index); 604 uprv_free(enumerator->context); 605 uprv_free(enumerator); 606 } 607 608 609 static int32_t U_CALLCONV 610 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 611 // check if already failed 612 if (U_FAILURE(*status)) { 613 return 0; 614 } 615 return ((Enumerator*)(enumerator->context))->length; 616 } 617 618 619 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 620 int32_t* resultLength, 621 UErrorCode* status) { 622 // check if already failed 623 if (U_FAILURE(*status)) { 624 return NULL; 625 } 626 627 int16_t cur = ((Enumerator*)(enumerator->context))->cur; 628 const UConverterSelector* sel; 629 const char* result; 630 if (cur >= ((Enumerator*)(enumerator->context))->length) { 631 return NULL; 632 } 633 sel = ((Enumerator*)(enumerator->context))->sel; 634 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 635 ((Enumerator*)(enumerator->context))->cur++; 636 if (resultLength) { 637 *resultLength = (int32_t)uprv_strlen(result); 638 } 639 return result; 640 } 641 642 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 643 UErrorCode* status) { 644 // check if already failed 645 if (U_FAILURE(*status)) { 646 return ; 647 } 648 ((Enumerator*)(enumerator->context))->cur = 0; 649 } 650 651 U_CDECL_END 652 653 654 static const UEnumeration defaultEncodings = { 655 NULL, 656 NULL, 657 ucnvsel_close_selector_iterator, 658 ucnvsel_count_encodings, 659 uenum_unextDefault, 660 ucnvsel_next_encoding, 661 ucnvsel_reset_iterator 662 }; 663 664 665 // internal fn to intersect two sets of masks 666 // returns whether the mask has reduced to all zeros 667 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 668 int32_t i; 669 uint32_t oredDest = 0; 670 for (i = 0 ; i < len ; ++i) { 671 oredDest |= (dest[i] &= source1[i]); 672 } 673 return oredDest == 0; 674 } 675 676 // internal fn to count how many 1's are there in a mask 677 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 678 static int16_t countOnes(uint32_t* mask, int32_t len) { 679 int32_t i, totalOnes = 0; 680 for (i = 0 ; i < len ; ++i) { 681 uint32_t ent = mask[i]; 682 for (; ent; totalOnes++) 683 { 684 ent &= ent - 1; // clear the least significant bit set 685 } 686 } 687 return totalOnes; 688 } 689 690 691 /* internal function! */ 692 static UEnumeration *selectForMask(const UConverterSelector* sel, 693 uint32_t *mask, UErrorCode *status) { 694 // this is the context we will use. Store a table of indices to which 695 // encodings are legit. 696 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 697 if (result == NULL) { 698 uprv_free(mask); 699 *status = U_MEMORY_ALLOCATION_ERROR; 700 return NULL; 701 } 702 result->index = NULL; // this will be allocated later! 703 result->length = result->cur = 0; 704 result->sel = sel; 705 706 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 707 if (en == NULL) { 708 // TODO(markus): Combine Enumerator and UEnumeration into one struct. 709 uprv_free(mask); 710 uprv_free(result); 711 *status = U_MEMORY_ALLOCATION_ERROR; 712 return NULL; 713 } 714 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 715 en->context = result; 716 717 int32_t columns = (sel->encodingsCount+31)/32; 718 int16_t numOnes = countOnes(mask, columns); 719 // now, we know the exact space we need for index 720 if (numOnes > 0) { 721 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 722 723 int32_t i, j; 724 int16_t k = 0; 725 for (j = 0 ; j < columns; j++) { 726 uint32_t v = mask[j]; 727 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 728 if ((v & 1) != 0) { 729 result->index[result->length++] = k; 730 } 731 v >>= 1; 732 } 733 } 734 } //otherwise, index will remain NULL (and will never be touched by 735 //the enumerator code anyway) 736 uprv_free(mask); 737 return en; 738 } 739 740 /* check a string against the selector - UTF16 version */ 741 U_CAPI UEnumeration * U_EXPORT2 742 ucnvsel_selectForString(const UConverterSelector* sel, 743 const UChar *s, int32_t length, UErrorCode *status) { 744 // check if already failed 745 if (U_FAILURE(*status)) { 746 return NULL; 747 } 748 // ensure args make sense! 749 if (sel == NULL || (s == NULL && length != 0)) { 750 *status = U_ILLEGAL_ARGUMENT_ERROR; 751 return NULL; 752 } 753 754 int32_t columns = (sel->encodingsCount+31)/32; 755 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 756 if (mask == NULL) { 757 *status = U_MEMORY_ALLOCATION_ERROR; 758 return NULL; 759 } 760 uprv_memset(mask, ~0, columns *4); 761 762 if(s!=NULL) { 763 const UChar *limit; 764 if (length >= 0) { 765 limit = s + length; 766 } else { 767 limit = NULL; 768 } 769 770 while (limit == NULL ? *s != 0 : s != limit) { 771 UChar32 c; 772 uint16_t pvIndex; 773 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 774 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 775 break; 776 } 777 } 778 } 779 return selectForMask(sel, mask, status); 780 } 781 782 /* check a string against the selector - UTF8 version */ 783 U_CAPI UEnumeration * U_EXPORT2 784 ucnvsel_selectForUTF8(const UConverterSelector* sel, 785 const char *s, int32_t length, UErrorCode *status) { 786 // check if already failed 787 if (U_FAILURE(*status)) { 788 return NULL; 789 } 790 // ensure args make sense! 791 if (sel == NULL || (s == NULL && length != 0)) { 792 *status = U_ILLEGAL_ARGUMENT_ERROR; 793 return NULL; 794 } 795 796 int32_t columns = (sel->encodingsCount+31)/32; 797 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 798 if (mask == NULL) { 799 *status = U_MEMORY_ALLOCATION_ERROR; 800 return NULL; 801 } 802 uprv_memset(mask, ~0, columns *4); 803 804 if (length < 0) { 805 length = (int32_t)uprv_strlen(s); 806 } 807 808 if(s!=NULL) { 809 const char *limit = s + length; 810 811 while (s != limit) { 812 uint16_t pvIndex; 813 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 814 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 815 break; 816 } 817 } 818 } 819 return selectForMask(sel, mask, status); 820 } 821 822 #endif // !UCONFIG_NO_CONVERSION 823