1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * C version of UnicodeSet. 19 */ 20 21 22 /** 23 * \file 24 * \brief C API: Unicode Set 25 * 26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 27 */ 28 29 #ifndef __USET_H__ 30 #define __USET_H__ 31 32 #include "unicode/utypes.h" 33 #include "unicode/uchar.h" 34 #include "unicode/localpointer.h" 35 36 #ifndef USET_DEFINED 37 38 #ifndef U_IN_DOXYGEN 39 #define USET_DEFINED 40 #endif 41 /** 42 * USet is the C API type corresponding to C++ class UnicodeSet. 43 * Use the uset_* API to manipulate. Create with 44 * uset_open*, and destroy with uset_close. 45 * @stable ICU 2.4 46 */ 47 typedef struct USet USet; 48 #endif 49 50 /** 51 * Bitmask values to be passed to uset_openPatternOptions() or 52 * uset_applyPattern() taking an option parameter. 53 * @stable ICU 2.4 54 */ 55 enum { 56 /** 57 * Ignore white space within patterns unless quoted or escaped. 58 * @stable ICU 2.4 59 */ 60 USET_IGNORE_SPACE = 1, 61 62 /** 63 * Enable case insensitive matching. E.g., "[ab]" with this flag 64 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 65 * match all except 'a', 'A', 'b', and 'B'. This performs a full 66 * closure over case mappings, e.g. U+017F for s. 67 * 68 * The resulting set is a superset of the input for the code points but 69 * not for the strings. 70 * It performs a case mapping closure of the code points and adds 71 * full case folding strings for the code points, and reduces strings of 72 * the original set to their full case folding equivalents. 73 * 74 * This is designed for case-insensitive matches, for example 75 * in regular expressions. The full code point case closure allows checking of 76 * an input character directly against the closure set. 77 * Strings are matched by comparing the case-folded form from the closure 78 * set with an incremental case folding of the string in question. 79 * 80 * The closure set will also contain single code points if the original 81 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 82 * This is not necessary (that is, redundant) for the above matching method 83 * but results in the same closure sets regardless of whether the original 84 * set contained the code point or a string. 85 * 86 * @stable ICU 2.4 87 */ 88 USET_CASE_INSENSITIVE = 2, 89 90 /** 91 * Enable case insensitive matching. E.g., "[ab]" with this flag 92 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 93 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 94 * title-, and uppercase mappings as well as the case folding 95 * of each existing element in the set. 96 * @stable ICU 3.2 97 */ 98 USET_ADD_CASE_MAPPINGS = 4 99 }; 100 101 /** 102 * Argument values for whether span() and similar functions continue while 103 * the current character is contained vs. not contained in the set. 104 * 105 * The functionality is straightforward for sets with only single code points, 106 * without strings (which is the common case): 107 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 108 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 109 * - span() and spanBack() partition any string the same way when 110 * alternating between span(USET_SPAN_NOT_CONTAINED) and 111 * span(either "contained" condition). 112 * - Using a complemented (inverted) set and the opposite span conditions 113 * yields the same results. 114 * 115 * When a set contains multi-code point strings, then these statements may not 116 * be true, depending on the strings in the set (for example, whether they 117 * overlap with each other) and the string that is processed. 118 * For a set with strings: 119 * - The complement of the set contains the opposite set of code points, 120 * but the same set of strings. 121 * Therefore, complementing both the set and the span conditions 122 * may yield different results. 123 * - When starting spans at different positions in a string 124 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 125 * because a set string may start before the later position. 126 * - span(USET_SPAN_SIMPLE) may be shorter than 127 * span(USET_SPAN_CONTAINED) because it will not recursively try 128 * all possible paths. 129 * For example, with a set which contains the three strings "xy", "xya" and "ax", 130 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 131 * span("xyax", USET_SPAN_SIMPLE) will return 3. 132 * span(USET_SPAN_SIMPLE) will never be longer than 133 * span(USET_SPAN_CONTAINED). 134 * - With either "contained" condition, span() and spanBack() may partition 135 * a string in different ways. 136 * For example, with a set which contains the two strings "ab" and "ba", 137 * and when processing the string "aba", 138 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 139 * while spanBack() will yield boundaries of { 0, 1, 3 }. 140 * 141 * Note: If it is important to get the same boundaries whether iterating forward 142 * or backward through a string, then either only span() should be used and 143 * the boundaries cached for backward operation, or an ICU BreakIterator 144 * could be used. 145 * 146 * Note: Unpaired surrogates are treated like surrogate code points. 147 * Similarly, set strings match only on code point boundaries, 148 * never in the middle of a surrogate pair. 149 * Illegal UTF-8 sequences are treated like U+FFFD. 150 * When processing UTF-8 strings, malformed set strings 151 * (strings with unpaired surrogates which cannot be converted to UTF-8) 152 * are ignored. 153 * 154 * @stable ICU 3.8 155 */ 156 typedef enum USetSpanCondition { 157 /** 158 * Continues a span() while there is no set element at the current position. 159 * Increments by one code point at a time. 160 * Stops before the first set element (character or string). 161 * (For code points only, this is like while contains(current)==FALSE). 162 * 163 * When span() returns, the substring between where it started and the position 164 * it returned consists only of characters that are not in the set, 165 * and none of its strings overlap with the span. 166 * 167 * @stable ICU 3.8 168 */ 169 USET_SPAN_NOT_CONTAINED = 0, 170 /** 171 * Spans the longest substring that is a concatenation of set elements (characters or strings). 172 * (For characters only, this is like while contains(current)==TRUE). 173 * 174 * When span() returns, the substring between where it started and the position 175 * it returned consists only of set elements (characters or strings) that are in the set. 176 * 177 * If a set contains strings, then the span will be the longest substring for which there 178 * exists at least one non-overlapping concatenation of set elements (characters or strings). 179 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 180 * (Java/ICU/Perl regex stops at the first match of an OR.) 181 * 182 * @stable ICU 3.8 183 */ 184 USET_SPAN_CONTAINED = 1, 185 /** 186 * Continues a span() while there is a set element at the current position. 187 * Increments by the longest matching element at each position. 188 * (For characters only, this is like while contains(current)==TRUE). 189 * 190 * When span() returns, the substring between where it started and the position 191 * it returned consists only of set elements (characters or strings) that are in the set. 192 * 193 * If a set only contains single characters, then this is the same 194 * as USET_SPAN_CONTAINED. 195 * 196 * If a set contains strings, then the span will be the longest substring 197 * with a match at each position with the longest single set element (character or string). 198 * 199 * Use this span condition together with other longest-match algorithms, 200 * such as ICU converters (ucnv_getUnicodeSet()). 201 * 202 * @stable ICU 3.8 203 */ 204 USET_SPAN_SIMPLE = 2, 205 #ifndef U_HIDE_DEPRECATED_API 206 /** 207 * One more than the last span condition. 208 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 209 */ 210 USET_SPAN_CONDITION_COUNT 211 #endif // U_HIDE_DEPRECATED_API 212 } USetSpanCondition; 213 214 enum { 215 /** 216 * Capacity of USerializedSet::staticArray. 217 * Enough for any single-code point set. 218 * Also provides padding for nice sizeof(USerializedSet). 219 * @stable ICU 2.4 220 */ 221 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 222 }; 223 224 /** 225 * A serialized form of a Unicode set. Limited manipulations are 226 * possible directly on a serialized set. See below. 227 * @stable ICU 2.4 228 */ 229 typedef struct USerializedSet { 230 /** 231 * The serialized Unicode Set. 232 * @stable ICU 2.4 233 */ 234 const uint16_t *array; 235 /** 236 * The length of the array that contains BMP characters. 237 * @stable ICU 2.4 238 */ 239 int32_t bmpLength; 240 /** 241 * The total length of the array. 242 * @stable ICU 2.4 243 */ 244 int32_t length; 245 /** 246 * A small buffer for the array to reduce memory allocations. 247 * @stable ICU 2.4 248 */ 249 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 250 } USerializedSet; 251 252 /********************************************************************* 253 * USet API 254 *********************************************************************/ 255 256 /** 257 * Create an empty USet object. 258 * Equivalent to uset_open(1, 0). 259 * @return a newly created USet. The caller must call uset_close() on 260 * it when done. 261 * @stable ICU 4.2 262 */ 263 U_STABLE USet* U_EXPORT2 264 uset_openEmpty(void); 265 266 /** 267 * Creates a USet object that contains the range of characters 268 * start..end, inclusive. If <code>start > end</code> 269 * then an empty set is created (same as using uset_openEmpty()). 270 * @param start first character of the range, inclusive 271 * @param end last character of the range, inclusive 272 * @return a newly created USet. The caller must call uset_close() on 273 * it when done. 274 * @stable ICU 2.4 275 */ 276 U_STABLE USet* U_EXPORT2 277 uset_open(UChar32 start, UChar32 end); 278 279 /** 280 * Creates a set from the given pattern. See the UnicodeSet class 281 * description for the syntax of the pattern language. 282 * @param pattern a string specifying what characters are in the set 283 * @param patternLength the length of the pattern, or -1 if null 284 * terminated 285 * @param ec the error code 286 * @stable ICU 2.4 287 */ 288 U_STABLE USet* U_EXPORT2 289 uset_openPattern(const UChar* pattern, int32_t patternLength, 290 UErrorCode* ec); 291 292 /** 293 * Creates a set from the given pattern. See the UnicodeSet class 294 * description for the syntax of the pattern language. 295 * @param pattern a string specifying what characters are in the set 296 * @param patternLength the length of the pattern, or -1 if null 297 * terminated 298 * @param options bitmask for options to apply to the pattern. 299 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 300 * @param ec the error code 301 * @stable ICU 2.4 302 */ 303 U_STABLE USet* U_EXPORT2 304 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 305 uint32_t options, 306 UErrorCode* ec); 307 308 /** 309 * Disposes of the storage used by a USet object. This function should 310 * be called exactly once for objects returned by uset_open(). 311 * @param set the object to dispose of 312 * @stable ICU 2.4 313 */ 314 U_STABLE void U_EXPORT2 315 uset_close(USet* set); 316 317 #if U_SHOW_CPLUSPLUS_API 318 319 U_NAMESPACE_BEGIN 320 321 /** 322 * \class LocalUSetPointer 323 * "Smart pointer" class, closes a USet via uset_close(). 324 * For most methods see the LocalPointerBase base class. 325 * 326 * @see LocalPointerBase 327 * @see LocalPointer 328 * @stable ICU 4.4 329 */ 330 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 331 332 U_NAMESPACE_END 333 334 #endif 335 336 /** 337 * Returns a copy of this object. 338 * If this set is frozen, then the clone will be frozen as well. 339 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 340 * @param set the original set 341 * @return the newly allocated copy of the set 342 * @see uset_cloneAsThawed 343 * @stable ICU 3.8 344 */ 345 U_STABLE USet * U_EXPORT2 346 uset_clone(const USet *set); 347 348 /** 349 * Determines whether the set has been frozen (made immutable) or not. 350 * See the ICU4J Freezable interface for details. 351 * @param set the set 352 * @return TRUE/FALSE for whether the set has been frozen 353 * @see uset_freeze 354 * @see uset_cloneAsThawed 355 * @stable ICU 3.8 356 */ 357 U_STABLE UBool U_EXPORT2 358 uset_isFrozen(const USet *set); 359 360 /** 361 * Freeze the set (make it immutable). 362 * Once frozen, it cannot be unfrozen and is therefore thread-safe 363 * until it is deleted. 364 * See the ICU4J Freezable interface for details. 365 * Freezing the set may also make some operations faster, for example 366 * uset_contains() and uset_span(). 367 * A frozen set will not be modified. (It remains frozen.) 368 * @param set the set 369 * @return the same set, now frozen 370 * @see uset_isFrozen 371 * @see uset_cloneAsThawed 372 * @stable ICU 3.8 373 */ 374 U_STABLE void U_EXPORT2 375 uset_freeze(USet *set); 376 377 /** 378 * Clone the set and make the clone mutable. 379 * See the ICU4J Freezable interface for details. 380 * @param set the set 381 * @return the mutable clone 382 * @see uset_freeze 383 * @see uset_isFrozen 384 * @see uset_clone 385 * @stable ICU 3.8 386 */ 387 U_STABLE USet * U_EXPORT2 388 uset_cloneAsThawed(const USet *set); 389 390 /** 391 * Causes the USet object to represent the range <code>start - end</code>. 392 * If <code>start > end</code> then this USet is set to an empty range. 393 * A frozen set will not be modified. 394 * @param set the object to set to the given range 395 * @param start first character in the set, inclusive 396 * @param end last character in the set, inclusive 397 * @stable ICU 3.2 398 */ 399 U_STABLE void U_EXPORT2 400 uset_set(USet* set, 401 UChar32 start, UChar32 end); 402 403 /** 404 * Modifies the set to represent the set specified by the given 405 * pattern. See the UnicodeSet class description for the syntax of 406 * the pattern language. See also the User Guide chapter about UnicodeSet. 407 * <em>Empties the set passed before applying the pattern.</em> 408 * A frozen set will not be modified. 409 * @param set The set to which the pattern is to be applied. 410 * @param pattern A pointer to UChar string specifying what characters are in the set. 411 * The character at pattern[0] must be a '['. 412 * @param patternLength The length of the UChar string. -1 if NUL terminated. 413 * @param options A bitmask for options to apply to the pattern. 414 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 415 * @param status Returns an error if the pattern cannot be parsed. 416 * @return Upon successful parse, the value is either 417 * the index of the character after the closing ']' 418 * of the parsed pattern. 419 * If the status code indicates failure, then the return value 420 * is the index of the error in the source. 421 * 422 * @stable ICU 2.8 423 */ 424 U_STABLE int32_t U_EXPORT2 425 uset_applyPattern(USet *set, 426 const UChar *pattern, int32_t patternLength, 427 uint32_t options, 428 UErrorCode *status); 429 430 /** 431 * Modifies the set to contain those code points which have the given value 432 * for the given binary or enumerated property, as returned by 433 * u_getIntPropertyValue. Prior contents of this set are lost. 434 * A frozen set will not be modified. 435 * 436 * @param set the object to contain the code points defined by the property 437 * 438 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 439 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 440 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 441 * 442 * @param value a value in the range u_getIntPropertyMinValue(prop).. 443 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 444 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 445 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 446 * categories such as [:L:] to be represented. 447 * 448 * @param ec error code input/output parameter 449 * 450 * @stable ICU 3.2 451 */ 452 U_STABLE void U_EXPORT2 453 uset_applyIntPropertyValue(USet* set, 454 UProperty prop, int32_t value, UErrorCode* ec); 455 456 /** 457 * Modifies the set to contain those code points which have the 458 * given value for the given property. Prior contents of this 459 * set are lost. 460 * A frozen set will not be modified. 461 * 462 * @param set the object to contain the code points defined by the given 463 * property and value alias 464 * 465 * @param prop a string specifying a property alias, either short or long. 466 * The name is matched loosely. See PropertyAliases.txt for names and a 467 * description of loose matching. If the value string is empty, then this 468 * string is interpreted as either a General_Category value alias, a Script 469 * value alias, a binary property alias, or a special ID. Special IDs are 470 * matched loosely and correspond to the following sets: 471 * 472 * "ANY" = [\\u0000-\\U0010FFFF], 473 * "ASCII" = [\\u0000-\\u007F], 474 * "Assigned" = [:^Cn:]. 475 * 476 * @param propLength the length of the prop, or -1 if NULL 477 * 478 * @param value a string specifying a value alias, either short or long. 479 * The name is matched loosely. See PropertyValueAliases.txt for names 480 * and a description of loose matching. In addition to aliases listed, 481 * numeric values and canonical combining classes may be expressed 482 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 483 * may also be empty. 484 * 485 * @param valueLength the length of the value, or -1 if NULL 486 * 487 * @param ec error code input/output parameter 488 * 489 * @stable ICU 3.2 490 */ 491 U_STABLE void U_EXPORT2 492 uset_applyPropertyAlias(USet* set, 493 const UChar *prop, int32_t propLength, 494 const UChar *value, int32_t valueLength, 495 UErrorCode* ec); 496 497 /** 498 * Return true if the given position, in the given pattern, appears 499 * to be the start of a UnicodeSet pattern. 500 * 501 * @param pattern a string specifying the pattern 502 * @param patternLength the length of the pattern, or -1 if NULL 503 * @param pos the given position 504 * @stable ICU 3.2 505 */ 506 U_STABLE UBool U_EXPORT2 507 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 508 int32_t pos); 509 510 /** 511 * Returns a string representation of this set. If the result of 512 * calling this function is passed to a uset_openPattern(), it 513 * will produce another set that is equal to this one. 514 * @param set the set 515 * @param result the string to receive the rules, may be NULL 516 * @param resultCapacity the capacity of result, may be 0 if result is NULL 517 * @param escapeUnprintable if TRUE then convert unprintable 518 * character to their hex escape representations, \\uxxxx or 519 * \\Uxxxxxxxx. Unprintable characters are those other than 520 * U+000A, U+0020..U+007E. 521 * @param ec error code. 522 * @return length of string, possibly larger than resultCapacity 523 * @stable ICU 2.4 524 */ 525 U_STABLE int32_t U_EXPORT2 526 uset_toPattern(const USet* set, 527 UChar* result, int32_t resultCapacity, 528 UBool escapeUnprintable, 529 UErrorCode* ec); 530 531 /** 532 * Adds the given character to the given USet. After this call, 533 * uset_contains(set, c) will return TRUE. 534 * A frozen set will not be modified. 535 * @param set the object to which to add the character 536 * @param c the character to add 537 * @stable ICU 2.4 538 */ 539 U_STABLE void U_EXPORT2 540 uset_add(USet* set, UChar32 c); 541 542 /** 543 * Adds all of the elements in the specified set to this set if 544 * they're not already present. This operation effectively 545 * modifies this set so that its value is the <i>union</i> of the two 546 * sets. The behavior of this operation is unspecified if the specified 547 * collection is modified while the operation is in progress. 548 * A frozen set will not be modified. 549 * 550 * @param set the object to which to add the set 551 * @param additionalSet the source set whose elements are to be added to this set. 552 * @stable ICU 2.6 553 */ 554 U_STABLE void U_EXPORT2 555 uset_addAll(USet* set, const USet *additionalSet); 556 557 /** 558 * Adds the given range of characters to the given USet. After this call, 559 * uset_contains(set, start, end) will return TRUE. 560 * A frozen set will not be modified. 561 * @param set the object to which to add the character 562 * @param start the first character of the range to add, inclusive 563 * @param end the last character of the range to add, inclusive 564 * @stable ICU 2.2 565 */ 566 U_STABLE void U_EXPORT2 567 uset_addRange(USet* set, UChar32 start, UChar32 end); 568 569 /** 570 * Adds the given string to the given USet. After this call, 571 * uset_containsString(set, str, strLen) will return TRUE. 572 * A frozen set will not be modified. 573 * @param set the object to which to add the character 574 * @param str the string to add 575 * @param strLen the length of the string or -1 if null terminated. 576 * @stable ICU 2.4 577 */ 578 U_STABLE void U_EXPORT2 579 uset_addString(USet* set, const UChar* str, int32_t strLen); 580 581 /** 582 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 583 * If this set already any particular character, it has no effect on that character. 584 * A frozen set will not be modified. 585 * @param set the object to which to add the character 586 * @param str the source string 587 * @param strLen the length of the string or -1 if null terminated. 588 * @stable ICU 3.4 589 */ 590 U_STABLE void U_EXPORT2 591 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 592 593 /** 594 * Removes the given character from the given USet. After this call, 595 * uset_contains(set, c) will return FALSE. 596 * A frozen set will not be modified. 597 * @param set the object from which to remove the character 598 * @param c the character to remove 599 * @stable ICU 2.4 600 */ 601 U_STABLE void U_EXPORT2 602 uset_remove(USet* set, UChar32 c); 603 604 /** 605 * Removes the given range of characters from the given USet. After this call, 606 * uset_contains(set, start, end) will return FALSE. 607 * A frozen set will not be modified. 608 * @param set the object to which to add the character 609 * @param start the first character of the range to remove, inclusive 610 * @param end the last character of the range to remove, inclusive 611 * @stable ICU 2.2 612 */ 613 U_STABLE void U_EXPORT2 614 uset_removeRange(USet* set, UChar32 start, UChar32 end); 615 616 /** 617 * Removes the given string to the given USet. After this call, 618 * uset_containsString(set, str, strLen) will return FALSE. 619 * A frozen set will not be modified. 620 * @param set the object to which to add the character 621 * @param str the string to remove 622 * @param strLen the length of the string or -1 if null terminated. 623 * @stable ICU 2.4 624 */ 625 U_STABLE void U_EXPORT2 626 uset_removeString(USet* set, const UChar* str, int32_t strLen); 627 628 /** 629 * Removes from this set all of its elements that are contained in the 630 * specified set. This operation effectively modifies this 631 * set so that its value is the <i>asymmetric set difference</i> of 632 * the two sets. 633 * A frozen set will not be modified. 634 * @param set the object from which the elements are to be removed 635 * @param removeSet the object that defines which elements will be 636 * removed from this set 637 * @stable ICU 3.2 638 */ 639 U_STABLE void U_EXPORT2 640 uset_removeAll(USet* set, const USet* removeSet); 641 642 /** 643 * Retain only the elements in this set that are contained in the 644 * specified range. If <code>start > end</code> then an empty range is 645 * retained, leaving the set empty. This is equivalent to 646 * a boolean logic AND, or a set INTERSECTION. 647 * A frozen set will not be modified. 648 * 649 * @param set the object for which to retain only the specified range 650 * @param start first character, inclusive, of range to be retained 651 * to this set. 652 * @param end last character, inclusive, of range to be retained 653 * to this set. 654 * @stable ICU 3.2 655 */ 656 U_STABLE void U_EXPORT2 657 uset_retain(USet* set, UChar32 start, UChar32 end); 658 659 /** 660 * Retains only the elements in this set that are contained in the 661 * specified set. In other words, removes from this set all of 662 * its elements that are not contained in the specified set. This 663 * operation effectively modifies this set so that its value is 664 * the <i>intersection</i> of the two sets. 665 * A frozen set will not be modified. 666 * 667 * @param set the object on which to perform the retain 668 * @param retain set that defines which elements this set will retain 669 * @stable ICU 3.2 670 */ 671 U_STABLE void U_EXPORT2 672 uset_retainAll(USet* set, const USet* retain); 673 674 /** 675 * Reallocate this objects internal structures to take up the least 676 * possible space, without changing this object's value. 677 * A frozen set will not be modified. 678 * 679 * @param set the object on which to perfrom the compact 680 * @stable ICU 3.2 681 */ 682 U_STABLE void U_EXPORT2 683 uset_compact(USet* set); 684 685 /** 686 * Inverts this set. This operation modifies this set so that 687 * its value is its complement. This operation does not affect 688 * the multicharacter strings, if any. 689 * A frozen set will not be modified. 690 * @param set the set 691 * @stable ICU 2.4 692 */ 693 U_STABLE void U_EXPORT2 694 uset_complement(USet* set); 695 696 /** 697 * Complements in this set all elements contained in the specified 698 * set. Any character in the other set will be removed if it is 699 * in this set, or will be added if it is not in this set. 700 * A frozen set will not be modified. 701 * 702 * @param set the set with which to complement 703 * @param complement set that defines which elements will be xor'ed 704 * from this set. 705 * @stable ICU 3.2 706 */ 707 U_STABLE void U_EXPORT2 708 uset_complementAll(USet* set, const USet* complement); 709 710 /** 711 * Removes all of the elements from this set. This set will be 712 * empty after this call returns. 713 * A frozen set will not be modified. 714 * @param set the set 715 * @stable ICU 2.4 716 */ 717 U_STABLE void U_EXPORT2 718 uset_clear(USet* set); 719 720 /** 721 * Close this set over the given attribute. For the attribute 722 * USET_CASE, the result is to modify this set so that: 723 * 724 * 1. For each character or string 'a' in this set, all strings or 725 * characters 'b' such that foldCase(a) == foldCase(b) are added 726 * to this set. 727 * 728 * 2. For each string 'e' in the resulting set, if e != 729 * foldCase(e), 'e' will be removed. 730 * 731 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 732 * 733 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 734 * == b denotes that the contents are the same, not pointer 735 * comparison.) 736 * 737 * A frozen set will not be modified. 738 * 739 * @param set the set 740 * 741 * @param attributes bitmask for attributes to close over. 742 * Currently only the USET_CASE bit is supported. Any undefined bits 743 * are ignored. 744 * @stable ICU 4.2 745 */ 746 U_STABLE void U_EXPORT2 747 uset_closeOver(USet* set, int32_t attributes); 748 749 /** 750 * Remove all strings from this set. 751 * 752 * @param set the set 753 * @stable ICU 4.2 754 */ 755 U_STABLE void U_EXPORT2 756 uset_removeAllStrings(USet* set); 757 758 /** 759 * Returns TRUE if the given USet contains no characters and no 760 * strings. 761 * @param set the set 762 * @return true if set is empty 763 * @stable ICU 2.4 764 */ 765 U_STABLE UBool U_EXPORT2 766 uset_isEmpty(const USet* set); 767 768 /** 769 * Returns TRUE if the given USet contains the given character. 770 * This function works faster with a frozen set. 771 * @param set the set 772 * @param c The codepoint to check for within the set 773 * @return true if set contains c 774 * @stable ICU 2.4 775 */ 776 U_STABLE UBool U_EXPORT2 777 uset_contains(const USet* set, UChar32 c); 778 779 /** 780 * Returns TRUE if the given USet contains all characters c 781 * where start <= c && c <= end. 782 * @param set the set 783 * @param start the first character of the range to test, inclusive 784 * @param end the last character of the range to test, inclusive 785 * @return TRUE if set contains the range 786 * @stable ICU 2.2 787 */ 788 U_STABLE UBool U_EXPORT2 789 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 790 791 /** 792 * Returns TRUE if the given USet contains the given string. 793 * @param set the set 794 * @param str the string 795 * @param strLen the length of the string or -1 if null terminated. 796 * @return true if set contains str 797 * @stable ICU 2.4 798 */ 799 U_STABLE UBool U_EXPORT2 800 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 801 802 /** 803 * Returns the index of the given character within this set, where 804 * the set is ordered by ascending code point. If the character 805 * is not in this set, return -1. The inverse of this method is 806 * <code>charAt()</code>. 807 * @param set the set 808 * @param c the character to obtain the index for 809 * @return an index from 0..size()-1, or -1 810 * @stable ICU 3.2 811 */ 812 U_STABLE int32_t U_EXPORT2 813 uset_indexOf(const USet* set, UChar32 c); 814 815 /** 816 * Returns the character at the given index within this set, where 817 * the set is ordered by ascending code point. If the index is 818 * out of range, return (UChar32)-1. The inverse of this method is 819 * <code>indexOf()</code>. 820 * @param set the set 821 * @param charIndex an index from 0..size()-1 to obtain the char for 822 * @return the character at the given index, or (UChar32)-1. 823 * @stable ICU 3.2 824 */ 825 U_STABLE UChar32 U_EXPORT2 826 uset_charAt(const USet* set, int32_t charIndex); 827 828 /** 829 * Returns the number of characters and strings contained in the given 830 * USet. 831 * @param set the set 832 * @return a non-negative integer counting the characters and strings 833 * contained in set 834 * @stable ICU 2.4 835 */ 836 U_STABLE int32_t U_EXPORT2 837 uset_size(const USet* set); 838 839 /** 840 * Returns the number of items in this set. An item is either a range 841 * of characters or a single multicharacter string. 842 * @param set the set 843 * @return a non-negative integer counting the character ranges 844 * and/or strings contained in set 845 * @stable ICU 2.4 846 */ 847 U_STABLE int32_t U_EXPORT2 848 uset_getItemCount(const USet* set); 849 850 /** 851 * Returns an item of this set. An item is either a range of 852 * characters or a single multicharacter string. 853 * @param set the set 854 * @param itemIndex a non-negative integer in the range 0.. 855 * uset_getItemCount(set)-1 856 * @param start pointer to variable to receive first character 857 * in range, inclusive 858 * @param end pointer to variable to receive last character in range, 859 * inclusive 860 * @param str buffer to receive the string, may be NULL 861 * @param strCapacity capacity of str, or 0 if str is NULL 862 * @param ec error code 863 * @return the length of the string (>= 2), or 0 if the item is a 864 * range, in which case it is the range *start..*end, or -1 if 865 * itemIndex is out of range 866 * @stable ICU 2.4 867 */ 868 U_STABLE int32_t U_EXPORT2 869 uset_getItem(const USet* set, int32_t itemIndex, 870 UChar32* start, UChar32* end, 871 UChar* str, int32_t strCapacity, 872 UErrorCode* ec); 873 874 /** 875 * Returns true if set1 contains all the characters and strings 876 * of set2. It answers the question, 'Is set1 a superset of set2?' 877 * @param set1 set to be checked for containment 878 * @param set2 set to be checked for containment 879 * @return true if the test condition is met 880 * @stable ICU 3.2 881 */ 882 U_STABLE UBool U_EXPORT2 883 uset_containsAll(const USet* set1, const USet* set2); 884 885 /** 886 * Returns true if this set contains all the characters 887 * of the given string. This is does not check containment of grapheme 888 * clusters, like uset_containsString. 889 * @param set set of characters to be checked for containment 890 * @param str string containing codepoints to be checked for containment 891 * @param strLen the length of the string or -1 if null terminated. 892 * @return true if the test condition is met 893 * @stable ICU 3.4 894 */ 895 U_STABLE UBool U_EXPORT2 896 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 897 898 /** 899 * Returns true if set1 contains none of the characters and strings 900 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 901 * @param set1 set to be checked for containment 902 * @param set2 set to be checked for containment 903 * @return true if the test condition is met 904 * @stable ICU 3.2 905 */ 906 U_STABLE UBool U_EXPORT2 907 uset_containsNone(const USet* set1, const USet* set2); 908 909 /** 910 * Returns true if set1 contains some of the characters and strings 911 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 912 * @param set1 set to be checked for containment 913 * @param set2 set to be checked for containment 914 * @return true if the test condition is met 915 * @stable ICU 3.2 916 */ 917 U_STABLE UBool U_EXPORT2 918 uset_containsSome(const USet* set1, const USet* set2); 919 920 /** 921 * Returns the length of the initial substring of the input string which 922 * consists only of characters and strings that are contained in this set 923 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 924 * or only of characters and strings that are not contained 925 * in this set (USET_SPAN_NOT_CONTAINED). 926 * See USetSpanCondition for details. 927 * Similar to the strspn() C library function. 928 * Unpaired surrogates are treated according to contains() of their surrogate code points. 929 * This function works faster with a frozen set and with a non-negative string length argument. 930 * @param set the set 931 * @param s start of the string 932 * @param length of the string; can be -1 for NUL-terminated 933 * @param spanCondition specifies the containment condition 934 * @return the length of the initial substring according to the spanCondition; 935 * 0 if the start of the string does not fit the spanCondition 936 * @stable ICU 3.8 937 * @see USetSpanCondition 938 */ 939 U_STABLE int32_t U_EXPORT2 940 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 941 942 /** 943 * Returns the start of the trailing substring of the input string which 944 * consists only of characters and strings that are contained in this set 945 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 946 * or only of characters and strings that are not contained 947 * in this set (USET_SPAN_NOT_CONTAINED). 948 * See USetSpanCondition for details. 949 * Unpaired surrogates are treated according to contains() of their surrogate code points. 950 * This function works faster with a frozen set and with a non-negative string length argument. 951 * @param set the set 952 * @param s start of the string 953 * @param length of the string; can be -1 for NUL-terminated 954 * @param spanCondition specifies the containment condition 955 * @return the start of the trailing substring according to the spanCondition; 956 * the string length if the end of the string does not fit the spanCondition 957 * @stable ICU 3.8 958 * @see USetSpanCondition 959 */ 960 U_STABLE int32_t U_EXPORT2 961 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 962 963 /** 964 * Returns the length of the initial substring of the input string which 965 * consists only of characters and strings that are contained in this set 966 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 967 * or only of characters and strings that are not contained 968 * in this set (USET_SPAN_NOT_CONTAINED). 969 * See USetSpanCondition for details. 970 * Similar to the strspn() C library function. 971 * Malformed byte sequences are treated according to contains(0xfffd). 972 * This function works faster with a frozen set and with a non-negative string length argument. 973 * @param set the set 974 * @param s start of the string (UTF-8) 975 * @param length of the string; can be -1 for NUL-terminated 976 * @param spanCondition specifies the containment condition 977 * @return the length of the initial substring according to the spanCondition; 978 * 0 if the start of the string does not fit the spanCondition 979 * @stable ICU 3.8 980 * @see USetSpanCondition 981 */ 982 U_STABLE int32_t U_EXPORT2 983 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 984 985 /** 986 * Returns the start of the trailing substring of the input string which 987 * consists only of characters and strings that are contained in this set 988 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 989 * or only of characters and strings that are not contained 990 * in this set (USET_SPAN_NOT_CONTAINED). 991 * See USetSpanCondition for details. 992 * Malformed byte sequences are treated according to contains(0xfffd). 993 * This function works faster with a frozen set and with a non-negative string length argument. 994 * @param set the set 995 * @param s start of the string (UTF-8) 996 * @param length of the string; can be -1 for NUL-terminated 997 * @param spanCondition specifies the containment condition 998 * @return the start of the trailing substring according to the spanCondition; 999 * the string length if the end of the string does not fit the spanCondition 1000 * @stable ICU 3.8 1001 * @see USetSpanCondition 1002 */ 1003 U_STABLE int32_t U_EXPORT2 1004 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1005 1006 /** 1007 * Returns true if set1 contains all of the characters and strings 1008 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1009 * @param set1 set to be checked for containment 1010 * @param set2 set to be checked for containment 1011 * @return true if the test condition is met 1012 * @stable ICU 3.2 1013 */ 1014 U_STABLE UBool U_EXPORT2 1015 uset_equals(const USet* set1, const USet* set2); 1016 1017 /********************************************************************* 1018 * Serialized set API 1019 *********************************************************************/ 1020 1021 /** 1022 * Serializes this set into an array of 16-bit integers. Serialization 1023 * (currently) only records the characters in the set; multicharacter 1024 * strings are ignored. 1025 * 1026 * The array 1027 * has following format (each line is one 16-bit integer): 1028 * 1029 * length = (n+2*m) | (m!=0?0x8000:0) 1030 * bmpLength = n; present if m!=0 1031 * bmp[0] 1032 * bmp[1] 1033 * ... 1034 * bmp[n-1] 1035 * supp-high[0] 1036 * supp-low[0] 1037 * supp-high[1] 1038 * supp-low[1] 1039 * ... 1040 * supp-high[m-1] 1041 * supp-low[m-1] 1042 * 1043 * The array starts with a header. After the header are n bmp 1044 * code points, then m supplementary code points. Either n or m 1045 * or both may be zero. n+2*m is always <= 0x7FFF. 1046 * 1047 * If there are no supplementary characters (if m==0) then the 1048 * header is one 16-bit integer, 'length', with value n. 1049 * 1050 * If there are supplementary characters (if m!=0) then the header 1051 * is two 16-bit integers. The first, 'length', has value 1052 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1053 * 1054 * After the header the code points are stored in ascending order. 1055 * Supplementary code points are stored as most significant 16 1056 * bits followed by least significant 16 bits. 1057 * 1058 * @param set the set 1059 * @param dest pointer to buffer of destCapacity 16-bit integers. 1060 * May be NULL only if destCapacity is zero. 1061 * @param destCapacity size of dest, or zero. Must not be negative. 1062 * @param pErrorCode pointer to the error code. Will be set to 1063 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1064 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1065 * @return the total length of the serialized format, including 1066 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1067 * than U_BUFFER_OVERFLOW_ERROR. 1068 * @stable ICU 2.4 1069 */ 1070 U_STABLE int32_t U_EXPORT2 1071 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1072 1073 /** 1074 * Given a serialized array, fill in the given serialized set object. 1075 * @param fillSet pointer to result 1076 * @param src pointer to start of array 1077 * @param srcLength length of array 1078 * @return true if the given array is valid, otherwise false 1079 * @stable ICU 2.4 1080 */ 1081 U_STABLE UBool U_EXPORT2 1082 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1083 1084 /** 1085 * Set the USerializedSet to contain the given character (and nothing 1086 * else). 1087 * @param fillSet pointer to result 1088 * @param c The codepoint to set 1089 * @stable ICU 2.4 1090 */ 1091 U_STABLE void U_EXPORT2 1092 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1093 1094 /** 1095 * Returns TRUE if the given USerializedSet contains the given 1096 * character. 1097 * @param set the serialized set 1098 * @param c The codepoint to check for within the set 1099 * @return true if set contains c 1100 * @stable ICU 2.4 1101 */ 1102 U_STABLE UBool U_EXPORT2 1103 uset_serializedContains(const USerializedSet* set, UChar32 c); 1104 1105 /** 1106 * Returns the number of disjoint ranges of characters contained in 1107 * the given serialized set. Ignores any strings contained in the 1108 * set. 1109 * @param set the serialized set 1110 * @return a non-negative integer counting the character ranges 1111 * contained in set 1112 * @stable ICU 2.4 1113 */ 1114 U_STABLE int32_t U_EXPORT2 1115 uset_getSerializedRangeCount(const USerializedSet* set); 1116 1117 /** 1118 * Returns a range of characters contained in the given serialized 1119 * set. 1120 * @param set the serialized set 1121 * @param rangeIndex a non-negative integer in the range 0.. 1122 * uset_getSerializedRangeCount(set)-1 1123 * @param pStart pointer to variable to receive first character 1124 * in range, inclusive 1125 * @param pEnd pointer to variable to receive last character in range, 1126 * inclusive 1127 * @return true if rangeIndex is valid, otherwise false 1128 * @stable ICU 2.4 1129 */ 1130 U_STABLE UBool U_EXPORT2 1131 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1132 UChar32* pStart, UChar32* pEnd); 1133 1134 #endif 1135