1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 27 #ifndef _h_klib_text_ 28 #define _h_klib_text_ 29 30 #ifndef _h_klib_extern_ 31 #include <klib/extern.h> 32 #endif 33 34 #ifndef _h_klib_defs_ 35 #include <klib/defs.h> 36 #endif 37 38 #include <stdarg.h> 39 #include <string.h> 40 41 #ifdef __cplusplus 42 extern "C" { 43 #endif 44 45 /*-------------------------------------------------------------------------- 46 * UTF-8 47 * UNICODE TRANFORMATION FORMAT into sequences of 8-bit bytes 48 * 49 * An ASCII character encoded with UTF-8 formatting will have a bit 50 * pattern of 0b0xxxxxxx. 51 * 52 * A non-ASCII UNICODE character encoded with UTF-8 formatting will 53 * use 2..6 bytes, where the character length will be encoded into 54 * the number of contiguous bits in the leading byte: 55 * 56 * 0b110xxxxx => 2 byte character 57 * 0b1110xxxx => 3 byte character 58 * 0b11110xxx => 4 byte character 59 * 0b111110xx => 5 byte character 60 * 0b1111110x => 6 byte character 61 * 62 * all UTF-8 characters uniformly represent their non-leading bytes 63 * by having the MSB set and the next bit 0: 64 * 65 * 0b10xxxxxx => non-leading byte 66 * 67 * this allows a pointer to an arbitrary byte within UTF-8 to 68 * be used to synchronize on the start of a character, since 69 * non-starting bytes cannot be confused with start bytes. The test 70 * 71 * if ( ( * utf8 & 0xC0 ) == 0x80 ) 72 * start_byte = false; 73 * 74 * is sufficient for detecting this. 75 * 76 * an understanding that string LENGTH ( the count of characters ) 77 * and string SIZE ( the count of bytes ) are different is critical. 78 */ 79 80 81 /*-------------------------------------------------------------------------- 82 * String 83 * pseudo-intrinsic string 84 * 85 * "addr" gives a pointer to constant UTF-8 data 86 * 87 * "size" gives the number of BYTES in the UTF-8 string, 88 * NOT the number of characters. 89 * 90 * "len" gives the number of CHARACTERS in the UTF-8 string, 91 * NOT the number of bytes. 92 * 93 * with UTF-8 encoding, only ASCII-7 characters will be represented 94 * using a single byte per character. In the case that a string is 95 * composed of 100% ASCII-7 characters, String.size == String.len, 96 * while if there is even a single UTF-8 character, 97 * String.size > String.len by definition. 98 * 99 * indexing into a string by BYTES is ridiculous because it may 100 * land in the middle of a character. String operations index by 101 * CHARACTER so as to always designate an entire character. 102 */ 103 typedef struct String String; 104 struct String 105 { 106 const char *addr; 107 size_t size; 108 uint32_t len; 109 }; 110 111 /* StringInit 112 * initializes a String object 113 */ 114 #define StringInit( s, val, sz, length ) \ 115 ( void ) \ 116 ( ( s ) -> addr = ( val ), \ 117 ( s ) -> size = ( sz ), \ 118 ( s ) -> len = ( length ) ) 119 120 #define StringInitCString( s, cstr ) \ 121 ( void ) \ 122 ( ( s ) -> len = string_measure \ 123 ( ( s ) -> addr = ( cstr ), & ( s ) -> size ) ) 124 125 /* CONST_STRING 126 * initialize a string from a manifest constant 127 */ 128 #define CONST_STRING( s, val ) \ 129 StringInit ( s, val, sizeof val - 1, sizeof val - 1 ) 130 131 /* StringSize 132 * size of string in BYTES 133 */ 134 #define StringSize( s ) \ 135 ( s ) -> size 136 137 /* StringLength 138 * length of string in CHARACTERS 139 */ 140 #define StringLength( s ) \ 141 ( s ) -> len 142 143 /* StringCopy 144 * allocates a copy of a string 145 */ 146 KLIB_EXTERN rc_t CC StringCopy ( const String **cpy, const String *str ); 147 148 /* StringConcat 149 * concatenate one string onto another 150 */ 151 KLIB_EXTERN rc_t CC StringConcat ( const String **cat, 152 const String *a, const String *b ); 153 154 /* StringSubstr 155 * creates a substring of an existing one 156 * note that the substring is always a non-allocated copy 157 * and is dependent upon the lifetime of its source 158 * 159 * returns "sub" if "idx" was valid 160 * or NULL otherwise 161 * 162 * "len" may be 0 to indicate infinite length 163 * or may extend beyond end of source string. 164 */ 165 KLIB_EXTERN String* CC StringSubstr ( const String *str, 166 String *sub, uint32_t idx, uint32_t len ); 167 168 /* StringTrim 169 * trims ascii white-space from both ends 170 * returns trimmed string in "trimmed" 171 */ 172 KLIB_EXTERN String * CC StringTrim ( const String * str, String * trimmed ); 173 174 /* StringHead 175 * access the first character 176 * 177 * this is an efficient enough function to be included. 178 * the generic functions of accessing characters by index 179 * are apt to be extremely inefficient with UTF-8, and 180 * as such are not included. 181 * 182 * returns EINVAL if the character is bad, 183 * or ENODATA if the string is empty 184 */ 185 KLIB_EXTERN rc_t CC StringHead ( const String *str, uint32_t *ch ); 186 187 /* StringPopHead 188 * remove and return the first character 189 * 190 * returns EINVAL if the character is bad, 191 * or ENODATA if the string is empty 192 */ 193 KLIB_EXTERN rc_t CC StringPopHead ( String *str, uint32_t *ch ); 194 195 /* StringEqual 196 * compare strings for equality 197 * not lexical for all characters 198 */ 199 #define StringEqual( a, b ) \ 200 ( ( a ) -> size == ( b ) -> size && \ 201 memcmp ( ( a ) -> addr, ( b ) -> addr, ( a ) -> size ) == 0 ) 202 203 /* StringCompare 204 * compare strings for relative ordering 205 * not strictly lexical - generally orders by character code 206 */ 207 KLIB_EXTERN int CC StringCompare ( const String *a, const String *b ); 208 209 /* StringCaseEqual 210 * compare strings for case-insensitive equality 211 */ 212 KLIB_EXTERN bool CC StringCaseEqual ( const String *a, const String *b ); 213 214 /* StringCaseCompare 215 * compare strings for relative case-insensitive ordering 216 */ 217 KLIB_EXTERN int CC StringCaseCompare ( const String *a, const String *b ); 218 219 /* StringOrder 220 * StringOrderNoNullCheck 221 * compares strings as quickly as possible for 222 * deterministic ordering: first by length, then 223 * by binary ( byte-wise ) content. 224 * 225 * performs more quickly than StringCompare for cases 226 * where only deterministic ordering is needed ( e.g. symbol table ). 227 * 228 * the "NoNullCheck" version will crash if either a or b are NULL 229 * but avoids the overhead of checking when both are known to be good 230 */ 231 KLIB_EXTERN int64_t CC StringOrder ( const String *a, const String *b ); 232 KLIB_EXTERN int64_t CC StringOrderNoNullCheck ( const String *a, const String *b ); 233 234 /* StringMatch 235 * creates a substring of "a" in "match" 236 * for all of the sequential matching characters between "a" and "b" 237 * starting from character [ 0 ]. 238 * 239 * returns the number of characters that match. 240 */ 241 KLIB_EXTERN uint32_t CC StringMatch ( String *match, 242 const String *a, const String *b ); 243 244 /* StringMatchExtend 245 * extends a substring of "a" in "match" 246 * for all of the sequential matching characters between "a" and "b" 247 * starting from character [ match -> len ]. 248 * 249 * returns the number of matching characters that were extended. 250 */ 251 KLIB_EXTERN uint32_t CC StringMatchExtend ( String *match, 252 const String *a, const String *b ); 253 254 /* StringHash 255 * hash value for string 256 */ 257 #define StringHash( s ) \ 258 string_hash ( ( s ) -> addr, ( s ) -> size ) 259 260 /* StringCopyUTF... 261 * creates a String from UTF16 or UTF32 UNICODE input 262 * wchar_t is one or the other, depending upon OS and compiler. 263 */ 264 KLIB_EXTERN rc_t CC StringCopyUTF16 ( const String **cpy, 265 const uint16_t *text, size_t bytes ); 266 KLIB_EXTERN rc_t CC StringCopyUTF32 ( const String **cpy, 267 const uint32_t *text, size_t bytes ); 268 269 /* StringCopyWChar_t 270 * wchar_t is alternately defined as 16 or 32 bits 271 */ 272 #define StringCopyWChar_t( cpy, text, bytes ) \ 273 ( ( sizeof ( wchar_t ) == sizeof ( uint16_t ) ) ? \ 274 StringCopyUTF16 ( cpy, ( const uint16_t* ) ( text ), bytes ) : \ 275 StringCopyUTF32 ( cpy, ( const uint32_t* ) ( text ), bytes ) ) 276 277 /* StringWhack 278 * deallocates a string 279 * ignores strings not allocated by this library 280 */ 281 KLIB_EXTERN void CC StringWhack ( const String* self ); 282 283 284 /* StringToInt 285 * simple string conversion functions 286 * 287 * these functions are defined to consume the entire string. 288 * leading spaces are tolerated, repeated signs are accepted for signed conversion, 289 * decimal and hex encodings are accepted for unsigned conversion, 290 * decimal only for signed conversion. 291 * 292 * "optional_rc" [ OUT, NULL OKAY ] - if non-null, user is interested 293 * in error conditions. if the parameter is present, the string must be 294 * completely consumed without overflow. 295 * 296 * optional return values ( with { GetRCObject ( rc ), GetRCState ( rc ) }: 297 * 0 : no error 298 * { rcRange, rcExcessive } : integer overflow 299 * { rcTransfer, rcIncomplete } : extra characters remain in string 300 * { rcData, rcInsufficient } : no numeric text was found 301 * 302 * return values - regardless of "optional_rc": 303 * val : when no error 304 * val : on incomplete transfer 305 * +/- max int64_t : when signed overflow occurs ( StringToI64 only ) 306 * max uint64_t : when unsigned overflow occurs ( StringToU64 only ) 307 * 0 : when no input text is found 308 */ 309 KLIB_EXTERN int64_t StringToI64 ( const String * self, rc_t * optional_rc ); 310 KLIB_EXTERN uint64_t StringToU64 ( const String * self, rc_t * optional_rc ); 311 312 313 /*-------------------------------------------------------------------------- 314 * raw text strings 315 * the internal representation of text strings is implementation 316 * dependent. it is assumed to be ASCII-7 or UTF-8, although 317 * this is determined by the implementation library of these functions. 318 * 319 * NB - ASCII implementations are no longer being provided 320 * all text handling is UTF-8 unless explictly stated otherwise 321 */ 322 323 /* string_size 324 * length of string in bytes 325 */ 326 KLIB_EXTERN size_t CC string_size ( const char *str ); 327 328 /* string_len 329 * length of string in characters, when the size is known 330 */ 331 KLIB_EXTERN uint32_t CC string_len ( const char *str, size_t size ); 332 333 /* string_measure 334 * measures length of string in both characters and bytes 335 */ 336 KLIB_EXTERN uint32_t CC string_measure ( const char *str, size_t *size ); 337 338 /* string_copy 339 * copies whole character text into a buffer 340 * terminates with NUL byte if possible 341 * returns the number of bytes copied 342 */ 343 KLIB_EXTERN size_t CC string_copy ( char *dst, size_t dst_size, 344 const char *src, size_t src_size ); 345 346 /* string_copy_measure 347 * copies whole character text into a buffer 348 * terminates with NUL byte if possible 349 * returns the number of bytes copied 350 */ 351 KLIB_EXTERN size_t CC string_copy_measure ( char *dst, size_t dst_size, const char *src ); 352 353 /* string_dup 354 * replaces the broken C library strndup 355 * creates a NUL-terminated malloc'd string 356 */ 357 KLIB_EXTERN char* CC string_dup ( const char *str, size_t size ); 358 359 /* string_dup_measure 360 * replaces the broken C library strdup 361 * creates a NUL-terminated malloc'd string 362 * returns size of string unless "size" is NULL 363 */ 364 KLIB_EXTERN char* CC string_dup_measure ( const char *str, size_t *size ); 365 366 /* tolower_copy 367 * copies whole character text in lower-case 368 * terminates with NUL byte if possible 369 * returns the number of bytes copied 370 */ 371 KLIB_EXTERN size_t CC tolower_copy ( char *dst, size_t dst_size, 372 const char *src, size_t src_size ); 373 374 /* toupper_copy 375 * copies whole character text in upper-case 376 * terminates with NUL byte if possible 377 * returns the number of bytes copied 378 */ 379 KLIB_EXTERN size_t CC toupper_copy ( char *dst, size_t dst_size, 380 const char *src, size_t src_size ); 381 382 /* string_cmp 383 * performs a safe strncmp 384 * 385 * "max_chars" limits the extent of the comparison 386 * to not exceed supplied value, i.e. the number of 387 * characters actually compared will be the minimum 388 * of asize, bsize and max_chars. 389 * 390 * if either string size ( or both ) < max_chars and 391 * all compared characters match, then the result will 392 * be a comparison of asize against bsize. 393 */ 394 KLIB_EXTERN int CC string_cmp ( const char *a, size_t asize, 395 const char *b, size_t bsize, uint32_t max_chars ); 396 397 /* strcase_cmp 398 * like string_cmp except case insensitive 399 */ 400 KLIB_EXTERN int CC strcase_cmp ( const char *a, size_t asize, 401 const char *b, size_t bsize, uint32_t max_chars ); 402 403 /* string_match 404 * returns the number of matching characters 405 * 406 * "max_chars" limits the extent of the comparison 407 * to not exceed supplied value, i.e. the number of 408 * characters actually compared will be the minimum 409 * of asize, bsize and max_chars. 410 * 411 * "msize" will be set to the size of the matched string 412 * if not NULL 413 */ 414 KLIB_EXTERN uint32_t CC string_match ( const char *a, size_t asize, 415 const char *b, size_t bsize, uint32_t max_chars, size_t *msize ); 416 417 /* strcase_match 418 * like string_match except case insensitive 419 */ 420 KLIB_EXTERN uint32_t CC strcase_match ( const char *a, size_t asize, 421 const char *b, size_t bsize, uint32_t max_chars, size_t *msize ); 422 423 /* string_chr 424 * performs a safe strchr 425 * "ch" is in UTF32 426 */ 427 KLIB_EXTERN char* CC string_chr ( const char *str, size_t size, uint32_t ch ); 428 429 /* string_rchr 430 * performs a safe strrchr 431 */ 432 KLIB_EXTERN char* CC string_rchr ( const char *str, size_t size, uint32_t ch ); 433 434 /* string_brk 435 * performs a safe strpbrk 436 */ 437 #if 0 438 KLIB_EXTERN char* CC string_brk ( const char *str, size_t size, 439 const char *accept, size_t asize ); 440 #endif 441 442 /* string_rbrk 443 */ 444 #if 0 445 KLIB_EXTERN char* CC string_rbrk ( const char *str, size_t size, 446 const char *accept, size_t asize ); 447 #endif 448 449 /* string_hash 450 * hashes a string 451 */ 452 KLIB_EXTERN uint32_t CC string_hash ( const char *str, size_t size ); 453 454 /* string_idx 455 * seek an indexed character 456 * 457 * the efficiency is based upon chosen internal 458 * string representation, which, when using single byte chars, 459 * is simple and efficient. 460 * 461 * on the other hand, UTF-8 has a variable character width, 462 * requiring scanning of the entire string until the indexed 463 * character is found. 464 */ 465 KLIB_EXTERN char* CC string_idx ( const char *str, size_t size, uint32_t idx ); 466 467 468 /* string_to_int 469 * simple string conversion functions 470 * 471 * these functions are defined to consume the entire string. 472 * leading spaces are tolerated, repeated signs are accepted for signed conversion, 473 * decimal and hex encodings are accepted for unsigned conversion, 474 * decimal only for signed conversion. 475 * 476 * "optional_rc" [ OUT, NULL OKAY ] - if non-null, user is interested 477 * in error conditions. if the parameter is present, the string must be 478 * completely consumed without overflow. 479 * 480 * optional return values ( with { GetRCObject ( rc ), GetRCState ( rc ) }: 481 * 0 : no error 482 * { rcRange, rcExcessive } : integer overflow 483 * { rcTransfer, rcIncomplete } : extra characters remain in string 484 * { rcData, rcInsufficient } : no numeric text was found 485 * 486 * return values - regardless of "optional_rc": 487 * val : when no error 488 * val : on incomplete transfer 489 * +/- max int64_t : when signed overflow occurs ( StringToI64 only ) 490 * max uint64_t : when unsigned overflow occurs ( StringToU64 only ) 491 * 0 : when no input text is found 492 */ 493 KLIB_EXTERN int64_t string_to_I64 ( const char * str, size_t size, rc_t * optional_rc ); 494 KLIB_EXTERN uint64_t string_to_U64 ( const char * str, size_t size, rc_t * optional_rc ); 495 496 497 /*-------------------------------------------------------------------------- 498 * conversion between UTF-32 and UTF-8 UNICODE 499 */ 500 501 /* utf8_utf32 502 * converts UTF-8 text to a single UTF-32 character 503 * returns the number of UTF8 bytes consumed, such that: 504 * return > 0 means success 505 * return == 0 means insufficient input 506 * return < 0 means bad input 507 */ 508 KLIB_EXTERN int CC utf8_utf32 ( uint32_t *ch, const char *begin, const char *end ); 509 510 /* utf32_utf8 511 * converts a single UTF-32 character to UTF-8 text 512 * returns the number of UTF8 bytes generated, such that: 513 * return > 0 means success 514 * return == 0 means insufficient output 515 * return < 0 means bad character 516 */ 517 KLIB_EXTERN int CC utf32_utf8 ( char *begin, char *end, uint32_t ch ); 518 519 520 /*-------------------------------------------------------------------------- 521 * support for 16 and 32-bit UTF formats 522 */ 523 524 /* utf16_string_size/len/measure 525 * measures UTF-16 strings 526 */ 527 KLIB_EXTERN size_t CC utf16_string_size ( const uint16_t *str ); 528 KLIB_EXTERN uint32_t CC utf16_string_len ( const uint16_t *str, size_t size ); 529 KLIB_EXTERN uint32_t CC utf16_string_measure ( const uint16_t *str, size_t *size ); 530 531 /* utf32_string_size/len/measure 532 */ 533 KLIB_EXTERN size_t CC utf32_string_size ( const uint32_t *str ); 534 KLIB_EXTERN uint32_t CC utf32_string_len ( const uint32_t *str, size_t size ); 535 KLIB_EXTERN uint32_t CC utf32_string_measure ( const uint32_t *str, size_t *size ); 536 537 /* wchar_string_size/len/measure 538 * measures wchar_t strings 539 */ 540 KLIB_EXTERN size_t CC wchar_string_size ( const wchar_t *str ); 541 KLIB_EXTERN uint32_t CC wchar_string_len ( const wchar_t *str, size_t size ); 542 KLIB_EXTERN uint32_t CC wchar_string_measure ( const wchar_t *str, size_t *size ); 543 544 /* conversion from UTF-16 to internal standard */ 545 KLIB_EXTERN uint32_t CC utf16_cvt_string_len ( const uint16_t *src, 546 size_t src_size, size_t *dst_size ); 547 KLIB_EXTERN uint32_t CC utf16_cvt_string_measure ( const uint16_t *src, 548 size_t *src_size, size_t *dst_size ); 549 KLIB_EXTERN size_t CC utf16_cvt_string_copy ( char *dst, size_t dst_size, 550 const uint16_t *src, size_t src_size ); 551 552 /* conversion from UTF-32 to internal standard */ 553 KLIB_EXTERN uint32_t CC utf32_cvt_string_len ( const uint32_t *src, 554 size_t src_size, size_t *dst_size ); 555 KLIB_EXTERN uint32_t CC utf32_cvt_string_measure ( const uint32_t *src, 556 size_t *src_size, size_t *dst_size ); 557 KLIB_EXTERN size_t CC utf32_cvt_string_copy ( char *dst, size_t dst_size, 558 const uint32_t *src, size_t src_size ); 559 560 /* conversion from wchar_t to internal standard */ 561 KLIB_EXTERN uint32_t CC wchar_cvt_string_len ( const wchar_t *src, 562 size_t src_size, size_t *dst_size ); 563 KLIB_EXTERN uint32_t CC wchar_cvt_string_measure ( const wchar_t *src, 564 size_t *src_size, size_t *dst_size ); 565 KLIB_EXTERN size_t CC wchar_cvt_string_copy ( char *dst, size_t dst_size, 566 const wchar_t *src, size_t src_size ); 567 568 /* conversion to wchar_t from internal standard */ 569 KLIB_EXTERN size_t CC string_cvt_wchar_copy ( wchar_t *dst, size_t dst_size, 570 const char *src, size_t src_size ); 571 572 /*-------------------------------------------------------------------------- 573 * support for ISO-8859-x 8-bit character sets 574 */ 575 576 /* iso8859_utf32 577 * converts 8-bit text to a single UTF-32 character 578 * returns the number of 8-bit bytes consumed, such that: 579 * return > 0 means success 580 * return == 0 means insufficient input 581 * return < 0 means bad input 582 */ 583 KLIB_EXTERN int CC iso8859_utf32 ( const uint32_t map [ 128 ], 584 uint32_t *ch, const char *begin, const char *end ); 585 586 /* iso8859_string_size/len/measure 587 * measures UTF-16 strings 588 */ 589 KLIB_EXTERN size_t CC iso8859_string_size ( const uint32_t map [ 128 ], 590 const char *str ); 591 KLIB_EXTERN uint32_t CC iso8859_string_len ( const uint32_t map [ 128 ], 592 const char *str, size_t size ); 593 KLIB_EXTERN uint32_t CC iso8859_string_measure ( const uint32_t map [ 128 ], 594 const char *str, size_t *size ); 595 596 /* conversion from ISO-8859-x to internal standard */ 597 KLIB_EXTERN uint32_t CC iso8859_cvt_string_len ( const uint32_t map [ 128 ], 598 const char *src, size_t src_size, size_t *dst_size ); 599 KLIB_EXTERN uint32_t CC iso8859_cvt_string_measure ( const uint32_t map [ 128 ], 600 const char *src, size_t *src_size, size_t *dst_size ); 601 KLIB_EXTERN size_t CC iso8859_cvt_string_copy ( const uint32_t map [ 128 ], 602 char *dst, size_t dst_size, const char *src, size_t src_size ); 603 604 /* some externally defined character maps */ 605 KLIB_EXTERN_DATA const uint32_t iso8859_1 [ 128 ]; 606 KLIB_EXTERN_DATA const uint32_t cp1252 [ 128 ]; 607 608 609 #ifdef __cplusplus 610 } 611 #endif 612 613 #endif /* _h_klib_text_ */ 614