1 /* 2 * Copyright 2017 Patrick O. Perry. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef UTF8LITE_H 18 #define UTF8LITE_H 19 20 /** 21 * \file utf8lite.h 22 * 23 * Lightweight UTF-8 processing. 24 */ 25 26 #include <limits.h> 27 #include <stddef.h> 28 #include <stdint.h> 29 30 /** 31 * \defgroup error Error handling 32 * @{ 33 */ 34 35 /** Maximum error message length, in bytes, not including the trailing NUL */ 36 #define UTF8LITE_MESSAGE_MAX 255 37 38 /** 39 * Error code. 40 */ 41 enum utf8lite_error_type { 42 UTF8LITE_ERROR_NONE = 0,/**< no error */ 43 UTF8LITE_ERROR_INVAL, /**< invalid input */ 44 UTF8LITE_ERROR_NOMEM, /**< out of memory */ 45 UTF8LITE_ERROR_OS, /**< operating system error */ 46 UTF8LITE_ERROR_OVERFLOW,/**< size exceeds maximum */ 47 UTF8LITE_ERROR_DOMAIN, /**< input is out of domain */ 48 UTF8LITE_ERROR_RANGE, /**< output is out of range */ 49 UTF8LITE_ERROR_INTERNAL /**< internal error */ 50 }; 51 52 /** 53 * Message buffer. 54 */ 55 struct utf8lite_message { 56 char string[UTF8LITE_MESSAGE_MAX + 1]; /**< NUL-terminated message */ 57 }; 58 59 /** 60 * Set a message to the empty string. 61 * 62 * \param msg message, or NULL 63 */ 64 void utf8lite_message_clear(struct utf8lite_message *msg); 65 66 /** 67 * Set a message to a formatted string. 68 * 69 * \param msg message, or NULL 70 * \param fmt format string 71 * \param ... format arguments 72 */ 73 void utf8lite_message_set(struct utf8lite_message *msg, const char *fmt, ...) 74 #if defined(_WIN32) || defined(_WIN64) 75 ; 76 #else 77 __attribute__ ((format (printf, 2, 3))); 78 #endif 79 80 /** 81 * Append to a message. 82 * 83 * \param msg message, or NULL 84 * \param fmt format string 85 * \param ... format arguments 86 */ 87 void utf8lite_message_append(struct utf8lite_message *msg, const char *fmt, ...) 88 #if defined(_WIN32) || defined(_WIN64) 89 ; 90 #else 91 __attribute__ ((format (printf, 2, 3))); 92 #endif 93 94 /**@}*/ 95 96 /** 97 * \defgroup char Unicode characters 98 * @{ 99 */ 100 101 /** Missing Unicode value */ 102 #define UTF8LITE_CODE_NONE -1 103 104 /** Unicode replacement character */ 105 #define UTF8LITE_CODE_REPLACEMENT 0xFFFD 106 107 /** Last valid unicode codepoint */ 108 #define UTF8LITE_CODE_MAX 0x10FFFF 109 110 /** Number of bits required to encode a codepoint */ 111 #define UTF8LITE_CODE_BITS 21 112 113 /** Indicates whether a given unsigned integer is a valid ASCII codepoint */ 114 #define UTF8LITE_IS_ASCII(x) \ 115 ((x) <= 0x7F) 116 117 /** Indicates whether a given unsigned integer is a valid unicode codepoint */ 118 #define UTF8LITE_IS_UNICODE(x) \ 119 (((x) <= UTF8LITE_CODE_MAX) \ 120 && !UTF8LITE_IS_UTF16_HIGH(x) \ 121 && !UTF8LITE_IS_UTF16_LOW(x)) 122 123 /** 124 * Unicode character width type. 125 */ 126 enum utf8lite_charwidth_type { 127 UTF8LITE_CHARWIDTH_NONE = 0, /**< Control or and other */ 128 UTF8LITE_CHARWIDTH_IGNORABLE, /**< Default ignorable */ 129 UTF8LITE_CHARWIDTH_MARK, /**< Zero-width mark or format */ 130 UTF8LITE_CHARWIDTH_NARROW, /**< Most western alphabets */ 131 UTF8LITE_CHARWIDTH_AMBIGUOUS, /**< Width depends on context */ 132 UTF8LITE_CHARWIDTH_WIDE, /**< Most ideographs */ 133 UTF8LITE_CHARWIDTH_EMOJI /**< Emoji presentation */ 134 }; 135 136 /** 137 * Get the width of a Unicode character, using the East Asian Width table and 138 * the Emoji data. 139 * 140 * \param code the codepoint 141 * 142 * \returns a #utf8lite_charwidth_type value giving the width 143 */ 144 int utf8lite_charwidth(int32_t code); 145 146 /** 147 * Get whether a Unicode character is white space. 148 * 149 * \param code the codepoint 150 * 151 * \returns 1 if space, 0 otherwise. 152 */ 153 int utf8lite_isspace(int32_t code); 154 155 /** 156 * Get whether a Unicode character is a default ignorable character. 157 * 158 * \param code the codepoint 159 * 160 * \returns 1 if space, 0 otherwise. 161 */ 162 int utf8lite_isignorable(int32_t code); 163 164 /**@}*/ 165 166 /** 167 * \defgroup encode Encoding 168 * @{ 169 */ 170 171 /** Number of bytes in the UTF-8 encoding of a valid unicode codepoint. */ 172 #define UTF8LITE_UTF8_ENCODE_LEN(u) \ 173 ((u) <= 0x7F ? 1 : \ 174 (u) <= 0x07FF ? 2 : \ 175 (u) <= 0xFFFF ? 3 : 4) 176 177 /** Number of 16-bit code units in the UTF-16 encoding of a valid unicode 178 * codepoint */ 179 #define UTF8LITE_UTF16_ENCODE_LEN(u) \ 180 ((u) <= 0xFFFF ? 1 : 2) 181 182 /** High (leading) UTF-16 surrogate for a code point in the supplementary 183 * plane (U+10000 to U+10FFFF). */ 184 #define UTF8LITE_UTF16_HIGH(u) \ 185 0xD800 | (((unsigned)(u) - 0x010000) >> 10) 186 187 /** Low (trailing) UTF-16 surrogate for a code point in the supplementary 188 * plane (U+10000 to U+10FFFF). */ 189 #define UTF8LITE_UTF16_LOW(u) \ 190 0xDC00 | (((unsigned)(u) - 0x010000) & 0x03FF) 191 192 /** Indicates whether a 16-bit code unit is a UTF-16 high surrogate. 193 * High surrogates are in the range 0xD800 `(1101 1000 0000 0000)` 194 * to 0xDBFF `(1101 1011 1111 1111)`. */ 195 #define UTF8LITE_IS_UTF16_HIGH(x) (((x) & 0xFC00) == 0xD800) 196 197 /** Indicates whether a 16-bit code unit is a UTF-16 low surrogate. 198 * Low surrogates are in the range 0xDC00 `(1101 1100 0000 0000)` 199 * to 0xDFFF `(1101 1111 1111 1111)`. */ 200 #define UTF8LITE_IS_UTF16_LOW(x) (((x) & 0xFC00) == 0xDC00) 201 202 /** Given the high and low UTF-16 surrogates, compute the unicode codepoint. */ 203 #define UTF8LITE_DECODE_UTF16_PAIR(h, l) \ 204 (((((h) & 0x3FF) << 10) | ((l) & 0x3FF)) + 0x10000) 205 206 /** Given the first byte in a valid UTF-8 byte sequence, determine the number of 207 * continuation bytes */ 208 #define UTF8LITE_UTF8_TAIL_LEN(x) \ 209 ( ((x) & 0x80) == 0x00 ? 0 \ 210 : ((x) & 0xE0) == 0xC0 ? 1 \ 211 : ((x) & 0xF0) == 0xE0 ? 2 : 3) 212 213 /** Maximum number of UTF-8 continuation bytes in a valid encoded character */ 214 #define UTF8LITE_UTF8_TAIL_MAX 3 215 216 /** 217 * Validate the first character in a UTF-8 character buffer. 218 * 219 * \param bufptr a pointer to the input buffer; on exit, a pointer to 220 * the end of the first valid UTF-8 character, or the first invalid 221 * byte in the encoding 222 * \param end the end of the input buffer 223 * \param msg an error message buffer 224 * 225 * \returns 0 on success 226 */ 227 int utf8lite_scan_utf8(const uint8_t **bufptr, const uint8_t *end, 228 struct utf8lite_message *msg); 229 230 /** 231 * Decode the first codepoint from a UTF-8 character buffer. 232 * 233 * \param bufptr on input, a pointer to the start of the character buffer; 234 * on exit, a pointer to the end of the first UTF-8 character in 235 * the buffer 236 * \param codeptr on exit, the first codepoint in the buffer 237 */ 238 void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr); 239 240 /** 241 * Encode a codepoint into a UTF-8 character buffer. The codepoint must 242 * be a valid unicode character (according to #UTF8LITE_IS_UNICODE) and the buffer 243 * must have space for at least #UTF8LITE_UTF8_ENCODE_LEN bytes. 244 * 245 * \param code the codepoint 246 * \param bufptr on input, a pointer to the start of the buffer; 247 * on exit, a pointer to the end of the encoded codepoint 248 */ 249 void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr); 250 251 /** 252 * Encode a codepoint in reverse, at the end of UTF-8 character buffer. 253 * The codepoint must be a valid unicode character (according to 254 * #UTF8LITE_IS_UNICODE) and the buffer must have space for at least 255 * #UTF8LITE_UTF8_ENCODE_LEN bytes. 256 * 257 * \param code the codepoint 258 * \param endptr on input, a pointer to the end of the buffer; 259 * on exit, a pointer to the start of the encoded codepoint 260 */ 261 void utf8lite_rencode_utf8(int32_t code, uint8_t **endptr); 262 263 /**@}*/ 264 265 /** 266 * \defgroup escape Escape code handling 267 * @{ 268 */ 269 270 /** 271 * Scan a JSON-style backslash (\\) escape. 272 * 273 * \param bufptr on input, a pointer to the byte after the backslash; 274 * on output, a pointer to the byte after the escape 275 * \param end pointer to the end of the buffer 276 * \param msg error message buffer 277 * 278 * \returns 0 on success 279 */ 280 int utf8lite_scan_escape(const uint8_t **bufptr, const uint8_t *end, 281 struct utf8lite_message *msg); 282 283 /** 284 * Scan a JSON-style backslash-u (\\u) escape. 285 * 286 * \param bufptr on input, a pointer to the byte after the 'u'; 287 * on output, a pointer to the byte after the escape 288 * \param end pointer to the end of the buffer 289 * \param msg error message buffer 290 * 291 * \returns 0 on success 292 */ 293 int utf8lite_scan_uescape(const uint8_t **bufptr, const uint8_t *end, 294 struct utf8lite_message *msg); 295 296 /** 297 * Decode a JSON-style backslash (\\) escape. 298 * 299 * \param bufptr on input, a pointer to the byte after the backslash; 300 * on output, a pointer to the byte after the escape 301 * \param codeptr on output, a pointer to the decoded UTF-32 character 302 */ 303 void utf8lite_decode_escape(const uint8_t **bufptr, int32_t *codeptr); 304 305 /** 306 * Scan a JSON-style backslash-u (\\u) escape. 307 * 308 * \param bufptr on input, a pointer to the byte after the 'u'; 309 * on output, a pointer to the byte after the escape 310 * \param codeptr on output, a pointer to the decoded UTF-32 character 311 */ 312 void utf8lite_decode_uescape(const uint8_t **bufptr, int32_t *codeptr); 313 314 /**@}*/ 315 316 /** 317 * \defgroup normalize Normalization 318 * @{ 319 */ 320 321 /** 322 * Unicode character decomposition mappings. The compatibility mappings are 323 * defined in [UAX #44 Sec. 5.7.3 Character Decomposition Maps] 324 * (http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). 325 */ 326 enum utf8lite_decomp_type { 327 UTF8LITE_DECOMP_NORMAL = 0, /**< normalization (required for NFD) */ 328 UTF8LITE_DECOMP_FONT = (1 << 0), /**< font variant */ 329 UTF8LITE_DECOMP_NOBREAK = (1 << 1), /**< no-break version of a space 330 or hyphen */ 331 UTF8LITE_DECOMP_INITIAL = (1 << 2), /**< initial presentation form 332 (Arabic) */ 333 UTF8LITE_DECOMP_MEDIAL = (1 << 3), /**< medial presentation form 334 (Arabic) */ 335 UTF8LITE_DECOMP_FINAL = (1 << 4), /**< final presentation form 336 (Arabic) */ 337 UTF8LITE_DECOMP_ISOLATED = (1 << 5), /**< isolated presentation form 338 (Arabic) */ 339 UTF8LITE_DECOMP_CIRCLE = (1 << 6), /**< encircled form */ 340 UTF8LITE_DECOMP_SUPER = (1 << 7), /**< superscript form */ 341 UTF8LITE_DECOMP_SUB = (1 << 8), /**< subscript form */ 342 UTF8LITE_DECOMP_VERTICAL = (1 << 9), /**< vertical layout presentation 343 form */ 344 UTF8LITE_DECOMP_WIDE = (1 << 10), /**< wide (or zenkaku) 345 compatibility */ 346 UTF8LITE_DECOMP_NARROW = (1 << 11), /**< narrow (or hankaku) 347 compatibility */ 348 UTF8LITE_DECOMP_SMALL = (1 << 12), /**< small variant form 349 (CNS compatibility) */ 350 UTF8LITE_DECOMP_SQUARE = (1 << 13), /**< CJK squared font variant */ 351 UTF8LITE_DECOMP_FRACTION = (1 << 14),/**< vulgar fraction form */ 352 UTF8LITE_DECOMP_COMPAT = (1 << 15), /**< unspecified compatibility */ 353 354 UTF8LITE_DECOMP_ALL = ((1 << 16) - 1)/**< all decompositions 355 (required for NFKD) */ 356 }; 357 358 /** 359 * Unicode case folding. These are defined in *TR44* Sec. 5.6. 360 */ 361 enum utf8lite_casefold_type { 362 UTF8LITE_CASEFOLD_NONE = 0, /**< no case folding */ 363 UTF8LITE_CASEFOLD_ALL = (1 << 16) /**< perform case folding */ 364 }; 365 366 /** 367 * Maximum size (in codepoints) of a single code point's decomposition. 368 * 369 * From *TR44* Sec. 5.7.3: "Compatibility mappings are guaranteed to be no 370 * longer than 18 characters, although most consist of just a few characters." 371 */ 372 #define UTF8LITE_UNICODE_DECOMP_MAX 18 373 374 /** 375 * Apply decomposition and/or casefold mapping to a Unicode character, 376 * outputting the result to the specified buffer. The output will be at 377 * most #UTF8LITE_UNICODE_DECOMP_MAX codepoints. 378 * 379 * \param type a bitmask composed from #utf8lite_decomp_type and 380 * #utf8lite_casefold_type values specifying the mapping type 381 * \param code the input codepoint 382 * \param bufptr on entry, a pointer to the output buffer; on exit, 383 * a pointer past the last output codepoint 384 */ 385 void utf8lite_map(int type, int32_t code, int32_t **bufptr); 386 387 /** 388 * Apply the canonical ordering algorithm to put an array of Unicode 389 * codepoints in normal order. See *Unicode* Sec 3.11 and *TR44* Sec. 5.7.4. 390 * 391 * \param ptr a pointer to the first codepoint 392 * \param len the number of codepoints 393 */ 394 void utf8lite_order(int32_t *ptr, size_t len); 395 396 /** 397 * Apply the canonical composition algorithm to put an array of 398 * canonically-ordered Unicode codepoints into composed form. 399 * 400 * \param ptr a pointer to the first codepoint 401 * \param lenptr on entry, a pointer to the number of input codepoints; 402 * on exit, a pointer to the number of composed codepoints 403 */ 404 void utf8lite_compose(int32_t *ptr, size_t *lenptr); 405 406 /**@}*/ 407 408 /** 409 * \defgroup text UTF-8 encoded text 410 * @{ 411 */ 412 413 /** Whether the text might contain a backslash (`\`) that should be 414 * interpreted as an escape */ 415 #define UTF8LITE_TEXT_ESC_BIT ((size_t)1 << (CHAR_BIT * sizeof(size_t) - 1)) 416 417 /** Size of the encoded text, in bytes; (decoded size) <= (encoded size) */ 418 #define UTF8LITE_TEXT_SIZE_MASK ((size_t)SIZE_MAX >> 1) 419 420 /** Maximum size of encode text, in bytes. */ 421 #define UTF8LITE_TEXT_SIZE_MAX UTF8LITE_TEXT_SIZE_MASK 422 423 /** The encoded size of the text, in bytes */ 424 #define UTF8LITE_TEXT_SIZE(text) ((text)->attr & UTF8LITE_TEXT_SIZE_MASK) 425 426 /** The text attribute bits */ 427 #define UTF8LITE_TEXT_BITS(text) ((text)->attr & ~UTF8LITE_TEXT_SIZE_MASK) 428 429 /** Indicates whether the text might contain a backslash (`\`) that should 430 * be interpreted as an escape code */ 431 #define UTF8LITE_TEXT_HAS_ESC(text) \ 432 (((text)->attr & UTF8LITE_TEXT_ESC_BIT) ? 1 : 0) 433 434 /** 435 * Flags for utf8lite_text_assign(). 436 */ 437 enum utf8lite_text_flag { 438 /** validate the input */ 439 UTF8LITE_TEXT_UNKNOWN = 0, 440 441 /** do not perform any validation on the input */ 442 UTF8LITE_TEXT_VALID = (1 << 0), 443 444 /** interpret backslash (`\`) as an escape */ 445 UTF8LITE_TEXT_UNESCAPE = (1 << 1) 446 }; 447 448 /** 449 * UTF-8 encoded text, possibly containing JSON-compatible backslash (`\`) 450 * escape codes which should be interpreted as such. The client assumes 451 * all responsibility for managing the memory for the underlying UTF8-data. 452 */ 453 struct utf8lite_text { 454 uint8_t *ptr; /**< pointer to valid UTF-8 data */ 455 size_t attr; /**< text attributes */ 456 }; 457 458 /** 459 * Assign a text value to point to data in the specified memory location 460 * after validating the input data. 461 * 462 * \param text the text value 463 * \param ptr a pointer to the underlying memory buffer 464 * \param size the number of bytes in the underlying memory buffer 465 * \param flags #utf8lite_text_flag bitmask specifying input type 466 * \param msg an error message buffer, or NULL 467 * 468 * \returns 0 on success 469 */ 470 int utf8lite_text_assign(struct utf8lite_text *text, 471 const uint8_t *ptr, size_t size, int flags, 472 struct utf8lite_message *msg); 473 474 /** 475 * Initialize a new text object by allocating space for and copying 476 * the encoded characters from another text object. 477 * 478 * \param text the object to initialize 479 * \param other the object to copy 480 * 481 * \returns 0 on success, or non-zero on memory allocation failure 482 */ 483 int utf8lite_text_init_copy(struct utf8lite_text *text, 484 const struct utf8lite_text *other); 485 486 /** Indicates whether the text definitely decodes to ASCII. For this to be true, 487 * the text must be encoded in ASCII and not have any escapes that decode to 488 * non-ASCII codepoints. 489 */ 490 int utf8lite_text_isascii(const struct utf8lite_text *text); 491 492 /** 493 * Free the resources associated with a text object. 494 * 495 * \param text the text object 496 */ 497 void utf8lite_text_destroy(struct utf8lite_text *text); 498 499 /** 500 * Compute a hash code from a text. 501 * 502 * \param text the text 503 * 504 * \returns the hash code. 505 */ 506 size_t utf8lite_text_hash(const struct utf8lite_text *text); 507 508 /** 509 * Test whether two texts are equal (bitwise). Bitwise equality is more 510 * stringent than decoding to the same value. 511 * 512 * \param text1 the first text 513 * \param text2 the second text 514 * 515 * \returns non-zero if the tokens are equal, zero otherwise 516 */ 517 int utf8lite_text_equals(const struct utf8lite_text *text1, 518 const struct utf8lite_text *text2); 519 520 /** 521 * Compare two texts. 522 * 523 * \param text1 the first text 524 * \param text2 the second text 525 * 526 * \returns zero if the two encoded texts are identical; a negative value 527 * if the first value is less than the second; a positive value 528 * if the first value is greater than the second 529 */ 530 int utf8lite_text_compare(const struct utf8lite_text *text1, 531 const struct utf8lite_text *text2); 532 /**@}*/ 533 534 535 /** 536 * \defgroup textiter Text iteration 537 * @{ 538 */ 539 540 /** 541 * An iterator over the decoded UTF-32 characters in a text. 542 */ 543 struct utf8lite_text_iter { 544 const uint8_t *ptr; /**< current position in the text buffer*/ 545 const uint8_t *end; /**< end of the text buffer */ 546 size_t text_attr; /**< text attributes */ 547 int32_t current; /**< current character (UTF-32) */ 548 }; 549 550 /** 551 * Initialize a text iterator to start at the beginning of a text. 552 * 553 * \param it the iterator 554 * \param text the text 555 */ 556 void utf8lite_text_iter_make(struct utf8lite_text_iter *it, 557 const struct utf8lite_text *text); 558 559 /** 560 * Advance to the next character in a text. 561 * 562 * \param it the text iterator 563 * 564 * \returns non-zero if the iterator successfully advanced; zero if 565 * the iterator has passed the end of the text 566 */ 567 int utf8lite_text_iter_advance(struct utf8lite_text_iter *it); 568 569 /** 570 * Retreat to the previous character in a text. 571 * 572 * \param it the text iterator 573 * 574 * \returns non-zero if the iterator successfully backed up; zero if 575 * the iterator has passed the start of the text. 576 */ 577 int utf8lite_text_iter_retreat(struct utf8lite_text_iter *it); 578 579 /** 580 * Reset an iterator to the start of the text. 581 * 582 * \param it the text iterator 583 */ 584 void utf8lite_text_iter_reset(struct utf8lite_text_iter *it); 585 586 /** 587 * Skip an iterator to the end of the text. 588 * 589 * \param it the text iterator 590 */ 591 void utf8lite_text_iter_skip(struct utf8lite_text_iter *it); 592 593 /**@}*/ 594 595 /** 596 * \defgroup textmap Text normalization map 597 * @{ 598 */ 599 600 /** 601 * Map descriptor. At a minimum, convert the text to 602 * composed normal form (NFC). Optionally, apply compatibility maps for 603 * NFKC normal and/or apply other transformations: 604 * 605 * + #UTF8LITE_TEXTMAP_CASE: perform case folding, in most languages (including 606 * English) mapping uppercase characters to their lowercase equivalents, 607 * but also performing other normalizations like mapping the 608 * German Eszett (ß) to "ss"; see 609 * _The Unicode Standard_ Sec. 5.18 "Case Mappings" 610 * and the 611 * [Case Mapping FAQ](http://unicode.org/faq/casemap_charprop.html) 612 * for more information 613 * 614 * + #UTF8LITE_TEXTMAP_COMPAT: apply all compatibility maps required for 615 * [NFKC normal form](http://unicode.org/reports/tr15/#Norm_Forms) 616 * 617 * + #UTF8LITE_TEXTMAP_QUOTE: quote fold, replace single quotes and 618 * Unicode apostrophe with ASCII apostrophe (U+0027) 619 * 620 * + #UTF8LITE_TEXTMAP_RMDI: remove default ignorables (DI) like soft 621 * hyphens and zero-width spaces, anything with the 622 * [Default_Ignorable_Code_Point=Yes] 623 * (http://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point) 624 * property 625 */ 626 enum utf8lite_textmap_type { 627 UTF8LITE_TEXTMAP_NORMAL = 0, /**< transform to composed normal form */ 628 UTF8LITE_TEXTMAP_CASE = (1 << 0), /**< perform case folding */ 629 UTF8LITE_TEXTMAP_COMPAT = (1 << 1), /**< apply compatibility mappings */ 630 UTF8LITE_TEXTMAP_QUOTE = (1 << 2), /**< replace apostrophe with `'` */ 631 UTF8LITE_TEXTMAP_RMDI = (1 << 3) /**< remove default ignorables */ 632 }; 633 634 /** 635 * Text normalization map. 636 */ 637 struct utf8lite_textmap { 638 struct utf8lite_text text;/**< result of the most recent call to 639 utf8lite_textmap_set() */ 640 int8_t ascii_map[128]; /**< a lookup table for the mappings of ASCII 641 characters; -1 indicates deletion */ 642 int32_t *codes; /**< buffer for intermediate UTF-32 decoding */ 643 size_t size_max; /**< text size maximum; normalizing a larger 644 text will force a reallocation */ 645 int type; /**< the map type descriptor, a bit mask 646 of #utf8lite_textmap_type values */ 647 int charmap_type; /**< the unicode map type, a bit mask of 648 #utf8lite_decomp_type and 649 #utf8lite_casefold_type values */ 650 }; 651 652 /** 653 * Initialize a new text map of the specified kind. 654 * 655 * \param map the text map 656 * \param type a bitmask of #utf8lite_textmap_type values, specifying 657 * the map type 658 * 659 * \returns 0 on success 660 */ 661 int utf8lite_textmap_init(struct utf8lite_textmap *map, int type); 662 663 /** 664 * Release the resources associated with a text map. 665 * 666 * \param map the text map 667 */ 668 void utf8lite_textmap_destroy(struct utf8lite_textmap *map); 669 670 /** 671 * Given input text, set a map to the corresponding output text. 672 * 673 * \param map the text map 674 * \param text the text 675 * 676 * \returns 0 on success 677 */ 678 int utf8lite_textmap_set(struct utf8lite_textmap *map, 679 const struct utf8lite_text *text); 680 681 /**@}*/ 682 683 /** 684 * \defgroup graphscan Character graphemes 685 * @{ 686 */ 687 688 /** 689 * Grapheme cluster. 690 */ 691 struct utf8lite_graph { 692 struct utf8lite_text text; /**< grapheme code sequence */ 693 }; 694 695 /** 696 * Grapheme scanner, for iterating over the graphemes in a text. Grapheme 697 * boundaries are determined according to 698 * [UAX #29, Unicode Text Segmentation][uax29], 699 * using the extended grapheme cluster rules. 700 * 701 * [uax29]: http://unicode.org/reports/tr29/ 702 */ 703 struct utf8lite_graphscan { 704 struct utf8lite_text_iter iter; /**< iterator pointed at next code */ 705 const uint8_t *ptr; /**< next code's start */ 706 int prop; /**< next code's break property */ 707 struct utf8lite_graph current; /**< current grapheme */ 708 }; 709 710 /** 711 * Create a grapheme scanner over a text object. 712 * 713 * \param scan the scanner to initialize 714 * \param text the text 715 */ 716 void utf8lite_graphscan_make(struct utf8lite_graphscan *scan, 717 const struct utf8lite_text *text); 718 719 /** 720 * Advance a scanner to the next grapheme. 721 * 722 * \param scan the scanner 723 * 724 * \returns nonzero on success, zero if at the end of the text 725 */ 726 int utf8lite_graphscan_advance(struct utf8lite_graphscan *scan); 727 728 /** 729 * Retreat a scanner to the previous grapheme. 730 * 731 * \param scan the scanner 732 * 733 * \returns non-zero on success, zero if at the start of the text 734 */ 735 int utf8lite_graphscan_retreat(struct utf8lite_graphscan *scan); 736 737 /** 738 * Reset a scanner to the beginning of the text. 739 * 740 * \param scan the scanner 741 */ 742 void utf8lite_graphscan_reset(struct utf8lite_graphscan *scan); 743 744 /** 745 * Skip a scanner at the end of the text. 746 * 747 * \param scan the scanner 748 */ 749 void utf8lite_graphscan_skip(struct utf8lite_graphscan *scan); 750 751 /**@}*/ 752 753 /** 754 * \defgroup wordscan Word boundaries 755 * @{ 756 */ 757 758 /** 759 * A word scanner, for iterating over the words in a text. Word boundaries 760 * are determined according to [UAX #29, Unicode Text Segmentation][uax29]. 761 * You can test the word boundary rules in an interactive 762 * [online demo][demo]. 763 * 764 * [demo]: http://unicode.org/cldr/utility/breaks.jsp 765 * [uax29]: http://unicode.org/reports/tr29/ 766 */ 767 struct utf8lite_wordscan { 768 int32_t code; /**< next code point */ 769 size_t attr; /**< next code's attributes */ 770 int prop; /**< next code's word break property */ 771 const uint8_t *ptr; /**< next code's start */ 772 773 struct utf8lite_text_iter iter; /**< an iterator over the input, 774 positioned past next code */ 775 int iter_prop; /**< iterator code's word break property */ 776 const uint8_t *iter_ptr;/**< iterator code's start */ 777 778 struct utf8lite_text current; /**< the current word */ 779 }; 780 781 /** 782 * Create a word scanner over a text object. 783 * 784 * \param scan the scanner to initialize 785 * \param text the text 786 */ 787 void utf8lite_wordscan_make(struct utf8lite_wordscan *scan, 788 const struct utf8lite_text *text); 789 790 /** 791 * Advance a scanner to the next word. 792 * 793 * \param scan the scanner 794 * 795 * \returns nonzero on success, zero if at the end of the text 796 */ 797 int utf8lite_wordscan_advance(struct utf8lite_wordscan *scan); 798 799 /** 800 * Reset a scanner to the beginning of the text. 801 * 802 * \param scan the scanner 803 */ 804 void utf8lite_wordscan_reset(struct utf8lite_wordscan *scan); 805 806 /**@}*/ 807 808 /** 809 * \defgroup render Text rendering 810 * @{ 811 */ 812 813 /** 814 * Render escaping type. Specifies that certain code-points require 815 * special handling. 816 */ 817 enum utf8lite_escape_type { 818 UTF8LITE_ESCAPE_NONE = 0, /**< no special escaping */ 819 UTF8LITE_ESCAPE_CONTROL = (1 << 0), /**< control and other codes */ 820 UTF8LITE_ESCAPE_DQUOTE = (1 << 1), /**< ASCII double quote */ 821 UTF8LITE_ESCAPE_SQUOTE = (1 << 2), /**< ASCII single quote */ 822 UTF8LITE_ESCAPE_EXTENDED = (1 << 3), /**< extended-plane UTF-8 */ 823 UTF8LITE_ESCAPE_UTF8 = (1 << 4) /**< non-ASCII UTF-8 */ 824 }; 825 826 /** 827 * Render encoding type. 828 */ 829 enum utf8lite_encode_type { 830 UTF8LITE_ENCODE_C = 0, /**< C-compatible escapes */ 831 UTF8LITE_ENCODE_JSON = (1 << 5),/**< JSON-compatible escapes */ 832 UTF8LITE_ENCODE_EMOJIZWSP = (1 << 6),/**< put ZWSP after emoji */ 833 UTF8LITE_ENCODE_RMDI = (1 << 7),/**< remove default ignorables */ 834 UTF8LITE_ENCODE_AMBIGWIDE = (1 << 8)/**< assume that ambiguous-width 835 characters are wide */ 836 }; 837 838 /** 839 * Get the width of a grapheme under the specified render settings. If 840 * the grapheme contains a non-escaped control character, report the width 841 * as -1. 842 * 843 * \param g the grapheme 844 * \param flags a bitmask of #utf8lite_escape_type and #utf8lite_encode_type 845 * values specifying the encoding settings 846 * \param widthptr if non-NULL, a pointer to store the width on exit 847 * (0 if the grapheme is empty or a non-escaped control) 848 * 849 * \returns 0 on success 850 */ 851 int utf8lite_graph_measure(const struct utf8lite_graph *g, int flags, 852 int *widthptr); 853 854 /** 855 * Renderer, for printing objects as strings. 856 */ 857 struct utf8lite_render { 858 char *string; /**< the rendered string (null terminated) */ 859 int length; /**< the length of the rendered string, not 860 including the null terminator */ 861 int length_max; /**< the maximum capacity of the rendered 862 string before requiring reallocation, not 863 including the null terminator */ 864 int flags; /**< the flags, a bitmask of 865 #utf8lite_escape_type and 866 #utf8lite_encode_type values, 867 specifying escaping behavior */ 868 869 const char *tab; /**< the tab string, for indenting */ 870 int tab_length; /**< the length in bytes of the tab string, 871 not including the null terminator */ 872 873 const char *newline; /**< the newline string, for advancing 874 to the next line */ 875 int newline_length; /**< the length in bytes of the newline string, 876 not including the null terminator */ 877 878 const char *style_open; /**< the escape style graphic parameters, 879 for styling backslash escapes */ 880 const char *style_close;/**< the escape style graphic parameters, 881 for restoring state after styling a 882 backslash escapes */ 883 int style_open_length; /**< length in bytes of the style_open string, 884 not including the null terminator */ 885 int style_close_length; /**< length in bytes of the style_close string, 886 not including the null terminator */ 887 888 int indent; /**< the current indent level */ 889 int needs_indent; /**< whether to indent before the next 890 character */ 891 int error; /**< the code for the last error that 892 occurred, or zero if none */ 893 }; 894 895 /** 896 * Initialize a new render object. 897 * 898 * \param r the render object 899 * \param flags a bitmask of #utf8lite_escape_type and #utf8lite_encode_type 900 * values specifying escaping behavior 901 * 902 * \returns 0 on success 903 */ 904 int utf8lite_render_init(struct utf8lite_render *r, int flags); 905 906 /** 907 * Release a render object's resources. 908 * 909 * \param r the render object 910 */ 911 void utf8lite_render_destroy(struct utf8lite_render *r); 912 913 /** 914 * Reset the render object to the empty string and set the indent level to 0. 915 * Leave the escape flags, the tab, and the newline string at their current 916 * values. 917 * 918 * \param r the render object 919 */ 920 void utf8lite_render_clear(struct utf8lite_render *r); 921 922 /** 923 * Set the escaping behavior. 924 * 925 * \param r the render object 926 * \param flags a bit mask of #utf8lite_escape_type values 927 * 928 * \returns 0 on success 929 */ 930 int utf8lite_render_set_flags(struct utf8lite_render *r, int flags); 931 932 /** 933 * Set the tab string. The client must not free the passed-in tab 934 * string until either the render object is destroyed or a new tab 935 * string gets set. 936 * 937 * \param r the render object 938 * \param tab the tab string (null terminated) 939 * 940 * \returns 0 on success 941 */ 942 int utf8lite_render_set_tab(struct utf8lite_render *r, const char *tab); 943 944 /** 945 * Set the new line string. The client must not free the passed-in newline 946 * string until either the render object is destroyed or a new newline 947 * string gets set. 948 * 949 * \param r the render object 950 * \param newline the newline string (null terminated) 951 * 952 * \returns 0 on success 953 */ 954 int utf8lite_render_set_newline(struct utf8lite_render *r, const char *newline); 955 956 /** 957 * Set the escape style strings. The client must not free the passed 958 * in strings until the render object is destroyed or new style 959 * strings get set. 960 * 961 * \param r the render object 962 * \param open the string to render before a backslash escape. 963 * \param close the string to render after a backslash escape. 964 * 965 * \returns 0 on success 966 */ 967 int utf8lite_render_set_style(struct utf8lite_render *r, 968 const char *open, const char *close); 969 970 /** 971 * Increase or decrease the indent level. 972 * 973 * \param r the render object 974 * \param nlevel the number of levels add or subtract to the indent 975 * 976 * \returns 0 on success 977 */ 978 int utf8lite_render_indent(struct utf8lite_render *r, int nlevel); 979 980 /** 981 * Add new lines. 982 * 983 * \param r the render object 984 * \param nline the number of new lines to add 985 * 986 * \returns 0 on success 987 */ 988 int utf8lite_render_newlines(struct utf8lite_render *r, int nline); 989 990 /** 991 * Render a character grapheme. If any render escape flags are set, filter 992 * the grapheme through the appropriate escaping and encoding. 993 * 994 * \param r the render object 995 * \param g the grapheme 996 * 997 * \returns 0 on success 998 */ 999 int utf8lite_render_graph(struct utf8lite_render *r, 1000 const struct utf8lite_graph *g); 1001 1002 /** 1003 * Render a single character, treating it as a grapheme cluster. If any 1004 * render escape flags are set, filter the character through the 1005 * appropriate escaping and encoding. 1006 * 1007 * \param r the render object 1008 * \param ch the character 1009 * 1010 * \returns 0 on success 1011 */ 1012 int utf8lite_render_char(struct utf8lite_render *r, int32_t ch); 1013 1014 /** 1015 * Render multiple copies of a character, treating each as a grapheme 1016 * cluster. 1017 * 1018 * \param r the render object 1019 * \parma ch the character 1020 * \param nchar the number of copies to render 1021 * 1022 * \returns 0 on success 1023 */ 1024 int utf8lite_render_chars(struct utf8lite_render *r, int32_t ch, int nchar); 1025 1026 /** 1027 * Render a string. If any render escape flags are set, filter 1028 * all character graphemes through the appropriate escaping. 1029 * 1030 * \param r the render object 1031 * \param str the string, valid UTF-8 1032 * 1033 * \returns 0 on success 1034 */ 1035 int utf8lite_render_string(struct utf8lite_render *r, const char *str); 1036 1037 /** 1038 * Render formatted text. If any render escape flags are set, filter 1039 * all character graphemes through the appropriate escaping. 1040 * 1041 * \param r the render object 1042 * \param format the format string 1043 */ 1044 int utf8lite_render_printf(struct utf8lite_render *r, const char *format, ...) 1045 #if defined(_WIN32) || defined(_WIN64) 1046 ; 1047 #else 1048 __attribute__ ((format (printf, 2, 3))); 1049 #endif 1050 1051 /** 1052 * Render a text object. If any render escape flags are set, filter 1053 * all character graphemes through the appropriate escaping. 1054 * 1055 * \param r the render object 1056 * \param text the text object 1057 * 1058 * \returns 0 on success 1059 */ 1060 int utf8lite_render_text(struct utf8lite_render *r, 1061 const struct utf8lite_text *text); 1062 1063 /** 1064 * Append a sequence of raw bytes to the render buffer. Ignore any special 1065 * handling specified by the render flags. 1066 * 1067 * \param r the render object 1068 * \param bytes the byte array 1069 * \param size the number of bytes 1070 * 1071 * \returns 0 on success. 1072 */ 1073 int utf8lite_render_raw(struct utf8lite_render *r, const char *bytes, 1074 size_t size); 1075 1076 /**@}*/ 1077 1078 #endif /* UTF8LITE_H */ 1079