1 /*
2  * Copyright 2017 Patrick O. Perry.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef UTF8LITE_H
18 #define UTF8LITE_H
19 
20 /**
21  * \file utf8lite.h
22  *
23  * Lightweight UTF-8 processing.
24  */
25 
26 #include <limits.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 /**
31  * \defgroup error Error handling
32  * @{
33  */
34 
35 /** Maximum error message length, in bytes, not including the trailing NUL */
36 #define UTF8LITE_MESSAGE_MAX 255
37 
38 /**
39  * Error code.
40  */
41 enum utf8lite_error_type {
42 	UTF8LITE_ERROR_NONE = 0,/**< no error */
43 	UTF8LITE_ERROR_INVAL,	/**< invalid input */
44 	UTF8LITE_ERROR_NOMEM,	/**< out of memory */
45 	UTF8LITE_ERROR_OS,	/**< operating system error */
46 	UTF8LITE_ERROR_OVERFLOW,/**< size exceeds maximum */
47 	UTF8LITE_ERROR_DOMAIN,	/**< input is out of domain */
48 	UTF8LITE_ERROR_RANGE,	/**< output is out of range */
49 	UTF8LITE_ERROR_INTERNAL	/**< internal error */
50 };
51 
52 /**
53  * Message buffer.
54  */
55 struct utf8lite_message {
56 	char string[UTF8LITE_MESSAGE_MAX + 1]; /**< NUL-terminated message */
57 };
58 
59 /**
60  * Set a message to the empty string.
61  *
62  * \param msg message, or NULL
63  */
64 void utf8lite_message_clear(struct utf8lite_message *msg);
65 
66 /**
67  * Set a message to a formatted string.
68  *
69  * \param msg message, or NULL
70  * \param fmt format string
71  * \param ... format arguments
72  */
73 void utf8lite_message_set(struct utf8lite_message *msg, const char *fmt, ...)
74 #if defined(_WIN32) || defined(_WIN64)
75 	;
76 #else
77 	__attribute__ ((format (printf, 2, 3)));
78 #endif
79 
80 /**
81  * Append to a message.
82  *
83  * \param msg message, or NULL
84  * \param fmt format string
85  * \param ... format arguments
86  */
87 void utf8lite_message_append(struct utf8lite_message *msg, const char *fmt, ...)
88 #if defined(_WIN32) || defined(_WIN64)
89 	;
90 #else
91 	__attribute__ ((format (printf, 2, 3)));
92 #endif
93 
94 /**@}*/
95 
96 /**
97  * \defgroup char Unicode characters
98  * @{
99  */
100 
101 /** Missing Unicode value */
102 #define UTF8LITE_CODE_NONE -1
103 
104 /** Unicode replacement character */
105 #define UTF8LITE_CODE_REPLACEMENT 0xFFFD
106 
107 /** Last valid unicode codepoint */
108 #define UTF8LITE_CODE_MAX 0x10FFFF
109 
110 /** Number of bits required to encode a codepoint */
111 #define UTF8LITE_CODE_BITS 21
112 
113 /** Indicates whether a given unsigned integer is a valid ASCII codepoint */
114 #define UTF8LITE_IS_ASCII(x) \
115 	((x) <= 0x7F)
116 
117 /** Indicates whether a given unsigned integer is a valid unicode codepoint */
118 #define UTF8LITE_IS_UNICODE(x) \
119 	(((x) <= UTF8LITE_CODE_MAX) \
120 	 && !UTF8LITE_IS_UTF16_HIGH(x) \
121 	 && !UTF8LITE_IS_UTF16_LOW(x))
122 
123 /**
124  * Unicode character width type.
125  */
126 enum utf8lite_charwidth_type {
127 	UTF8LITE_CHARWIDTH_NONE = 0,	/**< Control or and other */
128 	UTF8LITE_CHARWIDTH_IGNORABLE,	/**< Default ignorable */
129 	UTF8LITE_CHARWIDTH_MARK,	/**< Zero-width mark or format */
130 	UTF8LITE_CHARWIDTH_NARROW,	/**< Most western alphabets */
131 	UTF8LITE_CHARWIDTH_AMBIGUOUS,	/**< Width depends on context */
132 	UTF8LITE_CHARWIDTH_WIDE,	/**< Most ideographs */
133 	UTF8LITE_CHARWIDTH_EMOJI	/**< Emoji presentation */
134 };
135 
136 /**
137  * Get the width of a Unicode character, using the East Asian Width table and
138  * the Emoji data.
139  *
140  * \param code the codepoint
141  *
142  * \returns a #utf8lite_charwidth_type value giving the width
143  */
144 int utf8lite_charwidth(int32_t code);
145 
146 /**
147  * Get whether a Unicode character is white space.
148  *
149  * \param code the codepoint
150  *
151  * \returns 1 if space, 0 otherwise.
152  */
153 int utf8lite_isspace(int32_t code);
154 
155 /**
156  * Get whether a Unicode character is a default ignorable character.
157  *
158  * \param code the codepoint
159  *
160  * \returns 1 if space, 0 otherwise.
161  */
162 int utf8lite_isignorable(int32_t code);
163 
164 /**@}*/
165 
166 /**
167  * \defgroup encode Encoding
168  * @{
169  */
170 
171 /** Number of bytes in the UTF-8 encoding of a valid unicode codepoint. */
172 #define UTF8LITE_UTF8_ENCODE_LEN(u) \
173 	((u) <= 0x7F     ? 1 : \
174 	 (u) <= 0x07FF   ? 2 : \
175 	 (u) <= 0xFFFF   ? 3 : 4)
176 
177 /** Number of 16-bit code units in the UTF-16 encoding of a valid unicode
178  *  codepoint */
179 #define UTF8LITE_UTF16_ENCODE_LEN(u) \
180 	((u) <= 0xFFFF ? 1 : 2)
181 
182 /** High (leading) UTF-16 surrogate for a code point in the supplementary
183  *  plane (U+10000 to U+10FFFF). */
184 #define UTF8LITE_UTF16_HIGH(u) \
185 	0xD800 | (((unsigned)(u) - 0x010000) >> 10)
186 
187 /** Low (trailing) UTF-16 surrogate for a code point in the supplementary
188  *  plane (U+10000 to U+10FFFF). */
189 #define UTF8LITE_UTF16_LOW(u) \
190 	0xDC00 | (((unsigned)(u) - 0x010000) & 0x03FF)
191 
192 /** Indicates whether a 16-bit code unit is a UTF-16 high surrogate.
193  *  High surrogates are in the range 0xD800 `(1101 1000 0000 0000)`
194  *  to 0xDBFF `(1101 1011 1111 1111)`. */
195 #define UTF8LITE_IS_UTF16_HIGH(x) (((x) & 0xFC00) == 0xD800)
196 
197 /** Indicates whether a 16-bit code unit is a UTF-16 low surrogate.
198  *  Low surrogates are in the range 0xDC00 `(1101 1100 0000 0000)`
199  *  to 0xDFFF `(1101 1111 1111 1111)`. */
200 #define UTF8LITE_IS_UTF16_LOW(x) (((x) & 0xFC00) == 0xDC00)
201 
202 /** Given the high and low UTF-16 surrogates, compute the unicode codepoint. */
203 #define UTF8LITE_DECODE_UTF16_PAIR(h, l) \
204 	(((((h) & 0x3FF) << 10) | ((l) & 0x3FF)) + 0x10000)
205 
206 /** Given the first byte in a valid UTF-8 byte sequence, determine the number of
207  *  continuation bytes */
208 #define UTF8LITE_UTF8_TAIL_LEN(x) \
209 	(  ((x) & 0x80) == 0x00 ? 0 \
210 	 : ((x) & 0xE0) == 0xC0 ? 1 \
211 	 : ((x) & 0xF0) == 0xE0 ? 2 : 3)
212 
213 /** Maximum number of UTF-8 continuation bytes in a valid encoded character */
214 #define UTF8LITE_UTF8_TAIL_MAX 3
215 
216 /**
217  * Validate the first character in a UTF-8 character buffer.
218  *
219  * \param bufptr a pointer to the input buffer; on exit, a pointer to
220  * 	the end of the first valid UTF-8 character, or the first invalid
221  * 	byte in the encoding
222  * \param end the end of the input buffer
223  * \param msg an error message buffer
224  *
225  * \returns 0 on success
226  */
227 int utf8lite_scan_utf8(const uint8_t **bufptr, const uint8_t *end,
228 		       struct utf8lite_message *msg);
229 
230 /**
231  * Decode the first codepoint from a UTF-8 character buffer.
232  *
233  * \param bufptr on input, a pointer to the start of the character buffer;
234  * 	on exit, a pointer to the end of the first UTF-8 character in
235  * 	the buffer
236  * \param codeptr on exit, the first codepoint in the buffer
237  */
238 void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr);
239 
240 /**
241  * Encode a codepoint into a UTF-8 character buffer. The codepoint must
242  * be a valid unicode character (according to #UTF8LITE_IS_UNICODE) and the buffer
243  * must have space for at least #UTF8LITE_UTF8_ENCODE_LEN bytes.
244  *
245  * \param code the codepoint
246  * \param bufptr on input, a pointer to the start of the buffer;
247  * 	on exit, a pointer to the end of the encoded codepoint
248  */
249 void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr);
250 
251 /**
252  * Encode a codepoint in reverse, at the end of UTF-8 character buffer.
253  * The codepoint must be a valid unicode character (according to
254  * #UTF8LITE_IS_UNICODE) and the buffer must have space for at least
255  * #UTF8LITE_UTF8_ENCODE_LEN bytes.
256  *
257  * \param code the codepoint
258  * \param endptr on input, a pointer to the end of the buffer;
259  * 	on exit, a pointer to the start of the encoded codepoint
260  */
261 void utf8lite_rencode_utf8(int32_t code, uint8_t **endptr);
262 
263 /**@}*/
264 
265 /**
266  * \defgroup escape Escape code handling
267  * @{
268  */
269 
270 /**
271  * Scan a JSON-style backslash (\\) escape.
272  *
273  * \param bufptr on input, a pointer to the byte after the backslash;
274  * 	on output, a pointer to the byte after the escape
275  * \param end pointer to the end of the buffer
276  * \param msg error message buffer
277  *
278  * \returns 0 on success
279  */
280 int utf8lite_scan_escape(const uint8_t **bufptr, const uint8_t *end,
281 			 struct utf8lite_message *msg);
282 
283 /**
284  * Scan a JSON-style backslash-u (\\u) escape.
285  *
286  * \param bufptr on input, a pointer to the byte after the 'u';
287  * 	on output, a pointer to the byte after the escape
288  * \param end pointer to the end of the buffer
289  * \param msg error message buffer
290  *
291  * \returns 0 on success
292  */
293 int utf8lite_scan_uescape(const uint8_t **bufptr, const uint8_t *end,
294 			  struct utf8lite_message *msg);
295 
296 /**
297  * Decode a JSON-style backslash (\\) escape.
298  *
299  * \param bufptr on input, a pointer to the byte after the backslash;
300  * 	on output, a pointer to the byte after the escape
301  * \param codeptr on output, a pointer to the decoded UTF-32 character
302  */
303 void utf8lite_decode_escape(const uint8_t **bufptr, int32_t *codeptr);
304 
305 /**
306  * Scan a JSON-style backslash-u (\\u) escape.
307  *
308  * \param bufptr on input, a pointer to the byte after the 'u';
309  * 	on output, a pointer to the byte after the escape
310  * \param codeptr on output, a pointer to the decoded UTF-32 character
311  */
312 void utf8lite_decode_uescape(const uint8_t **bufptr, int32_t *codeptr);
313 
314 /**@}*/
315 
316 /**
317  * \defgroup normalize Normalization
318  * @{
319  */
320 
321 /**
322  * Unicode character decomposition mappings. The compatibility mappings are
323  * defined in [UAX #44 Sec. 5.7.3 Character Decomposition Maps]
324  * (http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
325  */
326 enum utf8lite_decomp_type {
327 	UTF8LITE_DECOMP_NORMAL = 0, /**< normalization (required for NFD) */
328 	UTF8LITE_DECOMP_FONT = (1 << 0),     /**< font variant */
329 	UTF8LITE_DECOMP_NOBREAK = (1 << 1),  /**< no-break version of a space
330 					      or hyphen */
331 	UTF8LITE_DECOMP_INITIAL = (1 << 2),  /**< initial presentation form
332 					      (Arabic) */
333 	UTF8LITE_DECOMP_MEDIAL = (1 << 3),   /**< medial presentation form
334 					      (Arabic) */
335 	UTF8LITE_DECOMP_FINAL = (1 << 4),    /**< final presentation form
336 					      (Arabic) */
337 	UTF8LITE_DECOMP_ISOLATED = (1 << 5), /**< isolated presentation form
338 					      (Arabic) */
339 	UTF8LITE_DECOMP_CIRCLE = (1 << 6),   /**< encircled form */
340 	UTF8LITE_DECOMP_SUPER = (1 << 7),    /**< superscript form */
341 	UTF8LITE_DECOMP_SUB = (1 << 8),      /**< subscript form */
342 	UTF8LITE_DECOMP_VERTICAL = (1 << 9), /**< vertical layout presentation
343 					      form */
344 	UTF8LITE_DECOMP_WIDE = (1 << 10),    /**< wide (or zenkaku)
345 					      compatibility */
346 	UTF8LITE_DECOMP_NARROW = (1 << 11),  /**< narrow (or hankaku)
347 					      compatibility */
348 	UTF8LITE_DECOMP_SMALL = (1 << 12),   /**< small variant form
349 					      (CNS compatibility) */
350 	UTF8LITE_DECOMP_SQUARE = (1 << 13),  /**< CJK squared font variant */
351 	UTF8LITE_DECOMP_FRACTION = (1 << 14),/**< vulgar fraction form */
352 	UTF8LITE_DECOMP_COMPAT = (1 << 15),  /**< unspecified compatibility */
353 
354 	UTF8LITE_DECOMP_ALL = ((1 << 16) - 1)/**< all decompositions
355 					      (required for NFKD) */
356 };
357 
358 /**
359  * Unicode case folding. These are defined in *TR44* Sec. 5.6.
360  */
361 enum utf8lite_casefold_type {
362 	UTF8LITE_CASEFOLD_NONE = 0,		/**< no case folding */
363 	UTF8LITE_CASEFOLD_ALL = (1 << 16)	/**< perform case folding */
364 };
365 
366 /**
367  * Maximum size (in codepoints) of a single code point's decomposition.
368  *
369  * From *TR44* Sec. 5.7.3: "Compatibility mappings are guaranteed to be no
370  * longer than 18 characters, although most consist of just a few characters."
371  */
372 #define UTF8LITE_UNICODE_DECOMP_MAX 18
373 
374 /**
375  * Apply decomposition and/or casefold mapping to a Unicode character,
376  * outputting the result to the specified buffer. The output will be at
377  * most #UTF8LITE_UNICODE_DECOMP_MAX codepoints.
378  *
379  * \param type a bitmask composed from #utf8lite_decomp_type and
380  * 	#utf8lite_casefold_type values specifying the mapping type
381  * \param code the input codepoint
382  * \param bufptr on entry, a pointer to the output buffer; on exit,
383  * 	a pointer past the last output codepoint
384  */
385 void utf8lite_map(int type, int32_t code, int32_t **bufptr);
386 
387 /**
388  * Apply the canonical ordering algorithm to put an array of Unicode
389  * codepoints in normal order. See *Unicode* Sec 3.11 and *TR44* Sec. 5.7.4.
390  *
391  * \param ptr a pointer to the first codepoint
392  * \param len the number of codepoints
393  */
394 void utf8lite_order(int32_t *ptr, size_t len);
395 
396 /**
397  * Apply the canonical composition algorithm to put an array of
398  * canonically-ordered Unicode codepoints into composed form.
399  *
400  * \param ptr a pointer to the first codepoint
401  * \param lenptr on entry, a pointer to the number of input codepoints;
402  * 	on exit, a pointer to the number of composed codepoints
403  */
404 void utf8lite_compose(int32_t *ptr, size_t *lenptr);
405 
406 /**@}*/
407 
408 /**
409  * \defgroup text UTF-8 encoded text
410  * @{
411  */
412 
413 /** Whether the text might contain a backslash (`\`) that should be
414  * interpreted as an escape */
415 #define UTF8LITE_TEXT_ESC_BIT	((size_t)1 << (CHAR_BIT * sizeof(size_t) - 1))
416 
417 /** Size of the encoded text, in bytes; (decoded size) <= (encoded size) */
418 #define UTF8LITE_TEXT_SIZE_MASK	((size_t)SIZE_MAX >> 1)
419 
420 /** Maximum size of encode text, in bytes. */
421 #define UTF8LITE_TEXT_SIZE_MAX	UTF8LITE_TEXT_SIZE_MASK
422 
423 /** The encoded size of the text, in bytes */
424 #define UTF8LITE_TEXT_SIZE(text) ((text)->attr & UTF8LITE_TEXT_SIZE_MASK)
425 
426 /** The text attribute bits */
427 #define UTF8LITE_TEXT_BITS(text) ((text)->attr & ~UTF8LITE_TEXT_SIZE_MASK)
428 
429 /** Indicates whether the text might contain a backslash (`\`) that should
430  *  be interpreted as an escape code */
431 #define UTF8LITE_TEXT_HAS_ESC(text) \
432 	(((text)->attr & UTF8LITE_TEXT_ESC_BIT) ? 1 : 0)
433 
434 /**
435  * Flags for utf8lite_text_assign().
436  */
437 enum utf8lite_text_flag {
438 	/** validate the input */
439 	UTF8LITE_TEXT_UNKNOWN = 0,
440 
441 	/** do not perform any validation on the input */
442 	UTF8LITE_TEXT_VALID = (1 << 0),
443 
444 	/** interpret backslash (`\`) as an escape */
445 	UTF8LITE_TEXT_UNESCAPE = (1 << 1)
446 };
447 
448 /**
449  * UTF-8 encoded text, possibly containing JSON-compatible backslash (`\`)
450  * escape codes which should be interpreted as such. The client assumes
451  * all responsibility for managing the memory for the underlying UTF8-data.
452  */
453 struct utf8lite_text {
454 	uint8_t *ptr;	/**< pointer to valid UTF-8 data */
455 	size_t attr;	/**< text attributes */
456 };
457 
458 /**
459  * Assign a text value to point to data in the specified memory location
460  * after validating the input data.
461  *
462  * \param text the text value
463  * \param ptr a pointer to the underlying memory buffer
464  * \param size the number of bytes in the underlying memory buffer
465  * \param flags #utf8lite_text_flag bitmask specifying input type
466  * \param msg an error message buffer, or NULL
467  *
468  * \returns 0 on success
469  */
470 int utf8lite_text_assign(struct utf8lite_text *text,
471 			 const uint8_t *ptr, size_t size, int flags,
472 			 struct utf8lite_message *msg);
473 
474 /**
475  * Initialize a new text object by allocating space for and copying
476  * the encoded characters from another text object.
477  *
478  * \param text the object to initialize
479  * \param other the object to copy
480  *
481  * \returns 0 on success, or non-zero on memory allocation failure
482  */
483 int utf8lite_text_init_copy(struct utf8lite_text *text,
484 			    const struct utf8lite_text *other);
485 
486 /** Indicates whether the text definitely decodes to ASCII. For this to be true,
487  *  the text must be encoded in ASCII and not have any escapes that decode to
488  *  non-ASCII codepoints.
489  */
490 int utf8lite_text_isascii(const struct utf8lite_text *text);
491 
492 /**
493  * Free the resources associated with a text object.
494  *
495  * \param text the text object
496  */
497 void utf8lite_text_destroy(struct utf8lite_text *text);
498 
499 /**
500  * Compute a hash code from a text.
501  *
502  * \param text the text
503  *
504  * \returns the hash code.
505  */
506 size_t utf8lite_text_hash(const struct utf8lite_text *text);
507 
508 /**
509  * Test whether two texts are equal (bitwise). Bitwise equality is more
510  * stringent than decoding to the same value.
511  *
512  * \param text1 the first text
513  * \param text2 the second text
514  *
515  * \returns non-zero if the tokens are equal, zero otherwise
516  */
517 int utf8lite_text_equals(const struct utf8lite_text *text1,
518 			 const struct utf8lite_text *text2);
519 
520 /**
521  * Compare two texts.
522  *
523  * \param text1 the first text
524  * \param text2 the second text
525  *
526  * \returns zero if the two encoded texts are identical; a negative value
527  * 	if the first value is less than the second; a positive value
528  * 	if the first value is greater than the second
529  */
530 int utf8lite_text_compare(const struct utf8lite_text *text1,
531 			  const struct utf8lite_text *text2);
532 /**@}*/
533 
534 
535 /**
536  * \defgroup textiter Text iteration
537  * @{
538  */
539 
540 /**
541  * An iterator over the decoded UTF-32 characters in a text.
542  */
543 struct utf8lite_text_iter {
544 	const uint8_t *ptr;	/**< current position in the text buffer*/
545 	const uint8_t *end;	/**< end of the text buffer */
546 	size_t text_attr;	/**< text attributes */
547 	int32_t current;	/**< current character (UTF-32) */
548 };
549 
550 /**
551  * Initialize a text iterator to start at the beginning of a text.
552  *
553  * \param it the iterator
554  * \param text the text
555  */
556 void utf8lite_text_iter_make(struct utf8lite_text_iter *it,
557 			     const struct utf8lite_text *text);
558 
559 /**
560  * Advance to the next character in a text.
561  *
562  * \param it the text iterator
563  *
564  * \returns non-zero if the iterator successfully advanced; zero if
565  * 	the iterator has passed the end of the text
566  */
567 int utf8lite_text_iter_advance(struct utf8lite_text_iter *it);
568 
569 /**
570  * Retreat to the previous character in a text.
571  *
572  * \param it the text iterator
573  *
574  * \returns non-zero if the iterator successfully backed up; zero if
575  * 	the iterator has passed the start of the text.
576  */
577 int utf8lite_text_iter_retreat(struct utf8lite_text_iter *it);
578 
579 /**
580  * Reset an iterator to the start of the text.
581  *
582  * \param it the text iterator
583  */
584 void utf8lite_text_iter_reset(struct utf8lite_text_iter *it);
585 
586 /**
587  * Skip an iterator to the end of the text.
588  *
589  * \param it the text iterator
590  */
591 void utf8lite_text_iter_skip(struct utf8lite_text_iter *it);
592 
593 /**@}*/
594 
595 /**
596  * \defgroup textmap Text normalization map
597  * @{
598  */
599 
600 /**
601  * Map descriptor. At a minimum, convert the text to
602  * composed normal form (NFC). Optionally, apply compatibility maps for
603  * NFKC normal and/or apply other transformations:
604  *
605  *  + #UTF8LITE_TEXTMAP_CASE: perform case folding, in most languages (including
606  *  	English) mapping uppercase characters to their lowercase equivalents,
607  *  	but also performing other normalizations like mapping the
608  *  	German Eszett (&szlig;) to "ss"; see
609  *  	_The Unicode Standard_ Sec. 5.18 "Case Mappings"
610  *  	and the
611  *  	[Case Mapping FAQ](http://unicode.org/faq/casemap_charprop.html)
612  *  	for more information
613  *
614  *  + #UTF8LITE_TEXTMAP_COMPAT: apply all compatibility maps required for
615  *  	[NFKC normal form](http://unicode.org/reports/tr15/#Norm_Forms)
616  *
617  *  + #UTF8LITE_TEXTMAP_QUOTE: quote fold, replace single quotes and
618  *      Unicode apostrophe with ASCII apostrophe (U+0027)
619  *
620  *  + #UTF8LITE_TEXTMAP_RMDI: remove default ignorables (DI) like soft
621  *      hyphens and zero-width spaces, anything with the
622  *  	[Default_Ignorable_Code_Point=Yes]
623  *  	(http://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point)
624  *  	property
625  */
626 enum utf8lite_textmap_type {
627 	UTF8LITE_TEXTMAP_NORMAL = 0, /**< transform to composed normal form */
628 	UTF8LITE_TEXTMAP_CASE   = (1 << 0), /**< perform case folding */
629 	UTF8LITE_TEXTMAP_COMPAT = (1 << 1), /**< apply compatibility mappings */
630 	UTF8LITE_TEXTMAP_QUOTE  = (1 << 2), /**< replace apostrophe with `'` */
631 	UTF8LITE_TEXTMAP_RMDI   = (1 << 3)  /**< remove default ignorables */
632 };
633 
634 /**
635  * Text normalization map.
636  */
637 struct utf8lite_textmap {
638 	struct utf8lite_text text;/**< result of the most recent call to
639 				    utf8lite_textmap_set() */
640 	int8_t ascii_map[128];	/**< a lookup table for the mappings of ASCII
641 				  characters; -1 indicates deletion */
642 	int32_t *codes;		/**< buffer for intermediate UTF-32 decoding */
643 	size_t size_max;	/**< text size maximum; normalizing a larger
644 				 	text will force a reallocation */
645 	int type;		/**< the map type descriptor, a bit mask
646 				  of #utf8lite_textmap_type values */
647 	int charmap_type;	/**< the unicode map type, a bit mask of
648 				  #utf8lite_decomp_type and
649 				  #utf8lite_casefold_type values */
650 };
651 
652 /**
653  * Initialize a new text map of the specified kind.
654  *
655  * \param map the text map
656  * \param type a bitmask of #utf8lite_textmap_type values, specifying
657  * 	the map type
658  *
659  * \returns 0 on success
660  */
661 int utf8lite_textmap_init(struct utf8lite_textmap *map, int type);
662 
663 /**
664  * Release the resources associated with a text map.
665  *
666  * \param map the text map
667  */
668 void utf8lite_textmap_destroy(struct utf8lite_textmap *map);
669 
670 /**
671  * Given input text, set a map to the corresponding output text.
672  *
673  * \param map the text map
674  * \param text the text
675  *
676  * \returns 0 on success
677  */
678 int utf8lite_textmap_set(struct utf8lite_textmap *map,
679 			 const struct utf8lite_text *text);
680 
681 /**@}*/
682 
683 /**
684  * \defgroup graphscan Character graphemes
685  * @{
686  */
687 
688 /**
689  * Grapheme cluster.
690  */
691 struct utf8lite_graph {
692 	struct utf8lite_text text;	/**< grapheme code sequence */
693 };
694 
695 /**
696  * Grapheme scanner, for iterating over the graphemes in a text. Grapheme
697  * boundaries are determined according to
698  * [UAX #29, Unicode Text Segmentation][uax29],
699  * using the extended grapheme cluster rules.
700  *
701  * [uax29]: http://unicode.org/reports/tr29/
702  */
703 struct utf8lite_graphscan {
704 	struct utf8lite_text_iter iter;	/**< iterator pointed at next code */
705 	const uint8_t *ptr;		/**< next code's start */
706 	int prop;			/**< next code's break property */
707 	struct utf8lite_graph current;	/**< current grapheme */
708 };
709 
710 /**
711  * Create a grapheme scanner over a text object.
712  *
713  * \param scan the scanner to initialize
714  * \param text the text
715  */
716 void utf8lite_graphscan_make(struct utf8lite_graphscan *scan,
717 			     const struct utf8lite_text *text);
718 
719 /**
720  * Advance a scanner to the next grapheme.
721  *
722  * \param scan the scanner
723  *
724  * \returns nonzero on success, zero if at the end of the text
725  */
726 int utf8lite_graphscan_advance(struct utf8lite_graphscan *scan);
727 
728 /**
729  * Retreat a scanner to the previous grapheme.
730  *
731  * \param scan the scanner
732  *
733  * \returns non-zero on success, zero if at the start of the text
734  */
735 int utf8lite_graphscan_retreat(struct utf8lite_graphscan *scan);
736 
737 /**
738  * Reset a scanner to the beginning of the text.
739  *
740  * \param scan the scanner
741  */
742 void utf8lite_graphscan_reset(struct utf8lite_graphscan *scan);
743 
744 /**
745  * Skip a scanner at the end of the text.
746  *
747  * \param scan the scanner
748  */
749 void utf8lite_graphscan_skip(struct utf8lite_graphscan *scan);
750 
751 /**@}*/
752 
753 /**
754  * \defgroup wordscan Word boundaries
755  * @{
756  */
757 
758 /**
759  * A word scanner, for iterating over the words in a text. Word boundaries
760  * are determined according to [UAX #29, Unicode Text Segmentation][uax29].
761  * You can test the word boundary rules in an interactive
762  * [online demo][demo].
763  *
764  * [demo]: http://unicode.org/cldr/utility/breaks.jsp
765  * [uax29]: http://unicode.org/reports/tr29/
766  */
767 struct utf8lite_wordscan {
768 	int32_t code;		/**< next code point */
769 	size_t attr;		/**< next code's attributes */
770 	int prop;		/**< next code's word break property */
771 	const uint8_t *ptr;	/**< next code's start */
772 
773 	struct utf8lite_text_iter iter;	/**< an iterator over the input,
774 				  positioned past next code */
775 	int iter_prop;		/**< iterator code's word break property */
776 	const uint8_t *iter_ptr;/**< iterator code's start */
777 
778 	struct utf8lite_text current;	/**< the current word */
779 };
780 
781 /**
782  * Create a word scanner over a text object.
783  *
784  * \param scan the scanner to initialize
785  * \param text the text
786  */
787 void utf8lite_wordscan_make(struct utf8lite_wordscan *scan,
788 			    const struct utf8lite_text *text);
789 
790 /**
791  * Advance a scanner to the next word.
792  *
793  * \param scan the scanner
794  *
795  * \returns nonzero on success, zero if at the end of the text
796  */
797 int utf8lite_wordscan_advance(struct utf8lite_wordscan *scan);
798 
799 /**
800  * Reset a scanner to the beginning of the text.
801  *
802  * \param scan the scanner
803  */
804 void utf8lite_wordscan_reset(struct utf8lite_wordscan *scan);
805 
806 /**@}*/
807 
808 /**
809  * \defgroup render Text rendering
810  * @{
811  */
812 
813 /**
814  * Render escaping type. Specifies that certain code-points require
815  * special handling.
816  */
817 enum utf8lite_escape_type {
818 	UTF8LITE_ESCAPE_NONE = 0,		/**< no special escaping */
819 	UTF8LITE_ESCAPE_CONTROL = (1 << 0),	/**< control and other codes */
820 	UTF8LITE_ESCAPE_DQUOTE = (1 << 1),	/**< ASCII double quote */
821 	UTF8LITE_ESCAPE_SQUOTE =  (1 << 2),	/**< ASCII single quote */
822 	UTF8LITE_ESCAPE_EXTENDED = (1 << 3),	/**< extended-plane UTF-8 */
823 	UTF8LITE_ESCAPE_UTF8 = (1 << 4)		/**< non-ASCII UTF-8 */
824 };
825 
826 /**
827  * Render encoding type.
828  */
829 enum utf8lite_encode_type {
830 	UTF8LITE_ENCODE_C = 0,		/**< C-compatible escapes */
831 	UTF8LITE_ENCODE_JSON = (1 << 5),/**< JSON-compatible escapes */
832 	UTF8LITE_ENCODE_EMOJIZWSP = (1 << 6),/**< put ZWSP after emoji */
833 	UTF8LITE_ENCODE_RMDI = (1 << 7),/**< remove default ignorables */
834 	UTF8LITE_ENCODE_AMBIGWIDE = (1 << 8)/**< assume that ambiguous-width
835 					       characters are wide */
836 };
837 
838 /**
839  * Get the width of a grapheme under the specified render settings. If
840  * the grapheme contains a non-escaped control character, report the width
841  * as -1.
842  *
843  * \param g the grapheme
844  * \param flags a bitmask of #utf8lite_escape_type and #utf8lite_encode_type
845  * 			values specifying the encoding settings
846  * \param widthptr if non-NULL, a pointer to store the width on exit
847  * 	(0 if the grapheme is empty or a non-escaped control)
848  *
849  * \returns 0 on success
850  */
851 int utf8lite_graph_measure(const struct utf8lite_graph *g, int flags,
852 			   int *widthptr);
853 
854 /**
855  * Renderer, for printing objects as strings.
856  */
857 struct utf8lite_render {
858 	char *string;		/**< the rendered string (null terminated) */
859 	int length;		/**< the length of the rendered string, not
860 				  including the null terminator */
861 	int length_max;		/**< the maximum capacity of the rendered
862 				  string before requiring reallocation, not
863 				  including the null terminator */
864 	int flags;		/**< the flags, a bitmask of
865 				  #utf8lite_escape_type and
866 				  #utf8lite_encode_type values,
867 				  specifying escaping behavior */
868 
869 	const char *tab;	/**< the tab string, for indenting */
870 	int tab_length;		/**< the length in bytes of the tab string,
871 				  not including the null terminator */
872 
873 	const char *newline;	/**< the newline string, for advancing
874 				  to the next line */
875 	int newline_length;	/**< the length in bytes of the newline string,
876 				  not including the null terminator */
877 
878 	const char *style_open;	/**< the escape style graphic parameters,
879 				  for styling backslash escapes */
880 	const char *style_close;/**< the escape style graphic parameters,
881 				  for restoring state after styling a
882 				  backslash escapes */
883 	int style_open_length;	/**< length in bytes of the style_open string,
884 				  not including the null terminator */
885 	int style_close_length;	/**< length in bytes of the style_close string,
886 				  not including the null terminator */
887 
888 	int indent;		/**< the current indent level */
889 	int needs_indent;	/**< whether to indent before the next
890 				  character */
891 	int error;		/**< the code for the last error that
892 				  occurred, or zero if none */
893 };
894 
895 /**
896  * Initialize a new render object.
897  *
898  * \param r the render object
899  * \param flags a bitmask of #utf8lite_escape_type and #utf8lite_encode_type
900  * 	values specifying escaping behavior
901  *
902  * \returns 0 on success
903  */
904 int utf8lite_render_init(struct utf8lite_render *r, int flags);
905 
906 /**
907  * Release a render object's resources.
908  *
909  * \param r the render object
910  */
911 void utf8lite_render_destroy(struct utf8lite_render *r);
912 
913 /**
914  * Reset the render object to the empty string and set the indent level to 0.
915  * Leave the escape flags, the tab, and the newline string at their current
916  * values.
917  *
918  * \param r the render object
919  */
920 void utf8lite_render_clear(struct utf8lite_render *r);
921 
922 /**
923  * Set the escaping behavior.
924  *
925  * \param r the render object
926  * \param flags a bit mask of #utf8lite_escape_type values
927  *
928  * \returns 0 on success
929  */
930 int utf8lite_render_set_flags(struct utf8lite_render *r, int flags);
931 
932 /**
933  * Set the tab string. The client must not free the passed-in tab
934  * string until either the render object is destroyed or a new tab
935  * string gets set.
936  *
937  * \param r the render object
938  * \param tab the tab string (null terminated)
939  *
940  * \returns 0 on success
941  */
942 int utf8lite_render_set_tab(struct utf8lite_render *r, const char *tab);
943 
944 /**
945  * Set the new line string.  The client must not free the passed-in newline
946  * string until either the render object is destroyed or a new newline
947  * string gets set.
948  *
949  * \param r the render object
950  * \param newline the newline string (null terminated)
951  *
952  * \returns 0 on success
953  */
954 int utf8lite_render_set_newline(struct utf8lite_render *r, const char *newline);
955 
956 /**
957  * Set the escape style strings. The client must not free the passed
958  * in strings until the render object is destroyed or new style
959  * strings get set.
960  *
961  * \param r the render object
962  * \param open the string to render before a backslash escape.
963  * \param close the string to render after a backslash escape.
964  *
965  * \returns 0 on success
966  */
967 int utf8lite_render_set_style(struct utf8lite_render *r,
968 			      const char *open, const char *close);
969 
970 /**
971  * Increase or decrease the indent level.
972  *
973  * \param r the render object
974  * \param nlevel the number of levels add or subtract to the indent
975  *
976  * \returns 0 on success
977  */
978 int utf8lite_render_indent(struct utf8lite_render *r, int nlevel);
979 
980 /**
981  * Add new lines.
982  *
983  * \param r the render object
984  * \param nline the number of new lines to add
985  *
986  * \returns 0 on success
987  */
988 int utf8lite_render_newlines(struct utf8lite_render *r, int nline);
989 
990 /**
991  * Render a character grapheme. If any render escape flags are set, filter
992  * the grapheme through the appropriate escaping and encoding.
993  *
994  * \param r the render object
995  * \param g the grapheme
996  *
997  * \returns 0 on success
998  */
999 int utf8lite_render_graph(struct utf8lite_render *r,
1000 			  const struct utf8lite_graph *g);
1001 
1002 /**
1003  * Render a single character, treating it as a grapheme cluster. If any
1004  * render escape flags are set, filter the character through the
1005  * appropriate escaping and encoding.
1006  *
1007  * \param r the render object
1008  * \param ch the character
1009  *
1010  * \returns 0 on success
1011  */
1012 int utf8lite_render_char(struct utf8lite_render *r, int32_t ch);
1013 
1014 /**
1015  * Render multiple copies of a character, treating each as a grapheme
1016  * cluster.
1017  *
1018  * \param r the render object
1019  * \parma ch the character
1020  * \param nchar the number of copies to render
1021  *
1022  * \returns 0 on success
1023  */
1024 int utf8lite_render_chars(struct utf8lite_render *r, int32_t ch, int nchar);
1025 
1026 /**
1027  * Render a string. If any render escape flags are set, filter
1028  * all character graphemes through the appropriate escaping.
1029  *
1030  * \param r the render object
1031  * \param str the string, valid UTF-8
1032  *
1033  * \returns 0 on success
1034  */
1035 int utf8lite_render_string(struct utf8lite_render *r, const char *str);
1036 
1037 /**
1038  * Render formatted text. If any render escape flags are set, filter
1039  * all character graphemes through the appropriate escaping.
1040  *
1041  * \param r the render object
1042  * \param format the format string
1043  */
1044 int utf8lite_render_printf(struct utf8lite_render *r, const char *format, ...)
1045 #if defined(_WIN32) || defined(_WIN64)
1046 	;
1047 #else
1048 	__attribute__ ((format (printf, 2, 3)));
1049 #endif
1050 
1051 /**
1052  * Render a text object. If any render escape flags are set, filter
1053  * all character graphemes through the appropriate escaping.
1054  *
1055  * \param r the render object
1056  * \param text the text object
1057  *
1058  * \returns 0 on success
1059  */
1060 int utf8lite_render_text(struct utf8lite_render *r,
1061 			 const struct utf8lite_text *text);
1062 
1063 /**
1064  * Append a sequence of raw bytes to the render buffer. Ignore any special
1065  * handling specified by the render flags.
1066  *
1067  * \param r the render object
1068  * \param bytes the byte array
1069  * \param size the number of bytes
1070  *
1071  * \returns 0 on success.
1072  */
1073 int utf8lite_render_raw(struct utf8lite_render *r, const char *bytes,
1074 			size_t size);
1075 
1076 /**@}*/
1077 
1078 #endif /* UTF8LITE_H */
1079