1 #ifndef UNICHAR_H
2 #define UNICHAR_H
3 
4 /* Character used to replace invalid input. */
5 #define UNICODE_REPLACEMENT_CHAR 0xfffd
6 #define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
7 #define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
8 	(sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
9 /* Horizontal ellipsis character ('...') */
10 #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
11 #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
12 #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
13 	(sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);
14 
15 /* Characters >= base require surrogates */
16 #define UTF16_SURROGATE_BASE 0x10000
17 
18 #define UTF16_SURROGATE_SHIFT 10
19 #define UTF16_SURROGATE_MASK 0x03ff
20 #define UTF16_SURROGATE_HIGH_FIRST 0xd800
21 #define UTF16_SURROGATE_HIGH_LAST 0xdbff
22 #define UTF16_SURROGATE_HIGH_MAX 0xdfff
23 #define UTF16_SURROGATE_LOW_FIRST 0xdc00
24 #define UTF16_SURROGATE_LOW_LAST 0xdfff
25 
26 #define UTF16_SURROGATE_HIGH(chr) \
27 	(UTF16_SURROGATE_HIGH_FIRST + \
28 	 (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
29 #define UTF16_SURROGATE_LOW(chr) \
30 	(UTF16_SURROGATE_LOW_FIRST + \
31 	 (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
32 
33 /* Returns TRUE if given byte is ASCII character or the beginning of a
34    multibyte UTF-8 sequence */
35 #define UTF8_IS_START_SEQ(b) \
36 	(((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)
37 
38 #define UTF8_REPLACEMENT_CHAR_LEN 3
39 
40 #define UNICHAR_T_MAX 0x10ffff
41 
42 #define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
43 #define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
44 
45 typedef uint32_t unichar_t;
46 ARRAY_DEFINE_TYPE(unichars, unichar_t);
47 
48 /* Normalize UTF8 input and append it to output buffer.
49    Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
50    as much as possible should be added to output. */
51 typedef int normalizer_func_t(const void *input, size_t size,
52 			      buffer_t *output);
53 
54 extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
55 extern const uint8_t *const uni_utf8_non1_bytes;
56 
uni_is_valid_ucs4(unichar_t chr)57 static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
58 {
59 	return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
60 		!UTF16_VALID_LOW_SURROGATE(chr) &&
61 		chr <= UNICHAR_T_MAX);
62 };
63 
64 /* Returns number of characters in a NUL-terminated unicode string */
65 unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
66 /* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
67    invalid */
68 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
69 int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
70 		       ARRAY_TYPE(unichars) *output);
71 /* Translates UCS-4 input to UTF-8 output. */
72 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
73 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
74 
75 /* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
76    -1 for invalid input. */
77 int uni_utf8_get_char(const char *input, unichar_t *chr_r);
78 int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
79 /* Returns number of characters in UTF-8 string. */
80 unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
81 /* Returns number of characters in UTF-8 input of specified size. */
82 unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
83 /* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
84    character, don't include it in the return value and set partial_pos_r to
85    where the character begins. Otherwise partial_pos_r is set to the end
86    of the input. */
87 unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
88 				       size_t *partial_pos_r);
89 
90 /* Returns the number of bytes belonging to this UTF-8 character. The given
91    parameter is the first byte of the UTF-8 sequence. Invalid input is
92    returned with length 1. */
93 static inline unsigned int ATTR_CONST
uni_utf8_char_bytes(unsigned char chr)94 uni_utf8_char_bytes(unsigned char chr)
95 {
96 	/* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
97 	if (chr < (192 + 2))
98 		return 1;
99 	return uni_utf8_non1_bytes[chr - (192 + 2)];
100 }
101 
102 /* Return given character in titlecase. */
103 unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
104 
105 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
106    output buffer. Returns 0 if ok, -1 if input was invalid. This generates
107    output that's compatible with i;unicode-casemap comparator. Invalid input
108    is replaced with unicode replacement character (0xfffd). */
109 int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
110 				     buffer_t *output);
111 
112 /* If input contains only valid UTF-8 characters, return TRUE without updating
113    buf. If input contains invalid UTF-8 characters, replace them with unicode
114    replacement character (0xfffd), write the output to buf and return FALSE. */
115 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
116 			     buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
117 /* Returns TRUE if string is valid UTF-8 input. */
118 bool uni_utf8_str_is_valid(const char *str);
119 /* Returns TRUE if data contains only valid UTF-8 input. */
120 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
121 /* Returns the size of the data when truncated to be less than or equal to
122    max_new_size, making sure UTF-8 character boundaries are respected. This only
123    looks at the last character at the new boundary. */
124 size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
125 			      size_t max_new_size);
126 
127 /* surrogate handling */
uni_join_surrogate(unichar_t high,unichar_t low)128 static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
129 {
130 	i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
131 		 UTF16_VALID_LOW_SURROGATE(low));
132 
133 	return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
134 		(low - UTF16_SURROGATE_LOW_FIRST) +
135 		UTF16_SURROGATE_BASE;
136 }
137 
uni_split_surrogate(unichar_t chr,unichar_t * high_r,unichar_t * low_r)138 static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)
139 {
140 	i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
141 	i_assert(high_r != NULL && low_r != NULL);
142 	*high_r = UTF16_SURROGATE_HIGH(chr);
143 	*low_r = UTF16_SURROGATE_LOW(chr);
144 }
145 #endif
146