1 #ifndef utf8_hh_INCLUDED
2 #define utf8_hh_INCLUDED
3
4 #include "assert.hh"
5 #include "unicode.hh"
6 #include "units.hh"
7 #include "optional.hh"
8
9 #include <cstddef>
10
11 namespace Kakoune
12 {
13
14 namespace utf8
15 {
16
17 template<typename Iterator>
18 [[gnu::always_inline]]
read(Iterator & it)19 inline char read(Iterator& it) noexcept { char c = *it; ++it; return c; }
20
21 // return true if it points to the first byte of a (either single or
22 // multibyte) character
23 [[gnu::always_inline]]
is_character_start(char c)24 inline bool is_character_start(char c) noexcept
25 {
26 return (c & 0xC0) != 0x80;
27 }
28
29 namespace InvalidPolicy
30 {
31
32 struct Assert
33 {
operator ()Kakoune::utf8::InvalidPolicy::Assert34 Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
35 };
36
37 struct Pass
38 {
operator ()Kakoune::utf8::InvalidPolicy::Pass39 Codepoint operator()(Codepoint cp) const noexcept { return cp; }
40 };
41
42 }
43
44 // returns the codepoint of the character whose first byte
45 // is pointed by it
46 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
47 typename Iterator, typename Sentinel>
read_codepoint(Iterator & it,const Sentinel & end)48 Codepoint read_codepoint(Iterator& it, const Sentinel& end)
49 noexcept(noexcept(InvalidPolicy{}(0)))
50 {
51 if (it == end)
52 return InvalidPolicy{}(-1);
53 // According to rfc3629, UTF-8 allows only up to 4 bytes.
54 // (21 bits codepoint)
55 unsigned char byte = read(it);
56 if ((byte & 0x80) == 0) // 0xxxxxxx
57 return byte;
58
59 if (it == end)
60 return InvalidPolicy{}(byte);
61
62 if ((byte & 0xE0) == 0xC0) // 110xxxxx
63 return ((byte & 0x1F) << 6) | (read(it) & 0x3F);
64
65 if ((byte & 0xF0) == 0xE0) // 1110xxxx
66 {
67 Codepoint cp = ((byte & 0x0F) << 12) | ((read(it) & 0x3F) << 6);
68 if (it == end)
69 return InvalidPolicy{}(cp);
70 return cp | (read(it) & 0x3F);
71 }
72
73 if ((byte & 0xF8) == 0xF0) // 11110xxx
74 {
75 Codepoint cp = ((byte & 0x0F) << 18) | ((read(it) & 0x3F) << 12);
76 if (it == end)
77 return InvalidPolicy{}(cp);
78 cp |= (read(it) & 0x3F) << 6;
79 if (it == end)
80 return InvalidPolicy{}(cp);
81 return cp | (read(it) & 0x3F);
82 }
83 return InvalidPolicy{}(byte);
84 }
85
86 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
87 typename Iterator, typename Sentinel>
codepoint(Iterator it,const Sentinel & end)88 Codepoint codepoint(Iterator it, const Sentinel& end)
89 noexcept(noexcept(read_codepoint<InvalidPolicy>(it, end)))
90 {
91 return read_codepoint<InvalidPolicy>(it, end);
92 }
93
94 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
codepoint_size(char byte)95 ByteCount codepoint_size(char byte)
96 noexcept(noexcept(InvalidPolicy{}(0)))
97 {
98 if ((byte & 0x80) == 0) // 0xxxxxxx
99 return 1;
100 else if ((byte & 0xE0) == 0xC0) // 110xxxxx
101 return 2;
102 else if ((byte & 0xF0) == 0xE0) // 1110xxxx
103 return 3;
104 else if ((byte & 0xF8) == 0xF0) // 11110xxx
105 return 4;
106 else
107 {
108 InvalidPolicy{}(byte);
109 return 1;
110 }
111 }
112
113 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
codepoint_size(Codepoint cp)114 ByteCount codepoint_size(Codepoint cp)
115 noexcept(noexcept(InvalidPolicy{}(0)))
116 {
117 if (cp <= 0x7F)
118 return 1;
119 else if (cp <= 0x7FF)
120 return 2;
121 else if (cp <= 0xFFFF)
122 return 3;
123 else if (cp <= 0x10FFFF)
124 return 4;
125 else
126 {
127 InvalidPolicy{}(cp);
128 return 0;
129 }
130 }
131
132 template<typename Iterator, typename Sentinel>
to_next(Iterator & it,const Sentinel & end)133 void to_next(Iterator& it, const Sentinel& end) noexcept
134 {
135 if (it != end)
136 ++it;
137 while (it != end and not is_character_start(*it))
138 ++it;
139 }
140
141 // returns an iterator to next character first byte
142 template<typename Iterator, typename Sentinel>
143 Iterator next(Iterator it, const Sentinel& end) noexcept
144 {
145 to_next(it, end);
146 return it;
147 }
148
149 // returns it's parameter if it points to a character first byte,
150 // or else returns next character first byte
151 template<typename Iterator, typename Sentinel>
152 Iterator finish(Iterator it, const Sentinel& end) noexcept
153 {
154 while (it != end and (*(it) & 0xC0) == 0x80)
155 ++it;
156 return it;
157 }
158
159 template<typename Iterator, typename Sentinel>
to_previous(Iterator & it,const Sentinel & begin)160 void to_previous(Iterator& it, const Sentinel& begin) noexcept
161 {
162 if (it != begin)
163 --it;
164 while (it != begin and not is_character_start(*it))
165 --it;
166 }
167 // returns an iterator to the previous character first byte
168 template<typename Iterator, typename Sentinel>
169 Iterator previous(Iterator it, const Sentinel& begin) noexcept
170 {
171 to_previous(it, begin);
172 return it;
173 }
174
175 // returns an iterator pointing to the first byte of the
176 // dth character after (or before if d < 0) the character
177 // pointed by it
178 template<typename Iterator, typename Sentinel>
179 Iterator advance(Iterator it, const Sentinel& end, CharCount d) noexcept
180 {
181 if (it == end)
182 return it;
183
184 if (d < 0)
185 {
186 while (it != end and d++ != 0)
187 to_previous(it, end);
188 }
189 else if (d > 0)
190 {
191 while (it != end and d-- != 0)
192 to_next(it, end);
193 }
194 return it;
195 }
196
197 // returns an iterator pointing to the first byte of the
198 // character at the dth column after (or before if d < 0)
199 // the character pointed by it
200 template<typename Iterator, typename Sentinel>
201 Iterator advance(Iterator it, const Sentinel& end, ColumnCount d) noexcept
202 {
203 if (it == end)
204 return it;
205
206 if (d < 0)
207 {
208 while (it != end and d < 0)
209 {
210 auto cur = it;
211 to_previous(it, end);
212 d += codepoint_width(codepoint(it, cur));
213 }
214 }
215 else if (d > 0)
216 {
217 auto begin = it;
218 while (it != end and d > 0)
219 {
220 d -= codepoint_width(read_codepoint(it, end));
221 if (it != end and d < 0)
222 to_previous(it, begin);
223 }
224 }
225 return it;
226 }
227
228 // returns the character count between begin and end
229 template<typename Iterator, typename Sentinel>
distance(Iterator begin,const Sentinel & end)230 CharCount distance(Iterator begin, const Sentinel& end) noexcept
231 {
232 CharCount dist = 0;
233
234 while (begin != end)
235 {
236 if (is_character_start(read(begin)))
237 ++dist;
238 }
239 return dist;
240 }
241
242 // returns the column count between begin and end
243 template<typename Iterator, typename Sentinel>
column_distance(Iterator begin,const Sentinel & end)244 ColumnCount column_distance(Iterator begin, const Sentinel& end) noexcept
245 {
246 ColumnCount dist = 0;
247
248 while (begin != end)
249 dist += codepoint_width(read_codepoint(begin, end));
250 return dist;
251 }
252
253 // returns an iterator to the first byte of the character it is into
254 template<typename Iterator, typename Sentinel>
255 Iterator character_start(Iterator it, const Sentinel& begin) noexcept
256 {
257 while (it != begin and not is_character_start(*it))
258 --it;
259 return it;
260 }
261
262 // returns an optional iterator to the first byte of the previous character
263 // or no value if it is at begin
264 template<typename Iterator, typename Sentinel>
prev_codepoint(Iterator it,const Sentinel & begin)265 static Optional<Codepoint> prev_codepoint(Iterator it, const Sentinel& begin) noexcept
266 {
267 if (it <= begin)
268 return {};
269 return codepoint(character_start(it -1, begin), it);
270 }
271
272
273 template<typename OutputIterator, typename InvalidPolicy = utf8::InvalidPolicy::Pass>
dump(OutputIterator && it,Codepoint cp)274 void dump(OutputIterator&& it, Codepoint cp)
275 {
276 if (cp <= 0x7F)
277 *it++ = cp;
278 else if (cp <= 0x7FF)
279 {
280 *it++ = 0xC0 | (cp >> 6);
281 *it++ = 0x80 | (cp & 0x3F);
282 }
283 else if (cp <= 0xFFFF)
284 {
285 *it++ = 0xE0 | (cp >> 12);
286 *it++ = 0x80 | ((cp >> 6) & 0x3F);
287 *it++ = 0x80 | (cp & 0x3F);
288 }
289 else if (cp <= 0x10FFFF)
290 {
291 *it++ = 0xF0 | (cp >> 18);
292 *it++ = 0x80 | ((cp >> 12) & 0x3F);
293 *it++ = 0x80 | ((cp >> 6) & 0x3F);
294 *it++ = 0x80 | (cp & 0x3F);
295 }
296 else
297 InvalidPolicy{}(cp);
298 }
299
300 }
301
302 }
303
304 #endif // utf8_hh_INCLUDED
305