1 #ifndef utf8_hh_INCLUDED
2 #define utf8_hh_INCLUDED
3 
4 #include "assert.hh"
5 #include "unicode.hh"
6 #include "units.hh"
7 #include "optional.hh"
8 
9 #include <cstddef>
10 
11 namespace Kakoune
12 {
13 
14 namespace utf8
15 {
16 
17 template<typename Iterator>
18 [[gnu::always_inline]]
read(Iterator & it)19 inline char read(Iterator& it) noexcept { char c = *it; ++it; return c; }
20 
21 // return true if it points to the first byte of a (either single or
22 // multibyte) character
23 [[gnu::always_inline]]
is_character_start(char c)24 inline bool is_character_start(char c) noexcept
25 {
26     return (c & 0xC0) != 0x80;
27 }
28 
29 namespace InvalidPolicy
30 {
31 
32 struct Assert
33 {
operator ()Kakoune::utf8::InvalidPolicy::Assert34     Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
35 };
36 
37 struct Pass
38 {
operator ()Kakoune::utf8::InvalidPolicy::Pass39     Codepoint operator()(Codepoint cp) const noexcept { return cp; }
40 };
41 
42 }
43 
44 // returns the codepoint of the character whose first byte
45 // is pointed by it
46 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
47          typename Iterator, typename Sentinel>
read_codepoint(Iterator & it,const Sentinel & end)48 Codepoint read_codepoint(Iterator& it, const Sentinel& end)
49     noexcept(noexcept(InvalidPolicy{}(0)))
50 {
51     if (it == end)
52         return InvalidPolicy{}(-1);
53     // According to rfc3629, UTF-8 allows only up to 4 bytes.
54     // (21 bits codepoint)
55     unsigned char byte = read(it);
56     if ((byte & 0x80) == 0) // 0xxxxxxx
57         return byte;
58 
59     if (it == end)
60         return InvalidPolicy{}(byte);
61 
62     if ((byte & 0xE0) == 0xC0) // 110xxxxx
63         return ((byte & 0x1F) << 6) | (read(it) & 0x3F);
64 
65     if ((byte & 0xF0) == 0xE0) // 1110xxxx
66     {
67         Codepoint cp = ((byte & 0x0F) << 12) | ((read(it) & 0x3F) << 6);
68         if (it == end)
69             return InvalidPolicy{}(cp);
70         return cp | (read(it) & 0x3F);
71     }
72 
73     if ((byte & 0xF8) == 0xF0) // 11110xxx
74     {
75         Codepoint cp = ((byte & 0x0F) << 18) | ((read(it) & 0x3F) << 12);
76         if (it == end)
77             return InvalidPolicy{}(cp);
78         cp |= (read(it) & 0x3F) << 6;
79         if (it == end)
80             return InvalidPolicy{}(cp);
81         return cp | (read(it) & 0x3F);
82     }
83     return InvalidPolicy{}(byte);
84 }
85 
86 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
87          typename Iterator, typename Sentinel>
codepoint(Iterator it,const Sentinel & end)88 Codepoint codepoint(Iterator it, const Sentinel& end)
89     noexcept(noexcept(read_codepoint<InvalidPolicy>(it, end)))
90 {
91     return read_codepoint<InvalidPolicy>(it, end);
92 }
93 
94 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
codepoint_size(char byte)95 ByteCount codepoint_size(char byte)
96     noexcept(noexcept(InvalidPolicy{}(0)))
97 {
98     if ((byte & 0x80) == 0) // 0xxxxxxx
99         return 1;
100     else if ((byte & 0xE0) == 0xC0) // 110xxxxx
101         return 2;
102     else if ((byte & 0xF0) == 0xE0) // 1110xxxx
103         return 3;
104     else if ((byte & 0xF8) == 0xF0) // 11110xxx
105         return 4;
106     else
107     {
108         InvalidPolicy{}(byte);
109         return 1;
110     }
111 }
112 
113 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
codepoint_size(Codepoint cp)114 ByteCount codepoint_size(Codepoint cp)
115     noexcept(noexcept(InvalidPolicy{}(0)))
116 {
117     if (cp <= 0x7F)
118         return 1;
119     else if (cp <= 0x7FF)
120         return 2;
121     else if (cp <= 0xFFFF)
122         return 3;
123     else if (cp <= 0x10FFFF)
124         return 4;
125     else
126     {
127         InvalidPolicy{}(cp);
128         return 0;
129     }
130 }
131 
132 template<typename Iterator, typename Sentinel>
to_next(Iterator & it,const Sentinel & end)133 void to_next(Iterator& it, const Sentinel& end) noexcept
134 {
135     if (it != end)
136         ++it;
137     while (it != end and not is_character_start(*it))
138         ++it;
139 }
140 
141 // returns an iterator to next character first byte
142 template<typename Iterator, typename Sentinel>
143 Iterator next(Iterator it, const Sentinel& end) noexcept
144 {
145     to_next(it, end);
146     return it;
147 }
148 
149 // returns it's parameter if it points to a character first byte,
150 // or else returns next character first byte
151 template<typename Iterator, typename Sentinel>
152 Iterator finish(Iterator it, const Sentinel& end) noexcept
153 {
154     while (it != end and (*(it) & 0xC0) == 0x80)
155         ++it;
156     return it;
157 }
158 
159 template<typename Iterator, typename Sentinel>
to_previous(Iterator & it,const Sentinel & begin)160 void to_previous(Iterator& it, const Sentinel& begin) noexcept
161 {
162     if (it != begin)
163         --it;
164     while (it != begin and not is_character_start(*it))
165         --it;
166 }
167 // returns an iterator to the previous character first byte
168 template<typename Iterator, typename Sentinel>
169 Iterator previous(Iterator it, const Sentinel& begin) noexcept
170 {
171     to_previous(it, begin);
172     return it;
173 }
174 
175 // returns an iterator pointing to the first byte of the
176 // dth character after (or before if d < 0) the character
177 // pointed by it
178 template<typename Iterator, typename Sentinel>
179 Iterator advance(Iterator it, const Sentinel& end, CharCount d) noexcept
180 {
181     if (it == end)
182         return it;
183 
184     if (d < 0)
185     {
186         while (it != end and d++ != 0)
187             to_previous(it, end);
188     }
189     else if (d > 0)
190     {
191         while (it != end and d-- != 0)
192             to_next(it, end);
193     }
194     return it;
195 }
196 
197 // returns an iterator pointing to the first byte of the
198 // character at the dth column after (or before if d < 0)
199 // the character pointed by it
200 template<typename Iterator, typename Sentinel>
201 Iterator advance(Iterator it, const Sentinel& end, ColumnCount d) noexcept
202 {
203     if (it == end)
204         return it;
205 
206     if (d < 0)
207     {
208         while (it != end and d < 0)
209         {
210             auto cur = it;
211             to_previous(it, end);
212             d += codepoint_width(codepoint(it, cur));
213         }
214     }
215     else if (d > 0)
216     {
217         auto begin = it;
218         while (it != end and d > 0)
219         {
220             d -= codepoint_width(read_codepoint(it, end));
221             if (it != end and d < 0)
222                 to_previous(it, begin);
223         }
224     }
225     return it;
226 }
227 
228 // returns the character count between begin and end
229 template<typename Iterator, typename Sentinel>
distance(Iterator begin,const Sentinel & end)230 CharCount distance(Iterator begin, const Sentinel& end) noexcept
231 {
232     CharCount dist = 0;
233 
234     while (begin != end)
235     {
236         if (is_character_start(read(begin)))
237             ++dist;
238     }
239     return dist;
240 }
241 
242 // returns the column count between begin and end
243 template<typename Iterator, typename Sentinel>
column_distance(Iterator begin,const Sentinel & end)244 ColumnCount column_distance(Iterator begin, const Sentinel& end) noexcept
245 {
246     ColumnCount dist = 0;
247 
248     while (begin != end)
249         dist += codepoint_width(read_codepoint(begin, end));
250     return dist;
251 }
252 
253 // returns an iterator to the first byte of the character it is into
254 template<typename Iterator, typename Sentinel>
255 Iterator character_start(Iterator it, const Sentinel& begin) noexcept
256 {
257     while (it != begin and not is_character_start(*it))
258         --it;
259     return it;
260 }
261 
262 // returns an optional iterator to the first byte of the previous character
263 // or no value if it is at begin
264 template<typename Iterator, typename Sentinel>
prev_codepoint(Iterator it,const Sentinel & begin)265 static Optional<Codepoint> prev_codepoint(Iterator it, const Sentinel& begin) noexcept
266 {
267     if (it <= begin)
268         return {};
269     return codepoint(character_start(it -1, begin), it);
270 }
271 
272 
273 template<typename OutputIterator, typename InvalidPolicy = utf8::InvalidPolicy::Pass>
dump(OutputIterator && it,Codepoint cp)274 void dump(OutputIterator&& it, Codepoint cp)
275 {
276     if (cp <= 0x7F)
277         *it++ = cp;
278     else if (cp <= 0x7FF)
279     {
280         *it++ = 0xC0 | (cp >> 6);
281         *it++ = 0x80 | (cp & 0x3F);
282     }
283     else if (cp <= 0xFFFF)
284     {
285         *it++ = 0xE0 | (cp >> 12);
286         *it++ = 0x80 | ((cp >> 6) & 0x3F);
287         *it++ = 0x80 | (cp & 0x3F);
288     }
289     else if (cp <= 0x10FFFF)
290     {
291         *it++ = 0xF0 | (cp >> 18);
292         *it++ = 0x80 | ((cp >> 12) & 0x3F);
293         *it++ = 0x80 | ((cp >> 6)  & 0x3F);
294         *it++ = 0x80 | (cp & 0x3F);
295     }
296     else
297         InvalidPolicy{}(cp);
298 }
299 
300 }
301 
302 }
303 
304 #endif // utf8_hh_INCLUDED
305