1
2 /* Much of this is adapted from the great utf8lite library:
3 * https://github.com/patperry/utf8lite
4 */
5
6 #include "charwidth.h"
7 #include "graphbreak.h"
8 #include "cli.h"
9 #include "errors.h"
10
utf8lite_decode_utf8(const uint8_t ** bufptr,int32_t * codeptr)11 void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr) {
12 const uint8_t *ptr = *bufptr;
13 int32_t code;
14 uint_fast8_t ch;
15 unsigned nc;
16
17 ch = *ptr++;
18 if (!(ch & 0x80)) {
19 code = ch;
20 nc = 0;
21 } else if (!(ch & 0x20)) {
22 code = ch & 0x1F;
23 nc = 1;
24 } else if (!(ch & 0x10)) {
25 code = ch & 0x0F;
26 nc = 2;
27 } else {
28 code = ch & 0x07;
29 nc = 3;
30 }
31
32 while (nc-- > 0) {
33 ch = *ptr++;
34 if (ch == 0) R_THROW_ERROR("Incomplete UTF-8 character");
35 code = (code << 6) + (ch & 0x3F);
36 }
37
38 *bufptr = ptr;
39 *codeptr = code;
40 }
41
utf8lite_encode_utf8(int32_t code,uint8_t ** bufptr)42 void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr) {
43 uint8_t *ptr = *bufptr;
44 int32_t x = code;
45
46 if (x <= 0x7F) {
47 *ptr++ = (uint8_t)x;
48 } else if (x <= 0x07FF) {
49 *ptr++ = (uint8_t)(0xC0 | (x >> 6));
50 *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
51 } else if (x <= 0xFFFF) {
52 *ptr++ = (uint8_t)(0xE0 | (x >> 12));
53 *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
54 *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
55 } else {
56 *ptr++ = (uint8_t)(0xF0 | (x >> 18));
57 *ptr++ = (uint8_t)(0x80 | ((x >> 12) & 0x3F));
58 *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
59 *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
60 }
61
62 *bufptr = ptr;
63 }
64
65 static int display_width_map[7] = {
66 /* CHARWIDTH_NONE = */ 0,
67 /* CHARWIDTH_IGNORABLE = */ 0,
68 /* CHARWIDTH_MARK = */ 0,
69 /* CHARWIDTH_NARROW = */ 1,
70 /* CHARWIDTH_AMBIGUOUS = */ 1,
71 /* CHARWIDTH_WIDE = */ 2,
72 /* CHARWIDTH_EMOJI = */ 2
73 };
74
75 #define NEXT() do { \
76 iter->cnd = iter->nxt_ptr; \
77 if (*(iter->nxt_ptr) == '\0') { \
78 iter->nxt_prop = -1; \
79 } else { \
80 utf8lite_decode_utf8(&iter->nxt_ptr, &iter->nxt_code); \
81 iter->nxt_prop = graph_break(iter->nxt_code); \
82 } \
83 if (iter->cnd_width_done >= 0) { \
84 if (iter->nxt_cw >= 0) { \
85 if (!iter->cnd_width_done) { \
86 iter->cnd_width += display_width_map[iter->nxt_cw]; \
87 if (iter->nxt_cw == CHARWIDTH_EMOJI) { \
88 iter->cnd_width_done = 1; \
89 } \
90 } \
91 } \
92 if (iter->nxt_prop != -1) { \
93 iter->nxt_cw = charwidth(iter->nxt_code); \
94 } \
95 } \
96 } while (0)
97
clic_utf8_graphscan_make(struct grapheme_iterator * iter,const uint8_t * txt,int width)98 void clic_utf8_graphscan_make(struct grapheme_iterator *iter,
99 const uint8_t *txt,
100 int width) {
101 iter->nxt_ptr = txt;
102 iter->nxt_cw = -1;
103 iter->cnd_width = 0;
104 iter->cnd_width_done = -1 * (width == 0);
105 NEXT();
106 }
107
clic_utf8_graphscan_next(struct grapheme_iterator * iter,uint8_t ** ptr,int * width)108 void clic_utf8_graphscan_next(struct grapheme_iterator *iter,
109 uint8_t **ptr,
110 int *width) {
111 if (ptr) *ptr = (uint8_t*) iter->cnd;
112
113 Start:
114 // GB2: Break at the end of text
115 if (iter->nxt_prop < 0) {
116 goto Break;
117 }
118
119 switch (iter->nxt_prop) {
120 case GRAPH_BREAK_CR:
121 NEXT();
122 goto CR;
123
124 case GRAPH_BREAK_CONTROL:
125 case GRAPH_BREAK_LF:
126 // Break after controls
127 // GB4: (Newline | LF) +
128 NEXT();
129 goto Break;
130
131 case GRAPH_BREAK_L:
132 NEXT();
133 goto L;
134
135 case GRAPH_BREAK_LV:
136 case GRAPH_BREAK_V:
137 NEXT();
138 goto V;
139
140 case GRAPH_BREAK_LVT:
141 case GRAPH_BREAK_T:
142 NEXT();
143 goto T;
144
145 case GRAPH_BREAK_PREPEND:
146 NEXT();
147 goto Prepend;
148
149 case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
150 NEXT();
151 goto Extended_Pictographic;
152
153 case GRAPH_BREAK_REGIONAL_INDICATOR:
154 NEXT();
155 goto Regional_Indicator;
156
157 case GRAPH_BREAK_EXTEND:
158 case GRAPH_BREAK_SPACINGMARK:
159 case GRAPH_BREAK_ZWJ:
160 case GRAPH_BREAK_OTHER:
161 NEXT();
162 goto MaybeBreak;
163 }
164
165 R_THROW_ERROR("internal error, unhandled grapheme break property");
166
167 CR:
168 // GB3: Do not break within CRLF
169 // GB4: Otherwise break after controls
170 if (iter->nxt_prop == GRAPH_BREAK_LF) {
171 NEXT();
172 }
173 goto Break;
174
175 L:
176 // GB6: Do not break Hangul syllable sequences.
177 switch (iter->nxt_prop) {
178 case GRAPH_BREAK_L:
179 NEXT();
180 goto L;
181
182 case GRAPH_BREAK_V:
183 case GRAPH_BREAK_LV:
184 NEXT();
185 goto V;
186
187 case GRAPH_BREAK_LVT:
188 NEXT();
189 goto T;
190
191 default:
192 goto MaybeBreak;
193 }
194
195
196 V:
197 // GB7: Do not break Hangul syllable sequences.
198 switch (iter->nxt_prop) {
199 case GRAPH_BREAK_V:
200 NEXT();
201 goto V;
202
203 case GRAPH_BREAK_T:
204 NEXT();
205 goto T;
206
207 default:
208 goto MaybeBreak;
209 }
210
211 T:
212 // GB8: Do not break Hangul syllable sequences.
213 switch (iter->nxt_prop) {
214 case GRAPH_BREAK_T:
215 NEXT();
216 goto T;
217
218 default:
219 goto MaybeBreak;
220 }
221
222 Prepend:
223 switch (iter->nxt_prop) {
224 case GRAPH_BREAK_CONTROL:
225 case GRAPH_BREAK_CR:
226 case GRAPH_BREAK_LF:
227 // GB5: break before controls
228 goto Break;
229
230 default:
231 // GB9b: do not break after Prepend characters.
232 goto Start;
233 }
234
235 Extended_Pictographic:
236 // GB9: Do not break before extending characters
237 while (iter->nxt_prop == GRAPH_BREAK_EXTEND) {
238 NEXT();
239 }
240 // GB9: Do not break before ZWJ
241 if (iter->nxt_prop == GRAPH_BREAK_ZWJ) {
242 NEXT();
243 // GB11: Do not break within emoji modifier sequences
244 // or emoji zwj sequences.
245 if (iter->nxt_prop == GRAPH_BREAK_EXTENDED_PICTOGRAPHIC) {
246 NEXT();
247 goto Extended_Pictographic;
248 }
249 }
250 goto MaybeBreak;
251
252 Regional_Indicator:
253 // Do not break within emoji flag sequences. That is, do not break
254 // between regional indicator (RI) symbols if there is an odd number
255 // of RI characters before the break point
256 if (iter->nxt_prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
257 // GB12/13: [^RI] RI * RI
258 NEXT();
259 }
260 goto MaybeBreak;
261
262 MaybeBreak:
263 // GB9: Do not break before extending characters or ZWJ.
264 // GB9a: Do not break before SpacingMark [extended grapheme clusters]
265 // GB999: Otherwise, break everywhere
266 switch (iter->nxt_prop) {
267 case GRAPH_BREAK_EXTEND:
268 case GRAPH_BREAK_SPACINGMARK:
269 case GRAPH_BREAK_ZWJ:
270 NEXT();
271 goto MaybeBreak;
272
273 default:
274 goto Break;
275 }
276
277 Break:
278 if (width) *width = iter->cnd_width;
279 iter->cnd_width = 0;
280 if (iter->cnd_width_done > 0) iter->cnd_width_done = 0;
281 return;
282 }
283
clic_utf8_display_width(SEXP x)284 SEXP clic_utf8_display_width(SEXP x) {
285 R_xlen_t i, len = XLENGTH(x);
286 SEXP res = PROTECT(allocVector(INTSXP, len));
287 int *pres = INTEGER(res);
288
289 for (i = 0; i < len; i++) {
290 SEXP x1 = STRING_ELT(x, i);
291 if (x1 == NA_STRING) {
292 pres[i] = NA_INTEGER;
293 } else {
294 struct grapheme_iterator iter;
295 const uint8_t *chr = (const uint8_t*) CHAR(x1);
296 clic_utf8_graphscan_make(&iter, chr, /* width= */ 1);
297 int len = 0;
298 int width;
299 while (iter.nxt_prop != -1) {
300 clic_utf8_graphscan_next(&iter, NULL, &width);
301 len += width;
302 }
303 pres[i] = len;
304 }
305 }
306
307 UNPROTECT(1);
308 return res;
309 }
310
clic_utf8_nchar_graphemes(SEXP x)311 SEXP clic_utf8_nchar_graphemes(SEXP x) {
312 R_xlen_t i, len = XLENGTH(x);
313 SEXP res = PROTECT(allocVector(INTSXP, len));
314 int *pres = INTEGER(res);
315
316 for (i = 0; i < len; i++) {
317 SEXP x1 = STRING_ELT(x, i);
318 if (x1 == NA_STRING) {
319 pres[i] = NA_INTEGER;
320 } else {
321 struct grapheme_iterator iter;
322 const uint8_t *chr = (const uint8_t*) CHAR(x1);
323 int len = 0;
324 clic_utf8_graphscan_make(&iter, chr, /* width= */ 0);
325 while (iter.nxt_prop != -1) {
326 clic_utf8_graphscan_next(&iter, NULL, NULL);
327 len ++;
328 }
329
330 pres[i] = len;
331 }
332 }
333
334 UNPROTECT(1);
335 return res;
336 }
337
clic_utf8_substr(SEXP x,SEXP sstart,SEXP sstop)338 SEXP clic_utf8_substr(SEXP x, SEXP sstart, SEXP sstop) {
339 R_xlen_t i, len = XLENGTH(x);
340 SEXP res = PROTECT(allocVector(STRSXP, len));
341
342 for (i = 0; i < len; i++) {
343 SEXP x1 = STRING_ELT(x, i);
344 if (x1 == NA_STRING) {
345 SET_STRING_ELT(res, i, x1);
346 } else {
347 const uint8_t *str = (const uint8_t*) CHAR(x1);
348 int pos = 1;
349 int start = INTEGER(sstart)[i];
350 int stop = INTEGER(sstop)[i];
351 struct grapheme_iterator iter;
352
353 /* Skip initial part */
354 clic_utf8_graphscan_make(&iter, (const uint8_t*) str, /* width = */ 0);
355 while (pos < start && iter.nxt_prop != -1) {
356 clic_utf8_graphscan_next(&iter, NULL, NULL);
357 pos++;
358 }
359 const char* from = (const char*) iter.cnd;
360
361 /* Iterate to the end */
362 while (pos <= stop && iter.nxt_prop != -1) {
363 clic_utf8_graphscan_next(&iter, NULL, NULL);
364 pos++;
365 }
366 const char *to = (const char*) iter.cnd;
367
368 if (from < to) {
369 SET_STRING_ELT(res, i, Rf_mkCharLenCE(from, to - from, CE_UTF8));
370 }
371 }
372 }
373
374 UNPROTECT(1);
375 return res;
376 }
377
clic_utf8_graphemes(SEXP x)378 SEXP clic_utf8_graphemes(SEXP x) {
379 R_xlen_t i, len = XLENGTH(x);
380 SEXP res = PROTECT(allocVector(VECSXP, len));
381
382 for (i = 0; i < len; i++) {
383 SEXP x1 = STRING_ELT(x, i);
384 if (x1 == NA_STRING) {
385 SET_VECTOR_ELT(res, i, Rf_ScalarString(x1));
386 } else {
387 const uint8_t *str = (const uint8_t*) CHAR(x1);
388 struct grapheme_iterator iter;
389 SEXP pieces = PROTECT(allocVector(STRSXP, strlen((const char*) str)));
390 R_xlen_t idx = 0;
391 uint8_t *start = 0;
392
393 clic_utf8_graphscan_make(&iter, str, /* width = */ 0);
394 while (iter.nxt_prop != -1) {
395 clic_utf8_graphscan_next(&iter, &start, NULL);
396 SET_STRING_ELT(
397 pieces,
398 idx++,
399 Rf_mkCharLenCE((const char*) start, iter.cnd - start, CE_UTF8)
400 );
401 }
402
403 SET_VECTOR_ELT(res, i, PROTECT(Rf_xlengthgets(pieces, idx)));
404 UNPROTECT(2);
405 }
406 }
407
408 UNPROTECT(1);
409 return res;
410 }
411