1 
2 /* Much of this is adapted from the great utf8lite library:
3  * https://github.com/patperry/utf8lite
4  */
5 
6 #include "charwidth.h"
7 #include "graphbreak.h"
8 #include "cli.h"
9 #include "errors.h"
10 
utf8lite_decode_utf8(const uint8_t ** bufptr,int32_t * codeptr)11 void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr) {
12   const uint8_t *ptr = *bufptr;
13   int32_t code;
14   uint_fast8_t ch;
15   unsigned nc;
16 
17   ch = *ptr++;
18   if (!(ch & 0x80)) {
19     code = ch;
20     nc = 0;
21   } else if (!(ch & 0x20)) {
22     code = ch & 0x1F;
23     nc = 1;
24   } else if (!(ch & 0x10)) {
25     code = ch & 0x0F;
26     nc = 2;
27   } else {
28     code = ch & 0x07;
29     nc = 3;
30   }
31 
32   while (nc-- > 0) {
33     ch = *ptr++;
34     if (ch == 0) R_THROW_ERROR("Incomplete UTF-8 character");
35     code = (code << 6) + (ch & 0x3F);
36   }
37 
38   *bufptr = ptr;
39   *codeptr = code;
40 }
41 
utf8lite_encode_utf8(int32_t code,uint8_t ** bufptr)42 void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr) {
43   uint8_t *ptr = *bufptr;
44   int32_t x = code;
45 
46   if (x <= 0x7F) {
47     *ptr++ = (uint8_t)x;
48   } else if (x <= 0x07FF) {
49     *ptr++ = (uint8_t)(0xC0 | (x >> 6));
50     *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
51   } else if (x <= 0xFFFF) {
52     *ptr++ = (uint8_t)(0xE0 | (x >> 12));
53     *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
54     *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
55   } else {
56     *ptr++ = (uint8_t)(0xF0 | (x >> 18));
57     *ptr++ = (uint8_t)(0x80 | ((x >> 12) & 0x3F));
58     *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
59     *ptr++ = (uint8_t)(0x80 | (x & 0x3F));
60   }
61 
62   *bufptr = ptr;
63 }
64 
65 static int display_width_map[7] = {
66   /* CHARWIDTH_NONE =      */ 0,
67   /* CHARWIDTH_IGNORABLE = */ 0,
68   /* CHARWIDTH_MARK =      */ 0,
69   /* CHARWIDTH_NARROW =    */ 1,
70   /* CHARWIDTH_AMBIGUOUS = */ 1,
71   /* CHARWIDTH_WIDE =      */ 2,
72   /* CHARWIDTH_EMOJI =     */ 2
73 };
74 
75 #define NEXT() do {                                             \
76     iter->cnd = iter->nxt_ptr;                                  \
77     if (*(iter->nxt_ptr) == '\0') {                             \
78       iter->nxt_prop = -1;                                      \
79     } else {                                                    \
80       utf8lite_decode_utf8(&iter->nxt_ptr, &iter->nxt_code);    \
81       iter->nxt_prop = graph_break(iter->nxt_code);             \
82     }                                                           \
83     if (iter->cnd_width_done >= 0) {                            \
84       if (iter->nxt_cw >= 0) {                                  \
85         if (!iter->cnd_width_done) {                            \
86           iter->cnd_width += display_width_map[iter->nxt_cw];   \
87           if (iter->nxt_cw == CHARWIDTH_EMOJI) {                \
88             iter->cnd_width_done = 1;                           \
89           }                                                     \
90         }                                                       \
91       }                                                         \
92       if (iter->nxt_prop != -1) {                               \
93         iter->nxt_cw = charwidth(iter->nxt_code);               \
94       }                                                         \
95     }                                                           \
96   } while (0)
97 
clic_utf8_graphscan_make(struct grapheme_iterator * iter,const uint8_t * txt,int width)98 void clic_utf8_graphscan_make(struct grapheme_iterator *iter,
99                               const uint8_t *txt,
100                               int width) {
101   iter->nxt_ptr = txt;
102   iter->nxt_cw = -1;
103   iter->cnd_width = 0;
104   iter->cnd_width_done = -1 * (width == 0);
105   NEXT();
106 }
107 
clic_utf8_graphscan_next(struct grapheme_iterator * iter,uint8_t ** ptr,int * width)108 void clic_utf8_graphscan_next(struct grapheme_iterator *iter,
109                               uint8_t **ptr,
110                               int *width) {
111   if (ptr) *ptr = (uint8_t*) iter->cnd;
112 
113  Start:
114   // GB2: Break at the end of text
115   if (iter->nxt_prop < 0) {
116     goto Break;
117   }
118 
119   switch (iter->nxt_prop) {
120   case GRAPH_BREAK_CR:
121     NEXT();
122     goto CR;
123 
124   case GRAPH_BREAK_CONTROL:
125   case GRAPH_BREAK_LF:
126     // Break after controls
127     // GB4: (Newline | LF) +
128     NEXT();
129     goto Break;
130 
131   case GRAPH_BREAK_L:
132     NEXT();
133     goto L;
134 
135   case GRAPH_BREAK_LV:
136   case GRAPH_BREAK_V:
137     NEXT();
138     goto V;
139 
140   case GRAPH_BREAK_LVT:
141   case GRAPH_BREAK_T:
142     NEXT();
143     goto T;
144 
145   case GRAPH_BREAK_PREPEND:
146     NEXT();
147     goto Prepend;
148 
149   case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
150     NEXT();
151     goto Extended_Pictographic;
152 
153   case GRAPH_BREAK_REGIONAL_INDICATOR:
154     NEXT();
155     goto Regional_Indicator;
156 
157   case GRAPH_BREAK_EXTEND:
158   case GRAPH_BREAK_SPACINGMARK:
159   case GRAPH_BREAK_ZWJ:
160   case GRAPH_BREAK_OTHER:
161     NEXT();
162     goto MaybeBreak;
163   }
164 
165   R_THROW_ERROR("internal error, unhandled grapheme break property");
166 
167  CR:
168   // GB3: Do not break within CRLF
169   // GB4: Otherwise break after controls
170   if (iter->nxt_prop == GRAPH_BREAK_LF) {
171     NEXT();
172   }
173   goto Break;
174 
175  L:
176   // GB6: Do not break Hangul syllable sequences.
177   switch (iter->nxt_prop) {
178   case GRAPH_BREAK_L:
179     NEXT();
180     goto L;
181 
182   case GRAPH_BREAK_V:
183   case GRAPH_BREAK_LV:
184     NEXT();
185     goto V;
186 
187   case GRAPH_BREAK_LVT:
188     NEXT();
189     goto T;
190 
191   default:
192     goto MaybeBreak;
193   }
194 
195 
196  V:
197   // GB7: Do not break Hangul syllable sequences.
198   switch (iter->nxt_prop) {
199   case GRAPH_BREAK_V:
200     NEXT();
201     goto V;
202 
203   case GRAPH_BREAK_T:
204     NEXT();
205     goto T;
206 
207   default:
208     goto MaybeBreak;
209   }
210 
211  T:
212   // GB8: Do not break Hangul syllable sequences.
213   switch (iter->nxt_prop) {
214   case GRAPH_BREAK_T:
215     NEXT();
216     goto T;
217 
218   default:
219     goto MaybeBreak;
220   }
221 
222  Prepend:
223   switch (iter->nxt_prop) {
224   case GRAPH_BREAK_CONTROL:
225   case GRAPH_BREAK_CR:
226   case GRAPH_BREAK_LF:
227     // GB5: break before controls
228     goto Break;
229 
230   default:
231     // GB9b: do not break after Prepend characters.
232     goto Start;
233   }
234 
235  Extended_Pictographic:
236   // GB9:  Do not break before extending characters
237   while (iter->nxt_prop == GRAPH_BREAK_EXTEND) {
238     NEXT();
239   }
240   // GB9: Do not break before ZWJ
241   if (iter->nxt_prop == GRAPH_BREAK_ZWJ) {
242     NEXT();
243     // GB11: Do not break within emoji modifier sequences
244     // or emoji zwj sequences.
245     if (iter->nxt_prop == GRAPH_BREAK_EXTENDED_PICTOGRAPHIC) {
246       NEXT();
247       goto Extended_Pictographic;
248     }
249   }
250   goto MaybeBreak;
251 
252  Regional_Indicator:
253   // Do not break within emoji flag sequences. That is, do not break
254   // between regional indicator (RI) symbols if there is an odd number
255   // of RI characters before the break point
256   if (iter->nxt_prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
257     // GB12/13: [^RI] RI * RI
258     NEXT();
259   }
260   goto MaybeBreak;
261 
262  MaybeBreak:
263   // GB9: Do not break before extending characters or ZWJ.
264   // GB9a: Do not break before SpacingMark [extended grapheme clusters]
265   // GB999: Otherwise, break everywhere
266   switch (iter->nxt_prop) {
267   case GRAPH_BREAK_EXTEND:
268   case GRAPH_BREAK_SPACINGMARK:
269   case GRAPH_BREAK_ZWJ:
270     NEXT();
271     goto MaybeBreak;
272 
273   default:
274     goto Break;
275   }
276 
277  Break:
278   if (width) *width = iter->cnd_width;
279   iter->cnd_width = 0;
280   if (iter->cnd_width_done > 0) iter->cnd_width_done = 0;
281   return;
282 }
283 
clic_utf8_display_width(SEXP x)284 SEXP clic_utf8_display_width(SEXP x) {
285   R_xlen_t i, len = XLENGTH(x);
286   SEXP res = PROTECT(allocVector(INTSXP, len));
287   int *pres = INTEGER(res);
288 
289   for (i = 0; i < len; i++) {
290     SEXP x1 = STRING_ELT(x, i);
291     if (x1 == NA_STRING) {
292       pres[i] = NA_INTEGER;
293     } else {
294       struct grapheme_iterator iter;
295       const uint8_t *chr = (const uint8_t*) CHAR(x1);
296       clic_utf8_graphscan_make(&iter, chr, /* width= */ 1);
297       int len = 0;
298       int width;
299       while (iter.nxt_prop != -1) {
300         clic_utf8_graphscan_next(&iter, NULL, &width);
301         len += width;
302       }
303       pres[i] = len;
304     }
305   }
306 
307   UNPROTECT(1);
308   return res;
309 }
310 
clic_utf8_nchar_graphemes(SEXP x)311 SEXP clic_utf8_nchar_graphemes(SEXP x) {
312   R_xlen_t i, len = XLENGTH(x);
313   SEXP res = PROTECT(allocVector(INTSXP, len));
314   int *pres = INTEGER(res);
315 
316   for (i = 0; i < len; i++) {
317     SEXP x1 = STRING_ELT(x, i);
318     if (x1 == NA_STRING) {
319       pres[i] = NA_INTEGER;
320     } else {
321       struct grapheme_iterator iter;
322       const uint8_t *chr = (const uint8_t*) CHAR(x1);
323       int len = 0;
324       clic_utf8_graphscan_make(&iter, chr, /* width= */ 0);
325       while (iter.nxt_prop != -1) {
326         clic_utf8_graphscan_next(&iter, NULL, NULL);
327         len ++;
328       }
329 
330       pres[i] = len;
331     }
332   }
333 
334   UNPROTECT(1);
335   return res;
336 }
337 
clic_utf8_substr(SEXP x,SEXP sstart,SEXP sstop)338 SEXP clic_utf8_substr(SEXP x, SEXP sstart, SEXP sstop) {
339   R_xlen_t i, len = XLENGTH(x);
340   SEXP res = PROTECT(allocVector(STRSXP, len));
341 
342   for (i = 0; i < len; i++) {
343     SEXP x1 = STRING_ELT(x, i);
344     if (x1 == NA_STRING) {
345       SET_STRING_ELT(res, i, x1);
346     } else {
347       const uint8_t *str = (const uint8_t*) CHAR(x1);
348       int pos = 1;
349       int start = INTEGER(sstart)[i];
350       int stop = INTEGER(sstop)[i];
351       struct grapheme_iterator iter;
352 
353       /* Skip initial part */
354       clic_utf8_graphscan_make(&iter, (const uint8_t*) str, /* width = */ 0);
355       while (pos < start && iter.nxt_prop != -1) {
356         clic_utf8_graphscan_next(&iter, NULL, NULL);
357         pos++;
358       }
359       const char* from = (const char*) iter.cnd;
360 
361       /* Iterate to the end */
362       while (pos <= stop && iter.nxt_prop != -1) {
363         clic_utf8_graphscan_next(&iter, NULL, NULL);
364         pos++;
365       }
366       const char *to = (const char*) iter.cnd;
367 
368       if (from < to) {
369         SET_STRING_ELT(res, i, Rf_mkCharLenCE(from, to - from, CE_UTF8));
370       }
371     }
372   }
373 
374   UNPROTECT(1);
375   return res;
376 }
377 
clic_utf8_graphemes(SEXP x)378 SEXP clic_utf8_graphemes(SEXP x) {
379   R_xlen_t i, len = XLENGTH(x);
380   SEXP res = PROTECT(allocVector(VECSXP, len));
381 
382   for (i = 0; i < len; i++) {
383     SEXP x1 = STRING_ELT(x, i);
384     if (x1 == NA_STRING) {
385       SET_VECTOR_ELT(res, i, Rf_ScalarString(x1));
386     } else {
387       const uint8_t *str = (const uint8_t*) CHAR(x1);
388       struct grapheme_iterator iter;
389       SEXP pieces = PROTECT(allocVector(STRSXP, strlen((const char*) str)));
390       R_xlen_t idx = 0;
391       uint8_t *start = 0;
392 
393       clic_utf8_graphscan_make(&iter, str, /* width = */ 0);
394       while (iter.nxt_prop != -1) {
395         clic_utf8_graphscan_next(&iter, &start, NULL);
396         SET_STRING_ELT(
397           pieces,
398           idx++,
399           Rf_mkCharLenCE((const char*) start, iter.cnd - start, CE_UTF8)
400         );
401       }
402 
403       SET_VECTOR_ELT(res, i, PROTECT(Rf_xlengthgets(pieces, idx)));
404       UNPROTECT(2);
405     }
406   }
407 
408   UNPROTECT(1);
409   return res;
410 }
411