1 #include <stdio.h>
2
3 #include <Rinternals.h>
4
5 #define report(reason) { snprintf(cause, sizeof(cause), "INVALID byte 0x%02x at 0x%lx (%lu, line %lu): %s\n", (int) buf[i], i, i, line, reason); if (max_cl) *max_cl = maxcl; return 1; }
6
7 static char cause[512];
8
utf8_check_(const unsigned char * buf,unsigned long len,int * max_cl,int min_char)9 static int utf8_check_(const unsigned char *buf, unsigned long len, int *max_cl, int min_char) {
10 unsigned long i = 0, bp = len, line = 1;
11 int maxcl = 1;
12
13 while (i < bp) {
14 if (min_char > 0 && buf[i] < min_char)
15 report("disallowed control character");
16 if (buf[i] < 128) {
17 if (buf[i] == '\n') line++;
18 } else if (buf[i] < 192) {
19 report("2+ byte of a sequence found in first position");
20 } else if (buf[i] < 194) {
21 report("overlong encoding (<=127 encoded)");
22 } else if (buf[i] < 224) { /* 2-byte seq */
23 if (i + 1 < bp) {
24 i++;
25 if (buf[i] < 0x80 || buf[i] > 0xbf) {
26 report("invalid second byte in 2-byte encoding");
27 }
28 if (maxcl < 2) maxcl = 2;
29 } else break;
30 } else if (buf[i] < 240) { /* 3-byte seq */
31 if (i + 2 < bp) {
32 i++;
33 if (buf[i] < 0x80 || buf[i] > 0xbf) {
34 report("invalid second byte in 3-byte encoding");
35 }
36 i++;
37 if (buf[i] < 0x80 || buf[i] > 0xbf) {
38 report("invalid third byte in 3-byte encoding");
39 }
40 if (maxcl < 3) maxcl = 3;
41 } else break;
42 } else if (buf[i] < 245) { /* 4-byte seq */
43 if (i + 3 < bp) {
44 i++;
45 if (buf[i] < 0x80 || buf[i] > 0xbf) {
46 report("invalid second byte in 4-byte encoding");
47 }
48 i++;
49 if (buf[i] < 0x80 || buf[i] > 0xbf) {
50 report("invalid third byte in 4-byte encoding");
51 }
52 i++;
53 if (buf[i] < 0x80 || buf[i] > 0xbf) {
54 report("invalid fourth byte in 4-byte encoding");
55 }
56 if (maxcl < 3) maxcl = 3;
57 } else break;
58 } else if (buf[i] < 254) {
59 report("invalid start of a codepoint above 0x10FFFF");
60 } else {
61 report("invalid start byte (FE/FF)");
62 }
63 i++;
64 }
65 bp -= i;
66 if (bp > 0)
67 report("unterminated multi-byte sequence at the end of file");
68 return 0;
69 }
70
utf8_check(SEXP sWhat,SEXP sQuiet,SEXP sXLen,SEXP sMinChar)71 SEXP utf8_check(SEXP sWhat, SEXP sQuiet, SEXP sXLen, SEXP sMinChar) {
72 if (TYPEOF(sWhat) != RAWSXP) Rf_error("invalid input");
73 {
74 int maxcl = 0;
75 int res = utf8_check_((const unsigned char*) RAW(sWhat), XLENGTH(sWhat), &maxcl, asInteger(sMinChar));
76
77 if (asInteger(sQuiet) == 0 && res)
78 Rf_error("%s", cause);
79 if (asInteger(sXLen) != 0)
80 return ScalarInteger((res == 0) ? maxcl : (-maxcl));
81 return ScalarLogical((res == 0) ? TRUE : FALSE);
82 }
83 }
84