1 #include <stdio.h>
2 
3 #include <Rinternals.h>
4 
5 #define report(reason) { snprintf(cause, sizeof(cause), "INVALID byte 0x%02x at 0x%lx (%lu, line %lu): %s\n", (int) buf[i], i, i, line, reason); if (max_cl) *max_cl = maxcl; return 1; }
6 
7 static char cause[512];
8 
utf8_check_(const unsigned char * buf,unsigned long len,int * max_cl,int min_char)9 static int utf8_check_(const unsigned char *buf, unsigned long len, int *max_cl, int min_char) {
10     unsigned long i = 0, bp = len, line = 1;
11     int maxcl = 1;
12 
13     while (i < bp) {
14 	if (min_char > 0 && buf[i] < min_char)
15 	    report("disallowed control character");
16 	if (buf[i] < 128) {
17 	    if (buf[i] == '\n') line++;
18 	} else if (buf[i] < 192) {
19 	    report("2+ byte of a sequence found in first position");
20 	} else if (buf[i] < 194) {
21 	    report("overlong encoding (<=127 encoded)");
22 	} else if (buf[i] < 224) { /* 2-byte seq */
23 	    if (i + 1 < bp) {
24 		i++;
25 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
26 		    report("invalid second byte in 2-byte encoding");
27 		}
28 		if (maxcl < 2) maxcl = 2;
29 	    } else break;
30 	} else if (buf[i] < 240) { /* 3-byte seq */
31 	    if (i + 2 < bp) {
32 		i++;
33 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
34 		    report("invalid second byte in 3-byte encoding");
35 		}
36 		i++;
37 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
38 		    report("invalid third byte in 3-byte encoding");
39 		}
40 		if (maxcl < 3) maxcl = 3;
41 	    } else break;
42 	} else if (buf[i] < 245) { /* 4-byte seq */
43 	    if (i + 3 < bp) {
44 		i++;
45 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
46 		    report("invalid second byte in 4-byte encoding");
47 		}
48 		i++;
49 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
50 		    report("invalid third byte in 4-byte encoding");
51 		}
52 		i++;
53 		if (buf[i] < 0x80 || buf[i] > 0xbf) {
54 		    report("invalid fourth byte in 4-byte encoding");
55 		}
56 		if (maxcl < 3) maxcl = 3;
57 	    } else break;
58 	} else if (buf[i] < 254) {
59 	    report("invalid start of a codepoint above 0x10FFFF");
60 	} else {
61 	    report("invalid start byte (FE/FF)");
62 	}
63 	i++;
64     }
65     bp -= i;
66     if (bp > 0)
67 	report("unterminated multi-byte sequence at the end of file");
68     return 0;
69 }
70 
utf8_check(SEXP sWhat,SEXP sQuiet,SEXP sXLen,SEXP sMinChar)71 SEXP utf8_check(SEXP sWhat, SEXP sQuiet, SEXP sXLen, SEXP sMinChar) {
72     if (TYPEOF(sWhat) != RAWSXP) Rf_error("invalid input");
73     {
74 	int maxcl = 0;
75 	int res = utf8_check_((const unsigned char*) RAW(sWhat), XLENGTH(sWhat), &maxcl, asInteger(sMinChar));
76 
77 	if (asInteger(sQuiet) == 0 && res)
78 	    Rf_error("%s", cause);
79 	if (asInteger(sXLen) != 0)
80 	    return ScalarInteger((res == 0) ? maxcl : (-maxcl));
81 	return ScalarLogical((res == 0) ? TRUE : FALSE);
82     }
83 }
84