1## This takes the definition of UTF-8 as RFC3629 (2003),
2## but not all software does.
3## See also https://en.wikipedia.org/wiki/UTF-8
4
5x <- 1L:0x10FFFF
6y <- intToUtf8(x, multiple = TRUE)
7names(y) <- sprintf("U+%4x", x)
8## values in the surrogate range: not handled in R < 3.4.3
9sr <- 0xD800:0xDFFF
10stopifnot(is.na(y[sr]))
11stopifnot(!is.na(y[-sr]))
12## too large values: originally handled by UTF-8, but not in RFC3629
13## R >= 3.4.3 conforms to RFC3629
14stopifnot(is.na(intToUtf8(c(0x200000, 0x10FFFF + 1:10))))
15
16## next command is quite slow.
17xx <- sapply(y, function(x) tryCatch(utf8ToInt(x),
18                                     error = function(e) NA_character_))
19invalid <- sr # previously included 0xFFFE and 0xFFFF
20              # but not other 'noncharacters'.
21stopifnot(is.na(xx[invalid]), !is.na(xx[!invalid]))
22stopifnot(xx[!invalid] == x[!invalid])
23
24## The pre-2003 UTF-8 standard converted larger code-points to 4-6 bytes,
25## and was followed by intToUtf8 in earlier versions of R.
26## Earlier conversion of 0x111111, 0x200001, 0x10000001)
27x <- c("\xf4\x91\x84\x91", "\xf8\x80\x80\x80\x81", "\xfc\x90\x80\x80\x80\x81")
28xx <- sapply(x, function(x) tryCatch(utf8ToInt(x),
29                                     error = function(e) NA_character_))
30stopifnot(is.na(xx)) # first was not in R < 3.4.3
31
32### test surrogate pairs
33surrogate_pair <- function(x)
34{
35    if(any(x < 0x10000 | x > 0x10FFFF))
36        stop("Surrogate pairs apply only to supplementary planes")
37    x <- x - 0x10000
38    as.vector(rbind(0xD800 + x %/% 1024, 0xDC00 + x %% 1024))
39}
40## check the example:
41xx <- surrogate_pair(0x10437)
42sprintf("%X", xx)
43stopifnot(xx == c(0xD801, 0xDC37))
44
45## there are 2^20 surrogate pairs, but fast enough to check them all
46x <- 0x10000:0x10FFFF
47x1 <- intToUtf8(x)
48x2 <- utf8ToInt(x1)
49stopifnot(x2 == x)
50
51z <- surrogate_pair(x)
52x1 <- intToUtf8(z, allow_surrogate_pairs = TRUE)
53x2 <- utf8ToInt(x1)
54stopifnot(x2 == x)
55