1
2#' Whether cli is emitting UTF-8 characters
3#'
4#' UTF-8 cli characters can be turned on by setting the `cli.unicode`
5#' option to `TRUE`. They can be turned off by setting if to `FALSE`.
6#' If this option is not set, then [base::l10n_info()] is used to detect
7#' UTF-8 support.
8#'
9#' @return Flag, whether cli uses UTF-8 characters.
10#'
11#' @export
12
13is_utf8_output <- function() {
14  opt <- getOption("cli.unicode", NULL)
15  if (! is.null(opt)) {
16    isTRUE(opt)
17  } else {
18    l10n_info()$`UTF-8` && !is_latex_output()
19  }
20}
21
22#' Count the number of characters in a character vector
23#'
24#' By default it counts Unicode grapheme clusters, instead of code points.
25#'
26#' @param x Character vector, it is converted to UTF-8.
27#' @param type Whether to count graphemes (characters), code points,
28#'   bytes, or calculate the display width of the string.
29#' @return Numeric vector, the length of the strings in the character
30#'   vector.
31#'
32#' @family UTF-8 string manipulation
33#' @export
34#' @examples
35#' # Grapheme example, emoji with combining characters. This is a single
36#' # grapheme, consisting of five Unicode code points:
37#' # * `\U0001f477` is the construction worker emoji
38#' # * `\U0001f3fb` is emoji modifier that changes the skin color
39#' # * `\u200d` is the zero width joiner
40#' # * `\u2640` is the female sign
41#' # * `\ufe0f` is variation selector 16, requesting an emoji style glyph
42#' emo <- "\U0001f477\U0001f3fb\u200d\u2640\ufe0f"
43#' cat(emo)
44#'
45#' utf8_nchar(emo, "chars") # = graphemes
46#' utf8_nchar(emo, "bytes")
47#' utf8_nchar(emo, "width")
48#' utf8_nchar(emo, "codepoints")
49#'
50#' # For comparision, the output for width depends on the R version used:
51#' nchar(emo, "chars")
52#' nchar(emo, "bytes")
53#' nchar(emo, "width")
54
55utf8_nchar <- function(x, type = c("chars", "bytes", "width", "graphemes",
56                                   "codepoints")) {
57
58  type <- match.arg(type)
59  if (type == "chars") type <- "graphemes"
60
61  x <- enc2utf8(x)
62
63  if (type == "width") {
64    .Call(clic_utf8_display_width, x)
65
66  } else if (type == "graphemes") {
67    .Call(clic_utf8_nchar_graphemes, x)
68
69  } else if (type == "codepoints") {
70    base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "chars")
71
72  } else { # bytes
73    base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "bytes")
74  }
75}
76
77#' Substring of an UTF-8 string
78#'
79#' This function uses grapheme clusters instead of Unicode code points in
80#' UTF-8 strings.
81#'
82#' @param x Character vector.
83#' @param start Starting index or indices, recycled to match the length
84#'   of `x`.
85#' @param stop Ending index or indices, recycled to match the length of
86#'   `x`.
87#' @return Character vector of the same length as `x`, containing
88#'   the requested substrings.
89#'
90#' @family UTF-8 string manipulation
91#' @export
92#' @examples
93#' # Five grapheme clusters, select the middle three
94#' str <- paste0(
95#'   "\U0001f477\U0001f3ff\u200d\u2640\ufe0f",
96#'   "\U0001f477\U0001f3ff",
97#'   "\U0001f477\u200d\u2640\ufe0f",
98#'   "\U0001f477\U0001f3fb",
99#'   "\U0001f477\U0001f3ff")
100#' cat(str)
101#' str24 <- utf8_substr(str, 2, 4)
102#' cat(str24)
103
104utf8_substr <- function(x, start, stop) {
105  if (!is.character(x)) x <- as.character(x)
106  start <- as.integer(start)
107  stop <- as.integer(stop)
108  if (!length(start) || !length(stop)) {
109    stop("invalid substring arguments")
110  }
111  if (anyNA(start) || anyNA(stop)) {
112    stop("non-numeric substring arguments not supported")
113  }
114  x <- enc2utf8(x)
115  start <- rep_len(start, length(x))
116  stop <- rep_len(stop, length(x))
117  .Call(clic_utf8_substr, x, start, stop)
118}
119
120#' Break an UTF-8 character vector into grapheme clusters
121#'
122#' @param x Character vector.
123#' @return List of characters vectors, the grapheme clusters of the input
124#'   string.
125#'
126#' @family UTF-8 string manipulation
127#' @export
128#' @examples
129#' # Five grapheme clusters
130#' str <- paste0(
131#'   "\U0001f477\U0001f3ff\u200d\u2640\ufe0f",
132#'   "\U0001f477\U0001f3ff",
133#'   "\U0001f477\u200d\u2640\ufe0f",
134#'   "\U0001f477\U0001f3fb",
135#'   "\U0001f477\U0001f3ff")
136#' cat(str, "\n")
137#' chrs <- utf8_graphemes(str)
138
139utf8_graphemes <- function(x) {
140  if (!is.character(x)) x <- as.character(x)
141  x <- enc2utf8(x)
142  .Call(clic_utf8_graphemes, x)
143}
144