1 2#' Whether cli is emitting UTF-8 characters 3#' 4#' UTF-8 cli characters can be turned on by setting the `cli.unicode` 5#' option to `TRUE`. They can be turned off by setting if to `FALSE`. 6#' If this option is not set, then [base::l10n_info()] is used to detect 7#' UTF-8 support. 8#' 9#' @return Flag, whether cli uses UTF-8 characters. 10#' 11#' @export 12 13is_utf8_output <- function() { 14 opt <- getOption("cli.unicode", NULL) 15 if (! is.null(opt)) { 16 isTRUE(opt) 17 } else { 18 l10n_info()$`UTF-8` && !is_latex_output() 19 } 20} 21 22#' Count the number of characters in a character vector 23#' 24#' By default it counts Unicode grapheme clusters, instead of code points. 25#' 26#' @param x Character vector, it is converted to UTF-8. 27#' @param type Whether to count graphemes (characters), code points, 28#' bytes, or calculate the display width of the string. 29#' @return Numeric vector, the length of the strings in the character 30#' vector. 31#' 32#' @family UTF-8 string manipulation 33#' @export 34#' @examples 35#' # Grapheme example, emoji with combining characters. This is a single 36#' # grapheme, consisting of five Unicode code points: 37#' # * `\U0001f477` is the construction worker emoji 38#' # * `\U0001f3fb` is emoji modifier that changes the skin color 39#' # * `\u200d` is the zero width joiner 40#' # * `\u2640` is the female sign 41#' # * `\ufe0f` is variation selector 16, requesting an emoji style glyph 42#' emo <- "\U0001f477\U0001f3fb\u200d\u2640\ufe0f" 43#' cat(emo) 44#' 45#' utf8_nchar(emo, "chars") # = graphemes 46#' utf8_nchar(emo, "bytes") 47#' utf8_nchar(emo, "width") 48#' utf8_nchar(emo, "codepoints") 49#' 50#' # For comparision, the output for width depends on the R version used: 51#' nchar(emo, "chars") 52#' nchar(emo, "bytes") 53#' nchar(emo, "width") 54 55utf8_nchar <- function(x, type = c("chars", "bytes", "width", "graphemes", 56 "codepoints")) { 57 58 type <- match.arg(type) 59 if (type == "chars") type <- "graphemes" 60 61 x <- enc2utf8(x) 62 63 if (type == "width") { 64 .Call(clic_utf8_display_width, x) 65 66 } else if (type == "graphemes") { 67 .Call(clic_utf8_nchar_graphemes, x) 68 69 } else if (type == "codepoints") { 70 base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "chars") 71 72 } else { # bytes 73 base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "bytes") 74 } 75} 76 77#' Substring of an UTF-8 string 78#' 79#' This function uses grapheme clusters instead of Unicode code points in 80#' UTF-8 strings. 81#' 82#' @param x Character vector. 83#' @param start Starting index or indices, recycled to match the length 84#' of `x`. 85#' @param stop Ending index or indices, recycled to match the length of 86#' `x`. 87#' @return Character vector of the same length as `x`, containing 88#' the requested substrings. 89#' 90#' @family UTF-8 string manipulation 91#' @export 92#' @examples 93#' # Five grapheme clusters, select the middle three 94#' str <- paste0( 95#' "\U0001f477\U0001f3ff\u200d\u2640\ufe0f", 96#' "\U0001f477\U0001f3ff", 97#' "\U0001f477\u200d\u2640\ufe0f", 98#' "\U0001f477\U0001f3fb", 99#' "\U0001f477\U0001f3ff") 100#' cat(str) 101#' str24 <- utf8_substr(str, 2, 4) 102#' cat(str24) 103 104utf8_substr <- function(x, start, stop) { 105 if (!is.character(x)) x <- as.character(x) 106 start <- as.integer(start) 107 stop <- as.integer(stop) 108 if (!length(start) || !length(stop)) { 109 stop("invalid substring arguments") 110 } 111 if (anyNA(start) || anyNA(stop)) { 112 stop("non-numeric substring arguments not supported") 113 } 114 x <- enc2utf8(x) 115 start <- rep_len(start, length(x)) 116 stop <- rep_len(stop, length(x)) 117 .Call(clic_utf8_substr, x, start, stop) 118} 119 120#' Break an UTF-8 character vector into grapheme clusters 121#' 122#' @param x Character vector. 123#' @return List of characters vectors, the grapheme clusters of the input 124#' string. 125#' 126#' @family UTF-8 string manipulation 127#' @export 128#' @examples 129#' # Five grapheme clusters 130#' str <- paste0( 131#' "\U0001f477\U0001f3ff\u200d\u2640\ufe0f", 132#' "\U0001f477\U0001f3ff", 133#' "\U0001f477\u200d\u2640\ufe0f", 134#' "\U0001f477\U0001f3fb", 135#' "\U0001f477\U0001f3ff") 136#' cat(str, "\n") 137#' chrs <- utf8_graphemes(str) 138 139utf8_graphemes <- function(x) { 140 if (!is.character(x)) x <- as.character(x) 141 x <- enc2utf8(x) 142 .Call(clic_utf8_graphemes, x) 143} 144