1#' @useDynLib fastmap, .registration = TRUE 2NULL 3 4 5#' Create a fastmap object 6#' 7#' A fastmap object provides a key-value store where the keys are strings and 8#' the values are any R objects. 9#' 10#' In R, it is common to use environments as key-value stores, but they can leak 11#' memory: every time a new key is used, R registers it in its global symbol 12#' table, which only grows and is never garbage collected. If many different 13#' keys are used, this can cause a non-trivial amount of memory leakage. 14#' 15#' Fastmap objects do not use the symbol table and do not leak memory. 16#' 17#' Unlike with environments, the keys in a fastmap are always encoded as UTF-8, 18#' so if you call \code{$set()} with two different strings that have the same 19#' Unicode values but have different encodings, the second call will overwrite 20#' the first value. If you call \code{$keys()}, it will return UTF-8 encoded 21#' strings, and similarly, \code{$as_list()} will return a list with names that 22#' have UTF-8 encoding. 23#' 24#' Note that if you call \code{$mset()} with a named argument, where the name is 25#' non-ASCII, R will convert the name to the native encoding before fastmap has 26#' the chance to convert them to UTF-8, and the keys may get mangled in the 27#' process. However, if you use \code{$mset(.list = x)}, then R will not convert 28#' the keys to the native encoding, and the keys will be correctly converted to 29#' UTF-8. With \code{$mget()}, the keys will be converted to UTF-8 before they 30#' are fetched. 31#' 32#' 33#' `fastmap` objects have the following methods: 34#' 35#' \describe{ 36#' \item{\code{set(key, value)}}{ 37#' Set a key-value pair. \code{key} must be a string. Returns \code{value}. 38#' } 39#' \item{\code{mset(..., .list = NULL)}}{ 40#' Set multiple key-value pairs. The key-value pairs are named arguments, 41#' and/or a list passed in as \code{.list}. Returns a named list where the 42#' names are the keys, and the values are the values. 43#' } 44#' \item{\code{get(key, missing = missing_default)}}{ 45#' Get a value corresponding to \code{key}. If the key is not in the map, 46#' return \code{missing}. 47#' } 48#' \item{\code{mget(keys, missing = missing_default)}}{ 49#' Get values corresponding to \code{keys}, which is a character vector. The 50#' values will be returned in a named list where the names are the same as 51#' the \code{keys} passed in, in the same order. For keys not in the map, 52#' they will have \code{missing} for their value. 53#' } 54#' \item{\code{has(keys)}}{ 55#' Given a vector of keys, returns a logical vector reporting whether each 56#' key is contained in the map. 57#' } 58#' \item{\code{remove(keys)}}{ 59#' Given a vector of keys, remove the key-value pairs from the map. Returns 60#' a logical vector reporting whether each item existed in (and was removed 61#' from) the map. 62#' } 63#' \item{\code{keys(sort = FALSE)}}{ 64#' Returns a character vector of all the keys. By default, the keys will be 65#' in arbitrary order. Note that the order can vary across platforms and is 66#' not guaranteed to be consistent. With \code{sort=TRUE}, the keys will be 67#' sorted according to their Unicode code point values. 68#' } 69#' \item{\code{size()}}{ 70#' Returns the number of items in the map. 71#' } 72#' \item{\code{as_list(sort = FALSE)}}{ 73#' Return a named list where the names are the keys from the map, and the 74#' values are the values. By default, the keys will be in arbitrary order. 75#' Note that the order can vary across platforms and is not guaranteed to 76#' be consistent. With \code{sort=TRUE}, the keys will be sorted according 77#' to their Unicode code point values. 78#' } 79#' \item{\code{reset()}}{ 80#' Reset the fastmap object, clearing all items. 81#' } 82#' } 83#' 84#' @param missing_default The value to return when \code{get()} is called with a 85#' key that is not in the map. The default is \code{NULL}, but in some cases 86#' it can be useful to return a sentinel value, such as a 87#' \code{\link{key_missing}} object. 88#' 89#' @examples 90#' # Create the fastmap object 91#' m <- fastmap() 92#' 93#' # Set some key-value pairs 94#' m$set("x", 100) 95#' m$set("letters", c("a", "b", "c")) 96#' m$mset(numbers = c(10, 20, 30), nothing = NULL) 97#' 98#' # Get values using keys 99#' m$get("x") 100#' m$get("numbers") 101#' m$mget(c("letters", "numbers")) 102#' 103#' # Missing keys return NULL by default, but this can be customized 104#' m$get("xyz") 105#' 106#' # Check for existence of keys 107#' m$has("x") 108#' m$has("nothing") 109#' m$has("xyz") 110#' 111#' # Remove one or more items 112#' m$remove(c("letters", "x")) 113#' 114#' # Return number of items 115#' m$size() 116#' 117#' # Get all keys 118#' m$keys() 119#' 120#' # Return named list that represents all key-value pairs 121#' str(m$as_list()) 122#' 123#' # Clear the map 124#' m$reset() 125#' 126#' 127#' # Specify missing value when get() is called 128#' m <- fastmap() 129#' m$get("x", missing = key_missing()) 130#' #> <Key Missing> 131#' 132#' # Specify the default missing value 133#' m <- fastmap(missing_default = key_missing()) 134#' m$get("x") 135#' #> <Key Missing> 136#' 137#' @export 138fastmap <- function(missing_default = NULL) { 139 force(missing_default) 140 141 # =================================== 142 # Constants 143 # =================================== 144 INITIAL_SIZE <- 32L 145 GROWTH_FACTOR <- 1.2 146 SHRINK_FACTOR <- 2 147 148 # =================================== 149 # Internal state 150 # =================================== 151 152 # Number of items currently stored in the fastmap object. 153 n <- NULL 154 # External pointer to the C++ object that maps from key (a string) to index 155 # into the list that stores the values (which can be any R object). 156 key_idx_map <- NULL 157 # A vector containing keys, where the keys are in the corresponding position 158 # to the values in the values list. This is only used to repopulate the map 159 # after the fastmap has been serialized and deserialized. It contains the same 160 # information as key_idx_map, but, since it is a normal R object, it can be 161 # saved and restored without any extra effort. 162 keys_ <- NULL 163 # Backing store for the R objects. 164 values <- NULL 165 # Indices in the list which are less than n and not currently occupied. These 166 # occur when objects are removed from the map. When a hole is filled, the 167 # entry is replaced with NA, and n_holes is updated to reflect it; this is 168 # instead of simply shrinking the holes vector, because that involves copying 169 # the entire object. 170 holes <- NULL 171 n_holes <- NULL 172 173 # =================================== 174 # Methods 175 # =================================== 176 177 reset <- function() { 178 n <<- 0L 179 key_idx_map <<- .Call(C_map_create) 180 keys_ <<- rep(NA_character_, INITIAL_SIZE) 181 values <<- vector(mode = "list", INITIAL_SIZE) 182 holes <<- seq_len(INITIAL_SIZE) 183 n_holes <<- INITIAL_SIZE 184 invisible(NULL) 185 } 186 reset() 187 188 set <- function(key, value) { 189 # Force evaluation of value here, so that, if it throws an error, the error 190 # will not happen in the middle of this function and leave things in an 191 # inconsistent state. 192 force(value) 193 194 ensure_restore_map() 195 196 idx <- .Call(C_map_get, key_idx_map, key) 197 198 if (idx == -1L) { 199 # This is a new key. 200 n <<- n + 1L 201 202 # If we have any holes in our values list, store it there. Otherwise 203 # append to the end of the values list. 204 if (n_holes == 0L) { 205 idx <- n 206 # If we got here, we need to grow. This grows values, and holes is 207 # updated to track it. 208 grow() 209 } 210 211 idx <- holes[n_holes] 212 holes[n_holes] <<- NA_integer_ # Mark as NA, for safety 213 n_holes <<- n_holes - 1L 214 215 .Call(C_map_set, key_idx_map, key, idx) 216 } 217 218 if (is.null(value)) { 219 # Need to handle NULLs differently. Wrap them in a list so that this 220 # doesn't result in deletion. 221 values[idx] <<- list(NULL) 222 } else { 223 values[[idx]] <<- value 224 } 225 # Store the key, as UTF-8 226 keys_[idx] <<- .Call(C_char_vec_to_utf8, key) 227 228 invisible(value) 229 } 230 231 mset <- function(..., .list = NULL) { 232 objs <- c(list(...), .list) 233 keys <- names(objs) 234 if (is.null(keys) || any(is.na(keys)) || any(keys == "")) { 235 stop("mset: all values must be named.") 236 } 237 for (i in seq_along(objs)) { 238 set(keys[i], objs[[i]]) 239 } 240 241 invisible(objs) 242 } 243 244 get <- function(key, missing = missing_default) { 245 ensure_restore_map() 246 idx <- .Call(C_map_get, key_idx_map, key) 247 if (idx == -1L) { 248 return(missing) 249 } 250 251 values[[idx]] 252 } 253 254 mget <- function(keys, missing = missing_default) { 255 if (is.null(keys)) { 256 return(list(a=1)[0]) # Special case: return empty named list 257 } 258 if (!is.character(keys)) { 259 stop("mget: `keys` must be a character vector or NULL") 260 } 261 262 # Make sure keys are encoded in UTF-8. Need this C function because iconv 263 # doesn't work right for vectors with mixed encodings. 264 keys <- .Call(C_char_vec_to_utf8, keys) 265 res <- lapply(keys, get, missing) 266 names(res) <- keys 267 res 268 } 269 270 # Internal function 271 has_one <- function(key) { 272 ensure_restore_map() 273 idx <- .Call(C_map_get, key_idx_map, key) 274 return(idx != -1L) 275 } 276 277 has <- function(keys) { 278 if (!(is.character(keys) || is.null(keys))) { 279 stop("mget: `keys` must be a character vector or NULL") 280 } 281 if (length(keys) == 1) { 282 # In the common case of only one key, it's faster to avoid vapply. 283 has_one(keys) 284 } else { 285 vapply(keys, has_one, FUN.VALUE = TRUE, USE.NAMES = FALSE) 286 } 287 } 288 289 # Internal function 290 remove_one <- function(key) { 291 ensure_restore_map() 292 idx <- .Call(C_map_remove, key_idx_map, key) 293 if (idx == -1L) { 294 return(FALSE) 295 } 296 297 values[idx] <<- list(NULL) 298 keys_[idx] <<- NA_character_ 299 n <<- n - 1L 300 301 n_holes <<- n_holes + 1L 302 holes[n_holes] <<- idx 303 304 # Shrink the values list if its length is larger than 32 and it is half or 305 # more empty. 306 values_length <- length(values) 307 if (values_length > INITIAL_SIZE && values_length >= n * SHRINK_FACTOR) { 308 shrink() 309 } 310 311 TRUE 312 } 313 314 remove <- function(keys) { 315 if (!(is.character(keys) || is.null(keys))) { 316 stop("mget: `keys` must be a character vector or NULL") 317 } 318 if (any(keys == "") || any(is.na(keys))) { 319 stop('mget: `keys` must not be "" or NA') 320 } 321 if (length(keys) == 1) { 322 # In the common case of only one key, it's faster to avoid vapply. 323 invisible(remove_one(keys)) 324 } else { 325 invisible(vapply(keys, remove_one, FUN.VALUE = TRUE, USE.NAMES = FALSE)) 326 } 327 } 328 329 size <- function() { 330 n 331 } 332 333 keys <- function(sort = FALSE) { 334 ensure_restore_map() 335 .Call(C_map_keys, key_idx_map, sort) 336 } 337 338 as_list <- function(sort = FALSE) { 339 ensure_restore_map() 340 keys_idxs <- .Call(C_map_keys_idxs, key_idx_map, sort) 341 result <- values[keys_idxs] 342 names(result) <- names(keys_idxs) 343 result 344 } 345 346 347 # Internal function 348 grow <- function() { 349 old_values_length <- length(values) 350 new_values_length <- as.integer(ceiling(old_values_length * GROWTH_FACTOR)) 351 352 # Increase size of values list by assigning NULL past the end. On R 3.4 and 353 # up, this will grow it in place. 354 values[new_values_length] <<- list(NULL) 355 keys_[new_values_length] <<- NA_character_ 356 357 # When grow() is called, `holes` is all NAs, but it's not as long as values. 358 # Grow it (possibly in place, depending on R version) to new_values_length, 359 # and have it point to all the new empty spaces in `values`. Strictly 360 # speaking, it doesn't have to be as large as new_values_length -- it only 361 # needs to be of size (new_values_length - new_values_length / 362 # SHRINK_FACTOR), but it's possible that there will be a rounding error and 363 # I'm playing it safe here. 364 holes[new_values_length] <<- NA_integer_ 365 n_holes <<- new_values_length - old_values_length 366 holes[seq_len(n_holes)] <<- seq.int(from = old_values_length + 1, to = new_values_length) 367 } 368 369 # Internal function 370 shrink <- function() { 371 if (n_holes == 0L) 372 return(invisible()) 373 374 keys_idxs <- .Call(C_map_keys_idxs, key_idx_map, FALSE) 375 376 # Suppose values is a length-7 list, n==3, holes==c(4,1,3,NA), n_holes==3 377 # Drop any extra values stored in the holes vector. 378 holes <<- holes[seq_len(n_holes)] 379 380 remap_inv <- seq_len(length(values)) 381 remap_inv <- remap_inv[-holes] 382 # remap_inv now is c(2, 5, 6, 7). It will be sorted. 383 384 remap <- integer(length(values)) 385 remap[remap_inv] <- seq_along(remap_inv) 386 # remap is now c(0,1,0,0,2,3,4). The non-zero values will be sorted. 387 388 if (length(keys_idxs) != length(remap_inv)) { 389 stop("length mismatch of keys_idxs and remap_inv") 390 } 391 keys <- names(keys_idxs) 392 for (i in seq_along(keys)) { 393 idx <- keys_idxs[[i]] 394 .Call(C_map_set, key_idx_map, keys[i], remap[idx]) 395 } 396 397 values <<- values[-holes] 398 keys_ <<- keys_[-holes] 399 holes <<- integer() 400 n_holes <<- 0L 401 n <<- length(values) 402 } 403 404 # Internal function. This is useful after a fastmap is deserialized. When that 405 # happens, the key_idx_map xptr will be NULL, and it needs to be repopulated. 406 # Every external-facing method that makes use of key_idx_map should call this 407 # before doing any operations on it. 408 ensure_restore_map <- function() { 409 # If the key_idx_map pointer is not NULL, just return. 410 if (!.Call(C_xptr_is_null, key_idx_map)) { 411 return(invisible()) 412 } 413 414 # Repopulate key_idx_map. 415 key_idx_map <<- .Call(C_map_create) 416 holes <- holes[seq_len(n_holes)] 417 idxs <- seq_along(keys_)[-holes] 418 for (idx in idxs) { 419 .Call(C_map_set, key_idx_map, keys_[idx], idx) 420 } 421 } 422 423 list( 424 reset = reset, 425 set = set, 426 mset = mset, 427 get = get, 428 mget = mget, 429 has = has, 430 remove = remove, 431 keys = keys, 432 size = size, 433 as_list = as_list 434 ) 435} 436