1#' @useDynLib fastmap, .registration = TRUE
2NULL
3
4
5#' Create a fastmap object
6#'
7#' A fastmap object provides a key-value store where the keys are strings and
8#' the values are any R objects.
9#'
10#' In R, it is common to use environments as key-value stores, but they can leak
11#' memory: every time a new key is used, R registers it in its global symbol
12#' table, which only grows and is never garbage collected. If many different
13#' keys are used, this can cause a non-trivial amount of memory leakage.
14#'
15#' Fastmap objects do not use the symbol table and do not leak memory.
16#'
17#' Unlike with environments, the keys in a fastmap are always encoded as UTF-8,
18#' so if you call \code{$set()} with two different strings that have the same
19#' Unicode values but have different encodings, the second call will overwrite
20#' the first value. If you call \code{$keys()}, it will return UTF-8 encoded
21#' strings, and similarly, \code{$as_list()} will return a list with names that
22#' have UTF-8 encoding.
23#'
24#' Note that if you call \code{$mset()} with a named argument, where the name is
25#' non-ASCII, R will convert the name to the native encoding before fastmap has
26#' the chance to convert them to UTF-8, and the keys may get mangled in the
27#' process. However, if you use \code{$mset(.list = x)}, then R will not convert
28#' the keys to the native encoding, and the keys will be correctly converted to
29#' UTF-8. With \code{$mget()}, the keys will be converted to UTF-8 before they
30#' are fetched.
31#'
32#'
33#' `fastmap` objects have the following methods:
34#'
35#' \describe{
36#'   \item{\code{set(key, value)}}{
37#'     Set a key-value pair. \code{key} must be a string. Returns \code{value}.
38#'   }
39#'   \item{\code{mset(..., .list = NULL)}}{
40#'     Set multiple key-value pairs. The key-value pairs are named arguments,
41#'     and/or a list passed in as \code{.list}. Returns a named list where the
42#'     names are the keys, and the values are the values.
43#'   }
44#'   \item{\code{get(key, missing = missing_default)}}{
45#'     Get a value corresponding to \code{key}. If the key is not in the map,
46#'     return \code{missing}.
47#'   }
48#'   \item{\code{mget(keys, missing = missing_default)}}{
49#'     Get values corresponding to \code{keys}, which is a character vector. The
50#'     values will be returned in a named list where the names are the same as
51#'     the \code{keys} passed in, in the same order. For keys not in the map,
52#'     they will have \code{missing} for their value.
53#'   }
54#'   \item{\code{has(keys)}}{
55#'     Given a vector of keys, returns a logical vector reporting whether each
56#'     key is contained in the map.
57#'   }
58#'   \item{\code{remove(keys)}}{
59#'     Given a vector of keys, remove the key-value pairs from the map. Returns
60#'     a logical vector reporting whether each item existed in (and was removed
61#'     from) the map.
62#'   }
63#'   \item{\code{keys(sort = FALSE)}}{
64#'     Returns a character vector of all the keys. By default, the keys will be
65#'     in arbitrary order. Note that the order can vary across platforms and is
66#'     not guaranteed to be consistent. With \code{sort=TRUE}, the keys will be
67#'     sorted according to their Unicode code point values.
68#'   }
69#'   \item{\code{size()}}{
70#'     Returns the number of items in the map.
71#'   }
72#'   \item{\code{as_list(sort = FALSE)}}{
73#'     Return a named list where the names are the keys from the map, and the
74#'     values are the values. By default, the keys will be in arbitrary order.
75#'     Note that the order can vary across platforms and is not guaranteed to
76#'     be consistent. With \code{sort=TRUE}, the keys will be sorted according
77#'     to their Unicode code point values.
78#'   }
79#'   \item{\code{reset()}}{
80#'     Reset the fastmap object, clearing all items.
81#'   }
82#' }
83#'
84#' @param missing_default The value to return when \code{get()} is called with a
85#'   key that is not in the map. The default is \code{NULL}, but in some cases
86#'   it can be useful to return a sentinel value, such as a
87#'   \code{\link{key_missing}} object.
88#'
89#' @examples
90#' # Create the fastmap object
91#' m <- fastmap()
92#'
93#' # Set some key-value pairs
94#' m$set("x", 100)
95#' m$set("letters", c("a", "b", "c"))
96#' m$mset(numbers = c(10, 20, 30), nothing = NULL)
97#'
98#' # Get values using keys
99#' m$get("x")
100#' m$get("numbers")
101#' m$mget(c("letters", "numbers"))
102#'
103#' # Missing keys return NULL by default, but this can be customized
104#' m$get("xyz")
105#'
106#' # Check for existence of keys
107#' m$has("x")
108#' m$has("nothing")
109#' m$has("xyz")
110#'
111#' # Remove one or more items
112#' m$remove(c("letters", "x"))
113#'
114#' # Return number of items
115#' m$size()
116#'
117#' # Get all keys
118#' m$keys()
119#'
120#' # Return named list that represents all key-value pairs
121#' str(m$as_list())
122#'
123#' # Clear the map
124#' m$reset()
125#'
126#'
127#' # Specify missing value when get() is called
128#' m <- fastmap()
129#' m$get("x", missing = key_missing())
130#' #> <Key Missing>
131#'
132#' # Specify the default missing value
133#' m <- fastmap(missing_default = key_missing())
134#' m$get("x")
135#' #> <Key Missing>
136#'
137#' @export
138fastmap <- function(missing_default = NULL) {
139  force(missing_default)
140
141  # ===================================
142  # Constants
143  # ===================================
144  INITIAL_SIZE <- 32L
145  GROWTH_FACTOR <- 1.2
146  SHRINK_FACTOR <- 2
147
148  # ===================================
149  # Internal state
150  # ===================================
151
152  # Number of items currently stored in the fastmap object.
153  n <- NULL
154  # External pointer to the C++ object that maps from key (a string) to index
155  # into the list that stores the values (which can be any R object).
156  key_idx_map <- NULL
157  # A vector containing keys, where the keys are in the corresponding position
158  # to the values in the values list. This is only used to repopulate the map
159  # after the fastmap has been serialized and deserialized. It contains the same
160  # information as key_idx_map, but, since it is a normal R object, it can be
161  # saved and restored without any extra effort.
162  keys_ <- NULL
163  # Backing store for the R objects.
164  values <- NULL
165  # Indices in the list which are less than n and not currently occupied. These
166  # occur when objects are removed from the map. When a hole is filled, the
167  # entry is replaced with NA, and n_holes is updated to reflect it; this is
168  # instead of simply shrinking the holes vector, because that involves copying
169  # the entire object.
170  holes <- NULL
171  n_holes <- NULL
172
173  # ===================================
174  # Methods
175  # ===================================
176
177  reset <- function() {
178    n           <<- 0L
179    key_idx_map <<- .Call(C_map_create)
180    keys_       <<- rep(NA_character_, INITIAL_SIZE)
181    values      <<- vector(mode = "list", INITIAL_SIZE)
182    holes       <<- seq_len(INITIAL_SIZE)
183    n_holes     <<- INITIAL_SIZE
184    invisible(NULL)
185  }
186  reset()
187
188  set <- function(key, value) {
189    # Force evaluation of value here, so that, if it throws an error, the error
190    # will not happen in the middle of this function and leave things in an
191    # inconsistent state.
192    force(value)
193
194    ensure_restore_map()
195
196    idx <- .Call(C_map_get, key_idx_map, key)
197
198    if (idx == -1L) {
199      # This is a new key.
200      n <<- n + 1L
201
202      # If we have any holes in our values list, store it there. Otherwise
203      # append to the end of the values list.
204      if (n_holes == 0L) {
205        idx <- n
206        # If we got here, we need to grow. This grows values, and holes is
207        # updated to track it.
208        grow()
209      }
210
211      idx <- holes[n_holes]
212      holes[n_holes] <<- NA_integer_   # Mark as NA, for safety
213      n_holes <<- n_holes - 1L
214
215      .Call(C_map_set, key_idx_map, key, idx)
216    }
217
218    if (is.null(value)) {
219      # Need to handle NULLs differently. Wrap them in a list so that this
220      # doesn't result in deletion.
221      values[idx] <<- list(NULL)
222    } else {
223      values[[idx]] <<- value
224    }
225    # Store the key, as UTF-8
226    keys_[idx] <<- .Call(C_char_vec_to_utf8, key)
227
228    invisible(value)
229  }
230
231  mset <- function(..., .list = NULL) {
232    objs <- c(list(...), .list)
233    keys <- names(objs)
234    if (is.null(keys) || any(is.na(keys)) || any(keys == "")) {
235      stop("mset: all values must be named.")
236    }
237    for (i in seq_along(objs)) {
238      set(keys[i], objs[[i]])
239    }
240
241    invisible(objs)
242  }
243
244  get <- function(key, missing = missing_default) {
245    ensure_restore_map()
246    idx <- .Call(C_map_get, key_idx_map, key)
247    if (idx == -1L) {
248      return(missing)
249    }
250
251    values[[idx]]
252  }
253
254  mget <- function(keys, missing = missing_default) {
255    if (is.null(keys)) {
256      return(list(a=1)[0]) # Special case: return empty named list
257    }
258    if (!is.character(keys)) {
259      stop("mget: `keys` must be a character vector or NULL")
260    }
261
262    # Make sure keys are encoded in UTF-8. Need this C function because iconv
263    # doesn't work right for vectors with mixed encodings.
264    keys <- .Call(C_char_vec_to_utf8, keys)
265    res <- lapply(keys, get, missing)
266    names(res) <- keys
267    res
268  }
269
270  # Internal function
271  has_one <- function(key) {
272    ensure_restore_map()
273    idx <- .Call(C_map_get, key_idx_map, key)
274    return(idx != -1L)
275  }
276
277  has <- function(keys) {
278    if (!(is.character(keys) || is.null(keys))) {
279      stop("mget: `keys` must be a character vector or NULL")
280    }
281    if (length(keys) == 1) {
282      # In the common case of only one key, it's faster to avoid vapply.
283      has_one(keys)
284    } else {
285      vapply(keys, has_one, FUN.VALUE = TRUE, USE.NAMES = FALSE)
286    }
287  }
288
289  # Internal function
290  remove_one <- function(key) {
291    ensure_restore_map()
292    idx <- .Call(C_map_remove, key_idx_map, key)
293    if (idx == -1L) {
294      return(FALSE)
295    }
296
297    values[idx] <<- list(NULL)
298    keys_[idx] <<- NA_character_
299    n <<- n - 1L
300
301    n_holes <<- n_holes + 1L
302    holes[n_holes] <<- idx
303
304    # Shrink the values list if its length is larger than 32 and it is half or
305    # more empty.
306    values_length <- length(values)
307    if (values_length > INITIAL_SIZE  &&  values_length >= n * SHRINK_FACTOR) {
308      shrink()
309    }
310
311    TRUE
312  }
313
314  remove <- function(keys) {
315    if (!(is.character(keys) || is.null(keys))) {
316      stop("mget: `keys` must be a character vector or NULL")
317    }
318    if (any(keys == "") || any(is.na(keys))) {
319      stop('mget: `keys` must not be "" or NA')
320    }
321    if (length(keys) == 1) {
322      # In the common case of only one key, it's faster to avoid vapply.
323      invisible(remove_one(keys))
324    } else {
325      invisible(vapply(keys, remove_one, FUN.VALUE = TRUE, USE.NAMES = FALSE))
326    }
327  }
328
329  size <- function() {
330    n
331  }
332
333  keys <- function(sort = FALSE) {
334    ensure_restore_map()
335    .Call(C_map_keys, key_idx_map, sort)
336  }
337
338  as_list <- function(sort = FALSE) {
339    ensure_restore_map()
340    keys_idxs <- .Call(C_map_keys_idxs, key_idx_map, sort)
341    result <- values[keys_idxs]
342    names(result) <- names(keys_idxs)
343    result
344  }
345
346
347  # Internal function
348  grow <- function() {
349    old_values_length <- length(values)
350    new_values_length <- as.integer(ceiling(old_values_length * GROWTH_FACTOR))
351
352    # Increase size of values list by assigning NULL past the end. On R 3.4 and
353    # up, this will grow it in place.
354    values[new_values_length] <<- list(NULL)
355    keys_[new_values_length] <<- NA_character_
356
357    # When grow() is called, `holes` is all NAs, but it's not as long as values.
358    # Grow it (possibly in place, depending on R version) to new_values_length,
359    # and have it point to all the new empty spaces in `values`. Strictly
360    # speaking, it doesn't have to be as large as new_values_length -- it only
361    # needs to be of size (new_values_length - new_values_length /
362    # SHRINK_FACTOR), but it's possible that there will be a rounding error and
363    # I'm playing it safe here.
364    holes[new_values_length] <<- NA_integer_
365    n_holes <<- new_values_length - old_values_length
366    holes[seq_len(n_holes)] <<- seq.int(from = old_values_length + 1, to = new_values_length)
367  }
368
369  # Internal function
370  shrink <- function() {
371    if (n_holes == 0L)
372      return(invisible())
373
374    keys_idxs <- .Call(C_map_keys_idxs, key_idx_map, FALSE)
375
376    # Suppose values is a length-7 list, n==3, holes==c(4,1,3,NA), n_holes==3
377    # Drop any extra values stored in the holes vector.
378    holes <<- holes[seq_len(n_holes)]
379
380    remap_inv <- seq_len(length(values))
381    remap_inv <- remap_inv[-holes]
382    # remap_inv now is c(2, 5, 6, 7). It will be sorted.
383
384    remap <- integer(length(values))
385    remap[remap_inv] <- seq_along(remap_inv)
386    # remap is now c(0,1,0,0,2,3,4). The non-zero values will be sorted.
387
388    if (length(keys_idxs) != length(remap_inv)) {
389      stop("length mismatch of keys_idxs and remap_inv")
390    }
391    keys <- names(keys_idxs)
392    for (i in seq_along(keys)) {
393      idx <- keys_idxs[[i]]
394      .Call(C_map_set, key_idx_map, keys[i], remap[idx])
395    }
396
397    values <<- values[-holes]
398    keys_ <<- keys_[-holes]
399    holes <<- integer()
400    n_holes <<- 0L
401    n <<- length(values)
402  }
403
404  # Internal function. This is useful after a fastmap is deserialized. When that
405  # happens, the key_idx_map xptr will be NULL, and it needs to be repopulated.
406  # Every external-facing method that makes use of key_idx_map should call this
407  # before doing any operations on it.
408  ensure_restore_map <- function() {
409    # If the key_idx_map pointer is not NULL, just return.
410    if (!.Call(C_xptr_is_null, key_idx_map)) {
411      return(invisible())
412    }
413
414    # Repopulate key_idx_map.
415    key_idx_map <<- .Call(C_map_create)
416    holes <- holes[seq_len(n_holes)]
417    idxs <- seq_along(keys_)[-holes]
418    for (idx in idxs) {
419      .Call(C_map_set, key_idx_map, keys_[idx], idx)
420    }
421  }
422
423  list(
424    reset = reset,
425    set = set,
426    mset = mset,
427    get = get,
428    mget = mget,
429    has = has,
430    remove = remove,
431    keys = keys,
432    size = size,
433    as_list = as_list
434  )
435}
436