1setkey = function(x, ..., verbose=getOption("datatable.verbose"), physical=TRUE)
2{
3  if (is.character(x)) stop("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.")
4  cols = as.character(substitute(list(...))[-1L])
5  if (!length(cols)) { cols=colnames(x) }
6  else if (identical(cols,"NULL")) cols=NULL
7  setkeyv(x, cols, verbose=verbose, physical=physical)
8}
9
10# FR #1442
11setindex = function(...) setkey(..., physical=FALSE)
12setindexv = function(x, cols, verbose=getOption("datatable.verbose")) {
13  if (is.list(cols)) {
14    sapply(cols, setkeyv, x=x, verbose=verbose, physical=FALSE)
15    return(invisible(x))
16  } else {
17    setkeyv(x, cols, verbose=verbose, physical=FALSE)
18  }
19}
20
21# upgrade to error after Mar 2020. Has already been warning since 2012, and stronger warning in Mar 2019 (note in news for 1.12.2); #3399
22"key<-" = function(x,value) {
23  warning("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.")
24  setkeyv(x,value)
25  # The returned value here from key<- is then copied by R before assigning to x, it seems. That's
26  # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<-
27  # method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the
28  # returned value (x) then gets copied by R.
29  # So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy.
30}
31
32setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE)
33{
34  if (is.null(cols)) {   # this is done on a data.frame when !cedta at top of [.data.table
35    if (physical) setattr(x,"sorted",NULL)
36    setattr(x,"index",NULL)  # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys.
37    return(invisible(x))
38  }
39  if (!missing(verbose)) {
40    stopifnot(isTRUEorFALSE(verbose))
41    # set the global verbose option because that is fetched from C code without having to pass it through
42    oldverbose = options(datatable.verbose=verbose)
43    on.exit(options(oldverbose))
44  }
45  if (!is.data.table(x)) stop("x is not a data.table")
46  if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.")
47  if (physical && .Call(C_islocked, x)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.")
48  if (!length(cols)) {
49    warning("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
50    setattr(x,"sorted",NULL)
51    return(invisible(x))
52  }
53  if (identical(cols,"")) stop("cols is the empty string. Use NULL to remove the key.")
54  if (!all(nzchar(cols))) stop("cols contains some blanks.")
55  cols = gsub("`", "", cols, fixed = TRUE)
56  miss = !(cols %chin% colnames(x))
57  if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
58
59  ## determine, whether key is already present:
60  if (identical(key(x),cols)) {
61    if (!physical) {
62      ## create index as integer() because already sorted by those columns
63      if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
64      setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
65    }
66    return(invisible(x))
67  } else if(identical(head(key(x), length(cols)), cols)){
68    if (!physical) {
69      ## create index as integer() because already sorted by those columns
70      if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
71      setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
72    } else {
73      ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
74      setattr(x,"sorted",cols)
75    }
76    return(invisible(x))
77  }
78
79  if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
80  for (i in cols) {
81    .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
82    if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
83  }
84  if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
85
86  newkey = paste0(cols, collapse="__")
87  if (!any(indices(x) == newkey)) {
88    if (verbose) {
89      tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
90      # suppress needed for tests 644 and 645 in verbose mode
91      cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
92    } else {
93      o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
94    }
95  } else {
96    if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="")
97    o = getindex(x, newkey)
98  }
99  if (!physical) {
100    if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
101    setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o)
102    return(invisible(x))
103  }
104  setattr(x,"index",NULL)   # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
105  if (length(o)) {
106    if (verbose) { last.started.at = proc.time() }
107    .Call(Creorder,x,o)
108    if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
109  } else {
110    if (verbose) cat("x is already ordered by these columns, no need to call reorder\n")
111  } # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
112  setattr(x,"sorted",cols)
113  invisible(x)
114}
115
116key = function(x) attr(x, "sorted", exact=TRUE)
117
118indices = function(x, vectors = FALSE) {
119  ans = names(attributes(attr(x, "index", exact=TRUE)))
120  if (is.null(ans)) return(ans) # otherwise character() gets returned by next line
121  ans = gsub("^__","",ans)     # the leading __ is internal only, so remove that in result
122  if (isTRUE(vectors))
123    ans = strsplit(ans, "__", fixed = TRUE)
124  ans
125}
126
127getindex = function(x, name) {
128  # name can be "col", or "col1__col2", or c("col1","col2")
129  ans = attr(attr(x, 'index', exact=TRUE), paste0("__",name,collapse=""), exact=TRUE)
130  if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) {
131    stop("Internal error: index '",name,"' exists but is invalid")   # nocov
132  }
133  ans
134}
135
136haskey = function(x) !is.null(key(x))
137
138# reorder a vector based on 'order' (integer)
139# to be used in fastorder instead of x[o], but in general, it's better to replace vector subsetting with this..?
140# Basic checks that all items of order are in range 1:n with no NAs are now made inside Creorder.
141# FOR INTERNAL USE ONLY
142setreordervec = function(x, order) .Call(Creorder, x, order)
143
144# sort = sort.int = sort.list = order = is.unsorted = function(...)
145#    stop("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.")
146# Nice idea, but users might use these in i or j e.g. blocking order caused tests 304 to fail.
147# Maybe just a grep through *.R for use of these function internally would be better (TO DO).
148
149# Don't use base::is.unsorted internally, because :
150#    1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
151#       "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale)
152#    2) wrapper needed, used to be :
153#       identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x))
154#       where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013].
155# The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
156# consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors.
157
158is.sorted = function(x, by=NULL) {
159  if (is.list(x)) {
160    if (missing(by)) by = seq_along(x)   # wouldn't make sense when x is a vector; hence by=seq_along(x) is not the argument default
161    if (is.character(by)) by = chmatch(by, names(x))
162  } else {
163    if (!missing(by)) stop("x is vector but 'by' is supplied")
164  }
165  .Call(Cissorted, x, as.integer(by))
166  # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1)
167}
168
169ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character')
170forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE)
171{
172  if (is.atomic(x)) {  # including forderv(NULL) which returns error consistent with base::order(NULL),
173    if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense")
174    by = NULL
175  } else {
176    if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L))
177    by = colnamesInt(x, by, check_dups=FALSE)
178    if (length(order) == 1L) order = rep(order, length(by))
179  }
180  order = as.integer(order) # length and contents of order being +1/-1 is checked at C level
181  .Call(Cforder, x, by, retGrp, sort, order, na.last)  # returns integer() if already sorted, regardless of sort=TRUE|FALSE
182}
183
184forder = function(..., na.last=TRUE, decreasing=FALSE)
185{
186  sub = substitute(list(...))
187  tt = sapply(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x)))
188  if (any(tt)) sub[tt] = NULL  # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, )
189  if (length(sub)<2L) return(NULL)  # forder() with no arguments returns NULL consistent with base::order
190  asc = rep.int(1L, length(sub)-1L)  # ascending (1) or descending (-1) per column
191  # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that :
192  # 1) - on character vector works; ordinarily in R that fails with type error
193  # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE
194  # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first
195  # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item.
196  for (i in seq.int(2L, length(sub))) {
197    v = sub[[i]]
198    while (v %iscall% c('-', '+') && length(v)==2L) {
199      if (v[[1L]] == "-") asc[i-1L] = -asc[i-1L]
200      sub[[i]] = v = v[[2L]]  # remove the leading +/- which is the 2nd item since length(v)==2; i.e. monadic +/-
201    }
202  }
203  x = eval(sub[[2L]], parent.frame(), parent.frame())
204  if (is.list(x)) {
205    if (length(x)==0L && is.data.frame(x)) stop("Attempting to order a 0-column data.table or data.frame.")
206    sub[2L] = NULL  # change list(DT, ...) to list(...)
207    if (length(sub)==1L) {
208      data = x
209    } else {
210      if (!is.data.frame(x)) stop("The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.")
211      asc = asc[-1L]
212      data = eval(sub, x, parent.frame())
213    }
214  } else {
215    data = eval(sub, parent.frame(), parent.frame())
216  }
217  stopifnot(isTRUEorFALSE(decreasing))
218  o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last)
219  if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o
220  o
221}
222
223fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...)
224{
225  containsNAs = FALSE
226  if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) {
227      if (internal) stop("Internal code should not be being called on type double")
228      return(.Call(Cfsort, x, verbose))
229  }
230  else {
231    # fsort is now exported for testing. Trying to head off complaints "it's slow on integer"
232    # The only places internally we use fsort internally (3 calls, all on integer) have had internal=TRUE added for now.
233    # TODO: implement integer and character in Cfsort and remove this branch and warning
234    if (!internal){
235      if (typeof(x)!="double") warning("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.")
236      if (decreasing)  warning("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.")
237      if (containsNAs) warning("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.")
238    }
239    orderArg = if (decreasing) -1 else 1
240    o = forderv(x, order=orderArg, na.last=na.last)
241    return( if (length(o)) x[o] else x )
242  }
243}
244
245setorder = function(x, ..., na.last=FALSE)
246# na.last=FALSE here, to be consistent with data.table's default
247# as opposed to DT[order(.)] where na.last=TRUE, to be consistent with base
248{
249  if (!is.data.frame(x)) stop("x must be a data.frame or data.table")
250  cols = substitute(list(...))[-1L]
251  if (identical(as.character(cols),"NULL")) return(x)
252  if (length(cols)) {
253    cols=as.list(cols)
254    order=rep(1L, length(cols))
255    for (i in seq_along(cols)) {
256      v=as.list(cols[[i]])
257      if (length(v) > 1L && v[[1L]] == "+") v=v[[-1L]]
258      else if (length(v) > 1L && v[[1L]] == "-") {
259        v=v[[-1L]]
260        order[i] = -1L
261      }
262      cols[[i]]=as.character(v)
263    }
264    cols=unlist(cols, use.names=FALSE)
265  } else {
266    cols=colnames(x)
267    order=rep(1L, length(cols))
268  }
269  setorderv(x, cols, order, na.last)
270}
271
272setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE)
273{
274  if (is.null(cols)) return(x)
275  if (!is.data.frame(x)) stop("x must be a data.frame or data.table")
276  na.last = as.logical(na.last)
277  if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE')
278  if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.")
279  if (!length(cols)) {
280    warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
281    return(x)
282  }
283  if (!all(nzchar(cols))) stop("cols contains some blanks.")     # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well
284  # remove backticks from cols
285  cols = gsub("`", "", cols, fixed = TRUE)
286  miss = !(cols %chin% colnames(x))
287  if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
288  if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
289  for (i in cols) {
290    .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
291    if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.")
292  }
293  if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
294
295  o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last)
296  if (length(o)) {
297    .Call(Creorder, x, o)
298    if (is.data.frame(x) & !is.data.table(x)) {
299      setattr(x, 'row.names', rownames(x)[o])
300    }
301    k = key(x)
302    if (!identical(head(cols, length(k)), k) || any(head(order, length(k)) < 0L))
303      setattr(x, 'sorted', NULL) # if 'forderv' is not 0-length & key is not a same-ordered subset of cols, it means order has changed. So, set key to NULL, else retain key.
304    setattr(x, 'index', NULL)  # remove secondary keys too. These could be reordered and retained, but simpler and faster to remove
305  }
306  invisible(x)
307}
308
309binary = function(x) .Call(Cbinary, x)
310
311setNumericRounding = function(x) {.Call(CsetNumericRounding, as.integer(x)); invisible()}
312getNumericRounding = function() .Call(CgetNumericRounding)
313
314SJ = function(...) {
315  JDT = as.data.table(list(...))
316  setkey(JDT)
317}
318# S for Sorted, usually used in i to sort the i table
319
320# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid setalloccol
321
322CJ = function(..., sorted = TRUE, unique = FALSE)
323{
324  # Pass in a list of unique values, e.g. ids and dates
325  # Cross Join will then produce a join table with the combination of all values (cross product).
326  # The last vector is varied the quickest in the table, so dates should be last for roll for example
327  l = list(...)
328  if (isFALSE(getOption("datatable.CJ.names", TRUE))) {  # default TRUE from v1.12.0, FALSE before. TODO: remove option in v1.13.0 as stated in news
329    if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l)))
330    else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
331  } else {
332    vnames = name_dots(...)$vnames
333    if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
334  }
335  dups = FALSE # fix for #1513
336  for (i in seq_along(l)) {
337    y = l[[i]]
338    if (!length(y)) next
339    if (sorted) {
340      if (!is.atomic(y)) stop("'sorted' is TRUE but element ", i, " is non-atomic, which can't be sorted; try setting sorted = FALSE")
341      o = forderv(y, retGrp=TRUE)
342      thisdups = attr(o, 'maxgrpn', exact=TRUE)>1L
343      if (thisdups) {
344        dups = TRUE
345        if (length(o)) l[[i]] = if (unique) y[o[attr(o, "starts", exact=TRUE)]] else y[o]
346        else if (unique) l[[i]] = y[attr(o, "starts", exact=TRUE)]  # test 1525.5
347      } else {
348        if (length(o)) l[[i]] = y[o]
349      }
350    } else {
351      if (unique) l[[i]] = unique(y)
352    }
353  }
354  nrow = prod( vapply_1i(l, length) )  # lengths(l) will work from R 3.2.0
355  if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table'))
356  l = .Call(Ccj, l)
357  setDT(l)
358  l = setalloccol(l)  # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since
359                    # it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation
360  setnames(l, vnames)
361  if (sorted) {
362    if (!dups) setattr(l, 'sorted', names(l))
363    else setkey(l) # fix #1513
364  }
365  l
366}
367
368