1setkey = function(x, ..., verbose=getOption("datatable.verbose"), physical=TRUE) 2{ 3 if (is.character(x)) stop("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.") 4 cols = as.character(substitute(list(...))[-1L]) 5 if (!length(cols)) { cols=colnames(x) } 6 else if (identical(cols,"NULL")) cols=NULL 7 setkeyv(x, cols, verbose=verbose, physical=physical) 8} 9 10# FR #1442 11setindex = function(...) setkey(..., physical=FALSE) 12setindexv = function(x, cols, verbose=getOption("datatable.verbose")) { 13 if (is.list(cols)) { 14 sapply(cols, setkeyv, x=x, verbose=verbose, physical=FALSE) 15 return(invisible(x)) 16 } else { 17 setkeyv(x, cols, verbose=verbose, physical=FALSE) 18 } 19} 20 21# upgrade to error after Mar 2020. Has already been warning since 2012, and stronger warning in Mar 2019 (note in news for 1.12.2); #3399 22"key<-" = function(x,value) { 23 warning("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.") 24 setkeyv(x,value) 25 # The returned value here from key<- is then copied by R before assigning to x, it seems. That's 26 # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<- 27 # method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the 28 # returned value (x) then gets copied by R. 29 # So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy. 30} 31 32setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE) 33{ 34 if (is.null(cols)) { # this is done on a data.frame when !cedta at top of [.data.table 35 if (physical) setattr(x,"sorted",NULL) 36 setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys. 37 return(invisible(x)) 38 } 39 if (!missing(verbose)) { 40 stopifnot(isTRUEorFALSE(verbose)) 41 # set the global verbose option because that is fetched from C code without having to pass it through 42 oldverbose = options(datatable.verbose=verbose) 43 on.exit(options(oldverbose)) 44 } 45 if (!is.data.table(x)) stop("x is not a data.table") 46 if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.") 47 if (physical && .Call(C_islocked, x)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") 48 if (!length(cols)) { 49 warning("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.") 50 setattr(x,"sorted",NULL) 51 return(invisible(x)) 52 } 53 if (identical(cols,"")) stop("cols is the empty string. Use NULL to remove the key.") 54 if (!all(nzchar(cols))) stop("cols contains some blanks.") 55 cols = gsub("`", "", cols, fixed = TRUE) 56 miss = !(cols %chin% colnames(x)) 57 if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) 58 59 ## determine, whether key is already present: 60 if (identical(key(x),cols)) { 61 if (!physical) { 62 ## create index as integer() because already sorted by those columns 63 if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) 64 setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) 65 } 66 return(invisible(x)) 67 } else if(identical(head(key(x), length(cols)), cols)){ 68 if (!physical) { 69 ## create index as integer() because already sorted by those columns 70 if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) 71 setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) 72 } else { 73 ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. 74 setattr(x,"sorted",cols) 75 } 76 return(invisible(x)) 77 } 78 79 if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") 80 for (i in cols) { 81 .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column 82 if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") 83 } 84 if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov 85 86 newkey = paste0(cols, collapse="__") 87 if (!any(indices(x) == newkey)) { 88 if (verbose) { 89 tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R 90 # suppress needed for tests 644 and 645 in verbose mode 91 cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") 92 } else { 93 o = forderv(x, cols, sort=TRUE, retGrp=FALSE) 94 } 95 } else { 96 if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") 97 o = getindex(x, newkey) 98 } 99 if (!physical) { 100 if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) 101 setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) 102 return(invisible(x)) 103 } 104 setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. 105 if (length(o)) { 106 if (verbose) { last.started.at = proc.time() } 107 .Call(Creorder,x,o) 108 if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } 109 } else { 110 if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") 111 } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. 112 setattr(x,"sorted",cols) 113 invisible(x) 114} 115 116key = function(x) attr(x, "sorted", exact=TRUE) 117 118indices = function(x, vectors = FALSE) { 119 ans = names(attributes(attr(x, "index", exact=TRUE))) 120 if (is.null(ans)) return(ans) # otherwise character() gets returned by next line 121 ans = gsub("^__","",ans) # the leading __ is internal only, so remove that in result 122 if (isTRUE(vectors)) 123 ans = strsplit(ans, "__", fixed = TRUE) 124 ans 125} 126 127getindex = function(x, name) { 128 # name can be "col", or "col1__col2", or c("col1","col2") 129 ans = attr(attr(x, 'index', exact=TRUE), paste0("__",name,collapse=""), exact=TRUE) 130 if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) { 131 stop("Internal error: index '",name,"' exists but is invalid") # nocov 132 } 133 ans 134} 135 136haskey = function(x) !is.null(key(x)) 137 138# reorder a vector based on 'order' (integer) 139# to be used in fastorder instead of x[o], but in general, it's better to replace vector subsetting with this..? 140# Basic checks that all items of order are in range 1:n with no NAs are now made inside Creorder. 141# FOR INTERNAL USE ONLY 142setreordervec = function(x, order) .Call(Creorder, x, order) 143 144# sort = sort.int = sort.list = order = is.unsorted = function(...) 145# stop("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.") 146# Nice idea, but users might use these in i or j e.g. blocking order caused tests 304 to fail. 147# Maybe just a grep through *.R for use of these function internally would be better (TO DO). 148 149# Don't use base::is.unsorted internally, because : 150# 1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but 151# "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale) 152# 2) wrapper needed, used to be : 153# identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x)) 154# where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013]. 155# The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for 156# consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors. 157 158is.sorted = function(x, by=NULL) { 159 if (is.list(x)) { 160 if (missing(by)) by = seq_along(x) # wouldn't make sense when x is a vector; hence by=seq_along(x) is not the argument default 161 if (is.character(by)) by = chmatch(by, names(x)) 162 } else { 163 if (!missing(by)) stop("x is vector but 'by' is supplied") 164 } 165 .Call(Cissorted, x, as.integer(by)) 166 # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) 167} 168 169ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') 170forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) 171{ 172 if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), 173 if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") 174 by = NULL 175 } else { 176 if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) 177 by = colnamesInt(x, by, check_dups=FALSE) 178 if (length(order) == 1L) order = rep(order, length(by)) 179 } 180 order = as.integer(order) # length and contents of order being +1/-1 is checked at C level 181 .Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE 182} 183 184forder = function(..., na.last=TRUE, decreasing=FALSE) 185{ 186 sub = substitute(list(...)) 187 tt = sapply(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) 188 if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) 189 if (length(sub)<2L) return(NULL) # forder() with no arguments returns NULL consistent with base::order 190 asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column 191 # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that : 192 # 1) - on character vector works; ordinarily in R that fails with type error 193 # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE 194 # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first 195 # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item. 196 for (i in seq.int(2L, length(sub))) { 197 v = sub[[i]] 198 while (v %iscall% c('-', '+') && length(v)==2L) { 199 if (v[[1L]] == "-") asc[i-1L] = -asc[i-1L] 200 sub[[i]] = v = v[[2L]] # remove the leading +/- which is the 2nd item since length(v)==2; i.e. monadic +/- 201 } 202 } 203 x = eval(sub[[2L]], parent.frame(), parent.frame()) 204 if (is.list(x)) { 205 if (length(x)==0L && is.data.frame(x)) stop("Attempting to order a 0-column data.table or data.frame.") 206 sub[2L] = NULL # change list(DT, ...) to list(...) 207 if (length(sub)==1L) { 208 data = x 209 } else { 210 if (!is.data.frame(x)) stop("The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.") 211 asc = asc[-1L] 212 data = eval(sub, x, parent.frame()) 213 } 214 } else { 215 data = eval(sub, parent.frame(), parent.frame()) 216 } 217 stopifnot(isTRUEorFALSE(decreasing)) 218 o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last) 219 if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o 220 o 221} 222 223fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...) 224{ 225 containsNAs = FALSE 226 if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) { 227 if (internal) stop("Internal code should not be being called on type double") 228 return(.Call(Cfsort, x, verbose)) 229 } 230 else { 231 # fsort is now exported for testing. Trying to head off complaints "it's slow on integer" 232 # The only places internally we use fsort internally (3 calls, all on integer) have had internal=TRUE added for now. 233 # TODO: implement integer and character in Cfsort and remove this branch and warning 234 if (!internal){ 235 if (typeof(x)!="double") warning("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.") 236 if (decreasing) warning("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.") 237 if (containsNAs) warning("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.") 238 } 239 orderArg = if (decreasing) -1 else 1 240 o = forderv(x, order=orderArg, na.last=na.last) 241 return( if (length(o)) x[o] else x ) 242 } 243} 244 245setorder = function(x, ..., na.last=FALSE) 246# na.last=FALSE here, to be consistent with data.table's default 247# as opposed to DT[order(.)] where na.last=TRUE, to be consistent with base 248{ 249 if (!is.data.frame(x)) stop("x must be a data.frame or data.table") 250 cols = substitute(list(...))[-1L] 251 if (identical(as.character(cols),"NULL")) return(x) 252 if (length(cols)) { 253 cols=as.list(cols) 254 order=rep(1L, length(cols)) 255 for (i in seq_along(cols)) { 256 v=as.list(cols[[i]]) 257 if (length(v) > 1L && v[[1L]] == "+") v=v[[-1L]] 258 else if (length(v) > 1L && v[[1L]] == "-") { 259 v=v[[-1L]] 260 order[i] = -1L 261 } 262 cols[[i]]=as.character(v) 263 } 264 cols=unlist(cols, use.names=FALSE) 265 } else { 266 cols=colnames(x) 267 order=rep(1L, length(cols)) 268 } 269 setorderv(x, cols, order, na.last) 270} 271 272setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) 273{ 274 if (is.null(cols)) return(x) 275 if (!is.data.frame(x)) stop("x must be a data.frame or data.table") 276 na.last = as.logical(na.last) 277 if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE') 278 if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.") 279 if (!length(cols)) { 280 warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") 281 return(x) 282 } 283 if (!all(nzchar(cols))) stop("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well 284 # remove backticks from cols 285 cols = gsub("`", "", cols, fixed = TRUE) 286 miss = !(cols %chin% colnames(x)) 287 if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) 288 if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") 289 for (i in cols) { 290 .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column 291 if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.") 292 } 293 if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov 294 295 o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) 296 if (length(o)) { 297 .Call(Creorder, x, o) 298 if (is.data.frame(x) & !is.data.table(x)) { 299 setattr(x, 'row.names', rownames(x)[o]) 300 } 301 k = key(x) 302 if (!identical(head(cols, length(k)), k) || any(head(order, length(k)) < 0L)) 303 setattr(x, 'sorted', NULL) # if 'forderv' is not 0-length & key is not a same-ordered subset of cols, it means order has changed. So, set key to NULL, else retain key. 304 setattr(x, 'index', NULL) # remove secondary keys too. These could be reordered and retained, but simpler and faster to remove 305 } 306 invisible(x) 307} 308 309binary = function(x) .Call(Cbinary, x) 310 311setNumericRounding = function(x) {.Call(CsetNumericRounding, as.integer(x)); invisible()} 312getNumericRounding = function() .Call(CgetNumericRounding) 313 314SJ = function(...) { 315 JDT = as.data.table(list(...)) 316 setkey(JDT) 317} 318# S for Sorted, usually used in i to sort the i table 319 320# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid setalloccol 321 322CJ = function(..., sorted = TRUE, unique = FALSE) 323{ 324 # Pass in a list of unique values, e.g. ids and dates 325 # Cross Join will then produce a join table with the combination of all values (cross product). 326 # The last vector is varied the quickest in the table, so dates should be last for roll for example 327 l = list(...) 328 if (isFALSE(getOption("datatable.CJ.names", TRUE))) { # default TRUE from v1.12.0, FALSE before. TODO: remove option in v1.13.0 as stated in news 329 if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l))) 330 else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) 331 } else { 332 vnames = name_dots(...)$vnames 333 if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) 334 } 335 dups = FALSE # fix for #1513 336 for (i in seq_along(l)) { 337 y = l[[i]] 338 if (!length(y)) next 339 if (sorted) { 340 if (!is.atomic(y)) stop("'sorted' is TRUE but element ", i, " is non-atomic, which can't be sorted; try setting sorted = FALSE") 341 o = forderv(y, retGrp=TRUE) 342 thisdups = attr(o, 'maxgrpn', exact=TRUE)>1L 343 if (thisdups) { 344 dups = TRUE 345 if (length(o)) l[[i]] = if (unique) y[o[attr(o, "starts", exact=TRUE)]] else y[o] 346 else if (unique) l[[i]] = y[attr(o, "starts", exact=TRUE)] # test 1525.5 347 } else { 348 if (length(o)) l[[i]] = y[o] 349 } 350 } else { 351 if (unique) l[[i]] = unique(y) 352 } 353 } 354 nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 355 if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table')) 356 l = .Call(Ccj, l) 357 setDT(l) 358 l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since 359 # it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation 360 setnames(l, vnames) 361 if (sorted) { 362 if (!dups) setattr(l, 'sorted', names(l)) 363 else setkey(l) # fix #1513 364 } 365 l 366} 367 368