1if (!exists("trimws", "package:base")) { 2 # trimws was new in R 3.2.0. Backport it for internal data.table use in R 3.1.0 3 trimws = function(x) { 4 mysub = function(re, x) sub(re, "", x, perl = TRUE) 5 mysub("[ \t\r\n]+$", mysub("^[ \t\r\n]+", x)) 6 } 7} 8 9dim.data.table = function(x) 10{ 11 .Call(Cdim, x) 12} 13 14.global = new.env() # thanks to: http://stackoverflow.com/a/12605694/403310 15setPackageName("data.table",.global) 16.global$print = "" 17 18# NB: if adding to/editing this list, be sure to do the following: 19# (1) add to man/special-symbols.Rd 20# (2) export() in NAMESPACE 21# (3) add to vignettes/datatable-importing.Rmd#globals section 22.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL 23# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler. 24# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols. 25# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions. 26# utils::globalVariables(c(".SD",".N")) was tried as well, but exporting seems better. 27# So even though .BY doesn't appear in this file, it should still be NULL here and exported because it's 28# defined in SDenv and can be used by users. 29 30is.data.table = function(x) inherits(x, "data.table") 31is.ff = function(x) inherits(x, "ff") # define this in data.table so that we don't have to require(ff), but if user is using ff we'd like it to work 32 33#NCOL = function(x) { 34# # copied from base, but additionally covers data.table via is.list() 35# # because NCOL in base explicitly tests using is.data.frame() 36# if (is.list(x) && !is.ff(x)) return(length(x)) 37# if (is.array(x) && length(dim(x)) > 1L) ncol(x) else as.integer(1L) 38#} 39#NROW = function(x) { 40# if (is.data.frame(x) || is.data.table(x)) return(nrow(x)) 41# if (is.list(x) && !is.ff(x)) stop("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve. 42# if (is.array(x)) nrow(x) else length(x) 43#} 44 45null.data.table = function() { 46 ans = list() 47 setattr(ans,"class",c("data.table","data.frame")) 48 setattr(ans,"row.names",.set_row_names(0L)) 49 setalloccol(ans) 50} 51 52data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) 53{ 54 # NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l 55 # first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking. 56 x = list(...) # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) 57 nd = name_dots(...) 58 names(x) = nd$vnames 59 if (length(x)==0L) return( null.data.table() ) 60 if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #48 61 ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named) # see comments inside as.data.table.list re copies 62 if (!is.null(key)) { 63 if (!is.character(key)) stop("key argument of data.table() must be character") 64 if (length(key)==1L) { 65 key = strsplit(key,split=",")[[1L]] 66 # eg key="A,B"; a syntax only useful in key argument to data.table(), really. 67 } 68 setkeyv(ans,key) 69 } else { 70 # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). 71 # If DT inputs with keys have been recycled then can't retain key 72 ckey = NULL 73 for (i in seq_along(x)) { 74 xi = x[[i]] 75 if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi)) 76 } 77 if (length(ckey) && 78 !anyDuplicated(ckey) && 79 identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) { 80 setattr(ans, "sorted", ckey) 81 } 82 } 83 if (isTRUE(stringsAsFactors)) { 84 for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j))) 85 # as_factor is internal function in fread.R currently 86 } 87 setalloccol(ans) # returns a NAMED==0 object, unlike data.frame() 88} 89 90replace_dot_alias = function(e) { 91 # we don't just simply alias .=list because i) list is a primitive (faster to iterate) and ii) we test for use 92 # of "list" in several places so it saves having to remember to write "." || "list" in those places 93 if (is.call(e) && !is.function(e[[1L]])) { 94 # . alias also used within bquote, #1912 95 if (e[[1L]] == 'bquote') return(e) 96 if (e[[1L]] == ".") e[[1L]] = quote(list) 97 for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_dot_alias(e[[i]]) 98 } 99 e 100} 101 102.massagei = function(x) { 103 # J alias for list as well in i, just if the first symbol 104 # if x = substitute(base::order) then as.character(x[[1L]]) == c("::", "base", "order") 105 if (x %iscall% c("J",".")) 106 x[[1L]] = quote(list) 107 x 108} 109 110.checkTypos = function(err, ref) { 111 if (grepl('object.*not found', err$message)) { 112 used = gsub(".*object '([^']+)'.*", "\\1", err$message) 113 found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE) 114 if (length(found)) { 115 stop("Object '", used, "' not found. Perhaps you intended ", 116 paste(head(found, 5L), collapse=", "), 117 if (length(found)<=5L) "" else paste(" or",length(found)-5L, "more")) 118 } else { 119 stop("Object '", used, "' not found amongst ", 120 paste(head(ref, 5L), collapse=', '), 121 if (length(ref)<=5L) "" else paste(" and", length(ref)-5L, "more")) 122 } 123 } else { 124 stop(err$message, call.=FALSE) 125 } 126} 127 128"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL) 129{ 130 # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could 131 # test explicitly if the caller is [.data.table (even stronger test. TO DO.) 132 # the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147 133 if (!cedta()) { 134 # Fix for #500 (to do) 135 Nargs = nargs() - (!missing(drop)) 136 ans = if (Nargs<3L) { `[.data.frame`(x,i) } # drop ignored anyway by DF[i] 137 else if (missing(drop)) `[.data.frame`(x,i,j) 138 else `[.data.frame`(x,i,j,drop) 139 # added is.data.table(ans) check to fix bug #81 140 if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304 141 return(ans) 142 } 143 if (!missing(verbose)) { 144 stopifnot(isTRUEorFALSE(verbose)) 145 # set the global verbose option because that is fetched from C code without having to pass it through 146 oldverbose = options(datatable.verbose=verbose) 147 on.exit(options(oldverbose)) 148 } 149 .global$print="" 150 missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector 151 if (!missing(keyby)) { 152 if (!missing(by)) stop("Provide either by= or keyby= but not both") 153 if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; } 154 by=bysub=substitute(keyby) 155 keyby=TRUE 156 # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by 157 } else { 158 if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; } 159 by=bysub= if (missing(by)) NULL else substitute(by) 160 keyby=FALSE 161 } 162 bynull = !missingby && is.null(by) #3530 163 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" 164 naturaljoin = FALSE 165 names_x = names(x) 166 if (missing(i) && !missing(on)) { 167 tt = eval.parent(.massagei(substitute(on))) 168 if (!is.list(tt) || !length(names(tt))) { 169 warning("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '",class(tt)[1L],"'.") 170 on = NULL 171 } else { 172 i = tt 173 naturaljoin = TRUE 174 } 175 } 176 if (missing(i) && missing(j)) { 177 tt_isub = substitute(i) 178 tt_jsub = substitute(j) 179 if (!is.null(names(sys.call())) && # not relying on nargs() as it considers DT[,] to have 3 arguments, #3163 180 tryCatch(!is.symbol(tt_isub), error=function(e)TRUE) && # a symbol that inherits missingness from caller isn't missing for our purpose; test 1974 181 tryCatch(!is.symbol(tt_jsub), error=function(e)TRUE)) { 182 warning("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.") 183 } 184 return(x) 185 } 186 if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first', 'last' or 'all'") 187 missingroll = missing(roll) 188 if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") 189 if (is.character(roll)) { 190 if (roll!="nearest") stop("roll is '",roll,"' (type character). Only valid character value is 'nearest'.") 191 } else { 192 roll = if (isTRUE(roll)) +Inf else as.double(roll) 193 } 194 force(rollends) 195 if (!is.logical(rollends)) stop("rollends must be a logical vector") 196 if (length(rollends)>2L) stop("rollends must be length 1 or 2") 197 if (length(rollends)==1L) rollends=rep.int(rollends,2L) 198 # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. 199 .unsafe.opt() #3585 200 missingnomatch = missing(nomatch) 201 if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857 202 if (!is.na(nomatch) && nomatch!=0L) stop("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") 203 nomatch = as.integer(nomatch) 204 if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") 205 if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("which==",which," (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.") 206 if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") 207 if (!with && missing(j)) stop("j must be provided when with=FALSE") 208 irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. 209 notjoin = FALSE 210 rightcols = leftcols = integer() 211 optimizedSubset = FALSE ## flag: tells whether a normal query was optimized into a join. 212 ..syms = NULL 213 av = NULL 214 jsub = NULL 215 if (!missing(j)) { 216 jsub = replace_dot_alias(substitute(j)) 217 root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" 218 if (root == ":" || 219 (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || 220 ( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) && 221 root %chin% c("","c","paste","paste0","-","!") && 222 missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed) 223 # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find. 224 # If variable names do occur, but they are all prefixed with .., then that means look up in calling scope. 225 # Automatically set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")] 226 # work as expected. As before, a vector will never be returned, but a single column data.table 227 # for type consistency with >1 cases. To return a single vector use DT[["someCol"]] or DT[[3]]. 228 # The root==":" is to allow DT[,colC:colH] even though that contains two variable names. 229 # root == "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root) 230 # We don't want to evaluate j at all in making this decision because i) evaluating could itself 231 # increment some variable and not intended to be evaluated a 2nd time later on and ii) we don't 232 # want decisions like this to depend on the data or vector lengths since that can introduce 233 # inconsistency reminiscent of drop=TRUE in [.data.frame that we seek to avoid. 234 with=FALSE 235 if (length(av)) { 236 for (..name in av) { 237 name = substring(..name, 3L) 238 if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") 239 if (!exists(name, where=parent.frame())) { 240 stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.", 241 if (exists(..name, where=parent.frame())) 242 paste0(" Variable '..",name,"' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.") 243 # We have recommended 'manual' .. prefix in the past, so try to be helpful 244 else 245 "" 246 ) 247 } else if (exists(..name, where=parent.frame())) { 248 warning("Both '",name,"' and '..", name, "' exist in calling scope. Please remove the '..", name,"' variable in calling scope for clarity.") 249 } 250 } 251 ..syms = av 252 } 253 } else if (is.name(jsub)) { 254 if (substring(jsub, 1L, 2L) == "..") stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov 255 if (!with && !exists(as.character(jsub), where=parent.frame())) 256 stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.") 257 } 258 if (root=="{") { 259 if (length(jsub) == 2L) { 260 jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376] 261 root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" 262 } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") { 263 #2142 -- j can be {} and have length 1 264 stop("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}") 265 } 266 } 267 if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) { 268 # TODO: this && !any depends on data. Can we remove it? 269 # Grab the dynamic expression from calling scope now to give the optimizer a chance to optimize it 270 # Only when top level is eval call. Not nested like x:=eval(...) or `:=`(x=eval(...), y=eval(...)) 271 jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression 272 if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call 273 # Note that the dynamic expression could now be := (new in v1.9.7) 274 root = if (is.call(jsub)) { 275 jsub = replace_dot_alias(jsub) 276 as.character(jsub[[1L]])[1L] 277 } else "" 278 } 279 if (root == ":=") { 280 allow.cartesian=TRUE # (see #800) 281 if (!missing(i) && keyby) 282 stop(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") 283 if (!missingnomatch) { 284 warning("nomatch isn't relevant together with :=, ignoring nomatch") 285 nomatch=0L 286 } 287 } 288 } 289 290 # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead. 291 dupdiff = function(x, y) x[!x %chin% y] 292 293 if (!missing(i)) { 294 xo = NULL 295 isub = substitute(i) 296 if (identical(isub, NA)) { 297 # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA] 298 # replace NA in this case with NA_integer_ as that's almost surely what user intended to 299 # return a single row with NA in all columns. (DT[0] returns an empty table, with correct types.) 300 # Any expression (including length 1 vectors) that evaluates to a single NA logical will 301 # however be left as NA logical since that's important for consistency to return empty in that 302 # case; e.g. DT[Col==3] where DT is 1 row and Col contains NA. 303 # Replacing the NA symbol makes DT[NA] and DT[c(1,NA)] consistent and provides 304 # an easy way to achieve a single row of NA as users expect rather than requiring them 305 # to know and change to DT[NA_integer_]. 306 isub=NA_integer_ 307 } 308 isnull_inames = FALSE 309 # Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires 310 # the "eval" to be checked before `as.name("!")`. Therefore interchanged. 311 restore.N = remove.N = FALSE 312 if (exists(".N", envir=parent.frame(), inherits=FALSE)) { 313 old.N = get(".N", envir=parent.frame(), inherits=FALSE) 314 locked.N = bindingIsLocked(".N", parent.frame()) 315 if (locked.N) eval(call("unlockBinding", ".N", parent.frame())) # eval call to pass R CMD check NOTE until we find cleaner way 316 assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE) 317 restore.N = TRUE 318 # the comment below is invalid hereafter (due to fix for #1145) 319 # binding locked when .SD[.N] but that's ok as that's the .N we want anyway 320 321 # TO DO: change isub at C level s/.N/nrow(x); changing a symbol to a constant should be ok 322 } else { 323 assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE) 324 remove.N = TRUE 325 } 326 if (isub %iscall% "eval") { # TO DO: or ..() 327 isub = eval(.massagei(isub[[2L]]), parent.frame(), parent.frame()) 328 if (is.expression(isub)) isub=isub[[1L]] 329 } 330 if (isub %iscall% "!") { 331 notjoin = TRUE 332 if (!missingnomatch) stop("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); 333 nomatch = 0L 334 isub = isub[[2L]] 335 # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!" 336 if (isub %iscall% "(" && !is.name(isub[[2L]])) 337 isub = isub[[2L]] 338 } 339 340 if (is.null(isub)) return( null.data.table() ) 341 342 if (length(o <- .prepareFastSubset(isub = isub, x = x, 343 enclos = parent.frame(), 344 notjoin = notjoin, verbose = verbose))){ 345 ## redirect to the is.data.table(x) == TRUE branch. 346 ## Additional flag to adapt things after bmerge: 347 optimizedSubset = TRUE 348 notjoin = o$notjoin 349 i = o$i 350 on = o$on 351 ## the following two are ignored if i is not a data.table. 352 ## Since we are converting i to data.table, it is important to set them properly. 353 nomatch = 0L 354 mult = "all" 355 } 356 else if (!is.name(isub)) { 357 ienv = new.env(parent=parent.frame()) 358 if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv) 359 i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) { 360 if (grepl(":=.*defined for use in j.*only", e$message)) 361 stop("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.") 362 else 363 .checkTypos(e, names_x) 364 }) 365 } else { 366 # isub is a single symbol name such as B in DT[B] 367 i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE) 368 if (inherits(i,"try-error")) { 369 # must be "not found" since isub is a mere symbol 370 col = try(eval(isub, x), silent=TRUE) # is it a column name? 371 msg = if (inherits(col,"try-error")) " and it is not a column name either." 372 else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE", 373 ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.") 374 stop(as.character(isub), " is not found in calling scope", msg, 375 " When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") 376 } 377 } 378 if (restore.N) { 379 assign(".N", old.N, envir=parent.frame()) 380 if (locked.N) lockBinding(".N", parent.frame()) 381 } 382 if (remove.N) rm(list=".N", envir=parent.frame()) 383 if (is.matrix(i)) { 384 if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix 385 i = as.integer(i) 386 } else { 387 stop("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.") 388 } 389 } 390 if (is.logical(i)) { 391 if (notjoin) { 392 notjoin = FALSE 393 i = !i 394 } 395 } 396 if (is.null(i)) return( null.data.table() ) 397 if (is.character(i)) { 398 isnull_inames = TRUE 399 i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")] 400 } else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) } 401 else if (identical(class(i),"data.frame")) { i = as.data.table(i) } # TO DO: avoid these as.data.table() and use a flag instead 402 else if (identical(class(i),"list")) { 403 isnull_inames = is.null(names(i)) 404 i = as.data.table(i) 405 } 406 407 if (is.data.table(i)) { 408 if (missing(on)) { 409 if (!haskey(x)) { 410 stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") 411 } 412 } else if (identical(substitute(on), as.name(".NATURAL"))) { 413 naturaljoin = TRUE 414 } 415 if (naturaljoin) { # natural join #629 416 common_names = intersect(names_x, names(i)) 417 len_common_names = length(common_names) 418 if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") 419 if (verbose) { 420 which_cols_msg = if (len_common_names == length(x)) " all 'x' columns" 421 else paste(":", brackify(common_names)) 422 cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "") 423 } 424 on = common_names 425 } 426 if (!missing(on)) { 427 # on = .() is now possible, #1257 428 on_ops = .parse_on(substitute(on), isnull_inames) 429 on = on_ops[[1L]] 430 ops = on_ops[[2L]] 431 if (any(ops > 1L)) { ## fix for #4489; ops = c("==", "<=", "<", ">=", ">", "!=") 432 allow.cartesian = TRUE 433 } 434 # TODO: collect all '==' ops first to speeden up Cnestedid 435 rightcols = colnamesInt(x, names(on), check_dups=FALSE) 436 leftcols = colnamesInt(i, unname(on), check_dups=FALSE) 437 } else { 438 ## missing on 439 rightcols = chmatch(key(x), names_x) # NAs here (i.e. invalid data.table) checked in bmerge() 440 leftcols = if (haskey(i)) 441 chmatch(head(key(i), length(rightcols)), names(i)) 442 else 443 seq_len(min(length(i),length(rightcols))) 444 rightcols = head(rightcols,length(leftcols)) 445 ops = rep(1L, length(leftcols)) 446 } 447 # Implementation for not-join along with by=.EACHI, #604 448 if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 449 notjoin = FALSE 450 if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()} 451 orignames = copy(names(i)) 452 i = setdiff_(x, i, rightcols, leftcols) # part of #547 453 if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} 454 setnames(i, orignames[leftcols]) 455 setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted 456 } 457 i = .shallow(i, retain.key = TRUE) 458 ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose) 459 xo = ans$xo ## to make it available for further use. 460 # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this 461 # 'setorder', as there's another 'setorder' in generating 'irows' below... 462 if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices) 463 allLen1 = ans$allLen1 464 f__ = ans$starts 465 len__ = ans$lens 466 allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. 467 indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix 468 # length of input nomatch (single 0 or NA) is 1 in both cases. 469 # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N 470 # If using secondary key of x, f__ will refer to xo 471 if (is.na(which)) { 472 w = if (notjoin) f__!=0L else is.na(f__) 473 return( if (length(xo)) fsort(xo[w], internal=TRUE) else which(w) ) 474 } 475 if (mult=="all") { 476 # is by=.EACHI along with non-equi join? 477 nqbyjoin = byjoin && length(ans$indices) && !allGrp1 478 if (!byjoin || nqbyjoin) { 479 # Really, `anyDuplicated` in base is AWESOME! 480 # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates 481 if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} 482 irows = if (allLen1) f__ else vecseq(f__,len__, 483 if (allow.cartesian || 484 notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). 485 !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) { 486 NULL # #742. If 'i' has no duplicates, ignore 487 } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). 488 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 489 # Fix for #1092 and #1074 490 # TODO: implement better version of "any"/"all"/"which" to avoid 491 # unnecessary construction of logical vectors 492 if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] 493 } else { 494 if (length(xo) && missing(on)) 495 stop("Internal error. Cannot by=.EACHI when joining to a secondary key, yet") # nocov 496 # since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too. 497 if (length(irows)) 498 stop("Internal error. irows has length in by=.EACHI") # nocov 499 } 500 if (nqbyjoin) { 501 irows = if (length(xo)) xo[irows] else irows 502 xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] 503 ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix 504 f__ = ans[[1L]]; len__ = ans[[2L]] 505 allLen1 = FALSE # TODO; should this always be FALSE? 506 irows = NULL # important to reset 507 if (any_na(as_list(xo))) xo = xo[!is.na(xo)] 508 } 509 } else { 510 if (!byjoin) { #1287 and #1271 511 irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic 512 if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows 513 } 514 # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"? 515 # TODO: how about when nomatch=0L, can we avoid allocating then as well? 516 } 517 if (length(xo) && length(irows)) { 518 irows = xo[irows] # TO DO: fsort here? 519 if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit. 520 if (verbose) {last.started.at=proc.time();cat("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} 521 irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] 522 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 523 } 524 } 525 if (optimizedSubset){ 526 ## special treatment for calls like DT[x == 3] that are transformed into DT[J(x=3), on = "x==x"] 527 528 if(!.Call(CisOrderedSubset, irows, nrow(x))){ 529 ## restore original order. This is a very expensive operation. 530 ## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time 531 ## (see #2366) 532 if (verbose) {last.started.at=proc.time()[3L];cat("Reordering", length(irows), "rows after bmerge done in ... ");flush.console()} 533 if(length(irows) < 1e6){ 534 irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv 535 } else { 536 irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion 537 } 538 if (verbose) {cat(round(proc.time()[3L]-last.started.at,3L),"secs\n");flush.console()} 539 } 540 ## make sure, all columns are taken from x and not from i. 541 ## This is done by simply telling data.table to continue as if there was a simple subset 542 leftcols = integer(0L) 543 rightcols = integer(0L) 544 i = irows ## important to make i not a data.table because otherwise Gforce doesn't kick in 545 } 546 } 547 else { 548 if (!missing(on)) { 549 stop("logical error. i is not a data.table, but 'on' argument is provided.") 550 } 551 # TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient. 552 # i is not a data.table 553 if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.") 554 if (is.logical(i)) { 555 if (length(i)==1L # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down) 556 && isTRUE(unname(i))) { irows=i=NULL } # unname() for #2152 - length 1 named logical vector. 557 # NULL is efficient signal to avoid creating 1:nrow(x) but still return all rows, fixes #1249 558 559 else if (length(i)<=1L) { irows=i=integer(0L) } 560 # FALSE, NA and empty. All should return empty data.table. The NA here will be result of expression, 561 # where for consistency of edge case #1252 all NA to be removed. If NA is a single NA symbol, it 562 # was auto converted to NA_integer_ higher up for ease of use and convenience. We definitely 563 # don't want base R behaviour where DF[NA,] returns an entire copy filled with NA everywhere. 564 565 else if (length(i)==nrow(x)) { irows=i=which(i) } 566 # The which() here auto removes NA for convenience so user doesn't need to remember "!is.na() & ..." 567 # Also this which() is for consistency of DT[colA>3,which=TRUE] and which(DT[,colA>3]) 568 # Assigning to 'i' here as well to save memory, #926. 569 570 else stop("i evaluates to a logical vector length ", length(i), " but there are ", nrow(x), " rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.") 571 } else { 572 irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) 573 irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning) 574 # simplifies logic from here on: can assume positive subscripts (no zeros) 575 # maintains Arun's fix for #2697 (test 1042) 576 # efficient in C with more detailed helpful messages when user mixes positives and negatives 577 # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives 578 # minor TO DO: can we merge this with check_idx in fcast.c/subset ? 579 } 580 } 581 if (notjoin) { 582 if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov 583 irows = irows[irows!=0L] 584 if (verbose) {last.started.at=proc.time()[3L];cat("Inverting irows for notjoin done in ... ");flush.console()} 585 i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) 586 if (verbose) cat(round(proc.time()[3L]-last.started.at, 3L), "sec\n") 587 leftcols = integer() # proceed as if row subset from now on, length(leftcols) is switched on later 588 rightcols = integer() 589 # Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each 590 # column when irows contains negatives. 591 } 592 if (which) return( if (is.null(irows)) seq_len(nrow(x)) else irows ) 593 } else { # missing(i) 594 i = NULL 595 } 596 names_i = names(i) # value is now stable 597 byval = NULL 598 xnrow = nrow(x) 599 xcols = xcolsAns = icols = icolsAns = integer() 600 xdotcols = FALSE 601 # track which columns appear in j through ansvars, and those 602 # that belong in .SD through sdvars. Mostly these will be the same, 603 # except in cases like DT[ , lapply(.SD, function(x) x/V1), .SDcols = !'V1'], 604 # see e.g. #484, #495, #1744, #1965. 605 non_sdvars = character(0L) 606 if (missing(j)) { 607 # missingby was already checked above before dealing with i 608 if (!length(x)) return(null.data.table()) 609 if (!length(leftcols)) { 610 # basic x[i] subset, #2951 611 if (is.null(irows)) return(shallow(x)) # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy 612 else return(.Call(CsubsetDT, x, irows, seq_along(x)) ) 613 } else { 614 jisvars = names_i[-leftcols] 615 tt = jisvars %chin% names_x 616 if (length(tt)) jisvars[tt] = paste0("i.",jisvars[tt]) 617 if (length(duprightcols <- rightcols[duplicated(rightcols)])) { 618 nx = c(names_x, names_x[duprightcols]) 619 rightcols = chmatchdup(names_x[rightcols], nx) 620 nx = make.unique(nx) 621 } else nx = names_x 622 ansvars = make.unique(c(nx, jisvars)) 623 icols = c(leftcols, seq_along(i)[-leftcols]) 624 icolsAns = c(rightcols, seq.int(length(nx)+1L, length.out=ncol(i)-length(unique(leftcols)))) 625 xcols = xcolsAns = seq_along(x)[-rightcols] 626 } 627 ansvals = chmatch(ansvars, nx) 628 } 629 else { 630 if (is.data.table(i)) { 631 idotprefix = paste0("i.", names_i) 632 xdotprefix = paste0("x.", names_x) 633 } else idotprefix = xdotprefix = character(0L) 634 635 # j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic) 636 if (is.null(jsub)) return(NULL) 637 638 if (!with && jsub %iscall% ":=") { 639 # TODO: make these both errors (or single long error in both cases) in next release. 640 # i.e. using with=FALSE together with := at all will become an error. Eventually with will be removed. 641 if (is.null(names(jsub)) && is.name(jsub[[2L]])) { 642 warning("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.") 643 jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame()) 644 } else { 645 warning("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.") 646 } 647 with = TRUE 648 } 649 650 if (!with) { 651 # missingby was already checked above before dealing with i 652 if (jsub %iscall% c("!", "-") && length(jsub)==2L) { # length 2 to only match unary, #2109 653 notj = TRUE 654 jsub = jsub[[2L]] 655 } else notj = FALSE 656 # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4))) 657 while (jsub %iscall% "(") jsub = as.list(jsub)[[-1L]] 658 if (jsub %iscall% ":" && length(jsub)==3L) { 659 j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line 660 } else { 661 names(..syms) = ..syms 662 j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame()) 663 } 664 if (is.logical(j)) j <- which(j) 665 if (!length(j) && !notj) return( null.data.table() ) 666 if (is.factor(j)) j = as.character(j) # fix for FR: #358 667 if (is.character(j)) { 668 if (notj) { 669 if (anyNA(idx <- chmatch(j, names_x))) warning("column(s) not removed because not found: ", brackify(j[is.na(idx)])) 670 # all duplicates of the name in names(x) must be removed; e.g. data.table(x=1, y=2, x=3)[, !"x"] should just output 'y'. 671 w = !names_x %chin% j 672 ansvars = names_x[w] 673 ansvals = which(w) 674 } else { 675 # if DT[, c("x","x")] and "x" is duplicated in names(DT), we still subset only the first. Because dups are unusual and 676 # it's more common to select the same column a few times. A syntax would be needed to distinguish these intents. 677 ansvars = j # x. and i. prefixes may be in here, they'll result in NA and will be dealt with further below if length(leftcols) 678 ansvals = chmatch(ansvars, names_x) # not chmatchdup() 679 } 680 if (!length(ansvals)) return(null.data.table()) 681 if (!length(leftcols)) { 682 if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals)) 683 else stop("column(s) not found: ", paste(ansvars[is.na(ansvals)],collapse=", ")) 684 } 685 # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) 686 # in both cases leave to the R-level subsetting of i and x together further below 687 } else if (is.numeric(j)) { 688 j = as.integer(j) 689 if (any(w<-(j>ncol(x)))) stop("Item ",which.first(w)," of j is ",j[which.first(w)]," which is outside the column number range [1,ncol=", ncol(x),"]") 690 j = j[j!=0L] 691 if (any(j<0L)) { 692 if (any(j>0L)) stop("j mixes positives and negatives") 693 j = seq_along(x)[j] # all j are <0 here 694 } 695 # 3013 -- handle !FALSE in column subset in j via logical+with 696 if (notj) j = seq_along(x)[if (length(j)) -j else TRUE] 697 if (!length(j)) return(null.data.table()) 698 return(.Call(CsubsetDT, x, irows, j)) 699 } else { 700 stop("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440. 701 } 702 } else { # with=TRUE and byjoin could be TRUE 703 bynames = NULL 704 allbyvars = NULL 705 if (byjoin) { 706 bynames = names_x[rightcols] 707 } else if (!missingby) { 708 # deal with by before j because we need byvars when j contains .SD 709 # may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through 710 bysubl = as.list.default(bysub) 711 bysuborig = bysub 712 if (is.name(bysub) && !(bysub %chin% names_x)) { # TO DO: names(x),names(i),and i. and x. prefixes 713 bysub = eval(bysub, parent.frame(), parent.frame()) 714 # fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow 715 # case where by=y where y is not a column name, and not a call/symbol/expression, but an atomic vector outside of DT. 716 # note that if y is a list, this'll return an error (not sure if it should). 717 if (is.atomic(bysub)) bysubl = list(bysuborig) else bysubl = as.list.default(bysub) 718 } 719 if (length(bysubl) && identical(bysubl[[1L]],quote(eval))) { # TO DO: or by=..() 720 bysub = eval(bysubl[[2L]], parent.frame(), parent.frame()) 721 bysub = replace_dot_alias(bysub) # fix for #1298 722 if (is.expression(bysub)) bysub=bysub[[1L]] 723 bysubl = as.list.default(bysub) 724 } else if (bysub %iscall% c("c","key","names", "intersect", "setdiff")) { 725 # catch common cases, so we don't have to copy x[irows] for all columns 726 # *** TO DO ***: try() this eval first (as long as not list() or .()) and see if it evaluates to column names 727 # to avoid the explicit c,key,names which already misses paste("V",1:10) for example 728 # tried before but since not wrapped in try() it failed on some tests 729 # or look for column names used in this by (since if none it wouldn't find column names anyway 730 # when evaled within full x[irows]). Trouble is that colA%%2L is a call and should be within frame. 731 tt = eval(bysub, parent.frame(), parent.frame()) 732 if (!is.character(tt)) stop("by=c(...), key(...) or names(...) must evaluate to 'character'") 733 bysub=tt 734 } else if (is.call(bysub) && !(bysub[[1L]] %chin% c("list", "as.list", "{", ".", ":"))) { 735 # potential use of function, ex: by=month(date). catch it and wrap with "(", because we need to set "bysameorder" to FALSE as we don't know if the function will return ordered results just because "date" is ordered. Fixes #2670. 736 bysub = as.call(c(as.name('('), list(bysub))) 737 bysubl = as.list.default(bysub) 738 } else if (bysub %iscall% ".") bysub[[1L]] = quote(list) 739 740 if (mode(bysub) == "character") { 741 if (length(grep(",", bysub, fixed = TRUE))) { 742 if (length(bysub)>1L) stop("'by' is a character vector length ",length(bysub)," but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.") 743 bysub = strsplit(bysub,split=",")[[1L]] 744 } 745 backtick_idx = grep("^[^`]+$",bysub) 746 if (length(backtick_idx)) bysub[backtick_idx] = paste0("`",bysub[backtick_idx],"`") 747 backslash_idx = grep("\\", bysub, fixed = TRUE) 748 if (length(backslash_idx)) bysub[backslash_idx] = gsub('\\', '\\\\', bysub[backslash_idx], fixed = TRUE) 749 bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]] 750 bysubl = as.list.default(bysub) 751 } 752 allbyvars = intersect(all.vars(bysub), names_x) 753 orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) 754 bysameorder = byindex = FALSE 755 if (!bysub %iscall% ":" && ##Fix #4285 756 all(vapply_1b(bysubl, is.name))) { 757 bysameorder = orderedirows && haskey(x) && length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))) 758 # either bysameorder or byindex can be true but not both. TODO: better name for bysameorder might be bykeyx 759 if (!bysameorder && keyby && !length(irows) && isTRUE(getOption("datatable.use.index"))) { 760 # TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062 761 # find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name 762 tt = paste0(c(allbyvars,""), collapse="__") 763 w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt) 764 if (!is.na(w)) { 765 byindex = indices(x)[w] 766 if (!length(getindex(x, byindex))) { 767 if (verbose) cat("by index '", byindex, "' but that index has 0 length. Ignoring.\n", sep="") 768 byindex=FALSE 769 } 770 } 771 } 772 } 773 774 if (is.null(irows)) { 775 if (bysub %iscall% ':' && length(bysub)==3L && is.name(bysub[[2L]]) && is.name(bysub[[3L]])) { 776 byval = eval(bysub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) 777 byval = as.list(x)[byval] 778 } else byval = eval(bysub, x, parent.frame()) 779 } else { 780 # length 0 when i returns no rows 781 if (!is.integer(irows)) stop("Internal error: irows isn't integer") # nocov 782 # Passing irows as i to x[] below has been troublesome in a rare edge case. 783 # irows may contain NA, 0, negatives and >nrow(x) here. That's all ok. 784 # But we may need i join column values to be retained (where those rows have no match), hence we tried eval(isub) 785 # in 1.8.3, but this failed test 876. 786 # TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match). 787 # TO DO: Make xss directly, rather than recursive call. 788 if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset 789 if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### 790 if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n") 791 xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] 792 } else { 793 if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="") 794 xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] 795 } 796 if (bysub %iscall% ':' && length(bysub)==3L) { 797 byval = eval(bysub, setattr(as.list(seq_along(xss)), 'names', names(xss)), parent.frame()) 798 byval = as.list(xss)[byval] 799 } else byval = eval(bysub, xss, parent.frame()) 800 xnrow = nrow(xss) 801 # TO DO: pass xss (x subset) through into dogroups. Still need irows there (for :=), but more condense 802 # and contiguous to use xss to form .SD in dogroups than going via irows 803 } 804 if (!length(byval) && xnrow>0L) { 805 # see missingby up above for comments 806 # by could be NULL or character(0L) for example (e.g. passed in as argument in a loop of different bys) 807 bysameorder = FALSE # 1st and only group is the entire table, so could be TRUE, but FALSE to avoid 808 # a key of empty character() 809 byval = list() 810 bynames = allbyvars = NULL 811 # the rest now fall through 812 } else bynames = names(byval) 813 if (is.atomic(byval)) { 814 if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) { 815 stop("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval",deparse(bysub)," should work. This is for efficiency so data.table can detect which columns are needed.") 816 } else { 817 # by may be a single unquoted column name but it must evaluate to list so this is a convenience to users. Could also be a single expression here such as DT[,sum(v),by=colA%%2] 818 byval = list(byval) 819 bysubl = c(as.name("list"),bysuborig) # for guessing the column name below 820 if (is.name(bysuborig)) 821 bynames = as.character(bysuborig) 822 else 823 bynames = names(byval) 824 } 825 } 826 if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") 827 if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() 828 if (!bynull) for (jj in seq_len(length(byval))) { 829 if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]") 830 } 831 tt = vapply_1i(byval,length) 832 if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table')) 833 if (is.null(bynames)) bynames = rep.int("",length(byval)) 834 if (length(idx <- which(!nzchar(bynames))) && !bynull) { 835 # TODO: improve this and unify auto-naming of jsub and bysub 836 if (is.name(bysubl[[1L]]) && bysubl[[1L]] == '{') bysubl = bysubl[[length(bysubl)]] # fix for #3156 837 for (jj in idx) { 838 # Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2 839 byvars = all.vars(bysubl[[jj+1L]], functions = TRUE) 840 if (length(byvars) == 1L) tt = byvars 841 else { 842 # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name 843 tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) 844 # byvars but exclude functions or `0`+`1` becomes `+` 845 tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] 846 } 847 # fix for #497 848 if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { 849 bynames[jj] = deparse(bysubl[[jj+1L]]) 850 if (verbose) 851 cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, 852 "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") 853 } 854 else bynames[jj] = tt 855 # if user doesn't like this inferred name, user has to use by=list() to name the column 856 } 857 # Fix for #1334 858 if (any(duplicated(bynames))) { 859 bynames = make.unique(bynames) 860 } 861 } 862 setattr(byval, "names", bynames) # byval is just a list not a data.table hence setattr not setnames 863 } 864 865 jvnames = NULL 866 drop_dot = function(x) { 867 if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items") # nocov 868 if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) 869 substring(x, 2L) 870 else 871 x 872 } 873 # handle auto-naming of last item of j (e.g. within {} or if/else, #2478) 874 # e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N 875 do_j_names = function(q) { 876 if (!is.call(q) || !is.name(q[[1L]])) return(q) 877 if (q[[1L]] %chin% c('list', '.')) { 878 q[[1L]] = quote(list) 879 qlen = length(q) 880 if (qlen>1L) { 881 nm = names(q[-1L]) # check list(a=sum(v),v) 882 if (is.null(nm)) nm = rep.int("", qlen-1L) 883 # attempt to auto-name unnamed columns 884 for (jj in which(nm=="")) { 885 thisq = q[[jj + 1L]] 886 if (missing(thisq)) stop(gettextf("Item %d of the .() or list() passed to j is missing", jj, domain="R-data.table")) #3507 887 if (is.name(thisq)) nm[jj] = drop_dot(thisq) 888 # TO DO: if call to a[1] for example, then call it 'a' too 889 } 890 if (!is.null(jvnames) && any(idx <- nm != jvnames)) 891 warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) 892 jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better 893 setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result. 894 } 895 return(q) # else empty list is needed for test 468: adding an empty list column 896 } 897 if (q[[1L]] == '{') { 898 if (!is.null(q[[qlen<-length(q)]])) q[[qlen]] = do_j_names(q[[qlen]]) 899 return(q) 900 } 901 if (q[[1L]] == 'if') { 902 #explicit NULL would return NULL, assigning NULL would delete that from the expression 903 if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]]) 904 if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]]) 905 return(q) 906 } 907 return(q) 908 } 909 if (is.name(jsub)) { 910 # j is a single unquoted column name 911 if (jsub!=".SD") jvnames = drop_dot(jsub) 912 # jsub is list()ed after it's eval'd inside dogroups. 913 } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list. 914 av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c) 915 use.I = ".I" %chin% av 916 if (any(c(".SD","eval","get","mget") %chin% av)) { 917 if (missing(.SDcols)) { 918 # here we need to use 'dupdiff' instead of 'setdiff'. Ex: setdiff(c("x", "x"), NULL) will give 'x'. 919 # slight memory efficiency boost if we only store sdvars if it differs from ansvars, but a bit tricky to examine all the uses here 920 ansvars = sdvars = dupdiff(names_x, union(bynames, allbyvars)) # TO DO: allbyvars here for vars used by 'by'. Document. 921 # just using .SD in j triggers all non-by columns in the subset even if some of 922 # those columns are not used. It would be tricky to detect whether the j expression 923 # really does use all of the .SD columns or not, hence .SDcols for grouping 924 # over a subset of columns 925 926 # all duplicate columns must be matched, because nothing is provided 927 ansvals = chmatchdup(ansvars, names_x) 928 } else { 929 # FR #355 - negative numeric and character indices for SDcols 930 colsub = substitute(.SDcols) 931 # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol. 932 if (colsub %iscall% c("!", "-")) { 933 negate_sdcols = TRUE 934 colsub = colsub[[2L]] 935 } else negate_sdcols = FALSE 936 # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4))) 937 while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] 938 if (colsub %iscall% ':' && length(colsub)==3L) { 939 # .SDcols is of the format a:b 940 .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) 941 } else { 942 if (colsub %iscall% 'patterns') { 943 # each pattern gives a new filter condition, intersect the end result 944 .SDcols = Reduce(intersect, do_patterns(colsub, names_x)) 945 } else { 946 .SDcols = eval(colsub, parent.frame(), parent.frame()) 947 # allow filtering via function in .SDcols, #3950 948 if (is.function(.SDcols)) { 949 .SDcols = lapply(x, .SDcols) 950 if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA))) 951 stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx])) 952 .SDcols = unlist(.SDcols, use.names = FALSE) 953 } 954 } 955 } 956 if (anyNA(.SDcols)) 957 stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols)))) 958 if (is.logical(.SDcols)) { 959 ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols) 960 ansvars = sdvars = names_x[ansvals] 961 } else if (is.numeric(.SDcols)) { 962 .SDcols = as.integer(.SDcols) 963 # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' 964 if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices") 965 if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) 966 stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx))) 967 ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] 968 ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols 969 } else { 970 if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names") 971 if (!all(idx <- .SDcols %chin% names_x)) 972 stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx])) 973 ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols 974 # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) 975 ansvals = chmatch(ansvars, names_x) 976 } 977 } 978 # fix for long standing FR/bug, #495 and #484 979 allcols = c(names_x, xdotprefix, names_i, idotprefix) 980 non_sdvars = setdiff(intersect(av, allcols), c(bynames, ansvars)) 981 982 # added 'mget' - fix for #994 983 if (any(c("get", "mget") %chin% av)){ 984 if (verbose) 985 cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) 986 # get('varname') is too difficult to detect which columns are used in general 987 # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. 988 989 # Do not include z in .SD when dt[, z := {.SD; get("x")}, .SDcols = "y"] (#2326, #2338) 990 if (jsub %iscall% ":=" && is.symbol(jsub[[2L]])) { 991 jsub_lhs_symbol = as.character(jsub[[2L]]) 992 if (jsub_lhs_symbol %chin% non_sdvars) { 993 sdvars = setdiff(sdvars, jsub_lhs_symbol) 994 } 995 } 996 997 if (missing(.SDcols)) { 998 ansvars = setdiff(allcols, bynames) # fix for bug #34 999 } else { 1000 # fixes #4089 - if .SDcols was already evaluated, we do not want the order of the columns to change. 1001 ansvars = union(ansvars, setdiff(setdiff(allcols, ansvars), bynames)) 1002 } 1003 non_sdvars = setdiff(ansvars, sdvars) 1004 ansvals = chmatch(ansvars, names_x) 1005 if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) 1006 } else if (length(non_sdvars)) { 1007 # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or 1008 # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., 1009 ansvars = union(ansvars, non_sdvars) 1010 ansvals = chmatch(ansvars, names_x) 1011 } 1012 # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols 1013 } else { 1014 if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.") 1015 allcols = c(names_x, xdotprefix, names_i, idotprefix) 1016 ansvars = sdvars = setdiff(intersect(av, allcols), bynames) 1017 if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "<none>" else paste(ansvars,collapse=","),"\n") 1018 # using a few named columns will be faster 1019 # Consider: DT[,max(diff(date)),by=list(month=month(date))] 1020 # and: DT[,lapply(.SD,sum),by=month(date)] 1021 # We don't want date in .SD in the latter, but we do in the former; hence the union() above. 1022 ansvals = chmatch(ansvars, names_x) 1023 } 1024 # if (!length(ansvars)) Leave ansvars empty. Important for test 607. 1025 1026 lhs = NULL 1027 newnames = NULL 1028 suppPrint = identity 1029 if (length(av) && av[1L] == ":=") { 1030 if (.Call(C_islocked, x)) stop(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.") 1031 suppPrint = function(x) { .global$print=address(x); x } 1032 # Suppress print when returns ok not on error, bug #2376. Thanks to: http://stackoverflow.com/a/13606880/403310 1033 # All appropriate returns following this point are wrapped; i.e. return(suppPrint(x)). 1034 1035 if (is.null(names(jsub))) { 1036 # regular LHS:=RHS usage, or `:=`(...) with no named arguments (an error) 1037 # `:=`(LHS,RHS) is valid though, but more because can't see how to detect that, than desire 1038 if (length(jsub)!=3L) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") 1039 lhs = jsub[[2L]] 1040 jsub = jsub[[3L]] 1041 if (is.name(lhs)) { 1042 lhs = as.character(lhs) 1043 } else { 1044 # e.g. (MyVar):= or get("MyVar"):= 1045 lhs = eval(lhs, parent.frame(), parent.frame()) 1046 } 1047 } else { 1048 # `:=`(c2=1L,c3=2L,...) 1049 lhs = names(jsub)[-1L] 1050 if (any(lhs=="")) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") 1051 names(jsub)="" 1052 jsub[[1L]]=as.name("list") 1053 } 1054 av = all.vars(jsub,TRUE) 1055 if (!is.atomic(lhs)) stop("LHS of := must be a symbol, or an atomic vector (column names or positions).") 1056 if (is.character(lhs)) { 1057 m = chmatch(lhs, names_x) 1058 } else if (is.numeric(lhs)) { 1059 m = as.integer(lhs) 1060 if (any(m<1L | ncol(x)<m)) stop("LHS of := appears to be column positions but are outside [1,ncol] range. New columns can only be added by name.") 1061 lhs = names_x[m] 1062 } else 1063 stop("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')") 1064 if (all(!is.na(m))) { 1065 # updates by reference to existing columns 1066 cols = as.integer(m) 1067 newnames=NULL 1068 if (identical(irows, integer())) { 1069 # Empty integer() means no rows e.g. logical i with only FALSE and NA 1070 # got converted to empty integer() by the which() above 1071 # Short circuit and do-nothing since columns already exist. If some don't 1072 # exist then for consistency with cases where irows is non-empty, we need to create 1073 # them of the right type and populate with NA. Which will happen via the regular 1074 # alternative branches below, to cover #759. 1075 # We need this short circuit at all just for convenience. Otherwise users may need to 1076 # fix errors in their RHS when called on empty edge cases, even when the result won't be 1077 # used anyway (so it would be annoying to have to fix it.) 1078 if (verbose) { 1079 cat("No rows match i. No new columns to add so not evaluating RHS of :=\n") 1080 cat("Assigning to 0 row subset of",nrow(x),"rows\n") 1081 } 1082 .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated 1083 .global$print = address(x) 1084 return(invisible(x)) 1085 } 1086 } else { 1087 # Adding new column(s). TO DO: move after the first eval in case the jsub has an error. 1088 newnames=setdiff(lhs, names_x) 1089 m[is.na(m)] = ncol(x)+seq_len(length(newnames)) 1090 cols = as.integer(m) 1091 # don't pass verbose to selfrefok here -- only activated when 1092 # ok=-1 which will trigger setalloccol with verbose in the next 1093 # branch, which again calls _selfrefok and returns the message then 1094 if ((ok<-selfrefok(x, verbose=FALSE))==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R] 1095 warning("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.") 1096 if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { 1097 DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove 1098 n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() 1099 # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). 1100 name = substitute(x) 1101 if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) 1102 cat("Growing vector of column pointers from truelength ", truelength(x), " to ", n, ". A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n") 1103 # #1729 -- copying to the wrong environment here can cause some confusion 1104 if (ok == -1L) cat("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") 1105 1106 # Verbosity should not issue warnings, so cat rather than warning. 1107 # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. 1108 1109 # TO DO ... comments moved up from C ... 1110 # Note that the NAMED(dt)>1 doesn't work because .Call 1111 # always sets to 2 (see R-ints), it seems. Work around 1112 # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too 1113 # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. 1114 # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we 1115 # don't mind. 1116 } 1117 setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope 1118 if (is.name(name)) { 1119 assign(as.character(name),x,parent.frame(),inherits=TRUE) 1120 } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) { 1121 k = eval(name[[2L]], parent.frame(), parent.frame()) 1122 if (is.list(k)) { 1123 origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) 1124 if (is.character(j)) { 1125 if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j)) 1126 j = match(j, names(k)) 1127 if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov 1128 } 1129 .Call(Csetlistelt,k,as.integer(j), x) 1130 } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { 1131 assign(as.character(name[[3L]]), x, k, inherits=FALSE) 1132 } 1133 } # TO DO: else if env$<- or list$<- 1134 } 1135 } 1136 } 1137 } 1138 1139 if (length(ansvars)) { 1140 w = ansvals 1141 if (length(rightcols) && missingby) { 1142 w[ w %in% rightcols ] = NA 1143 } 1144 # patch for #1615. Allow 'x.' syntax. Only useful during join op when x's join col needs to be used. 1145 # Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] to refer to x's join column 1146 # as well because x[i, col] == x[i][, col] will not be TRUE anymore.. 1147 if ( any(xdotprefixvals <- ansvars %chin% xdotprefix)) { 1148 w[xdotprefixvals] = chmatch(ansvars[xdotprefixvals], xdotprefix) 1149 xdotcols = TRUE 1150 } 1151 if (!any(wna <- is.na(w))) { 1152 xcols = w 1153 xcolsAns = seq_along(ansvars) 1154 icols = icolsAns = integer() 1155 } else { 1156 if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov 1157 xcols = w[!wna] 1158 xcolsAns = which(!wna) 1159 map = c(seq_along(i), leftcols) # this map is to handle dups in leftcols, #3635 1160 names(map) = c(names_i, names_x[rightcols]) 1161 w2 = map[ansvars[wna]] 1162 if (any(w2na <- is.na(w2))) { 1163 ivars = paste0("i.", names_i) # ivars is only used in this branch 1164 ivars[leftcols] = names_i[leftcols] 1165 w2[w2na] = chmatch(ansvars[wna][w2na], ivars) 1166 if (any(w2na <- is.na(w2))) { 1167 ivars[leftcols] = paste0("i.",ivars[leftcols]) 1168 w2[w2na] = chmatch(ansvars[wna][w2na], ivars) 1169 if (any(w2na <- is.na(w2))) stop("Internal error -- column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) # nocov 1170 } 1171 } 1172 icols = w2 1173 icolsAns = which(wna) 1174 } 1175 } 1176 } # end of if !missing(j) 1177 1178 SDenv = new.env(parent=parent.frame()) 1179 # taking care of warnings for posixlt type, #646 1180 SDenv$strptime = function(x, ...) { 1181 warning("strptime() usage detected and wrapped with as.POSIXct(). This is to minimize the chance of assigning POSIXlt columns, which use 40+ bytes to store one date (versus 8 for POSIXct). Use as.POSIXct() (which will call strptime() as needed internally) to avoid this warning.") 1182 as.POSIXct(base::strptime(x, ...)) 1183 } 1184 1185 syms = all.vars(jsub) 1186 syms = syms[ substring(syms,1L,2L)==".." ] 1187 syms = syms[ substring(syms,3L,3L)!="." ] # exclude ellipsis 1188 for (sym in syms) { 1189 if (sym %chin% names_x) { 1190 # if "..x" exists as column name, use column, for backwards compatibility; e.g. package socialmixr in rev dep checks #2779 1191 next 1192 # TODO in future, as warned in NEWS item for v1.11.0 : 1193 # warning(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..") 1194 } 1195 getName = substring(sym, 3L) 1196 if (!exists(getName, parent.frame())) { 1197 if (exists(sym, parent.frame())) next # user did 'manual' prefix; i.e. variable in calling scope has .. prefix 1198 stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.") 1199 } 1200 assign(sym, get(getName, parent.frame()), SDenv) 1201 } 1202 # hash=TRUE (the default) does seem better as expected using e.g. test 645. TO DO experiment with 'size' argument 1203 if (missingby || bynull || (!byjoin && !length(byval))) { 1204 # No grouping: 'by' = missing | NULL | character() | "" | list() 1205 # Considered passing a one-group to dogroups but it doesn't do the recycling of i within group, that's done here 1206 if (length(ansvars)) { 1207 if (!(length(i) && length(icols))) { 1208 # new in v1.12.0 to redirect to CsubsetDT in this case 1209 if (!identical(xcolsAns, seq_along(xcolsAns)) || length(xcols)!=length(xcolsAns) || length(ansvars)!=length(xcolsAns)) { 1210 stop("Internal error: xcolAns does not pass checks: ", length(xcolsAns), length(ansvars), length(xcols), paste(xcolsAns,collapse=",")) # nocov 1211 } 1212 # Retained from old R way below (test 1542.01 checks shallow at this point) 1213 # ' Temp fix for #921 - skip COPY until after evaluating 'jval' (scroll down). 1214 # ' Unless 'with=FALSE' - can not be expressions but just column names. 1215 ans = if (with && is.null(irows)) shallow(x, xcols) else .Call(CsubsetDT, x, irows, xcols) 1216 setattr(ans, "names", ansvars) 1217 } else { 1218 # length(i) && length(icols) 1219 if (is.null(irows)) { 1220 stop("Internal error: irows is NULL when making join result at R level. Should no longer happen now we use CsubsetDT earlier.") # nocov 1221 # TODO: Make subsetDT do a shallow copy when irows is NULL (it currently copies). Then copy only when user uses := or set* on the result 1222 # by using NAMED/REFCNT on columns, with warning if they copy. Since then, even foo = DT$b would cause the next set or := to copy that 1223 # column (so the warning is needed). To tackle that, we could have our own DT.NAMED attribute, perhaps. 1224 # Or keep the rule that [.data.table always returns new memory, and create view() or view= as well, maybe cleaner. 1225 } 1226 ans = vector("list", length(ansvars)) 1227 ii = rep.int(indices__, len__) # following #1991 fix 1228 # TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=0 should drop rows in i that have no match] 1229 # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT 1230 # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. 1231 # That R level branch was taken out in PR #3213 1232 1233 # TO DO: use CsubsetDT twice here and then remove this entire R level branch 1234 for (s in seq_along(icols)) { 1235 target = icolsAns[s] 1236 source = icols[s] 1237 ans[[target]] = .Call(CsubsetVector,i[[source]],ii) # i.e. i[[source]][ii] 1238 } 1239 for (s in seq_along(xcols)) { 1240 target = xcolsAns[s] 1241 source = xcols[s] 1242 ans[[target]] = .Call(CsubsetVector,x[[source]],irows) # i.e. x[[source]][irows], but guaranteed new memory even for singleton logicals from R 3.1.0 1243 } 1244 setattr(ans, "names", ansvars) 1245 if (haskey(x)) { 1246 keylen = which.first(!key(x) %chin% ansvars)-1L 1247 if (is.na(keylen)) keylen = length(key(x)) 1248 len = length(rightcols) 1249 # fix for #1268, #1704, #1766 and #1823 1250 chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE 1251 if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) { 1252 keylen = if (!chk) len else 0L # fix for #1268 1253 } 1254 ## check key on i as well! 1255 ichk = is.data.table(i) && haskey(i) && 1256 identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061 1257 if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE 1258 setattr(ans,"sorted",head(key(x),keylen)) 1259 } 1260 setattr(ans, "class", class(x)) # fix for #64 1261 setattr(ans, "row.names", .set_row_names(nrow(ans))) 1262 setalloccol(ans) 1263 } 1264 1265 if (!with || missing(j)) return(ans) 1266 1267 SDenv$.SDall = ans 1268 SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall 1269 SDenv$.N = nrow(ans) 1270 1271 } else { 1272 SDenv$.SDall = SDenv$.SD = null.data.table() # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf. 1273 SDenv$.N = if (is.null(irows)) nrow(x) else if(!length(irows) || identical(max(irows), 0L)) 0L else length(irows) 1274 # Fix for #963. 1275 # When irows is integer(0L), length(irows) = 0 will result in 0 (as expected). 1276 # Binary search can return all 0 irows when none of the input matches. Instead of doing all(irows==0L) (previous method), which has to allocate a logical vector the size of irows, we can make use of 'max'. If max is 0, we return 0. The condition where only some irows > 0 won't occur. 1277 } 1278 # Temp fix for #921. Allocate `.I` only if j-expression uses it. 1279 SDenv$.I = if (!missing(j) && use.I) seq_len(SDenv$.N) else 0L 1280 SDenv$.GRP = 1L 1281 SDenv$.NGRP = 1L 1282 .Call(C_lock, SDenv$.SD) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. 1283 .Call(C_lock, SDenv$.SDall) 1284 lockBinding(".SD",SDenv) 1285 lockBinding(".SDall",SDenv) 1286 lockBinding(".N",SDenv) 1287 lockBinding(".I",SDenv) 1288 lockBinding(".GRP",SDenv) 1289 lockBinding(".NGRP", SDenv) 1290 for (ii in ansvars) assign(ii, SDenv$.SDall[[ii]], SDenv) 1291 # Since .SD is inside SDenv, alongside its columns as variables, R finds .SD symbol more quickly, if used. 1292 # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write). 1293 1294 if (is.name(jsub) && is.null(lhs) && !exists(jsubChar<-as.character(jsub), SDenv, inherits=FALSE)) { 1295 stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. Perhaps you intended DT[, ..",jsubChar,"]. This difference to data.frame is deliberate and explained in FAQ 1.1.") 1296 } 1297 1298 jval = eval(jsub, SDenv, parent.frame()) 1299 .Call(C_unlock, jval) # in case jval inherits .SD's lock, #1341 #2245. .Call directly (not via an R function like setattr or unlock) to avoid bumping jval's MAYBE_SHARED. 1300 1301 # copy 'jval' when required 1302 # More speedup - only check + copy if irows is NULL 1303 # Temp fix for #921 - check address and copy *after* evaluating 'jval'. #75 also related. 1304 if (is.null(irows)) { 1305 if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. 1306 jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list() 1307 if (jcpy) jval = copy(jval) 1308 } else if (address(jval) == address(SDenv$.SD)) { 1309 jval = copy(jval) 1310 } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { 1311 for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) 1312 } else if (jsub %iscall% 'get') { 1313 jval = copy(jval) # fix for #1212 1314 } 1315 } 1316 1317 if (!is.null(lhs)) { 1318 # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. 1319 .Call(Cassign,x,irows,cols,newnames,jval) 1320 return(suppPrint(x)) 1321 } 1322 if ((is.call(jsub) && jsub[[1L]] != "get" && is.list(jval) && !is.object(jval)) || !missingby) { 1323 # is.call: selecting from a list column should return list 1324 # is.object: for test 168 and 168.1 (S4 object result from ggplot2::qplot). Just plain list results should result in data.table 1325 1326 # Fix for #813 and #758. Ex: DT[c(FALSE, FALSE), list(integer(0L), y)] 1327 # where DT = data.table(x=1:2, y=3:4) should return an empty data.table!! 1328 if (!is.null(irows) && `||`( 1329 identical(irows, integer(0L)) && !bynull, 1330 length(irows) && !anyNA(irows) && all(irows==0L) ## anyNA() because all() returns NA (not FALSE) when irows is all-NA. TODO: any way to not check all 'irows' values? 1331 )) 1332 jval = lapply(jval, `[`, 0L) 1333 if (is.atomic(jval)) { 1334 setattr(jval,"names",NULL) # discard names of named vectors otherwise each cell in the column would have a name 1335 jval = list(jval) 1336 } 1337 if (!is.null(jvnames) && !all(jvnames=="")) setattr(jval, 'names', jvnames) # e.g. jvnames=="N" for DT[,.N,] 1338 jval = as.data.table.list(jval, .named=NULL) 1339 } 1340 1341 if (is.data.table(jval)) { 1342 setattr(jval, 'class', class(x)) # fix for #64 1343 if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) 1344 setattr(jval, 'sorted', key(x)) 1345 if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov 1346 } 1347 return(jval) 1348 } 1349 1350 ########################################################################### 1351 # Grouping ... 1352 ########################################################################### 1353 1354 o__ = integer() 1355 if (".N" %chin% ansvars) stop("The column '.N' can't be grouped because it conflicts with the special .N variable. Try setnames(DT,'.N','N') first.") 1356 if (".I" %chin% ansvars) stop("The column '.I' can't be grouped because it conflicts with the special .I variable. Try setnames(DT,'.I','I') first.") 1357 SDenv$.iSD = NULL # null.data.table() 1358 SDenv$.xSD = NULL # null.data.table() - introducing for FR #2693 and Gabor's post on fixing for FAQ 2.8 1359 1360 SDenv$print = function(x, ...){ base::print(x,...); NULL} 1361 # Now ggplot2 returns data from print, we need a way to throw it away otherwise j accumulates the result 1362 1363 SDenv$.SDall = SDenv$.SD = null.data.table() # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only 1364 SDenv$.N = vector("integer", 1L) # explicit new vector (not 0L or as.integer() which might return R's internal small-integer global) 1365 SDenv$.GRP = vector("integer", 1L) # because written to by reference at C level (one write per group). TODO: move this alloc to C level 1366 1367 # #3694/#761 common gotcha -- doing t1-t0 by group, but -.POSIXt uses units='auto' 1368 # independently by group & attr mismatch among groups is ignored. The latter 1369 # is a more general issue but the former can be fixed by forcing units='secs' 1370 SDenv$`-.POSIXt` = function(e1, e2) { 1371 if (inherits(e2, 'POSIXt')) { 1372 if (verbose && !exists('done_units_report', parent.frame())) { 1373 cat('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units') 1374 assign('done_units_report', TRUE, parent.frame()) 1375 } 1376 return(difftime(e1, e2, units='secs')) 1377 } else return(base::`-.POSIXt`(e1, e2)) 1378 } 1379 1380 if (byjoin) { 1381 # The groupings come instead from each row of the i data.table. 1382 # Much faster for a few known groups vs a 'by' for all followed by a subset 1383 if (!is.data.table(i)) stop("logical error. i is not data.table, but mult='all' and 'by'=.EACHI") 1384 byval = i 1385 bynames = if (missing(on)) head(key(x),length(leftcols)) else names(on) 1386 allbyvars = NULL 1387 bysameorder = haskey(i) || (is.sorted(f__) && ((roll == FALSE) || length(f__) == 1L)) # Fix for #1010 1388 ## 'av' correct here ?? *** TO DO *** 1389 xjisvars = intersect(av, names_x[rightcols]) # no "x." for xvars. 1390 # if 'get' is in 'av' use all cols in 'i', fix for bug #34 1391 # added 'mget' - fix for #994 1392 jisvars = if (any(c("get", "mget") %chin% av)) names_i else intersect(gsub("^i[.]","", setdiff(av, xjisvars)), names_i) 1393 # JIS (non join cols) but includes join columns too (as there are named in i) 1394 if (length(jisvars)) { 1395 tt = min(nrow(i),1L) # min() is here for when nrow(i)==0 1396 SDenv$.iSD = i[tt,jisvars,with=FALSE] 1397 for (ii in jisvars) { 1398 assign(ii, SDenv$.iSD[[ii]], SDenv) 1399 assign(paste0("i.",ii), SDenv$.iSD[[ii]], SDenv) 1400 } 1401 } 1402 1403 } else { 1404 # Find the groups, using 'byval' ... 1405 if (missingby) stop("Internal error: by= is missing") # nocov 1406 1407 if (length(byval) && length(byval[[1L]])) { 1408 if (!bysameorder && isFALSE(byindex)) { 1409 if (verbose) {last.started.at=proc.time();cat("Finding groups using forderv ... ");flush.console()} 1410 o__ = forderv(byval, sort=keyby, retGrp=TRUE) 1411 # The sort= argument is called sortGroups at C level. It's primarily for saving the sort of unique strings at 1412 # C level for efficiency when by= not keyby=. Other types also retain appearance order, but at byte level to 1413 # minimize data movement and benefit from skipping subgroups which happen to be grouped but not sorted. This byte 1414 # appearance order is not the same as the order of group values within by= columns, so the 2nd forder below is 1415 # still needed to get the group appearance order. Always passing sort=TRUE above won't change any result at all 1416 # (tested and confirmed), it'll just make by= slower. It must be TRUE when keyby= though since the key is just 1417 # marked afterwards. 1418 # forderv() returns empty integer() if already ordered to save allocating 1:xnrow 1419 bysameorder = orderedirows && !length(o__) 1420 if (verbose) { 1421 cat(timetaken(last.started.at),"\n") 1422 last.started.at=proc.time() 1423 cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") 1424 flush.console() # for windows 1425 } 1426 f__ = attr(o__, "starts", exact=TRUE) 1427 len__ = uniqlengths(f__, xnrow) 1428 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 1429 if (!bysameorder && !keyby) { 1430 # TO DO: lower this into forder.c 1431 if (verbose) {last.started.at=proc.time();cat("Getting back original order ... ");flush.console()} 1432 firstofeachgroup = o__[f__] 1433 if (length(origorder <- forderv(firstofeachgroup))) { 1434 f__ = f__[origorder] 1435 len__ = len__[origorder] 1436 } 1437 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 1438 } 1439 if (!orderedirows && !length(o__)) o__ = seq_len(xnrow) # temp fix. TODO: revist orderedirows 1440 } else { 1441 if (verbose) last.started.at=proc.time(); 1442 if (bysameorder) { 1443 if (verbose) {cat("Finding groups using uniqlist on key ... ");flush.console()} 1444 f__ = uniqlist(byval) 1445 } else { 1446 if (!is.character(byindex) || length(byindex)!=1L) stop("Internal error: byindex not the index name") # nocov 1447 if (verbose) {cat("Finding groups using uniqlist on index '", byindex, "' ... ", sep="");flush.console()} 1448 o__ = getindex(x, byindex) 1449 if (is.null(o__)) stop("Internal error: byindex not found") # nocov 1450 f__ = uniqlist(byval, order=o__) 1451 } 1452 if (verbose) { 1453 cat(timetaken(last.started.at),"\n") 1454 last.started.at=proc.time() 1455 cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") 1456 flush.console() # for windows 1457 } 1458 len__ = uniqlengths(f__, xnrow) 1459 # TO DO: combine uniqlist and uniquelengths into one call. Or, just set len__ to NULL when dogroups infers that. 1460 if (verbose) { cat(timetaken(last.started.at),"\n"); flush.console() } 1461 } 1462 } else { 1463 f__=NULL 1464 len__=0L 1465 bysameorder=TRUE # for test 724 1466 } 1467 # TO DO: allow secondary keys to be stored, then we see if our by matches one, if so use it, and no need to sort again. TO DO: document multiple keys. 1468 } 1469 if (length(xcols)) { 1470 # TODO add: if (max(len__)==nrow) stop("There is no need to deep copy x in this case") 1471 # TODO move down to dogroup.c, too. 1472 SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset 1473 if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns] 1474 SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall 1475 } 1476 if (nrow(SDenv$.SDall)==0L) { 1477 setattr(SDenv$.SDall,"row.names",c(NA_integer_,0L)) 1478 setattr(SDenv$.SD,"row.names",c(NA_integer_,0L)) 1479 } 1480 # .set_row_names() basically other than not integer() for 0 length, otherwise dogroups has no [1] to modify to -.N 1481 .Call(C_lock, SDenv$.SD) # stops := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. 1482 .Call(C_lock, SDenv$.SDall) 1483 lockBinding(".SD",SDenv) 1484 lockBinding(".SDall",SDenv) 1485 lockBinding(".N",SDenv) 1486 lockBinding(".GRP",SDenv) 1487 lockBinding(".iSD",SDenv) 1488 1489 SDenv$.NGRP = length(f__) 1490 lockBinding(".NGRP", SDenv) 1491 1492 GForce = FALSE 1493 if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit 1494 # Optimization to reduce overhead of calling lapply over and over for each group 1495 oldjsub = jsub 1496 funi = 1L # Fix for #985 1497 # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then. 1498 .massageSD = function(jsub) { 1499 txt = as.list(jsub)[-1L] 1500 if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 1501 fun = txt[[2L]] 1502 if (fun %iscall% "function") { 1503 # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT 1504 # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means 1505 # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) 1506 # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) 1507 thisfun = paste0("..FUN", funi) # Fix for #985 1508 assign(thisfun,eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD 1509 lockBinding(thisfun,SDenv) 1510 txt[[1L]] = as.name(thisfun) 1511 } else { 1512 if (is.character(fun)) fun = as.name(fun) 1513 txt[[1L]] = fun 1514 } 1515 ans = vector("list", length(sdvars)+1L) 1516 ans[[1L]] = as.name("list") 1517 for (ii in seq_along(sdvars)) { 1518 txt[[2L]] = as.name(sdvars[ii]) 1519 ans[[ii+1L]] = as.call(txt) 1520 } 1521 jsub = as.call(ans) # important no names here 1522 jvnames = sdvars # but here instead 1523 list(jsub, jvnames) 1524 # It may seem inefficient to construct a potentially long expression. But, consider calling 1525 # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it 1526 # is called, involving small memory allocations. 1527 # The R level lapply calls as.list which needs a shallow copy. 1528 # lapply also does a setAttib of names (duplicating the same names over and over again 1529 # for each group) which is terrible for our needs. We replace all that with a 1530 # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol 1531 # lookup), and the eval() inside dogroups hardly has to do anything. All this results in 1532 # overhead minimised. We don't need to worry about the env passed to the eval in a possible 1533 # lapply replacement, or how to pass ... efficiently to it. 1534 # Plus we optimize lapply first, so that mean() can be optimized too as well, next. 1535 } 1536 if (is.name(jsub)) { 1537 if (jsub == ".SD") { 1538 jsub = as.call(c(quote(list), lapply(sdvars, as.name))) 1539 jvnames = sdvars 1540 } 1541 } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with <jsub[[1L]] == > 1542 # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 1543 subopt = length(jsub) == 3L && 1544 (jsub[[1L]] == "[" || 1545 (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && 1546 (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") 1547 headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" 1548 firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 1549 if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && 1550 (subopt || headopt || firstopt)) { 1551 if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 1552 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. 1553 jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) 1554 jvnames = sdvars 1555 } else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) { 1556 deparse_ans = .massageSD(jsub) 1557 jsub = deparse_ans[[1L]] 1558 jvnames = deparse_ans[[2L]] 1559 } else if (jsub[[1L]] == "c" && length(jsub) > 1L) { 1560 # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. 1561 # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. 1562 # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains 1563 # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* 1564 # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* 1565 # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. 1566 # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output 1567 # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. 1568 # For now, we optimise all functions mentioned in 'optfuns' below. 1569 optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") 1570 is_valid = TRUE 1571 any_SD = FALSE 1572 jsubl = as.list.default(jsub) 1573 oldjvnames = jvnames 1574 jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. 1575 # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! 1576 for (i_ in 2L:length(jsubl)) { 1577 this = jsub[[i_]] 1578 if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names 1579 if (this == ".SD") { # optimise '.SD' alone 1580 any_SD = TRUE 1581 jsubl[[i_]] = lapply(sdvars, as.name) 1582 jvnames = c(jvnames, sdvars) 1583 } else if (this == ".N") { 1584 # don't optimise .I in c(.SD, .I), it's length can be > 1 1585 # only c(.SD, list(.I)) should be optimised!! .N is always length 1. 1586 jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) 1587 } else { 1588 # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) 1589 is_valid=FALSE 1590 break 1591 } 1592 } else if (is.call(this)) { 1593 if (this[[1L]] == "lapply" && this[[2L]] == ".SD" && length(xcols)) { 1594 any_SD = TRUE 1595 deparse_ans = .massageSD(this) 1596 funi = funi + 1L # Fix for #985 1597 jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.) 1598 jvnames = c(jvnames, deparse_ans[[2L]]) 1599 } else if (this[[1L]] == "list") { 1600 # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen 1601 if (length(this) > 1L) { 1602 jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) 1603 jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) 1604 idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) 1605 if (any(idx)) jn__[idx & (jn__ == "")] = "I" 1606 jvnames = c(jvnames, jn__) 1607 jsubl[[i_]] = jl__ 1608 } 1609 } else if (this %iscall% optfuns && length(this)>1L) { 1610 jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) 1611 } else if ( length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && 1612 this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) { 1613 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. 1614 any_SD = TRUE 1615 jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) 1616 jvnames = c(jvnames, sdvars) 1617 } else if (any(all.vars(this) == ".SD")) { 1618 # TODO, TO DO: revisit complex cases (as illustrated below) 1619 # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] 1620 # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. 1621 is_valid=FALSE 1622 break 1623 } else { # just to be sure that any other case (I've overlooked) runs smoothly, without optimisation 1624 # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? 1625 is_valid=FALSE 1626 break 1627 } 1628 } else { 1629 is_valid = FALSE 1630 break 1631 } 1632 } 1633 if (!is_valid || !any_SD) { # restore if c(...) doesn't contain lapply(.SD, ..) or if it's just invalid 1634 jvnames = oldjvnames # reset jvnames 1635 jsub = oldjsub # reset jsub 1636 jsubl = as.list.default(jsubl) # reset jsubl 1637 } else { 1638 setattr(jsubl, 'names', NULL) 1639 jsub = as.call(unlist(jsubl, use.names=FALSE)) 1640 jsub[[1L]] = quote(list) 1641 } 1642 } 1643 } 1644 if (verbose) { 1645 if (!identical(oldjsub, jsub)) 1646 cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") 1647 else 1648 cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") 1649 } 1650 dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? 1651 # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with 1652 # nomatch=0L even now.. but not switching it on yet, will deal it separately. 1653 if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { 1654 if (!length(ansvars) && !use.I) { 1655 GForce = FALSE 1656 if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) { 1657 GForce = TRUE 1658 if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") 1659 } 1660 } else { 1661 # Apply GForce 1662 .gforce_ok = function(q) { 1663 if (dotN(q)) return(TRUE) # For #334 1664 # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD 1665 # is.symbol() is for #1369, #1974 and #2949 1666 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) 1667 if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 1668 if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE) 1669 # ... head-tail uses default value n=6 which as of now should not go gforce ^^ 1670 # otherwise there must be three arguments, and only in two cases: 1671 # 1) head/tail(x, 1) or 2) x[n], n>0 1672 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && 1673 ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) 1674 } 1675 if (jsub[[1L]]=="list") { 1676 GForce = TRUE 1677 for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { 1678 if (!.gforce_ok(jsub[[ii]])) {GForce = FALSE; break} 1679 } 1680 } else GForce = .gforce_ok(jsub) 1681 if (GForce) { 1682 if (jsub[[1L]]=="list") 1683 for (ii in seq_along(jsub)[-1L]) { 1684 if (dotN(jsub[[ii]])) next; # For #334 1685 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) 1686 if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 1687 } 1688 else { 1689 jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) 1690 if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 1691 } 1692 if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") 1693 } else if (verbose) cat("GForce is on, left j unchanged\n"); 1694 } 1695 } 1696 if (!GForce && !is.name(jsub)) { 1697 # Still do the old speedup for mean, for now 1698 nomeanopt=FALSE # to be set by .optmean() using <<- inside it 1699 oldjsub = jsub 1700 if (jsub[[1L]]=="list") { 1701 # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() and/or the for-looped if() 1702 # jsub[[1]]=="list" so the first item of todo will always be FALSE 1703 todo = sapply(jsub, `%iscall%`, 'mean') 1704 if (any(todo)) { 1705 w = which(todo) 1706 jsub[w] = lapply(jsub[w], .optmean) 1707 } 1708 } else if (jsub[[1L]]=="mean") { 1709 jsub = .optmean(jsub) 1710 } 1711 if (nomeanopt) { 1712 warning("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.",immediate.=TRUE) 1713 } 1714 if (verbose) { 1715 if (!identical(oldjsub, jsub)) 1716 cat("Old mean optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") 1717 else 1718 cat("Old mean optimization is on, left j unchanged.\n") 1719 } 1720 assign("Cfastmean", Cfastmean, SDenv) 1721 # Old comments still here for now ... 1722 # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. 1723 # Maybe change to : 1724 # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower 1725 # when fastmean can do trim. 1726 } 1727 } else if (verbose) { 1728 if (getOption("datatable.optimize")<1L) cat("All optimizations are turned off\n") 1729 else cat("Optimization is on but left j unchanged (single plain symbol): '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") 1730 } 1731 if (byjoin) { 1732 groups = i 1733 grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i) 1734 jiscols = chmatch(jisvars, names_i) # integer() if there are no jisvars (usually there aren't, advanced feature) 1735 xjiscols = chmatch(xjisvars, names_x) 1736 SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE] 1737 if (!missing(on)) o__ = xo else o__ = integer(0L) 1738 } else { 1739 groups = byval 1740 grpcols = seq_along(byval) 1741 jiscols = NULL # NULL rather than integer() is used in C to know when using by 1742 xjiscols = NULL 1743 } 1744 lockBinding(".xSD", SDenv) 1745 grporder = o__ 1746 # for #971, added !GForce. if (GForce) we do it much more (memory) efficiently than subset of order vector below. 1747 if (length(irows) && !isTRUE(irows) && !GForce) { 1748 # any zeros in irows were removed by convertNegAndZeroIdx earlier above; no need to check for zeros again. Test 1058-1061 check case #2758. 1749 if (length(o__) && length(irows)!=length(o__)) stop("Internal error: length(irows)!=length(o__)") # nocov 1750 o__ = if (length(o__)) irows[o__] # better do this once up front (even though another alloc) than deep repeated branch in dogroups.c 1751 else irows 1752 } # else grporder is left bound to same o__ memory (no cost of copy) 1753 if (is.null(lhs)) cols=NULL 1754 if (!length(f__)) { 1755 # for consistency of empty case in test 184 1756 f__=len__=0L 1757 } 1758 if (verbose) {last.started.at=proc.time();cat("Making each group and running j (GForce ",GForce,") ... ",sep="");flush.console()} 1759 if (GForce) { 1760 thisEnv = new.env() # not parent=parent.frame() so that gsum is found 1761 for (ii in ansvars) assign(ii, x[[ii]], thisEnv) 1762 assign(".N", len__, thisEnv) # For #334 1763 #fix for #1683 1764 if (use.I) assign(".I", seq_len(nrow(x)), thisEnv) 1765 ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971. 1766 gi = if (length(o__)) o__[f__] else f__ 1767 g = lapply(grpcols, function(i) groups[[i]][gi]) 1768 ans = c(g, ans) 1769 } else { 1770 ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) 1771 } 1772 # unlock any locked data.table components of the answer, #4159 1773 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 1774 # TODO: is there an efficient way to get around this MAX_DEPTH limit? 1775 MAX_DEPTH = 5L 1776 runlock = function(x, current_depth = 1L) { 1777 if (is.list(x) && current_depth <= MAX_DEPTH) { # is.list() used to be is.recursive(), #4814 1778 if (inherits(x, 'data.table')) .Call(C_unlock, x) 1779 else return(lapply(x, runlock, current_depth = current_depth + 1L)) 1780 } 1781 return(invisible()) 1782 } 1783 runlock(ans) 1784 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 1785 # TO DO: xrows would be a better name for irows: irows means the rows of x that i joins to 1786 # Grouping by i: icols the joins columns (might not need), isdcols (the non join i and used by j), all __ are length x 1787 # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__) 1788 # TO DO: setkey could mark the key whether it is unique or not. 1789 if (!is.null(lhs)) { 1790 if (any(names_x[cols] %chin% key(x))) 1791 setkey(x,NULL) 1792 # fixes #1479. Take care of secondary indices, TODO: cleaner way of doing this 1793 attrs = attr(x, 'index', exact=TRUE) 1794 skeys = names(attributes(attrs)) 1795 if (!is.null(skeys)) { 1796 hits = unlist(lapply(paste0("__", names_x[cols]), function(x) grep(x, skeys, fixed = TRUE))) 1797 hits = skeys[unique(hits)] 1798 for (i in seq_along(hits)) setattr(attrs, hits[i], NULL) # does by reference 1799 } 1800 if (keyby) { 1801 cnames = as.character(bysubl)[-1L] 1802 cnames = gsub('^`|`$', '', cnames) # the wrapping backticks that were added above can be removed now, #3378 1803 if (all(cnames %chin% names_x)) { 1804 if (verbose) {last.started.at=proc.time();cat("setkey() after the := with keyby= ... ");flush.console()} 1805 setkeyv(x,cnames) # TO DO: setkey before grouping to get memcpy benefit. 1806 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 1807 } 1808 else warning("The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=.\n") 1809 } 1810 return(suppPrint(x)) 1811 } 1812 if (is.null(ans)) { 1813 ans = as.data.table.list(lapply(groups,"[",0L)) # side-effects only such as test 168 1814 setnames(ans,seq_along(bynames),bynames) # TO DO: why doesn't groups have bynames in the first place? 1815 return(ans) 1816 } 1817 setattr(ans,"row.names",.set_row_names(length(ans[[1L]]))) 1818 setattr(ans,"class",class(x)) # fix for #64 1819 if (is.null(names(ans))) { 1820 # Efficiency gain of dropping names has been successful. Ordinarily this will run. 1821 if (is.null(jvnames)) jvnames = character(length(ans)-length(bynames)) 1822 if (length(bynames)+length(jvnames)!=length(ans)) 1823 stop("Internal error: jvnames is length ",length(jvnames), " but ans is ",length(ans)," and bynames is ", length(bynames)) # nocov 1824 ww = which(jvnames=="") 1825 if (any(ww)) jvnames[ww] = paste0("V",ww) 1826 setattr(ans, "names", c(bynames, jvnames)) 1827 } else { 1828 setnames(ans,seq_along(bynames),bynames) # TO DO: reinvestigate bynames flowing from dogroups here and simplify 1829 } 1830 if (byjoin && keyby && !bysameorder) { 1831 if (verbose) {last.started.at=proc.time();cat("setkey() afterwards for keyby=.EACHI ... ");flush.console()} 1832 setkeyv(ans,names(ans)[seq_along(byval)]) 1833 if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} 1834 } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) { 1835 setattr(ans,"sorted",names(ans)[seq_along(grpcols)]) 1836 } 1837 setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line 1838} 1839 1840.optmean = function(expr) { # called by optimization of j inside [.data.table only. Outside for a small speed advantage. 1841 if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE 1842 return(call(".External",quote(Cfastmean),expr[[2L]], FALSE)) 1843 # return(call(".Internal",expr)) # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012 1844 if (length(expr)==3L && identical("na",substring(names(expr)[3L], 1L, 2L))) # one parameter passed to mean() 1845 return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]])) # faster than .Call 1846 assign("nomeanopt",TRUE,parent.frame()) 1847 expr # e.g. trim is not optimized, just na.rm 1848} 1849 1850# [[.data.frame is now dispatched due to inheritance. 1851# The code below tried to avoid that but made things 1852# very slow (462 times faster down to 1 in the timings test). 1853# TO DO. Reintroduce velow but dispatch straight to 1854# .C("do_subset2") or better. Tests 604-608 test 1855# that this doesn't regress. 1856 1857#"[[.data.table" = function(x,...) { 1858# if (!cedta()) return(`[[.data.frame`(x,...)) 1859# .subset2(x,...) 1860# #class(x)=NULL # awful, copy 1861# #x[[...]] 1862#} 1863 1864#"[[<-.data.table" = function(x,i,j,value) { 1865# if (!cedta()) return(`[[<-.data.frame`(x,i,j,value)) 1866# if (!missing(j)) stop("[[i,j]] assignment not available in data.table, put assignment(s) in [i,{...}] instead, more powerful") 1867# cl = oldClass(x) # [[<-.data.frame uses oldClass rather than class, don't know why but we'll follow suit 1868# class(x) = NULL 1869# x[[i]] = value 1870# class(x) = cl 1871# x 1872#} 1873 1874as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { 1875 # rownames = the rownames column (most common usage) 1876 if (!is.null(rownames)) { 1877 if (!is.null(rownames.value)) stop("rownames and rownames.value cannot both be used at the same time") 1878 if (length(rownames)>1L) { 1879 # TODO in future as warned in NEWS for 1.11.6: 1880 # warning("length(rownames)>1 is deprecated. Please use rownames.value= instead") 1881 if (length(rownames)!=nrow(x)) 1882 stop("length(rownames)==", length(rownames), " but nrow(DT)==", nrow(x), 1883 ". The rownames argument specifies a single column name or number. Consider rownames.value= instead.") 1884 rownames.value = rownames 1885 rownames = NULL 1886 } else if (length(rownames)==0L) { 1887 stop("length(rownames)==0 but should be a single column name or number, or NULL") 1888 } else { 1889 if (isTRUE(rownames)) { 1890 if (length(key(x))>1L) { 1891 warning("rownames is TRUE but key has multiple columns ", 1892 brackify(key(x)), "; taking first column x[,1] as rownames") 1893 } 1894 rownames = if (length(key(x))==1L) chmatch(key(x),names(x)) else 1L 1895 } 1896 else if (is.logical(rownames) || is.na(rownames)) { 1897 # FALSE, NA, NA_character_ all mean the same as NULL 1898 rownames = NULL 1899 } 1900 else if (is.character(rownames)) { 1901 w = chmatch(rownames, names(x)) 1902 if (is.na(w)) stop("'", rownames, "' is not a column of x") 1903 rownames = w 1904 } 1905 else { # rownames is a column number already 1906 rownames = as.integer(rownames) 1907 if (is.na(rownames) || rownames<1L || rownames>ncol(x)) 1908 stop("as.integer(rownames)==", rownames, 1909 " which is outside the column number range [1,ncol=", ncol(x), "].") 1910 } 1911 } 1912 } else if (!is.null(rownames.value)) { 1913 if (length(rownames.value)!=nrow(x)) 1914 stop("length(rownames.value)==", length(rownames.value), 1915 " but should be nrow(x)==", nrow(x)) 1916 } 1917 if (!is.null(rownames)) { 1918 # extract that column and drop it. 1919 rownames.value = x[[rownames]] 1920 dm = dim(x) - 0:1 1921 cn = names(x)[-rownames] 1922 X = x[, .SD, .SDcols = cn] 1923 } else { 1924 dm = dim(x) 1925 cn = names(x) 1926 X = x 1927 } 1928 p = dm[2L] 1929 n = dm[1L] 1930 collabs = as.list(cn) 1931 class(X) = NULL 1932 non.numeric = non.atomic = FALSE 1933 all.logical = TRUE 1934 for (j in seq_len(p)) { 1935 if (is.ff(X[[j]])) X[[j]] = X[[j]][] # nocov to bring the ff into memory, since we need to create a matrix in memory 1936 xj = X[[j]] 1937 if (length(dj <- dim(xj)) == 2L && dj[2L] > 1L) { 1938 if (inherits(xj, "data.table")) 1939 xj = X[[j]] = as.matrix(X[[j]]) 1940 dnj = dimnames(xj)[[2L]] 1941 collabs[[j]] = paste(collabs[[j]], if (length(dnj) > 1942 0L) 1943 dnj 1944 else seq_len(dj[2L]), sep = ".") 1945 } 1946 if (!is.logical(xj)) 1947 all.logical = FALSE 1948 if (length(levels(xj)) > 0L || !(is.numeric(xj) || is.complex(xj) || is.logical(xj)) || 1949 (!is.null(cl <- attr(xj, "class", exact=TRUE)) && any(cl %chin% 1950 c("Date", "POSIXct", "POSIXlt")))) 1951 non.numeric = TRUE 1952 if (!is.atomic(xj)) 1953 non.atomic = TRUE 1954 } 1955 if (non.atomic) { 1956 for (j in seq_len(p)) { 1957 xj = X[[j]] 1958 if (is.recursive(xj)) { } 1959 else X[[j]] = as.list(as.vector(xj)) 1960 } 1961 } 1962 else if (all.logical) { } 1963 else if (non.numeric) { 1964 for (j in seq_len(p)) { 1965 if (is.character(X[[j]])) next 1966 xj = X[[j]] 1967 miss = is.na(xj) 1968 xj = if (length(levels(xj))) as.vector(xj) else format(xj) 1969 is.na(xj) = miss 1970 X[[j]] = xj 1971 } 1972 } 1973 X = unlist(X, recursive = FALSE, use.names = FALSE) 1974 if (any(dm==0L)) { 1975 # retain highest type of input for empty output, #4762 1976 if (length(X)!=0L) 1977 stop("Internal error: as.matrix.data.table length(X)==", length(X), " but a dimension is zero") # nocov 1978 return(array(if (is.null(X)) NA else X, dim = dm, dimnames = list(rownames.value, cn))) 1979 } 1980 dim(X) <- c(n, length(X)/n) 1981 dimnames(X) <- list(rownames.value, unlist(collabs, use.names = FALSE)) 1982 X 1983} 1984 1985# bug #2375. fixed. same as head.data.frame and tail.data.frame to deal with negative indices 1986head.data.table = function(x, n=6L, ...) { 1987 if (!cedta()) return(NextMethod()) # nocov 1988 stopifnot(length(n) == 1L) 1989 i = seq_len(if (n<0L) max(nrow(x)+n, 0L) else min(n,nrow(x))) 1990 x[i, , ] 1991} 1992tail.data.table = function(x, n=6L, ...) { 1993 if (!cedta()) return(NextMethod()) # nocov 1994 stopifnot(length(n) == 1L) 1995 n = if (n<0L) max(nrow(x) + n, 0L) else min(n, nrow(x)) 1996 i = seq.int(to=nrow(x), length.out=n) 1997 x[i] 1998} 1999 2000"[<-.data.table" = function (x, i, j, value) { 2001 # [<- is provided for consistency, but := is preferred as it allows by group and by reference to subsets of columns 2002 # with no copy of the (very large, say 10GB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change. 2003 if (!cedta()) { 2004 x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value) 2005 else `[<-.data.frame`(x, i, j, value) 2006 return(setalloccol(x)) # over-allocate (again). Avoid all this by using :=. 2007 } 2008 # TO DO: warning("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='") 2009 if (!missing(i)) { 2010 isub=substitute(i) 2011 i = eval(.massagei(isub), x, parent.frame()) 2012 if (is.matrix(i)) { 2013 if (!missing(j)) stop("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j") 2014 x = `[<-.data.frame`(x, i, value=value) 2015 return(setalloccol(x)) 2016 } 2017 i = x[i, which=TRUE] 2018 # Tried adding ... after value above, and passing ... in here (e.g. for mult="first") but R CMD check 2019 # then gives "The argument of a replacement function which corresponds to the right hand side must be 2020 # named 'value'". So, users have to use := for that. 2021 } else i = NULL # meaning (to C code) all rows, without allocating 1L:nrow(x) vector 2022 if (missing(j)) j=names(x) 2023 if (!is.atomic(j)) stop("j must be an atomic vector, see ?is.atomic") 2024 if (anyNA(j)) stop("NA in j") 2025 if (is.character(j)) { 2026 newnames = setdiff(j,names(x)) 2027 cols = as.integer(chmatch(j, c(names(x),newnames))) 2028 # We can now mix existing columns and new columns 2029 } else { 2030 if (!is.numeric(j)) stop("j must be vector of column name or positions") 2031 if (any(j>ncol(x))) stop("Attempt to assign to column position greater than ncol(x). Create the column by name, instead. This logic intends to catch (most likely) user errors.") 2032 cols = as.integer(j) # for convenience e.g. to convert 1 to 1L 2033 newnames = NULL 2034 } 2035 reinstatekey=NULL 2036 if (haskey(x) && identical(key(x),key(value)) && 2037 identical(names(x),names(value)) && 2038 is.sorted(i) && 2039 identical(substitute(x),quote(`*tmp*`))) { 2040 # DT["a",]$y = 1.1 winds up creating `*tmp*` subset of rows and assigning _all_ the columns into x and 2041 # over-writing the key columns with the same value (not just the single 'y' column). 2042 # That isn't good for speed; it's an R thing. Solution is to use := instead to avoid all this, but user 2043 # expects key to be retained in this case because _he_ didn't assign to a key column (the internal base R 2044 # code did). 2045 reinstatekey=key(x) 2046 } 2047 if (!selfrefok(x) || truelength(x) < ncol(x)+length(newnames)) { 2048 x = setalloccol(x, length(x)+length(newnames)) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too 2049 # search for one other .Call to assign in [.data.table to see how it differs 2050 } 2051 x = .Call(Cassign,copy(x),i,cols,newnames,value) # From 3.1.0, DF[2,"b"] = 7 no longer copies DF$a (so in this [<-.data.table method we need to copy) 2052 setalloccol(x) # can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe. 2053 if (length(reinstatekey)) setkeyv(x,reinstatekey) 2054 invisible(x) 2055 # no copy at all if user calls directly; i.e. `[<-.data.table`(x,i,j,value) 2056 # or uses data.table := syntax; i.e. DT[i,j:=value] 2057 # but, there is one copy by R in [<- dispatch to `*tmp*`; i.e. DT[i,j]=value. *Update: not from R > 3.0.2, yay* 2058 # That copy is via main/duplicate.c which preserves truelength but copies length amount. Hence setalloccol(x,length(x)). 2059 # No warn passed to assign here because we know it'll be copied via *tmp*. 2060 # := allows subassign to a column with no copy of the column at all, and by group, etc. 2061} 2062 2063"$<-.data.table" = function(x, name, value) { 2064 if (!cedta()) { 2065 ans = `$<-.data.frame`(x, name, value) 2066 return(setalloccol(ans)) # over-allocate (again) 2067 } 2068 x = copy(x) 2069 set(x,j=name,value=value) # important i is missing here 2070} 2071 2072as.data.frame.data.table = function(x, ...) 2073{ 2074 ans = copy(x) 2075 setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names 2076 setattr(ans,"class","data.frame") 2077 setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted 2078 setattr(ans,".internal.selfref",NULL) 2079 # leave tl intact, no harm, 2080 ans 2081} 2082 2083as.list.data.table = function(x, ...) { 2084 # Similar to as.list.data.frame in base. Although a data.table/frame is a list, too, it may be 2085 # being coerced to raw list type (by calling code) so that "[" and "[[" work in their raw list form, 2086 # such as lapply does for data.frame. So we do have to remove the class attributes (and thus shallow 2087 # copy is almost instant way to achieve that, without risking compatibility). 2088 #if (sys.call(-2L)[[1L]]=="lapply") 2089 # return(x) 2090 ans = shallow(x) 2091 setattr(ans, "class", NULL) 2092 setattr(ans, "row.names", NULL) 2093 setattr(ans, "sorted", NULL) 2094 setattr(ans,".internal.selfref", NULL) # needed to pass S4 tests for example 2095 ans 2096} 2097 2098 2099dimnames.data.table = function(x) { 2100 if (!cedta()) { 2101 if (!inherits(x, "data.frame")) 2102 stop("data.table inherits from data.frame (from v1.5), but this data.table does not. Has it been created manually (e.g. by using 'structure' rather than 'data.table') or saved to disk using a prior version of data.table?") 2103 return(`dimnames.data.frame`(x)) 2104 } 2105 list(NULL, names(x)) 2106} 2107 2108"dimnames<-.data.table" = function (x, value) # so that can do colnames(dt)=<..> as well as names(dt)=<..> 2109{ 2110 if (!cedta()) return(`dimnames<-.data.frame`(x,value)) # nocov ; will drop key but names<-.data.table (below) is more common usage and does retain the key 2111 if (!is.list(value) || length(value) != 2L) stop("attempting to assign invalid object to dimnames of a data.table") 2112 if (!is.null(value[[1L]])) stop("data.tables do not have rownames") 2113 if (ncol(x) != length(value[[2L]])) stop("Can't assign ", length(value[[2L]]), " colnames to a ", ncol(x), "-column data.table") 2114 setnames(x,as.character(value[[2L]])) 2115 x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change. 2116} 2117 2118"names<-.data.table" = function(x,value) 2119{ 2120 # When non data.table aware packages change names, we'd like to maintain the key. 2121 # If call is names(DT)[2]="newname", R will call this names<-.data.table function (notice no i) with 'value' already prepared to be same length as ncol 2122 x = shallow(x) # `names<-` should not modify by reference. Related to #1015, #476 and #825. Needed for R v3.1.0+. TO DO: revisit 2123 if (is.null(value)) 2124 setattr(x,"names",NULL) # e.g. plyr::melt() calls base::unname() 2125 else 2126 setnames(x,value) 2127 x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change. 2128} 2129 2130within.data.table = function (data, expr, ...) 2131# basically within.list but retains key (if any) 2132# will be slower than using := or a regular query (see ?within for further info). 2133{ 2134 if (!cedta()) return(NextMethod()) # nocov 2135 parent = parent.frame() 2136 e = evalq(environment(), data, parent) 2137 eval(substitute(expr), e) # might (and it's known that some user code does) contain rm() 2138 l = as.list(e) 2139 l = l[!vapply_1b(l, is.null)] 2140 nD = length(del <- setdiff(names(data), (nl <- names(l)))) 2141 ans = copy(data) 2142 if (length(nl)) ans[,nl] = l 2143 if (nD) ans[,del] = NULL 2144 if (haskey(data) && all(key(data) %chin% names(ans))) { 2145 x = TRUE 2146 for (i in key(data)) { 2147 x = identical(data[[i]],ans[[i]]) 2148 if (!x) break 2149 } 2150 if (x) setattr(ans,"sorted",key(data)) 2151 } 2152 ans 2153} 2154 2155transform.data.table = function (`_data`, ...) 2156# basically transform.data.frame with data.table instead of data.frame, and retains key 2157{ 2158 if (!cedta()) return(NextMethod()) # nocov 2159 e = eval(substitute(list(...)), `_data`, parent.frame()) 2160 tags = names(e) 2161 inx = chmatch(tags, names(`_data`)) 2162 matched = !is.na(inx) 2163 if (any(matched)) { 2164 .Call(C_unlock, `_data`) # fix for #1641, now covered by test 104.2 2165 `_data`[,inx[matched]] = e[matched] 2166 `_data` = as.data.table(`_data`) 2167 } 2168 if (!all(matched)) { 2169 ans = do.call("data.table", c(list(`_data`), e[!matched])) 2170 } else { 2171 ans = `_data` 2172 } 2173 key.cols = key(`_data`) 2174 if (!any(tags %chin% key.cols)) { 2175 setattr(ans, "sorted", key.cols) 2176 } 2177 ans 2178} 2179 2180subset.data.table = function (x, subset, select, ...) 2181{ 2182 key.cols = key(x) 2183 2184 if (missing(subset)) { 2185 r = TRUE 2186 } else { 2187 e = substitute(subset) 2188 r = eval(e, x, parent.frame()) 2189 if (!is.logical(r)) 2190 stop("'subset' must evaluate to logical") 2191 r = r & !is.na(r) 2192 } 2193 2194 if (missing(select)) { 2195 vars = seq_len(ncol(x)) 2196 } else { 2197 nl = as.list(seq_len(ncol(x))) 2198 setattr(nl,"names",names(x)) 2199 vars = eval(substitute(select), nl, parent.frame()) # e.g. select=colF:colP 2200 # #891 fix - don't convert numeric vars to column names - will break when there are duplicate columns 2201 key.cols = intersect(key.cols, names(x)[vars]) ## Only keep key.columns found in the select clause 2202 } 2203 2204 ans = x[r, vars, with = FALSE] 2205 2206 if (nrow(ans) > 0L) { 2207 if (!missing(select) && length(key.cols)) { 2208 ## Set the key on the returned data.table as long as the key 2209 ## columns that "remain" are the same as the original, or a 2210 ## prefix of it. 2211 is.prefix = all(key(x)[seq_len(length(key.cols))] == key.cols) 2212 if (is.prefix) { 2213 setattr(ans, "sorted", key.cols) 2214 } 2215 } 2216 } else { 2217 setkey(ans,NULL) 2218 } 2219 ans 2220} 2221 2222# Equivalent of 'rowSums(is.na(dt) > 0L)' but much faster and memory efficient. 2223# Also called "complete.cases" in base. Unfortunately it's not a S3 generic. 2224# Also handles bit64::integer64. TODO: export this? 2225# For internal use only. 'by' requires integer input. No argument checks here yet. 2226is_na = function(x, by=seq_along(x)) .Call(Cdt_na, x, by) 2227any_na = function(x, by=seq_along(x)) .Call(CanyNA, x, by) 2228 2229na.omit.data.table = function (object, cols = seq_along(object), invert = FALSE, ...) { 2230 # compare to stats:::na.omit.data.frame 2231 if (!cedta()) return(NextMethod()) # nocov 2232 if ( !missing(invert) && is.na(as.logical(invert)) ) 2233 stop("Argument 'invert' must be logical TRUE/FALSE") 2234 cols = colnamesInt(object, cols, check_dups=FALSE) 2235 ix = .Call(Cdt_na, object, cols) 2236 # forgot about invert with no NA case, #2660 2237 if (invert) { 2238 if (all(ix)) 2239 object 2240 else 2241 .Call(CsubsetDT, object, which_(ix, bool = TRUE), seq_along(object)) 2242 } else { 2243 if (any(ix)) 2244 .Call(CsubsetDT, object, which_(ix, bool = FALSE), seq_along(object)) 2245 else 2246 object 2247 } 2248} 2249 2250which_ = function(x, bool = TRUE) { 2251 # fix for #1467, quotes result in "not resolved in current namespace" error 2252 .Call(Cwhichwrapper, x, bool) 2253} 2254 2255is.na.data.table = function (x) { 2256 if (!cedta()) return(`is.na.data.frame`(x)) 2257 do.call("cbind", lapply(x, "is.na")) 2258} 2259 2260# not longer needed as inherits ... 2261# t.data.table = t.data.frame 2262# Math.data.table = Math.data.frame 2263# summary.data.table = summary.data.frame 2264 2265Ops.data.table = function(e1, e2 = NULL) 2266{ 2267 ans = NextMethod() 2268 if (cedta() && is.data.frame(ans)) ans = as.data.table(ans) 2269 else if (is.matrix(ans)) colnames(ans) = copy(colnames(ans)) 2270 ans 2271} 2272 2273split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) { 2274 if (!is.data.table(x)) stop("x argument must be a data.table") 2275 stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by), is.logical(flatten)) 2276 # split data.frame way, using `f` and not `by` argument 2277 if (!missing(f)) { 2278 if (!length(f) && nrow(x)) 2279 stop("group length is 0 but data nrow > 0") 2280 if (!missing(by)) 2281 stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor") 2282 # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency 2283 return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind])) 2284 } 2285 if (missing(by)) stop("Either 'by' or 'f' argument must be supplied") 2286 # check reserved column names during processing 2287 if (".ll.tech.split" %chin% names(x)) stop("Column '.ll.tech.split' is reserved for split.data.table processing") 2288 if (".nm.tech.split" %chin% by) stop("Column '.nm.tech.split' is reserved for split.data.table processing") 2289 if (!all(by %chin% names(x))) stop("Argument 'by' must refer to column names in x") 2290 if (!all(by.atomic <- vapply_1b(by, function(.by) is.atomic(x[[.by]])))) stop("Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: ", brackify(by[!by.atomic])) 2291 # list of data.tables (flatten) or list of lists of ... data.tables 2292 make.levels = function(x, cols, sorted) { 2293 by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE 2294 ul = lapply(setNames(cols, nm=cols), function(col) { 2295 if (!is.factor(x[[col]])) unique(x[[col]]) else { 2296 .x_lev = levels(x[[col]]) 2297 #need to keep as a factor or order will be lost, #2082 2298 factor(.x_lev, levels = .x_lev) 2299 } 2300 }) 2301 r = do.call("CJ", c(ul, sorted=sorted, unique=TRUE)) 2302 if (!sorted && nrow(by.order)) { 2303 ii = r[by.order, on=cols, which=TRUE] 2304 r = rbindlist(list( 2305 r[ii], # original order from data 2306 r[-ii] # empty levels at the end 2307 )) 2308 } 2309 r 2310 } 2311 .by = by[1L] 2312 # this builds data.table call - is much more cleaner than handling each case one by one 2313 dtq = as.list(call("[", as.name("x"))) 2314 join = FALSE 2315 flatten_any = flatten && any(vapply_1b(by, function(col) is.factor(x[[col]]))) 2316 nested_current = !flatten && is.factor(x[[.by]]) 2317 if (!drop && (flatten_any || nested_current)) { 2318 # create 'levs' here to avoid lexical scoping glitches, see #3151 2319 levs = make.levels(x=x, cols=if (flatten) by else .by, sorted=sorted) 2320 dtq[["i"]] = quote(levs) 2321 join = TRUE 2322 } 2323 dtq[["j"]] = substitute( 2324 list(.ll.tech.split=list(.expr)), 2325 list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ? 2326 ) 2327 by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L] 2328 dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. 2329 .expr, 2330 list(.expr = if(join) {as.name(".EACHI")} else if (flatten) by else .by) 2331 ) 2332 dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by) 2333 if (join) dtq[["on"]] = if (flatten) by else .by 2334 dtq = as.call(dtq) 2335 if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="") 2336 tmp = eval(dtq) 2337 # add names on list 2338 setattr(ll <- tmp$.ll.tech.split, 2339 "names", 2340 as.character( 2341 if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(lapply(.SD, as.character)), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split 2342 )) 2343 # handle nested split 2344 if (flatten || length(by) == 1L) { 2345 for (x in ll) .Call(C_unlock, x) 2346 lapply(ll, setDT) 2347 # alloc.col could handle DT in list as done in: c9c4ff80bdd4c600b0c4eff23b207d53677176bd 2348 } else if (length(by) > 1L) { 2349 lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten) 2350 } 2351} 2352 2353# TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want 2354 2355copy = function(x) { 2356 newx = .Call(Ccopy,x) # copies at length but R's duplicate() also copies truelength over. 2357 # TO DO: inside Ccopy it could reset tl to 0 or length, but no matter as selfrefok detects it 2358 # TO DO: revisit duplicate.c in R 3.0.3 and see where it's at 2359 2360 reallocate = function(y) { 2361 if (is.data.table(y)) { 2362 .Call(C_unlock, y) 2363 setalloccol(y) 2364 } else if (is.list(y)) { 2365 oldClass = class(y) 2366 setattr(y, 'class', NULL) # otherwise [[.person method (which returns itself) results in infinite recursion, #4620 2367 y[] = lapply(y, reallocate) 2368 if (!identical(oldClass, 'list')) setattr(y, 'class', oldClass) 2369 } 2370 y 2371 } 2372 2373 reallocate(newx) 2374} 2375 2376.shallow = function(x, cols = NULL, retain.key = FALSE, unlock = FALSE) { 2377 wasnull = is.null(cols) 2378 cols = colnamesInt(x, cols, check_dups=FALSE) 2379 ans = .Call(Cshallowwrapper, x, cols) # copies VECSXP only 2380 2381 if(retain.key){ 2382 if (wasnull) return(ans) # handle most frequent case first 2383 ## get correct key if cols are present 2384 cols = names(x)[cols] 2385 keylength = which.first(!key(ans) %chin% cols) - 1L 2386 if (is.na(keylength)) keylength <- length(key(ans)) 2387 if (!keylength) { 2388 setattr(ans, "sorted", NULL) ## no key remaining 2389 } else { 2390 setattr(ans, "sorted", head(key(ans), keylength)) ## keep what can be kept 2391 } 2392 ## take care of attributes. 2393 indices = names(attributes(attr(ans, "index", exact=TRUE))) 2394 for(index in indices) { 2395 indexcols = strsplit(index, split = "__")[[1L]][-1L] 2396 indexlength = which.first(!indexcols %chin% cols) - 1L 2397 if (is.na(indexlength)) next ## all columns are present, nothing to be done 2398 reducedindex = paste0("__", indexcols[seq_len(indexlength)], collapse="") ## the columns until the first missing from the new index 2399 if (reducedindex %chin% indices || !indexlength) { 2400 ## Either reduced index already present or no columns of the original index remain. 2401 ## Drop the original index completely 2402 setattr(attr(ans, "index", exact=TRUE), index, NULL) 2403 } else if(length(attr(attr(ans, "index", exact=TRUE), index, exact=TRUE))) { 2404 ## index is not length 0. Drop it since shortening could lead to spurious reordering in discarded columns (#2336) 2405 setattr(attr(ans, "index", exact=TRUE), index, NULL) 2406 } else { 2407 ## rename index to reducedindex 2408 names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index", exact=TRUE))) == index] = reducedindex 2409 } 2410 } 2411 } else { # retain.key == FALSE 2412 setattr(ans, "sorted", NULL) 2413 setattr(ans, "index", NULL) 2414 } 2415 if (unlock) .Call(C_unlock, ans) 2416 ans 2417} 2418 2419shallow = function(x, cols=NULL) { 2420 if (!is.data.table(x)) 2421 stop("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table") 2422 ans = .shallow(x, cols=cols, retain.key = TRUE) 2423 ans 2424} 2425 2426setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose")) 2427{ 2428 name = substitute(DT) 2429 if (identical(name, quote(`*tmp*`))) stop("setalloccol attempting to modify `*tmp*`") 2430 ans = .Call(Calloccolwrapper, DT, eval(n), verbose) 2431 if (is.name(name)) { 2432 name = as.character(name) 2433 assign(name,ans,parent.frame(),inherits=TRUE) 2434 } 2435 ans 2436} 2437 2438selfrefok = function(DT,verbose=getOption("datatable.verbose")) { 2439 .Call(Cselfrefokwrapper,DT,verbose) 2440} 2441 2442truelength = function(x) .Call(Ctruelength,x) 2443# deliberately no "truelength<-" method. setalloccol is the mechanism for that. 2444# settruelength() no longer need (and so removed) now that data.table depends on R 2.14.0 2445# which initializes tl to zero rather than leaving uninitialized. 2446 2447setattr = function(x,name,value) { 2448 # Wrapper for setAttrib internal R function 2449 # Sets attribute by reference (no copy) 2450 # Named setattr (rather than setattrib) at R level to more closely resemble attr<- 2451 # And as from 1.7.8 is made exported in NAMESPACE for use in user attributes. 2452 # User can also call `attr<-` function directly, but that copies (maybe just when NAMED>0, which is always for data.frame, I think). See "Confused by NAMED" thread on r-devel 24 Nov 2011. 2453 # We tend to use setattr() internally in data.table.R because often we construct a data.table and it hasn't 2454 # got names yet. setnames() is the user interface which checks integrity and doesn't let you drop names for example. 2455 if (name=="names" && is.data.table(x) && length(attr(x, "names", exact=TRUE)) && !is.null(value)) 2456 setnames(x,value) 2457 # Using setnames here so that truelength of names can be retained, to carry out integrity checks such as not 2458 # creating names longer than the number of columns of x, and to change the key, too 2459 # For convenience so that setattr(DT,"names",allnames) works as expected without requiring a switch to setnames. 2460 else { 2461 ans = .Call(Csetattrib, x, name, value) 2462 # If name=="names" and this is the first time names are assigned (e.g. in data.table()), this will be grown by setalloccol very shortly afterwards in the caller. 2463 if (!is.null(ans)) { 2464 warning("Input is a length=1 logical that points to the same address as R's global value. Therefore the attribute has not been set by reference, rather on a copy. You will need to assign the result back to a variable. See issue #1281.") 2465 x = ans 2466 } 2467 } 2468 # fix for #1142 - duplicated levels for factors 2469 if (name == "levels" && is.factor(x) && anyDuplicated(value)) 2470 .Call(Csetlevels, x, (value <- as.character(value)), unique(value)) 2471 invisible(x) 2472} 2473 2474setnames = function(x,old,new,skip_absent=FALSE) { 2475 # Sets by reference, maintains truelength, no copy of table at all. 2476 # But also more convenient than names(DT)[i]="newname" because we can also do setnames(DT,"oldname","newname") 2477 # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness. 2478 # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with "" 2479 if (!is.data.frame(x)) stop("x is not a data.table or data.frame") 2480 ncol = length(x) 2481 if (length(names(x)) != ncol) stop("x has ",ncol," columns but its names are length ",length(names(x))) 2482 stopifnot(isTRUEorFALSE(skip_absent)) 2483 if (missing(new) || missing(old)) { 2484 # usage: setnames(DT, new = letters[1:n]) 2485 if (missing(old)) { old = new; new = NULL } 2486 # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2 2487 if (is.function(old)) old = old(names(x)) 2488 if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.") 2489 if (length(old) != ncol) stop("Can't assign ",length(old)," names to a ",ncol," column data.table") 2490 if (anyNA(names(x))) { 2491 # if x somehow has some NA names, which() needs help to return them, #2475 2492 w = which((names(x) != old) | (Encoding(names(x)) != Encoding(old)) | (is.na(names(x)) & !is.na(old))) 2493 } else { 2494 w = which(names(x) != old | (Encoding(names(x)) != Encoding(old))) 2495 } 2496 if (!length(w)) return(invisible(x)) # no changes 2497 new = old[w] 2498 i = w 2499 } else { 2500 if (is.function(new)) new = if (is.numeric(old)) new(names(x)[old]) else new(old) 2501 if (!is.character(new)) stop("'new' is not a character vector or a function") 2502 # if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) # dups allowed without warning; warn if and when the dup causes an ambiguity 2503 if (anyNA(new)) stop("NA in 'new' at positions ", brackify(which(is.na(new)))) 2504 if (anyDuplicated(old)) stop("Some duplicates exist in 'old': ", brackify(old[duplicated(old)])) 2505 if (is.numeric(old)) i = old = seq_along(x)[old] # leave it to standard R to manipulate bounds and negative numbers 2506 else if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character") 2507 if (length(new)!=length(old)) stop("'old' is length ",length(old)," but 'new' is length ",length(new)) 2508 if (anyNA(old)) stop("NA (or out of bounds) in 'old' at positions ", brackify(which(is.na(old)))) 2509 if (is.character(old)) { 2510 i = chmatchdup(c(old,old), names(x)) # chmatchdup returns the second of any duplicates matched to in names(x) (if any) 2511 if (!all(tt<-is.na(tail(i,length(old))))) warning("Item ",w<-which.first(!tt)," of 'old' is '", old[w],"' which appears several times in column names. Just the first will be changed. There are ", sum(!tt)-1L," other items in old that are also duplicated in column names.") 2512 i = head(i,length(old)) 2513 if (anyNA(i)) { 2514 if (isTRUE(skip_absent)) { 2515 w = !is.na(i) 2516 new = new[w] 2517 i = i[w] 2518 } else { 2519 stop("Items of 'old' not found in column names: ", brackify(old[is.na(i)]), ". Consider skip_absent=TRUE.") 2520 } 2521 } 2522 } 2523 if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) { 2524 w = which(!w) 2525 new = new[w] 2526 i = i[w] 2527 } 2528 if (!length(new)) return(invisible(x)) # no changes 2529 if (length(i) != length(new)) stop("Internal error: length(i)!=length(new)") # nocov 2530 } 2531 # update the key if the column name being change is in the key 2532 m = chmatch(names(x)[i], key(x)) 2533 w = which(!is.na(m)) 2534 if (length(w)) 2535 .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w]) 2536 2537 # update secondary keys 2538 idx = attr(x, "index", exact=TRUE) 2539 for (k in names(attributes(idx))) { 2540 tt = strsplit(k,split="__")[[1L]][-1L] 2541 m = chmatch(names(x)[i], tt) 2542 w = which(!is.na(m)) 2543 if (length(w)) { 2544 tt[m[w]] = new[w] 2545 newk = paste0("__",tt,collapse="") 2546 setattr(idx, newk, attr(idx, k, exact=TRUE)) 2547 setattr(idx, k, NULL) 2548 } 2549 } 2550 2551 .Call(Csetcharvec, attr(x, "names", exact=TRUE), as.integer(i), new) 2552 invisible(x) 2553} 2554 2555setcolorder = function(x, neworder=key(x)) 2556{ 2557 if (is.character(neworder) && anyDuplicated(names(x))) 2558 stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.") 2559 # if (!is.data.table(x)) stop("x is not a data.table") 2560 neworder = colnamesInt(x, neworder, check_dups=FALSE) # dups are now checked inside Csetcolorder below 2561 if (length(neworder) != length(x)) { 2562 #if shorter than length(x), pad by the missing 2563 # elements (checks below will catch other mistakes) 2564 neworder = c(neworder, setdiff(seq_along(x), neworder)) 2565 } 2566 .Call(Csetcolorder, x, neworder) 2567 invisible(x) 2568} 2569 2570set = function(x,i=NULL,j,value) # low overhead, loopable 2571{ 2572 .Call(Cassign,x,i,j,NULL,value) 2573 invisible(x) 2574} 2575 2576chmatch = function(x, table, nomatch=NA_integer_) 2577 .Call(Cchmatch, x, table, as.integer(nomatch[1L])) # [1L] to fix #1672 2578 2579# chmatchdup() behaves like 'pmatch' but only the 'exact' matching part; i.e. a value in 2580# 'x' is matched to 'table' only once. No index will be present more than once. For example: 2581# chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' 2582# chmatchdup(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a' 2583# chmatchdup(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match. 2584chmatchdup = function(x, table, nomatch=NA_integer_) 2585 .Call(Cchmatchdup, x, table, as.integer(nomatch[1L])) 2586 2587"%chin%" = function(x, table) 2588 .Call(Cchin, x, table) # TO DO if table has 'ul' then match to that 2589 2590chorder = function(x) { 2591 o = forderv(x, sort=TRUE, retGrp=FALSE) 2592 if (length(o)) o else seq_along(x) 2593} 2594 2595chgroup = function(x) { 2596 # TO DO: deprecate and remove this. It's exported but doubt anyone uses it. Think the plan was to use it internally, but forderv superceded. 2597 o = forderv(x, sort=FALSE, retGrp=TRUE) 2598 if (length(o)) as.vector(o) else seq_along(x) # as.vector removes the attributes 2599} 2600 2601# plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948 2602rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) { 2603 l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x)) #1626; e.g. psych binds a data.frame|table with a matrix 2604 rbindlist(l, use.names, fill, idcol) 2605} 2606cbind.data.table = data.table 2607.rbind.data.table = rbind.data.table # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0 2608 2609rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { 2610 if (is.null(l)) return(null.data.table()) 2611 if (!is.list(l) || is.data.frame(l)) stop("Input is ", class(l)[1L]," but should be a plain list of items to be stacked") 2612 if (isFALSE(idcol)) { idcol = NULL } 2613 else if (!is.null(idcol)) { 2614 if (isTRUE(idcol)) idcol = ".id" 2615 if (!is.character(idcol)) stop("idcol must be a logical or character vector of length 1. If logical TRUE the id column will named '.id'.") 2616 idcol = idcol[1L] 2617 } 2618 miss = missing(use.names) 2619 # more checking of use.names happens at C level; this is just minimal to massage 'check' to NA 2620 if (identical(use.names, NA)) stop("use.names=NA invalid") # otherwise use.names=NA could creep in an usage equivalent to use.names='check' 2621 if (identical(use.names,"check")) { 2622 if (!miss) stop("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") 2623 use.names = NA 2624 } 2625 ans = .Call(Crbindlist, l, use.names, fill, idcol) 2626 if (!length(ans)) return(null.data.table()) 2627 setDT(ans)[] 2628} 2629 2630vecseq = function(x,y,clamp) .Call(Cvecseq,x,y,clamp) 2631 2632# .Call(Caddress, x) increments NAM() when x is vector with NAM(1). Referring object within non-primitive function is enough to increment reference. 2633address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) 2634 2635":=" = function(...) { 2636 # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 2637 stop('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").') 2638} 2639 2640setDF = function(x, rownames=NULL) { 2641 if (!is.list(x)) stop("setDF only accepts data.table, data.frame or list of equal length as input") 2642 if (anyDuplicated(rownames)) stop("rownames contains duplicates") 2643 if (is.data.table(x)) { 2644 # copied from as.data.frame.data.table 2645 if (is.null(rownames)) { 2646 rn = .set_row_names(nrow(x)) 2647 } else { 2648 if (length(rownames) != nrow(x)) 2649 stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames)) 2650 rn = rownames 2651 } 2652 setattr(x, "row.names", rn) 2653 setattr(x, "class", "data.frame") 2654 setattr(x, "sorted", NULL) 2655 setattr(x, ".internal.selfref", NULL) 2656 } else if (is.data.frame(x)) { 2657 if (!is.null(rownames)) { 2658 if (length(rownames) != nrow(x)) 2659 stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames)) 2660 setattr(x, "row.names", rownames) 2661 } 2662 x 2663 } else { 2664 n = vapply_1i(x, length) 2665 mn = max(n) 2666 if (any(n<mn)) 2667 stop("All elements in argument 'x' to 'setDF' must be of same length") 2668 xn = names(x) 2669 if (is.null(xn)) { 2670 setattr(x, "names", paste0("V",seq_len(length(x)))) 2671 } else { 2672 idx = xn %chin% "" 2673 if (any(idx)) { 2674 xn[idx] = paste0("V", seq_along(which(idx))) 2675 setattr(x, "names", xn) 2676 } 2677 } 2678 if (is.null(rownames)) { 2679 rn = .set_row_names(mn) 2680 } else { 2681 if (length(rownames) != mn) 2682 stop("rownames incorrect length; expected ", mn, " names, got ", length(rownames)) 2683 rn = rownames 2684 } 2685 setattr(x,"row.names", rn) 2686 setattr(x,"class","data.frame") 2687 } 2688 invisible(x) 2689} 2690 2691setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { 2692 name = substitute(x) 2693 if (is.name(name)) { 2694 home = function(x, env) { 2695 if (identical(env, emptyenv())) 2696 stop("Cannot find symbol ", cname, call. = FALSE) 2697 else if (exists(x, env, inherits=FALSE)) env 2698 else home(x, parent.env(env)) 2699 } 2700 cname = as.character(name) 2701 envir = home(cname, parent.frame()) 2702 if (bindingIsLocked(cname, envir)) { 2703 stop("Cannot convert '", cname, "' to data.table by reference because binding is locked. It is very likely that '", cname, "' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.") 2704 } 2705 } 2706 # check no matrix-like columns, #3760. Other than a single list(matrix) is unambiguous and depended on by some revdeps, #3581 2707 if (length(x)>1L) { 2708 idx = vapply_1i(x, function(xi) length(dim(xi)))>1L 2709 if (any(idx)) 2710 warning("Some columns are a multi-column type (such as a matrix column): ", brackify(which(idx)),". setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.") 2711 } 2712 if (is.data.table(x)) { 2713 # fix for #1078 and #1128, see .resetclass() for explanation. 2714 setattr(x, 'class', .resetclass(x, 'data.table')) 2715 if (!missing(key)) setkeyv(x, key) # fix for #1169 2716 if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) 2717 if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x) 2718 } else if (is.data.frame(x)) { 2719 rn = if (!identical(keep.rownames, FALSE)) rownames(x) else NULL 2720 setattr(x, "row.names", .set_row_names(nrow(x))) 2721 if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) 2722 # fix for #1078 and #1128, see .resetclass() for explanation. 2723 setattr(x, "class", .resetclass(x, 'data.frame')) 2724 setalloccol(x) 2725 if (!is.null(rn)) { 2726 nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x)) 2727 x[, (nm[1L]) := rn] 2728 setcolorder(x, nm) 2729 } 2730 } else if (is.list(x) && length(x)==1L && is.matrix(x[[1L]])) { 2731 # a single list(matrix) is unambiguous and depended on by some revdeps, #3581 2732 x = as.data.table.matrix(x[[1L]]) 2733 } else if (is.null(x) || (is.list(x) && !length(x))) { 2734 x = null.data.table() 2735 } else if (is.list(x)) { 2736 # copied from as.data.table.list - except removed the copy 2737 for (i in seq_along(x)) { 2738 if (is.null(x[[i]])) next # allow NULL columns to be created by setDT(list) even though they are not really allowed 2739 # many operations still work in the presence of NULL columns and it might be convenient 2740 # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which 2741 # fail for NULL columns will give helpful error at that point, #3480 and #3471 2742 if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") 2743 } 2744 n = vapply_1i(x, length) 2745 n_range = range(n) 2746 if (n_range[1L] != n_range[2L]) { 2747 tbl = sort(table(n)) 2748 stop("All elements in argument 'x' to 'setDT' must be of same length, but the profile of input lengths (length:frequency) is: ", 2749 brackify(sprintf('%s:%d', names(tbl), tbl)), "\nThe first entry with fewer than ", n_range[2L], " entries is ", which.max(n<n_range[2L])) 2750 } 2751 xn = names(x) 2752 if (is.null(xn)) { 2753 setattr(x, "names", paste0("V",seq_len(length(x)))) 2754 } else { 2755 idx = xn %chin% "" # names can be NA - test 1006 caught that! 2756 if (any(idx)) { 2757 xn[idx] = paste0("V", seq_along(which(idx))) 2758 setattr(x, "names", xn) 2759 } 2760 if (check.names) setattr(x, "names", make.names(xn, unique=TRUE)) 2761 } 2762 setattr(x,"row.names",.set_row_names(n_range[2L])) 2763 setattr(x,"class",c("data.table","data.frame")) 2764 setalloccol(x) 2765 } else { 2766 stop("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'") 2767 } 2768 if (!is.null(key)) setkeyv(x, key) 2769 if (is.name(name)) { 2770 name = as.character(name) 2771 assign(name, x, parent.frame(), inherits=TRUE) 2772 } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) { 2773 # common case is call from 'lapply()' 2774 k = eval(name[[2L]], parent.frame(), parent.frame()) 2775 if (is.list(k)) { 2776 origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) 2777 if (length(j) == 1L) { 2778 if (is.character(j)) { 2779 j = match(j, names(k)) 2780 if (is.na(j)) 2781 stop("Item '", origj, "' not found in names of input list") 2782 } 2783 } 2784 .Call(Csetlistelt,k,as.integer(j), x) 2785 } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { 2786 assign(as.character(name[[3L]]), x, k, inherits=FALSE) 2787 } 2788 } 2789 .Call(CexpandAltRep, x) # issue#2866 and PR#2882 2790 invisible(x) 2791} 2792 2793as_list = function(x) { 2794 lx = vector("list", 1L) 2795 .Call(Csetlistelt, lx, 1L, x) 2796 lx 2797} 2798 2799# FR #1353 2800rowid = function(..., prefix=NULL) { 2801 rowidv(list(...), prefix=prefix) 2802} 2803 2804rowidv = function(x, cols=seq_along(x), prefix=NULL) { 2805 if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) 2806 stop("'prefix' must be NULL or a character vector of length 1.") 2807 if (is.atomic(x)) { 2808 if (!missing(cols) && !is.null(cols)) 2809 stop("x is a single vector, non-NULL 'cols' doesn't make sense.") 2810 cols = 1L 2811 x = as_list(x) 2812 } else if (!length(cols)) { 2813 stop("x is a list, 'cols' cannot be 0-length.") 2814 } 2815 xorder = forderv(x, by=cols, sort=FALSE, retGrp=TRUE) # speedup on char with sort=FALSE 2816 xstart = attr(xorder, 'starts', exact=TRUE) 2817 if (!length(xorder)) xorder = seq_along(x[[1L]]) 2818 ids = .Call(Cfrank, xorder, xstart, uniqlengths(xstart, length(xorder)), "sequence") 2819 if (!is.null(prefix)) 2820 ids = paste0(prefix, ids) 2821 ids 2822} 2823 2824# FR #686 2825rleid = function(..., prefix=NULL) { 2826 rleidv(list(...), prefix=prefix) 2827} 2828 2829rleidv = function(x, cols=seq_along(x), prefix=NULL) { 2830 if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) 2831 stop("'prefix' must be NULL or a character vector of length 1.") 2832 if (is.atomic(x)) { 2833 if (!missing(cols) && !is.null(cols)) 2834 stop("x is a single vector, non-NULL 'cols' doesn't make sense.") 2835 cols = 1L 2836 x = as_list(x) 2837 } else if (!length(cols)) { 2838 stop("x is a list, 'cols' cannot be 0-length.") 2839 } 2840 cols = colnamesInt(x, cols, check_dups=FALSE) 2841 ids = .Call(Crleid, x, cols) 2842 if (!is.null(prefix)) ids = paste0(prefix, ids) 2843 ids 2844} 2845 2846# GForce functions 2847# to add a new function to GForce (from the R side -- the easy part!): 2848# (1) add it to gfuns 2849# (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function 2850# (3) define the gfun = function() R wrapper 2851gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", 2852 "median", "min", "max", "var", "sd", ".N") # added .N for #334 2853`g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. 2854ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment 2855gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment 2856gfirst = function(x) .Call(Cgfirst, x) 2857glast = function(x) .Call(Cglast, x) 2858gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm, TRUE) # warnOverflow=TRUE, #986 2859gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) 2860gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) 2861gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) 2862gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) 2863gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm) 2864gvar = function(x, na.rm=FALSE) .Call(Cgvar, x, na.rm) 2865gsd = function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm) 2866gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows) 2867 2868isReallyReal = function(x) { 2869 .Call(CisReallyReal, x) 2870} 2871 2872.prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){ 2873 ## helper that decides, whether a fast binary search can be performed, if i is a call 2874 ## For details on the supported queries, see \code{\link{datatable-optimize}} 2875 ## Additional restrictions are imposed if x is .SD, or if options indicate that no optimization 2876 ## is to be performed 2877 #' @param isub the substituted i 2878 #' @param x the data.table 2879 #' @param enclos The environment where to evaluate when RHS is not a column of x 2880 #' @param notjoin boolean that is set before, indicating whether i started with '!'. 2881 #' @param verbose TRUE for detailed output 2882 #' @return If i is not fast subsettable, NULL. Else, a list with entries: 2883 #' out$i: a data.table that will be used as i with proper column names and key. 2884 #' out$on: the correct 'on' statement that will be used for x[i, on =...] 2885 #' out$notjoin Bool. In some cases, notjoin is updated within the function. 2886 #' ATTENTION: If nothing else helps, an auto-index is created on x unless options prevent this. 2887 if(getOption("datatable.optimize") < 3L) return(NULL) ## at least level three optimization required. 2888 if (!is.call(isub)) return(NULL) 2889 if (.Call(C_islocked, x)) return(NULL) # fix for #958, don't create auto index on '.SD'. 2890 ## a list of all possible operators with their translations into the 'on' clause 2891 validOps = list(op = c("==", "%in%", "%chin%"), 2892 on = c("==", "==", "==")) 2893 2894 ## Determine, whether the nature of isub in general supports fast binary search 2895 remainingIsub = isub 2896 i = list() 2897 on = character(0L) 2898 nonEqui = FALSE 2899 while(length(remainingIsub)){ 2900 if(is.call(remainingIsub)){ 2901 if (length(remainingIsub[[1L]]) != 1L) return(NULL) ## only single symbol, either '&' or one of validOps allowed. 2902 if (remainingIsub[[1L]] != "&"){ ## only a single expression present or a different connection. 2903 stub = remainingIsub 2904 remainingIsub = NULL ## there is no remainder to be evaluated after stub. 2905 } else { 2906 ## multiple expressions with & connection. 2907 if (notjoin) return(NULL) ## expressions of type DT[!(a==1 & b==2)] currently not supported 2908 stub = remainingIsub[[3L]] ## the single column expression like col == 4 2909 remainingIsub = remainingIsub[[2L]] ## the potentially longer expression with potential additional '&' 2910 } 2911 } else { ## single symbol present 2912 stub = remainingIsub 2913 remainingIsub = NULL 2914 } 2915 ## check the stub if it is fastSubsettable 2916 if(is.symbol(stub)){ 2917 ## something like DT[x & y]. If x and y are logical columns, we can optimize. 2918 col = as.character(stub) 2919 if(!col %chin% names(x)) return(NULL) 2920 if(!is.logical(x[[col]])) return(NULL) 2921 ## redirect to normal DT[x == TRUE] 2922 stub = call("==", as.symbol(col), TRUE) 2923 } 2924 if (length(stub[[1L]]) != 1) return(NULL) # nocov Whatever it is, definitely not one of the valid operators 2925 operator = as.character(stub[[1L]]) 2926 if (!operator %chin% validOps$op) return(NULL) ## operator not supported 2927 if (!is.name(stub[[2L]])) return(NULL) 2928 col = as.character(stub[[2L]]) 2929 if (!col %chin% names(x)) return(NULL) ## any non-column name prevents fast subsetting 2930 if(col %chin% names(i)) return(NULL) ## repeated appearance of the same column not supported (e.g. DT[x < 3 & x < 5]) 2931 ## now check the RHS of stub 2932 RHS = eval(stub[[3L]], x, enclos) 2933 if (is.list(RHS)) RHS = as.character(RHS) # fix for #961 2934 if (length(RHS) != 1L && !operator %chin% c("%in%", "%chin%")){ 2935 if (length(RHS) != nrow(x)) stop(gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x), domain="R-data.table"), domain=NA) 2936 return(NULL) # DT[colA == colB] regular element-wise vector scan 2937 } 2938 if ( mode(x[[col]]) != mode(RHS) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric') 2939 is.factor(x[[col]])+is.factor(RHS) == 1L || # but factor is also mode 'numeric' so treat that separately 2940 is.integer(x[[col]]) && isReallyReal(RHS) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type 2941 # re-direct non-matching type cases to base R, as data.table's binary 2942 # search based join is strict in types. #957, #961 and #1361 2943 # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on 2944 return(NULL) 2945 } 2946 if (!operator %chin% c("%in%", "%chin%")) { 2947 # additional requirements for notjoin and NA values. Behaviour is different for %in%, %chin% compared to other operators 2948 # RHS is of length=1 or n 2949 if (any_na(as_list(RHS))) { 2950 ## dt[x == NA] or dt[x <= NA] will always return empty 2951 notjoin = FALSE 2952 RHS = RHS[0L] 2953 } else if (notjoin) { 2954 ## dt[!x == 3] must not return rows where x is NA 2955 RHS = c(RHS, if (is.double(RHS) && is.double(x[[col]])) c(NA, NaN) else NA) 2956 } 2957 } 2958 ## if it passed until here, fast subset can be done for this stub 2959 i = c(i, setNames(list(RHS), col)) 2960 on = c(on, setNames(paste0(col, validOps$on[validOps$op == operator], col), col)) 2961 ## loop continues with remainingIsub 2962 } 2963 if (length(i) == 0L) stop("Internal error in .isFastSubsettable. Please report to data.table developers") # nocov 2964 ## convert i to data.table with all combinations in rows. 2965 if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){ 2966 ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 2967 if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} 2968 return(NULL) 2969 } 2970 ## Care is needed with names as we construct i 2971 ## with 'CJ' and 'do.call' and this would cause problems if colNames were 'sorted' or 'unique' 2972 ## as these two would be interpreted as args for CJ 2973 colNames = names(i) 2974 names(i) = NULL 2975 i$sorted = FALSE 2976 i$unique = TRUE 2977 i = do.call(CJ, i) 2978 setnames(i, colNames) 2979 idx = NULL 2980 if(is.null(idx)){ 2981 ## check whether key fits the columns in i. 2982 ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. 2983 if (all(names(i) %chin% head(key(x), length(i)))){ 2984 if (verbose) {cat("Optimized subsetting with key '", paste0( head(key(x), length(i)), collapse = ", "),"'\n",sep="");flush.console()} 2985 idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. 2986 idxCols = head(key(x), length(i)) ## in correct order! 2987 } 2988 } 2989 if (is.null(idx)){ 2990 if (!getOption("datatable.use.index")) return(NULL) # #1422 2991 ## check whether an existing index can be used 2992 ## An index can be used if it corresponds exactly to the columns in i (similar to the key above) 2993 candidates = indices(x, vectors = TRUE) 2994 idx = NULL 2995 for (cand in candidates){ 2996 if (all(names(i) %chin% cand) && length(cand) == length(i)){ 2997 idx = attr(attr(x, "index", exact=TRUE), paste0("__", cand, collapse = ""), exact = TRUE) 2998 idxCols = cand 2999 break 3000 } 3001 } 3002 if (!is.null(idx)){ 3003 if (verbose) {cat("Optimized subsetting with index '", paste0( idxCols, collapse = "__"),"'\n",sep="");flush.console()} 3004 } 3005 } 3006 if (is.null(idx)){ 3007 ## if nothing else helped, auto create a new index that can be used 3008 if (!getOption("datatable.auto.index")) return(NULL) 3009 if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} 3010 if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} 3011 setindexv(x, names(i)) 3012 if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} 3013 if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} 3014 idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE) 3015 idxCols = names(i) 3016 } 3017 if(!is.null(idxCols)){ 3018 setkeyv(i, idxCols) 3019 on = on[idxCols] ## make sure 'on' is in the correct order. Otherwise the logic won't recognise that a key / index already exists. 3020 } 3021 return(list(i = i, 3022 on = on, 3023 notjoin = notjoin 3024 ) 3025 ) 3026} 3027 3028 3029.parse_on = function(onsub, isnull_inames) { 3030 ## helper that takes the 'on' string(s) and extracts comparison operators and column names from it. 3031 #' @param onsub the substituted on 3032 #' @param isnull_inames bool; TRUE if i has no names. 3033 #' @return List with two entries: 3034 #' 'on' : character vector providing the column names for the join. 3035 #' Names correspond to columns in x, entries correspond to columns in i 3036 #' 'ops': integer vector. Gives the indices of the operators that connect the columns in x and i. 3037 ops = c("==", "<=", "<", ">=", ">", "!=") 3038 pat = paste0("(", ops, ")", collapse="|") 3039 if (onsub %iscall% 'eval') { 3040 onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L)) 3041 } 3042 if (onsub %iscall% c('list', '.')) { 3043 spat = paste0("[ ]+(", pat, ")[ ]+") 3044 onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L))) 3045 onsub = as.call(c(quote(c), onsub)) 3046 } 3047 on = eval(onsub, parent.frame(2L), parent.frame(2L)) 3048 if (length(on) == 0L || !is.character(on)) 3049 stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") 3050 ## extract the operators and potential variable names from 'on'. 3051 ## split at backticks to take care about variable names like `col1<=`. 3052 pieces = strsplit(on, "(?=[`])", perl = TRUE) 3053 xCols = character(length(on)) 3054 ## if 'on' is named, the names are the xCols for sure 3055 if(!is.null(names(on))){ 3056 xCols = names(on) 3057 } 3058 iCols = character(length(on)) 3059 operators = character(length(on)) 3060 ## loop over the elements and extract operators and column names. 3061 for(i in seq_along(pieces)){ 3062 thisCols = character(0L) 3063 thisOperators = character(0L) 3064 j = 1L 3065 while(j <= length(pieces[[i]])){ 3066 if(pieces[[i]][j] == "`"){ 3067 ## start of a variable name with backtick. 3068 thisCols = c(thisCols, pieces[[i]][j+1L]) 3069 j = j+3L # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.` 3070 } else { 3071 ## no backtick 3072 ## search for operators 3073 thisOperators = c(thisOperators, 3074 unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), 3075 use.names = FALSE)) 3076 ## search for column names 3077 thisCols = c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1L]])) 3078 ## there can be empty string column names because of trimws, remove them 3079 thisCols = thisCols[thisCols != ""] 3080 j = j+1L 3081 } 3082 } 3083 if (length(thisOperators) == 0L) { 3084 ## if no operator is given, it must be == 3085 operators[i] = "==" 3086 } else if (length(thisOperators) == 1L) { 3087 operators[i] = thisOperators 3088 } else { 3089 ## multiple operators found in one 'on' part. Something is wrong. 3090 stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.") 3091 } 3092 if (length(thisCols) == 2L){ 3093 ## two column names found, first is xCol, second is iCol for sure 3094 xCols[i] = thisCols[1L] 3095 iCols[i] = thisCols[2L] 3096 } else if (length(thisCols) == 1L){ 3097 ## a single column name found. Can mean different things 3098 if(xCols[i] != ""){ 3099 ## xCol is given by names(on). thisCols must be iCol 3100 iCols[i] = thisCols[1L] 3101 } else if (isnull_inames){ 3102 ## i has no names. It will be given the names V1, V2, ... automatically. 3103 ## The single column name is the x column. It will match to the ith column in i. 3104 xCols[i] = thisCols[1L] 3105 iCols[i] = paste0("V", i) 3106 } else { 3107 ## i has names and one single column name is given by on. 3108 ## This means that xCol and iCol have the same name. 3109 xCols[i] = thisCols[1L] 3110 iCols[i] = thisCols[1L] 3111 } 3112 } else if (length(thisCols) == 0L){ 3113 stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.") 3114 } else { 3115 stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.") 3116 } 3117 } 3118 idx_op = match(operators, ops, nomatch=0L) 3119 if (any(idx_op %in% c(0L, 6L))) 3120 stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") 3121 ## the final on will contain the xCol as name, the iCol as value 3122 on = iCols 3123 names(on) = xCols 3124 return(list(on = on, ops = idx_op)) 3125} 3126