1if (!exists("trimws", "package:base")) {
2  # trimws was new in R 3.2.0. Backport it for internal data.table use in R 3.1.0
3  trimws = function(x) {
4    mysub = function(re, x) sub(re, "", x, perl = TRUE)
5    mysub("[ \t\r\n]+$", mysub("^[ \t\r\n]+", x))
6  }
7}
8
9dim.data.table = function(x)
10{
11  .Call(Cdim, x)
12}
13
14.global = new.env()  # thanks to: http://stackoverflow.com/a/12605694/403310
15setPackageName("data.table",.global)
16.global$print = ""
17
18# NB: if adding to/editing this list, be sure to do the following:
19#   (1) add to man/special-symbols.Rd
20#   (2) export() in NAMESPACE
21#   (3) add to vignettes/datatable-importing.Rmd#globals section
22.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
23# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
24# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
25# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
26# utils::globalVariables(c(".SD",".N")) was tried as well, but exporting seems better.
27# So even though .BY doesn't appear in this file, it should still be NULL here and exported because it's
28# defined in SDenv and can be used by users.
29
30is.data.table = function(x) inherits(x, "data.table")
31is.ff = function(x) inherits(x, "ff")  # define this in data.table so that we don't have to require(ff), but if user is using ff we'd like it to work
32
33#NCOL = function(x) {
34#    # copied from base, but additionally covers data.table via is.list()
35#    # because NCOL in base explicitly tests using is.data.frame()
36#    if (is.list(x) && !is.ff(x)) return(length(x))
37#    if (is.array(x) && length(dim(x)) > 1L) ncol(x) else as.integer(1L)
38#}
39#NROW = function(x) {
40#    if (is.data.frame(x) || is.data.table(x)) return(nrow(x))
41#    if (is.list(x) && !is.ff(x)) stop("List is not a data.frame or data.table. Convert first before using NROW")   # list may have different length elements, which data.table and data.frame's resolve.
42#    if (is.array(x)) nrow(x) else length(x)
43#}
44
45null.data.table = function() {
46  ans = list()
47  setattr(ans,"class",c("data.table","data.frame"))
48  setattr(ans,"row.names",.set_row_names(0L))
49  setalloccol(ans)
50}
51
52data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
53{
54  # NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l
55  #       first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking.
56  x = list(...)   # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change)
57  nd = name_dots(...)
58  names(x) = nd$vnames
59  if (length(x)==0L) return( null.data.table() )
60  if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #48
61  ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named)  # see comments inside as.data.table.list re copies
62  if (!is.null(key)) {
63    if (!is.character(key)) stop("key argument of data.table() must be character")
64    if (length(key)==1L) {
65      key = strsplit(key,split=",")[[1L]]
66      # eg key="A,B"; a syntax only useful in key argument to data.table(), really.
67    }
68    setkeyv(ans,key)
69  } else {
70    # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table().
71    # If DT inputs with keys have been recycled then can't retain key
72    ckey = NULL
73    for (i in seq_along(x)) {
74      xi = x[[i]]
75      if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi))
76    }
77    if (length(ckey) &&
78        !anyDuplicated(ckey) &&
79        identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) {
80      setattr(ans, "sorted", ckey)
81    }
82  }
83  if (isTRUE(stringsAsFactors)) {
84    for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
85    # as_factor is internal function in fread.R currently
86  }
87  setalloccol(ans)  # returns a NAMED==0 object, unlike data.frame()
88}
89
90replace_dot_alias = function(e) {
91  # we don't just simply alias .=list because i) list is a primitive (faster to iterate) and ii) we test for use
92  # of "list" in several places so it saves having to remember to write "." || "list" in those places
93  if (is.call(e) && !is.function(e[[1L]])) {
94    # . alias also used within bquote, #1912
95    if (e[[1L]] == 'bquote') return(e)
96    if (e[[1L]] == ".") e[[1L]] = quote(list)
97    for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_dot_alias(e[[i]])
98  }
99  e
100}
101
102.massagei = function(x) {
103  # J alias for list as well in i, just if the first symbol
104  # if x = substitute(base::order) then as.character(x[[1L]]) == c("::", "base", "order")
105  if (x %iscall% c("J","."))
106    x[[1L]] = quote(list)
107  x
108}
109
110.checkTypos = function(err, ref) {
111  if (grepl('object.*not found', err$message)) {
112    used = gsub(".*object '([^']+)'.*", "\\1", err$message)
113    found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE)
114    if (length(found)) {
115      stop("Object '", used, "' not found. Perhaps you intended ",
116           paste(head(found, 5L), collapse=", "),
117           if (length(found)<=5L) "" else paste(" or",length(found)-5L, "more"))
118    } else {
119      stop("Object '", used, "' not found amongst ",
120           paste(head(ref, 5L), collapse=', '),
121           if (length(ref)<=5L) "" else paste(" and", length(ref)-5L, "more"))
122    }
123  } else {
124    stop(err$message, call.=FALSE)
125  }
126}
127
128"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
129{
130  # ..selfcount <<- ..selfcount+1  # in dev, we check no self calls, each of which doubles overhead, or could
131  # test explicitly if the caller is [.data.table (even stronger test. TO DO.)
132  # the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147
133  if (!cedta()) {
134    # Fix for #500 (to do)
135    Nargs = nargs() - (!missing(drop))
136    ans = if (Nargs<3L) { `[.data.frame`(x,i) }  # drop ignored anyway by DF[i]
137        else if (missing(drop)) `[.data.frame`(x,i,j)
138        else `[.data.frame`(x,i,j,drop)
139    # added is.data.table(ans) check to fix bug #81
140    if (!missing(i) & is.data.table(ans)) setkey(ans,NULL)  # See test 304
141    return(ans)
142  }
143  if (!missing(verbose)) {
144    stopifnot(isTRUEorFALSE(verbose))
145    # set the global verbose option because that is fetched from C code without having to pass it through
146    oldverbose = options(datatable.verbose=verbose)
147    on.exit(options(oldverbose))
148  }
149  .global$print=""
150  missingby = missing(by) && missing(keyby)  # for tests 359 & 590 where passing by=NULL results in data.table not vector
151  if (!missing(keyby)) {
152    if (!missing(by)) stop("Provide either by= or keyby= but not both")
153    if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; }
154    by=bysub=substitute(keyby)
155    keyby=TRUE
156    # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by
157  } else {
158    if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; }
159    by=bysub= if (missing(by)) NULL else substitute(by)
160    keyby=FALSE
161  }
162  bynull = !missingby && is.null(by) #3530
163  byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"
164  naturaljoin = FALSE
165  names_x = names(x)
166  if (missing(i) && !missing(on)) {
167    tt = eval.parent(.massagei(substitute(on)))
168    if (!is.list(tt) || !length(names(tt))) {
169      warning("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '",class(tt)[1L],"'.")
170      on = NULL
171    } else {
172      i = tt
173      naturaljoin = TRUE
174    }
175  }
176  if (missing(i) && missing(j)) {
177    tt_isub = substitute(i)
178    tt_jsub = substitute(j)
179    if (!is.null(names(sys.call())) &&  # not relying on nargs() as it considers DT[,] to have 3 arguments, #3163
180        tryCatch(!is.symbol(tt_isub), error=function(e)TRUE) &&   # a symbol that inherits missingness from caller isn't missing for our purpose; test 1974
181        tryCatch(!is.symbol(tt_jsub), error=function(e)TRUE)) {
182      warning("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.")
183    }
184    return(x)
185  }
186  if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first', 'last' or 'all'")
187  missingroll = missing(roll)
188  if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
189  if (is.character(roll)) {
190    if (roll!="nearest") stop("roll is '",roll,"' (type character). Only valid character value is 'nearest'.")
191  } else {
192    roll = if (isTRUE(roll)) +Inf else as.double(roll)
193  }
194  force(rollends)
195  if (!is.logical(rollends)) stop("rollends must be a logical vector")
196  if (length(rollends)>2L) stop("rollends must be length 1 or 2")
197  if (length(rollends)==1L) rollends=rep.int(rollends,2L)
198  # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one.
199  .unsafe.opt() #3585
200  missingnomatch = missing(nomatch)
201  if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857
202  if (!is.na(nomatch) && nomatch!=0L) stop("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)")
203  nomatch = as.integer(nomatch)
204  if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
205  if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("which==",which," (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.")
206  if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.")
207  if (!with && missing(j)) stop("j must be provided when with=FALSE")
208  irows = NULL  # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
209  notjoin = FALSE
210  rightcols = leftcols = integer()
211  optimizedSubset = FALSE ## flag: tells whether a normal query was optimized into a join.
212  ..syms = NULL
213  av = NULL
214  jsub = NULL
215  if (!missing(j)) {
216    jsub = replace_dot_alias(substitute(j))
217    root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
218    if (root == ":" ||
219        (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
220        ( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) &&
221          root %chin% c("","c","paste","paste0","-","!") &&
222          missingby )) {   # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
223      # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
224      # If variable names do occur, but they are all prefixed with .., then that means look up in calling scope.
225      # Automatically set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")]
226      # work as expected.  As before, a vector will never be returned, but a single column data.table
227      # for type consistency with >1 cases. To return a single vector use DT[["someCol"]] or DT[[3]].
228      # The root==":" is to allow DT[,colC:colH] even though that contains two variable names.
229      # root == "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root)
230      # We don't want to evaluate j at all in making this decision because i) evaluating could itself
231      # increment some variable and not intended to be evaluated a 2nd time later on and ii) we don't
232      # want decisions like this to depend on the data or vector lengths since that can introduce
233      # inconsistency reminiscent of drop=TRUE in [.data.frame that we seek to avoid.
234      with=FALSE
235      if (length(av)) {
236        for (..name in av) {
237          name = substring(..name, 3L)
238          if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.")
239          if (!exists(name, where=parent.frame())) {
240            stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.",
241              if (exists(..name, where=parent.frame()))
242                paste0(" Variable '..",name,"' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.")
243                # We have recommended 'manual' .. prefix in the past, so try to be helpful
244              else
245                ""
246            )
247          } else if (exists(..name, where=parent.frame())) {
248            warning("Both '",name,"' and '..", name, "' exist in calling scope. Please remove the '..", name,"' variable in calling scope for clarity.")
249          }
250        }
251        ..syms = av
252      }
253    } else if (is.name(jsub)) {
254      if (substring(jsub, 1L, 2L) == "..") stop("Internal error:  DT[, ..var] should be dealt with by the branch above now.") # nocov
255      if (!with && !exists(as.character(jsub), where=parent.frame()))
256        stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.")
257    }
258    if (root=="{") {
259      if (length(jsub) == 2L) {
260        jsub = jsub[[2L]]  # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376]
261        root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
262      } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") {
263        #2142 -- j can be {} and have length 1
264        stop("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}")
265      }
266    }
267    if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) {
268      # TODO: this && !any depends on data. Can we remove it?
269      # Grab the dynamic expression from calling scope now to give the optimizer a chance to optimize it
270      # Only when top level is eval call.  Not nested like x:=eval(...) or `:=`(x=eval(...), y=eval(...))
271      jsub = eval(jsub[[2L]], parent.frame(), parent.frame())  # this evals the symbol to return the dynamic expression
272      if (is.expression(jsub)) jsub = jsub[[1L]]    # if expression, convert it to call
273      # Note that the dynamic expression could now be := (new in v1.9.7)
274      root = if (is.call(jsub)) {
275        jsub = replace_dot_alias(jsub)
276        as.character(jsub[[1L]])[1L]
277      } else ""
278    }
279    if (root == ":=") {
280      allow.cartesian=TRUE   # (see #800)
281      if (!missing(i) && keyby)
282        stop(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i")
283      if (!missingnomatch) {
284        warning("nomatch isn't relevant together with :=, ignoring nomatch")
285        nomatch=0L
286      }
287    }
288  }
289
290  # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead.
291  dupdiff = function(x, y) x[!x %chin% y]
292
293  if (!missing(i)) {
294    xo = NULL
295    isub = substitute(i)
296    if (identical(isub, NA)) {
297      # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA]
298      # replace NA in this case with NA_integer_ as that's almost surely what user intended to
299      # return a single row with NA in all columns. (DT[0] returns an empty table, with correct types.)
300      # Any expression (including length 1 vectors) that evaluates to a single NA logical will
301      # however be left as NA logical since that's important for consistency to return empty in that
302      # case; e.g. DT[Col==3] where DT is 1 row and Col contains NA.
303      # Replacing the NA symbol makes DT[NA] and DT[c(1,NA)] consistent and provides
304      # an easy way to achieve a single row of NA as users expect rather than requiring them
305      # to know and change to DT[NA_integer_].
306      isub=NA_integer_
307    }
308    isnull_inames = FALSE
309    # Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
310    # the "eval" to be checked before `as.name("!")`. Therefore interchanged.
311    restore.N = remove.N = FALSE
312    if (exists(".N", envir=parent.frame(), inherits=FALSE)) {
313      old.N = get(".N", envir=parent.frame(), inherits=FALSE)
314      locked.N = bindingIsLocked(".N", parent.frame())
315      if (locked.N) eval(call("unlockBinding", ".N", parent.frame()))  # eval call to pass R CMD check NOTE until we find cleaner way
316      assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
317      restore.N = TRUE
318      # the comment below is invalid hereafter (due to fix for #1145)
319      # binding locked when .SD[.N] but that's ok as that's the .N we want anyway
320
321      # TO DO: change isub at C level s/.N/nrow(x); changing a symbol to a constant should be ok
322    } else {
323       assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
324       remove.N = TRUE
325    }
326    if (isub %iscall% "eval") {  # TO DO: or ..()
327      isub = eval(.massagei(isub[[2L]]), parent.frame(), parent.frame())
328      if (is.expression(isub)) isub=isub[[1L]]
329    }
330    if (isub %iscall% "!") {
331      notjoin = TRUE
332      if (!missingnomatch) stop("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch.");
333      nomatch = 0L
334      isub = isub[[2L]]
335      # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!"
336      if (isub %iscall% "(" && !is.name(isub[[2L]]))
337        isub = isub[[2L]]
338    }
339
340    if (is.null(isub)) return( null.data.table() )
341
342    if (length(o <- .prepareFastSubset(isub = isub, x = x,
343                                              enclos =  parent.frame(),
344                                              notjoin = notjoin, verbose = verbose))){
345      ## redirect to the is.data.table(x) == TRUE branch.
346      ## Additional flag to adapt things after bmerge:
347      optimizedSubset = TRUE
348      notjoin = o$notjoin
349      i = o$i
350      on = o$on
351      ## the following two are ignored if i is not a data.table.
352      ## Since we are converting i to data.table, it is important to set them properly.
353      nomatch = 0L
354      mult = "all"
355    }
356    else if (!is.name(isub)) {
357      ienv = new.env(parent=parent.frame())
358      if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv)
359      i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) {
360        if (grepl(":=.*defined for use in j.*only", e$message))
361          stop("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.")
362        else
363          .checkTypos(e, names_x)
364      })
365    } else {
366      # isub is a single symbol name such as B in DT[B]
367      i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE)
368      if (inherits(i,"try-error")) {
369        # must be "not found" since isub is a mere symbol
370        col = try(eval(isub, x), silent=TRUE)  # is it a column name?
371        msg = if (inherits(col,"try-error")) " and it is not a column name either."
372        else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
373                    ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
374        stop(as.character(isub), " is not found in calling scope", msg,
375             " When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.")
376      }
377    }
378    if (restore.N) {
379      assign(".N", old.N, envir=parent.frame())
380      if (locked.N) lockBinding(".N", parent.frame())
381    }
382    if (remove.N) rm(list=".N", envir=parent.frame())
383    if (is.matrix(i)) {
384      if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix
385        i = as.integer(i)
386      } else {
387        stop("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.")
388      }
389    }
390    if (is.logical(i)) {
391      if (notjoin) {
392        notjoin = FALSE
393        i = !i
394      }
395    }
396    if (is.null(i)) return( null.data.table() )
397    if (is.character(i)) {
398      isnull_inames = TRUE
399      i = data.table(V1=i)   # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
400    } else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) }
401    else if (identical(class(i),"data.frame")) { i = as.data.table(i) }   # TO DO: avoid these as.data.table() and use a flag instead
402    else if (identical(class(i),"list")) {
403      isnull_inames = is.null(names(i))
404      i = as.data.table(i)
405    }
406
407    if (is.data.table(i)) {
408      if (missing(on)) {
409        if (!haskey(x)) {
410          stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
411        }
412      } else if (identical(substitute(on), as.name(".NATURAL"))) {
413        naturaljoin = TRUE
414      }
415      if (naturaljoin) { # natural join #629
416        common_names = intersect(names_x, names(i))
417        len_common_names = length(common_names)
418        if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables")
419        if (verbose) {
420          which_cols_msg = if (len_common_names == length(x)) " all 'x' columns"
421          else paste(":", brackify(common_names))
422          cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "")
423        }
424        on = common_names
425      }
426      if (!missing(on)) {
427        # on = .() is now possible, #1257
428        on_ops = .parse_on(substitute(on), isnull_inames)
429        on = on_ops[[1L]]
430        ops = on_ops[[2L]]
431        if (any(ops > 1L)) { ## fix for #4489;  ops = c("==", "<=", "<", ">=", ">", "!=")
432          allow.cartesian = TRUE
433        }
434        # TODO: collect all '==' ops first to speeden up Cnestedid
435        rightcols = colnamesInt(x, names(on), check_dups=FALSE)
436        leftcols  = colnamesInt(i, unname(on), check_dups=FALSE)
437      } else {
438        ## missing on
439        rightcols = chmatch(key(x), names_x)   # NAs here (i.e. invalid data.table) checked in bmerge()
440        leftcols = if (haskey(i))
441          chmatch(head(key(i), length(rightcols)), names(i))
442        else
443          seq_len(min(length(i),length(rightcols)))
444        rightcols = head(rightcols,length(leftcols))
445        ops = rep(1L, length(leftcols))
446      }
447      # Implementation for not-join along with by=.EACHI, #604
448      if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571
449        notjoin = FALSE
450        if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()}
451        orignames = copy(names(i))
452        i = setdiff_(x, i, rightcols, leftcols) # part of #547
453        if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()}
454        setnames(i, orignames[leftcols])
455        setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
456      }
457      i = .shallow(i, retain.key = TRUE)
458      ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
459      xo = ans$xo ## to make it available for further use.
460      # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
461      # 'setorder', as there's another 'setorder' in generating 'irows' below...
462      if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
463      allLen1 = ans$allLen1
464      f__ = ans$starts
465      len__ = ans$lens
466      allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time.
467      indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix
468      # length of input nomatch (single 0 or NA) is 1 in both cases.
469      # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N
470      # If using secondary key of x, f__ will refer to xo
471      if (is.na(which)) {
472        w = if (notjoin) f__!=0L else is.na(f__)
473        return( if (length(xo)) fsort(xo[w], internal=TRUE) else which(w) )
474      }
475      if (mult=="all") {
476        # is by=.EACHI along with non-equi join?
477        nqbyjoin = byjoin && length(ans$indices) && !allGrp1
478        if (!byjoin || nqbyjoin) {
479          # Really, `anyDuplicated` in base is AWESOME!
480          # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
481          if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
482          irows = if (allLen1) f__ else vecseq(f__,len__,
483            if (allow.cartesian ||
484                notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
485                !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
486              NULL # #742. If 'i' has no duplicates, ignore
487            } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
488          if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
489          # Fix for #1092 and #1074
490          # TODO: implement better version of "any"/"all"/"which" to avoid
491          # unnecessary construction of logical vectors
492          if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L]
493        } else {
494          if (length(xo) && missing(on))
495            stop("Internal error. Cannot by=.EACHI when joining to a secondary key, yet") # nocov
496          # since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too.
497          if (length(irows))
498            stop("Internal error. irows has length in by=.EACHI") # nocov
499        }
500        if (nqbyjoin) {
501          irows = if (length(xo)) xo[irows] else irows
502          xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
503          ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix
504          f__ = ans[[1L]]; len__ = ans[[2L]]
505          allLen1 = FALSE # TODO; should this always be FALSE?
506          irows = NULL # important to reset
507          if (any_na(as_list(xo))) xo = xo[!is.na(xo)]
508        }
509      } else {
510        if (!byjoin) { #1287 and #1271
511          irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic
512          if (identical(nomatch,0L)) irows = irows[len__>0L]  # 0s are len 0, so this removes -1 irows
513        }
514        # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
515        # TODO: how about when nomatch=0L, can we avoid allocating then as well?
516      }
517      if (length(xo) && length(irows)) {
518        irows = xo[irows]   # TO DO: fsort here?
519        if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit.
520          if (verbose) {last.started.at=proc.time();cat("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()}
521          irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
522          if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
523        }
524      }
525      if (optimizedSubset){
526        ## special treatment for calls like DT[x == 3] that are transformed into DT[J(x=3), on = "x==x"]
527
528        if(!.Call(CisOrderedSubset, irows, nrow(x))){
529          ## restore original order. This is a very expensive operation.
530          ## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time
531          ## (see #2366)
532          if (verbose) {last.started.at=proc.time()[3L];cat("Reordering", length(irows), "rows after bmerge done in ... ");flush.console()}
533          if(length(irows) < 1e6){
534            irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv
535            } else {
536              irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion
537            }
538          if (verbose) {cat(round(proc.time()[3L]-last.started.at,3L),"secs\n");flush.console()}
539        }
540        ## make sure, all columns are taken from x and not from i.
541        ## This is done by simply telling data.table to continue as if there was a simple subset
542        leftcols  = integer(0L)
543        rightcols = integer(0L)
544        i = irows ## important to make i not a data.table because otherwise Gforce doesn't kick in
545      }
546    }
547    else {
548      if (!missing(on)) {
549        stop("logical error. i is not a data.table, but 'on' argument is provided.")
550      }
551      # TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient.
552      # i is not a data.table
553      if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.")
554      if (is.logical(i)) {
555        if (length(i)==1L  # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down)
556          && isTRUE(unname(i))) { irows=i=NULL }  # unname() for #2152 - length 1 named logical vector.
557        # NULL is efficient signal to avoid creating 1:nrow(x) but still return all rows, fixes #1249
558
559        else if (length(i)<=1L) { irows=i=integer(0L) }
560        # FALSE, NA and empty. All should return empty data.table. The NA here will be result of expression,
561        # where for consistency of edge case #1252 all NA to be removed. If NA is a single NA symbol, it
562        # was auto converted to NA_integer_ higher up for ease of use and convenience. We definitely
563        # don't want base R behaviour where DF[NA,] returns an entire copy filled with NA everywhere.
564
565        else if (length(i)==nrow(x)) { irows=i=which(i) }
566        # The which() here auto removes NA for convenience so user doesn't need to remember "!is.na() & ..."
567        # Also this which() is for consistency of DT[colA>3,which=TRUE] and which(DT[,colA>3])
568        # Assigning to 'i' here as well to save memory, #926.
569
570        else stop("i evaluates to a logical vector length ", length(i), " but there are ", nrow(x), " rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.")
571      } else {
572        irows = as.integer(i)  # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error)
573        irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=")  # last argument is allowOverMax (NA when selecting, error when assigning)
574        # simplifies logic from here on: can assume positive subscripts (no zeros)
575        # maintains Arun's fix for #2697 (test 1042)
576        # efficient in C with more detailed helpful messages when user mixes positives and negatives
577        # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives
578        # minor TO DO: can we merge this with check_idx in fcast.c/subset ?
579      }
580    }
581    if (notjoin) {
582      if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov
583      irows = irows[irows!=0L]
584      if (verbose) {last.started.at=proc.time()[3L];cat("Inverting irows for notjoin done in ... ");flush.console()}
585      i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL  # NULL meaning all rows i.e. seq_len(nrow(x))
586      if (verbose) cat(round(proc.time()[3L]-last.started.at, 3L), "sec\n")
587      leftcols = integer()  # proceed as if row subset from now on, length(leftcols) is switched on later
588      rightcols = integer()
589      # Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each
590      # column when irows contains negatives.
591    }
592    if (which) return( if (is.null(irows)) seq_len(nrow(x)) else irows )
593  } else {  # missing(i)
594    i = NULL
595  }
596  names_i = names(i) # value is now stable
597  byval = NULL
598  xnrow = nrow(x)
599  xcols = xcolsAns = icols = icolsAns = integer()
600  xdotcols = FALSE
601  # track which columns appear in j through ansvars, and those
602  #   that belong in .SD through sdvars. Mostly these will be the same,
603  #   except in cases like DT[ , lapply(.SD, function(x) x/V1), .SDcols = !'V1'],
604  #   see e.g. #484, #495, #1744, #1965.
605  non_sdvars = character(0L)
606  if (missing(j)) {
607    # missingby was already checked above before dealing with i
608    if (!length(x)) return(null.data.table())
609    if (!length(leftcols)) {
610      # basic x[i] subset, #2951
611      if (is.null(irows)) return(shallow(x))   # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy
612      else                return(.Call(CsubsetDT, x, irows, seq_along(x)) )
613    } else {
614      jisvars = names_i[-leftcols]
615      tt = jisvars %chin% names_x
616      if (length(tt)) jisvars[tt] = paste0("i.",jisvars[tt])
617      if (length(duprightcols <- rightcols[duplicated(rightcols)])) {
618        nx = c(names_x, names_x[duprightcols])
619        rightcols = chmatchdup(names_x[rightcols], nx)
620        nx = make.unique(nx)
621      } else nx = names_x
622      ansvars = make.unique(c(nx, jisvars))
623      icols = c(leftcols, seq_along(i)[-leftcols])
624      icolsAns = c(rightcols, seq.int(length(nx)+1L, length.out=ncol(i)-length(unique(leftcols))))
625      xcols = xcolsAns = seq_along(x)[-rightcols]
626    }
627    ansvals = chmatch(ansvars, nx)
628  }
629  else {
630    if (is.data.table(i)) {
631      idotprefix = paste0("i.", names_i)
632      xdotprefix = paste0("x.", names_x)
633    } else idotprefix = xdotprefix = character(0L)
634
635    # j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic)
636    if (is.null(jsub)) return(NULL)
637
638    if (!with && jsub %iscall% ":=") {
639      # TODO: make these both errors (or single long error in both cases) in next release.
640      # i.e. using with=FALSE together with := at all will become an error. Eventually with will be removed.
641      if (is.null(names(jsub)) && is.name(jsub[[2L]])) {
642        warning("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.")
643        jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame())
644      } else {
645        warning("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.")
646      }
647      with = TRUE
648    }
649
650    if (!with) {
651      # missingby was already checked above before dealing with i
652      if (jsub %iscall% c("!", "-") && length(jsub)==2L) {  # length 2 to only match unary, #2109
653        notj = TRUE
654        jsub = jsub[[2L]]
655      } else notj = FALSE
656      # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
657      while (jsub %iscall% "(") jsub = as.list(jsub)[[-1L]]
658      if (jsub %iscall% ":" && length(jsub)==3L) {
659        j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line
660      } else {
661        names(..syms) = ..syms
662        j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame())
663      }
664      if (is.logical(j)) j <- which(j)
665      if (!length(j) && !notj) return( null.data.table() )
666      if (is.factor(j)) j = as.character(j)  # fix for FR: #358
667      if (is.character(j)) {
668        if (notj) {
669          if (anyNA(idx <- chmatch(j, names_x))) warning("column(s) not removed because not found: ", brackify(j[is.na(idx)]))
670          # all duplicates of the name in names(x) must be removed; e.g. data.table(x=1, y=2, x=3)[, !"x"] should just output 'y'.
671          w = !names_x %chin% j
672          ansvars = names_x[w]
673          ansvals = which(w)
674        } else {
675          # if DT[, c("x","x")] and "x" is duplicated in names(DT), we still subset only the first. Because dups are unusual and
676          # it's more common to select the same column a few times. A syntax would be needed to distinguish these intents.
677          ansvars = j   # x. and i. prefixes may be in here, they'll result in NA and will be dealt with further below if length(leftcols)
678          ansvals = chmatch(ansvars, names_x)   # not chmatchdup()
679        }
680        if (!length(ansvals)) return(null.data.table())
681        if (!length(leftcols)) {
682          if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals))
683          else stop("column(s) not found: ", paste(ansvars[is.na(ansvals)],collapse=", "))
684        }
685        # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977)
686        #   in both cases leave to the R-level subsetting of i and x together further below
687      } else if (is.numeric(j)) {
688        j = as.integer(j)
689        if (any(w<-(j>ncol(x)))) stop("Item ",which.first(w)," of j is ",j[which.first(w)]," which is outside the column number range [1,ncol=", ncol(x),"]")
690        j = j[j!=0L]
691        if (any(j<0L)) {
692          if (any(j>0L)) stop("j mixes positives and negatives")
693          j = seq_along(x)[j]  # all j are <0 here
694        }
695        # 3013 -- handle !FALSE in column subset in j via logical+with
696        if (notj) j = seq_along(x)[if (length(j)) -j else TRUE]
697        if (!length(j)) return(null.data.table())
698        return(.Call(CsubsetDT, x, irows, j))
699      } else {
700        stop("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440.
701      }
702    } else {   # with=TRUE and byjoin could be TRUE
703      bynames = NULL
704      allbyvars = NULL
705      if (byjoin) {
706        bynames = names_x[rightcols]
707      } else if (!missingby) {
708        # deal with by before j because we need byvars when j contains .SD
709        # may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through
710        bysubl = as.list.default(bysub)
711        bysuborig = bysub
712        if (is.name(bysub) && !(bysub %chin% names_x)) {  # TO DO: names(x),names(i),and i. and x. prefixes
713          bysub = eval(bysub, parent.frame(), parent.frame())
714          # fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow
715          # case where by=y where y is not a column name, and not a call/symbol/expression, but an atomic vector outside of DT.
716          # note that if y is a list, this'll return an error (not sure if it should).
717          if (is.atomic(bysub)) bysubl = list(bysuborig) else bysubl = as.list.default(bysub)
718        }
719        if (length(bysubl) && identical(bysubl[[1L]],quote(eval))) {    # TO DO: or by=..()
720          bysub = eval(bysubl[[2L]], parent.frame(), parent.frame())
721          bysub = replace_dot_alias(bysub) # fix for #1298
722          if (is.expression(bysub)) bysub=bysub[[1L]]
723          bysubl = as.list.default(bysub)
724        } else if (bysub %iscall% c("c","key","names", "intersect", "setdiff")) {
725          # catch common cases, so we don't have to copy x[irows] for all columns
726          # *** TO DO ***: try() this eval first (as long as not list() or .()) and see if it evaluates to column names
727          # to avoid the explicit c,key,names which already misses paste("V",1:10) for example
728          #        tried before but since not wrapped in try() it failed on some tests
729          # or look for column names used in this by (since if none it wouldn't find column names anyway
730          # when evaled within full x[irows]).  Trouble is that colA%%2L is a call and should be within frame.
731          tt = eval(bysub, parent.frame(), parent.frame())
732          if (!is.character(tt)) stop("by=c(...), key(...) or names(...) must evaluate to 'character'")
733          bysub=tt
734        } else if (is.call(bysub) && !(bysub[[1L]] %chin% c("list", "as.list", "{", ".", ":"))) {
735          # potential use of function, ex: by=month(date). catch it and wrap with "(", because we need to set "bysameorder" to FALSE as we don't know if the function will return ordered results just because "date" is ordered. Fixes #2670.
736          bysub = as.call(c(as.name('('), list(bysub)))
737          bysubl = as.list.default(bysub)
738        } else if (bysub %iscall% ".") bysub[[1L]] = quote(list)
739
740        if (mode(bysub) == "character") {
741          if (length(grep(",", bysub, fixed = TRUE))) {
742            if (length(bysub)>1L) stop("'by' is a character vector length ",length(bysub)," but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.")
743            bysub = strsplit(bysub,split=",")[[1L]]
744          }
745          backtick_idx = grep("^[^`]+$",bysub)
746          if (length(backtick_idx)) bysub[backtick_idx] = paste0("`",bysub[backtick_idx],"`")
747          backslash_idx = grep("\\", bysub, fixed = TRUE)
748          if (length(backslash_idx)) bysub[backslash_idx] = gsub('\\', '\\\\', bysub[backslash_idx], fixed = TRUE)
749          bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]]
750          bysubl = as.list.default(bysub)
751        }
752        allbyvars = intersect(all.vars(bysub), names_x)
753        orderedirows = .Call(CisOrderedSubset, irows, nrow(x))  # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__)
754        bysameorder = byindex = FALSE
755        if (!bysub %iscall% ":" && ##Fix #4285
756            all(vapply_1b(bysubl, is.name))) {
757          bysameorder = orderedirows && haskey(x) && length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars)))
758          # either bysameorder or byindex can be true but not both. TODO: better name for bysameorder might be bykeyx
759          if (!bysameorder && keyby && !length(irows) && isTRUE(getOption("datatable.use.index"))) {
760            # TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062
761            # find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name
762            tt = paste0(c(allbyvars,""), collapse="__")
763            w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt)
764            if (!is.na(w)) {
765              byindex = indices(x)[w]
766              if (!length(getindex(x, byindex))) {
767                if (verbose) cat("by index '", byindex, "' but that index has 0 length. Ignoring.\n", sep="")
768                byindex=FALSE
769              }
770            }
771          }
772        }
773
774        if (is.null(irows)) {
775          if (bysub %iscall% ':' && length(bysub)==3L && is.name(bysub[[2L]]) && is.name(bysub[[3L]])) {
776            byval = eval(bysub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
777            byval = as.list(x)[byval]
778          } else byval = eval(bysub, x, parent.frame())
779        } else {
780          # length 0 when i returns no rows
781          if (!is.integer(irows)) stop("Internal error: irows isn't integer") # nocov
782          # Passing irows as i to x[] below has been troublesome in a rare edge case.
783          # irows may contain NA, 0, negatives and >nrow(x) here. That's all ok.
784          # But we may need i join column values to be retained (where those rows have no match), hence we tried eval(isub)
785          # in 1.8.3, but this failed test 876.
786          # TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match).
787          # TO DO: Make xss directly, rather than recursive call.
788          if (!is.na(nomatch)) irows = irows[irows!=0L]   # TO DO: can be removed now we have CisSortedSubset
789          if (length(allbyvars)) {    ###############  TO DO  TO DO  TO DO  ###############
790            if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n")
791            xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
792          } else {
793            if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="")
794            xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
795          }
796          if (bysub %iscall% ':' && length(bysub)==3L) {
797            byval = eval(bysub, setattr(as.list(seq_along(xss)), 'names', names(xss)), parent.frame())
798            byval = as.list(xss)[byval]
799          } else byval = eval(bysub, xss, parent.frame())
800          xnrow = nrow(xss)
801          # TO DO: pass xss (x subset) through into dogroups. Still need irows there (for :=), but more condense
802          # and contiguous to use xss to form .SD in dogroups than going via irows
803        }
804        if (!length(byval) && xnrow>0L) {
805          # see missingby up above for comments
806          # by could be NULL or character(0L) for example (e.g. passed in as argument in a loop of different bys)
807          bysameorder = FALSE  # 1st and only group is the entire table, so could be TRUE, but FALSE to avoid
808                     # a key of empty character()
809          byval = list()
810          bynames = allbyvars = NULL
811          # the rest now fall through
812        } else bynames = names(byval)
813        if (is.atomic(byval)) {
814          if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) {
815            stop("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval",deparse(bysub)," should work. This is for efficiency so data.table can detect which columns are needed.")
816          } else {
817            # by may be a single unquoted column name but it must evaluate to list so this is a convenience to users. Could also be a single expression here such as DT[,sum(v),by=colA%%2]
818            byval = list(byval)
819            bysubl = c(as.name("list"),bysuborig)  # for guessing the column name below
820            if (is.name(bysuborig))
821              bynames = as.character(bysuborig)
822            else
823              bynames = names(byval)
824          }
825        }
826        if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
827        if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
828        if (!bynull) for (jj in seq_len(length(byval))) {
829          if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
830        }
831        tt = vapply_1i(byval,length)
832        if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table'))
833        if (is.null(bynames)) bynames = rep.int("",length(byval))
834        if (length(idx <- which(!nzchar(bynames))) && !bynull) {
835          # TODO: improve this and unify auto-naming of jsub and bysub
836          if (is.name(bysubl[[1L]]) && bysubl[[1L]] == '{') bysubl = bysubl[[length(bysubl)]] # fix for #3156
837          for (jj in idx) {
838            # Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2
839            byvars = all.vars(bysubl[[jj+1L]], functions = TRUE)
840            if (length(byvars) == 1L) tt = byvars
841            else {
842              # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name
843              tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE)
844              # byvars but exclude functions or `0`+`1` becomes `+`
845              tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L]
846            }
847            # fix for #497
848            if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) {
849              bynames[jj] = deparse(bysubl[[jj+1L]])
850              if (verbose)
851                cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt,
852                    "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="")
853            }
854            else bynames[jj] = tt
855            # if user doesn't like this inferred name, user has to use by=list() to name the column
856          }
857          # Fix for #1334
858          if (any(duplicated(bynames))) {
859            bynames = make.unique(bynames)
860          }
861        }
862        setattr(byval, "names", bynames)  # byval is just a list not a data.table hence setattr not setnames
863      }
864
865      jvnames = NULL
866      drop_dot = function(x) {
867        if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items")  # nocov
868        if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY"))
869          substring(x, 2L)
870        else
871          x
872      }
873      # handle auto-naming of last item of j (e.g. within {} or if/else, #2478)
874      #   e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N
875      do_j_names = function(q) {
876        if (!is.call(q) || !is.name(q[[1L]])) return(q)
877        if (q[[1L]] %chin% c('list', '.')) {
878          q[[1L]] = quote(list)
879          qlen = length(q)
880          if (qlen>1L) {
881            nm = names(q[-1L])   # check list(a=sum(v),v)
882            if (is.null(nm)) nm = rep.int("", qlen-1L)
883            # attempt to auto-name unnamed columns
884            for (jj in which(nm=="")) {
885              thisq = q[[jj + 1L]]
886              if (missing(thisq)) stop(gettextf("Item %d of the .() or list() passed to j is missing", jj, domain="R-data.table")) #3507
887              if (is.name(thisq)) nm[jj] = drop_dot(thisq)
888              # TO DO: if call to a[1] for example, then call it 'a' too
889            }
890            if (!is.null(jvnames) && any(idx <- nm != jvnames))
891              warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE)
892            jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better
893            setattr(q, "names", NULL)  # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result.
894          }
895          return(q) # else empty list is needed for test 468: adding an empty list column
896        }
897        if (q[[1L]] == '{') {
898          if (!is.null(q[[qlen<-length(q)]])) q[[qlen]] = do_j_names(q[[qlen]])
899          return(q)
900        }
901        if (q[[1L]] == 'if') {
902          #explicit NULL would return NULL, assigning NULL would delete that from the expression
903          if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]])
904          if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]])
905          return(q)
906        }
907        return(q)
908      }
909      if (is.name(jsub)) {
910        # j is a single unquoted column name
911        if (jsub!=".SD") jvnames = drop_dot(jsub)
912        # jsub is list()ed after it's eval'd inside dogroups.
913      } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list.
914      av = all.vars(jsub,TRUE)  # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
915      use.I = ".I" %chin% av
916      if (any(c(".SD","eval","get","mget") %chin% av)) {
917        if (missing(.SDcols)) {
918          # here we need to use 'dupdiff' instead of 'setdiff'. Ex: setdiff(c("x", "x"), NULL) will give 'x'.
919          # slight memory efficiency boost if we only store sdvars if it differs from ansvars, but a bit tricky to examine all the uses here
920          ansvars = sdvars = dupdiff(names_x, union(bynames, allbyvars))   # TO DO: allbyvars here for vars used by 'by'. Document.
921          # just using .SD in j triggers all non-by columns in the subset even if some of
922          # those columns are not used. It would be tricky to detect whether the j expression
923          # really does use all of the .SD columns or not, hence .SDcols for grouping
924          # over a subset of columns
925
926          # all duplicate columns must be matched, because nothing is provided
927          ansvals = chmatchdup(ansvars, names_x)
928        } else {
929          # FR #355 - negative numeric and character indices for SDcols
930          colsub = substitute(.SDcols)
931          # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol.
932          if (colsub %iscall% c("!", "-")) {
933            negate_sdcols = TRUE
934            colsub = colsub[[2L]]
935          } else negate_sdcols = FALSE
936          # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
937          while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]]
938          if (colsub %iscall% ':' && length(colsub)==3L) {
939            # .SDcols is of the format a:b
940            .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
941          } else {
942            if (colsub %iscall% 'patterns') {
943              # each pattern gives a new filter condition, intersect the end result
944              .SDcols = Reduce(intersect, do_patterns(colsub, names_x))
945            } else {
946              .SDcols = eval(colsub, parent.frame(), parent.frame())
947              # allow filtering via function in .SDcols, #3950
948              if (is.function(.SDcols)) {
949                .SDcols = lapply(x, .SDcols)
950                if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
951                  stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx]))
952                .SDcols = unlist(.SDcols, use.names = FALSE)
953              }
954            }
955          }
956          if (anyNA(.SDcols))
957            stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
958          if (is.logical(.SDcols)) {
959            ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols)
960            ansvars = sdvars = names_x[ansvals]
961          } else if (is.numeric(.SDcols)) {
962            .SDcols = as.integer(.SDcols)
963            # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
964            if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
965            if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
966              stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
967            ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols]
968            ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols
969          } else {
970            if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
971            if (!all(idx <- .SDcols %chin% names_x))
972              stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
973            ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols
974            # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
975            ansvals = chmatch(ansvars, names_x)
976          }
977        }
978        # fix for long standing FR/bug, #495 and #484
979        allcols = c(names_x, xdotprefix, names_i, idotprefix)
980        non_sdvars = setdiff(intersect(av, allcols), c(bynames, ansvars))
981
982        # added 'mget' - fix for #994
983        if (any(c("get", "mget") %chin% av)){
984          if (verbose)
985            cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table"))
986            # get('varname') is too difficult to detect which columns are used in general
987            # eval(macro) column names are detected via the  if jsub[[1]]==eval switch earlier above.
988
989          # Do not include z in .SD when dt[, z := {.SD; get("x")}, .SDcols = "y"] (#2326, #2338)
990          if (jsub %iscall% ":=" && is.symbol(jsub[[2L]])) {
991            jsub_lhs_symbol = as.character(jsub[[2L]])
992            if (jsub_lhs_symbol %chin% non_sdvars) {
993              sdvars = setdiff(sdvars, jsub_lhs_symbol)
994            }
995          }
996
997          if (missing(.SDcols)) {
998            ansvars = setdiff(allcols, bynames) # fix for bug #34
999          } else {
1000            # fixes #4089 - if .SDcols was already evaluated, we do not want the order of the columns to change.
1001            ansvars = union(ansvars, setdiff(setdiff(allcols, ansvars), bynames))
1002          }
1003          non_sdvars = setdiff(ansvars, sdvars)
1004          ansvals = chmatch(ansvars, names_x)
1005          if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars), domain = "R-data.table"))
1006        } else if (length(non_sdvars)) {
1007          # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or
1008          # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc.,
1009          ansvars = union(ansvars, non_sdvars)
1010          ansvals = chmatch(ansvars, names_x)
1011        }
1012        # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
1013      } else {
1014        if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.")
1015        allcols = c(names_x, xdotprefix, names_i, idotprefix)
1016        ansvars = sdvars = setdiff(intersect(av, allcols), bynames)
1017        if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "<none>" else paste(ansvars,collapse=","),"\n")
1018        # using a few named columns will be faster
1019        # Consider:   DT[,max(diff(date)),by=list(month=month(date))]
1020        # and:        DT[,lapply(.SD,sum),by=month(date)]
1021        # We don't want date in .SD in the latter, but we do in the former; hence the union() above.
1022        ansvals = chmatch(ansvars, names_x)
1023      }
1024      # if (!length(ansvars)) Leave ansvars empty. Important for test 607.
1025
1026      lhs = NULL
1027      newnames = NULL
1028      suppPrint = identity
1029      if (length(av) && av[1L] == ":=") {
1030        if (.Call(C_islocked, x)) stop(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.")
1031        suppPrint = function(x) { .global$print=address(x); x }
1032        # Suppress print when returns ok not on error, bug #2376. Thanks to: http://stackoverflow.com/a/13606880/403310
1033        # All appropriate returns following this point are wrapped; i.e. return(suppPrint(x)).
1034
1035        if (is.null(names(jsub))) {
1036          # regular LHS:=RHS usage, or `:=`(...) with no named arguments (an error)
1037          # `:=`(LHS,RHS) is valid though, but more because can't see how to detect that, than desire
1038          if (length(jsub)!=3L) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.")
1039          lhs = jsub[[2L]]
1040          jsub = jsub[[3L]]
1041          if (is.name(lhs)) {
1042            lhs = as.character(lhs)
1043          } else {
1044            # e.g. (MyVar):= or get("MyVar"):=
1045            lhs = eval(lhs, parent.frame(), parent.frame())
1046          }
1047        } else {
1048          # `:=`(c2=1L,c3=2L,...)
1049          lhs = names(jsub)[-1L]
1050          if (any(lhs=="")) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.")
1051          names(jsub)=""
1052          jsub[[1L]]=as.name("list")
1053        }
1054        av = all.vars(jsub,TRUE)
1055        if (!is.atomic(lhs)) stop("LHS of := must be a symbol, or an atomic vector (column names or positions).")
1056        if (is.character(lhs)) {
1057          m = chmatch(lhs, names_x)
1058        } else if (is.numeric(lhs)) {
1059          m = as.integer(lhs)
1060          if (any(m<1L | ncol(x)<m)) stop("LHS of := appears to be column positions but are outside [1,ncol] range. New columns can only be added by name.")
1061          lhs = names_x[m]
1062        } else
1063          stop("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')")
1064        if (all(!is.na(m))) {
1065          # updates by reference to existing columns
1066          cols = as.integer(m)
1067          newnames=NULL
1068          if (identical(irows, integer())) {
1069            # Empty integer() means no rows e.g. logical i with only FALSE and NA
1070            # got converted to empty integer() by the which() above
1071            # Short circuit and do-nothing since columns already exist. If some don't
1072            # exist then for consistency with cases where irows is non-empty, we need to create
1073            # them of the right type and populate with NA.  Which will happen via the regular
1074            # alternative branches below, to cover #759.
1075            # We need this short circuit at all just for convenience. Otherwise users may need to
1076            # fix errors in their RHS when called on empty edge cases, even when the result won't be
1077            # used anyway (so it would be annoying to have to fix it.)
1078            if (verbose) {
1079              cat("No rows match i. No new columns to add so not evaluating RHS of :=\n")
1080              cat("Assigning to 0 row subset of",nrow(x),"rows\n")
1081            }
1082            .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated
1083            .global$print = address(x)
1084            return(invisible(x))
1085          }
1086        } else {
1087          # Adding new column(s). TO DO: move after the first eval in case the jsub has an error.
1088          newnames=setdiff(lhs, names_x)
1089          m[is.na(m)] = ncol(x)+seq_len(length(newnames))
1090          cols = as.integer(m)
1091          # don't pass verbose to selfrefok here -- only activated when
1092          #   ok=-1 which will trigger setalloccol with verbose in the next
1093          #   branch, which again calls _selfrefok and returns the message then
1094          if ((ok<-selfrefok(x, verbose=FALSE))==0L)   # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
1095            warning("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.")
1096          if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) {
1097            DT = x  # in case getOption contains "ncol(DT)" as it used to.  TODO: warn and then remove
1098            n = length(newnames) + eval(getOption("datatable.alloccol"))  # TODO: warn about expressions and then drop the eval()
1099            # i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1100            name = substitute(x)
1101            if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO)    # ok here includes -1 (loaded from disk)
1102              cat("Growing vector of column pointers from truelength ", truelength(x), " to ", n, ". A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n")
1103              # #1729 -- copying to the wrong environment here can cause some confusion
1104              if (ok == -1L) cat("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1105
1106              # Verbosity should not issue warnings, so cat rather than warning.
1107              # TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1108
1109              # TO DO ... comments moved up from C ...
1110              # Note that the NAMED(dt)>1 doesn't work because .Call
1111              # always sets to 2 (see R-ints), it seems. Work around
1112              # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1113              # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1114              # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1115              # don't mind.
1116            }
1117            setalloccol(x, n, verbose=verbose)   # always assigns to calling scope; i.e. this scope
1118            if (is.name(name)) {
1119              assign(as.character(name),x,parent.frame(),inherits=TRUE)
1120            } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) {
1121              k = eval(name[[2L]], parent.frame(), parent.frame())
1122              if (is.list(k)) {
1123                origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame())
1124                if (is.character(j)) {
1125                  if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j))
1126                  j = match(j, names(k))
1127                  if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov
1128                }
1129                .Call(Csetlistelt,k,as.integer(j), x)
1130              } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) {
1131                assign(as.character(name[[3L]]), x, k, inherits=FALSE)
1132              }
1133            } # TO DO: else if env$<- or list$<-
1134          }
1135        }
1136      }
1137    }
1138
1139    if (length(ansvars)) {
1140      w = ansvals
1141      if (length(rightcols) && missingby) {
1142        w[ w %in% rightcols ] = NA
1143      }
1144      # patch for #1615. Allow 'x.' syntax. Only useful during join op when x's join col needs to be used.
1145      # Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] to refer to x's join column
1146      # as well because x[i, col] == x[i][, col] will not be TRUE anymore..
1147      if ( any(xdotprefixvals <- ansvars %chin% xdotprefix)) {
1148        w[xdotprefixvals] = chmatch(ansvars[xdotprefixvals], xdotprefix)
1149        xdotcols = TRUE
1150      }
1151      if (!any(wna <- is.na(w))) {
1152        xcols = w
1153        xcolsAns = seq_along(ansvars)
1154        icols = icolsAns = integer()
1155      } else {
1156        if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov
1157        xcols = w[!wna]
1158        xcolsAns = which(!wna)
1159        map = c(seq_along(i), leftcols)   # this map is to handle dups in leftcols, #3635
1160        names(map) = c(names_i, names_x[rightcols])
1161        w2 = map[ansvars[wna]]
1162        if (any(w2na <- is.na(w2))) {
1163          ivars = paste0("i.", names_i)   # ivars is only used in this branch
1164          ivars[leftcols] = names_i[leftcols]
1165          w2[w2na] = chmatch(ansvars[wna][w2na], ivars)
1166          if (any(w2na <- is.na(w2))) {
1167            ivars[leftcols] = paste0("i.",ivars[leftcols])
1168            w2[w2na] = chmatch(ansvars[wna][w2na], ivars)
1169            if (any(w2na <- is.na(w2))) stop("Internal error -- column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) # nocov
1170          }
1171        }
1172        icols = w2
1173        icolsAns = which(wna)
1174      }
1175    }
1176  }  # end of  if !missing(j)
1177
1178  SDenv = new.env(parent=parent.frame())
1179  # taking care of warnings for posixlt type, #646
1180  SDenv$strptime = function(x, ...) {
1181    warning("strptime() usage detected and wrapped with as.POSIXct(). This is to minimize the chance of assigning POSIXlt columns, which use 40+ bytes to store one date (versus 8 for POSIXct). Use as.POSIXct() (which will call strptime() as needed internally) to avoid this warning.")
1182    as.POSIXct(base::strptime(x, ...))
1183  }
1184
1185  syms = all.vars(jsub)
1186  syms = syms[ substring(syms,1L,2L)==".." ]
1187  syms = syms[ substring(syms,3L,3L)!="." ]  # exclude ellipsis
1188  for (sym in syms) {
1189    if (sym %chin% names_x) {
1190      # if "..x" exists as column name, use column, for backwards compatibility; e.g. package socialmixr in rev dep checks #2779
1191      next
1192      # TODO in future, as warned in NEWS item for v1.11.0 :
1193      # warning(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..")
1194    }
1195    getName = substring(sym, 3L)
1196    if (!exists(getName, parent.frame())) {
1197      if (exists(sym, parent.frame())) next  # user did 'manual' prefix; i.e. variable in calling scope has .. prefix
1198      stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.")
1199    }
1200    assign(sym, get(getName, parent.frame()), SDenv)
1201  }
1202  # hash=TRUE (the default) does seem better as expected using e.g. test 645.  TO DO experiment with 'size' argument
1203  if (missingby || bynull || (!byjoin && !length(byval))) {
1204    # No grouping: 'by' = missing | NULL | character() | "" | list()
1205    # Considered passing a one-group to dogroups but it doesn't do the recycling of i within group, that's done here
1206    if (length(ansvars)) {
1207      if (!(length(i) && length(icols))) {
1208        # new in v1.12.0 to redirect to CsubsetDT in this case
1209        if (!identical(xcolsAns, seq_along(xcolsAns)) || length(xcols)!=length(xcolsAns) || length(ansvars)!=length(xcolsAns)) {
1210          stop("Internal error: xcolAns does not pass checks: ", length(xcolsAns), length(ansvars), length(xcols), paste(xcolsAns,collapse=","))   # nocov
1211        }
1212        # Retained from old R way below (test 1542.01 checks shallow at this point)
1213        # ' Temp fix for #921 - skip COPY until after evaluating 'jval' (scroll down).
1214        # ' Unless 'with=FALSE' - can not be expressions but just column names.
1215        ans = if (with && is.null(irows)) shallow(x, xcols) else .Call(CsubsetDT, x, irows, xcols)
1216        setattr(ans, "names", ansvars)
1217      } else {
1218        # length(i) && length(icols)
1219        if (is.null(irows)) {
1220          stop("Internal error: irows is NULL when making join result at R level. Should no longer happen now we use CsubsetDT earlier.")  # nocov
1221          # TODO: Make subsetDT do a shallow copy when irows is NULL (it currently copies). Then copy only when user uses := or set* on the result
1222          # by using NAMED/REFCNT on columns, with warning if they copy. Since then, even foo = DT$b would cause the next set or := to copy that
1223          # column (so the warning is needed). To tackle that, we could have our own DT.NAMED attribute, perhaps.
1224          # Or keep the rule that [.data.table always returns new memory, and create view() or view= as well, maybe cleaner.
1225        }
1226        ans = vector("list", length(ansvars))
1227        ii = rep.int(indices__, len__) # following #1991 fix
1228        # TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i)  [nomatch=0 should drop rows in i that have no match]
1229        #       But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT
1230        #       to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally.
1231        #       That R level branch was taken out in PR #3213
1232
1233        # TO DO: use CsubsetDT twice here and then remove this entire R level branch
1234        for (s in seq_along(icols)) {
1235          target = icolsAns[s]
1236          source = icols[s]
1237          ans[[target]] = .Call(CsubsetVector,i[[source]],ii)  # i.e. i[[source]][ii]
1238        }
1239        for (s in seq_along(xcols)) {
1240          target = xcolsAns[s]
1241          source = xcols[s]
1242          ans[[target]] = .Call(CsubsetVector,x[[source]],irows)   # i.e. x[[source]][irows], but guaranteed new memory even for singleton logicals from R 3.1.0
1243        }
1244        setattr(ans, "names", ansvars)
1245        if (haskey(x)) {
1246          keylen = which.first(!key(x) %chin% ansvars)-1L
1247          if (is.na(keylen)) keylen = length(key(x))
1248          len = length(rightcols)
1249          # fix for #1268, #1704, #1766 and #1823
1250          chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
1251          if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
1252            keylen = if (!chk) len else 0L # fix for #1268
1253          }
1254          ## check key on i as well!
1255          ichk = is.data.table(i) && haskey(i) &&
1256                 identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
1257          if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
1258            setattr(ans,"sorted",head(key(x),keylen))
1259        }
1260        setattr(ans, "class", class(x)) # fix for #64
1261        setattr(ans, "row.names", .set_row_names(nrow(ans)))
1262        setalloccol(ans)
1263      }
1264
1265      if (!with || missing(j)) return(ans)
1266
1267      SDenv$.SDall = ans
1268      SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
1269      SDenv$.N = nrow(ans)
1270
1271    } else {
1272      SDenv$.SDall = SDenv$.SD = null.data.table()   # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf.
1273      SDenv$.N = if (is.null(irows)) nrow(x) else if(!length(irows) || identical(max(irows), 0L)) 0L else length(irows)
1274      # Fix for #963.
1275      # When irows is integer(0L), length(irows) = 0 will result in 0 (as expected).
1276      # Binary search can return all 0 irows when none of the input matches. Instead of doing all(irows==0L) (previous method), which has to allocate a logical vector the size of irows, we can make use of 'max'. If max is 0, we return 0. The condition where only some irows > 0 won't occur.
1277    }
1278    # Temp fix for #921. Allocate `.I` only if j-expression uses it.
1279    SDenv$.I = if (!missing(j) && use.I) seq_len(SDenv$.N) else 0L
1280    SDenv$.GRP = 1L
1281    SDenv$.NGRP = 1L
1282    .Call(C_lock, SDenv$.SD) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
1283    .Call(C_lock, SDenv$.SDall)
1284    lockBinding(".SD",SDenv)
1285    lockBinding(".SDall",SDenv)
1286    lockBinding(".N",SDenv)
1287    lockBinding(".I",SDenv)
1288    lockBinding(".GRP",SDenv)
1289    lockBinding(".NGRP", SDenv)
1290    for (ii in ansvars) assign(ii, SDenv$.SDall[[ii]], SDenv)
1291    # Since .SD is inside SDenv, alongside its columns as variables, R finds .SD symbol more quickly, if used.
1292    # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write).
1293
1294    if (is.name(jsub) && is.null(lhs) && !exists(jsubChar<-as.character(jsub), SDenv, inherits=FALSE)) {
1295      stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. Perhaps you intended DT[, ..",jsubChar,"]. This difference to data.frame is deliberate and explained in FAQ 1.1.")
1296    }
1297
1298    jval = eval(jsub, SDenv, parent.frame())
1299    .Call(C_unlock, jval) # in case jval inherits .SD's lock, #1341 #2245. .Call directly (not via an R function like setattr or unlock) to avoid bumping jval's MAYBE_SHARED.
1300
1301    # copy 'jval' when required
1302    # More speedup - only check + copy if irows is NULL
1303    # Temp fix for #921 - check address and copy *after* evaluating 'jval'.  #75 also related.
1304    if (is.null(irows)) {
1305      if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg.
1306        jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list()
1307        if (jcpy) jval = copy(jval)
1308      } else if (address(jval) == address(SDenv$.SD)) {
1309        jval = copy(jval)
1310      } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
1311        for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
1312      } else if (jsub %iscall% 'get') {
1313        jval = copy(jval) # fix for #1212
1314      }
1315    }
1316
1317    if (!is.null(lhs)) {
1318      # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
1319      .Call(Cassign,x,irows,cols,newnames,jval)
1320      return(suppPrint(x))
1321    }
1322    if ((is.call(jsub) && jsub[[1L]] != "get" && is.list(jval) && !is.object(jval)) || !missingby) {
1323      # is.call: selecting from a list column should return list
1324      # is.object: for test 168 and 168.1 (S4 object result from ggplot2::qplot). Just plain list results should result in data.table
1325
1326      # Fix for #813 and #758. Ex: DT[c(FALSE, FALSE), list(integer(0L), y)]
1327      # where DT = data.table(x=1:2, y=3:4) should return an empty data.table!!
1328      if (!is.null(irows) && `||`(
1329          identical(irows, integer(0L)) && !bynull,
1330          length(irows) && !anyNA(irows) && all(irows==0L) ## anyNA() because all() returns NA (not FALSE) when irows is all-NA. TODO: any way to not check all 'irows' values?
1331          ))
1332        jval = lapply(jval, `[`, 0L)
1333      if (is.atomic(jval)) {
1334        setattr(jval,"names",NULL)  # discard names of named vectors otherwise each cell in the column would have a name
1335        jval = list(jval)
1336      }
1337      if (!is.null(jvnames) && !all(jvnames=="")) setattr(jval, 'names', jvnames)  # e.g. jvnames=="N" for DT[,.N,]
1338      jval = as.data.table.list(jval, .named=NULL)
1339    }
1340
1341    if (is.data.table(jval)) {
1342      setattr(jval, 'class', class(x)) # fix for #64
1343      if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x)))
1344        setattr(jval, 'sorted', key(x))
1345      if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov
1346    }
1347    return(jval)
1348  }
1349
1350  ###########################################################################
1351  # Grouping ...
1352  ###########################################################################
1353
1354  o__ = integer()
1355  if (".N" %chin% ansvars) stop("The column '.N' can't be grouped because it conflicts with the special .N variable. Try setnames(DT,'.N','N') first.")
1356  if (".I" %chin% ansvars) stop("The column '.I' can't be grouped because it conflicts with the special .I variable. Try setnames(DT,'.I','I') first.")
1357  SDenv$.iSD = NULL  # null.data.table()
1358  SDenv$.xSD = NULL  # null.data.table() - introducing for FR #2693 and Gabor's post on fixing for FAQ 2.8
1359
1360  SDenv$print = function(x, ...){ base::print(x,...); NULL}
1361  # Now ggplot2 returns data from print, we need a way to throw it away otherwise j accumulates the result
1362
1363  SDenv$.SDall = SDenv$.SD = null.data.table()  # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only
1364  SDenv$.N = vector("integer", 1L)    # explicit new vector (not 0L or as.integer() which might return R's internal small-integer global)
1365  SDenv$.GRP = vector("integer", 1L)  #   because written to by reference at C level (one write per group). TODO: move this alloc to C level
1366
1367  # #3694/#761 common gotcha -- doing t1-t0 by group, but -.POSIXt uses units='auto'
1368  #   independently by group & attr mismatch among groups is ignored. The latter
1369  #   is a more general issue but the former can be fixed by forcing units='secs'
1370  SDenv$`-.POSIXt` = function(e1, e2) {
1371    if (inherits(e2, 'POSIXt')) {
1372      if (verbose && !exists('done_units_report', parent.frame())) {
1373        cat('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units')
1374        assign('done_units_report', TRUE, parent.frame())
1375      }
1376      return(difftime(e1, e2, units='secs'))
1377    } else return(base::`-.POSIXt`(e1, e2))
1378  }
1379
1380  if (byjoin) {
1381    # The groupings come instead from each row of the i data.table.
1382    # Much faster for a few known groups vs a 'by' for all followed by a subset
1383    if (!is.data.table(i)) stop("logical error. i is not data.table, but mult='all' and 'by'=.EACHI")
1384    byval = i
1385    bynames = if (missing(on)) head(key(x),length(leftcols)) else names(on)
1386    allbyvars = NULL
1387    bysameorder = haskey(i) || (is.sorted(f__) && ((roll == FALSE) || length(f__) == 1L)) # Fix for #1010
1388    ##  'av' correct here ??  *** TO DO ***
1389    xjisvars = intersect(av, names_x[rightcols])  # no "x." for xvars.
1390    # if 'get' is in 'av' use all cols in 'i', fix for bug #34
1391    # added 'mget' - fix for #994
1392    jisvars = if (any(c("get", "mget") %chin% av)) names_i else intersect(gsub("^i[.]","", setdiff(av, xjisvars)), names_i)
1393    # JIS (non join cols) but includes join columns too (as there are named in i)
1394    if (length(jisvars)) {
1395      tt = min(nrow(i),1L)  # min() is here for when nrow(i)==0
1396      SDenv$.iSD = i[tt,jisvars,with=FALSE]
1397      for (ii in jisvars) {
1398        assign(ii, SDenv$.iSD[[ii]], SDenv)
1399        assign(paste0("i.",ii), SDenv$.iSD[[ii]], SDenv)
1400      }
1401    }
1402
1403  } else {
1404    # Find the groups, using 'byval' ...
1405    if (missingby) stop("Internal error: by= is missing")   # nocov
1406
1407    if (length(byval) && length(byval[[1L]])) {
1408      if (!bysameorder && isFALSE(byindex)) {
1409        if (verbose) {last.started.at=proc.time();cat("Finding groups using forderv ... ");flush.console()}
1410        o__ = forderv(byval, sort=keyby, retGrp=TRUE)
1411        # The sort= argument is called sortGroups at C level. It's primarily for saving the sort of unique strings at
1412        # C level for efficiency when by= not keyby=. Other types also retain appearance order, but at byte level to
1413        # minimize data movement and benefit from skipping subgroups which happen to be grouped but not sorted. This byte
1414        # appearance order is not the same as the order of group values within by= columns, so the 2nd forder below is
1415        # still needed to get the group appearance order. Always passing sort=TRUE above won't change any result at all
1416        # (tested and confirmed), it'll just make by= slower. It must be TRUE when keyby= though since the key is just
1417        # marked afterwards.
1418        # forderv() returns empty integer() if already ordered to save allocating 1:xnrow
1419        bysameorder = orderedirows && !length(o__)
1420        if (verbose) {
1421          cat(timetaken(last.started.at),"\n")
1422          last.started.at=proc.time()
1423          cat("Finding group sizes from the positions (can be avoided to save RAM) ... ")
1424          flush.console()  # for windows
1425        }
1426        f__ = attr(o__, "starts", exact=TRUE)
1427        len__ = uniqlengths(f__, xnrow)
1428        if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
1429        if (!bysameorder && !keyby) {
1430          # TO DO: lower this into forder.c
1431          if (verbose) {last.started.at=proc.time();cat("Getting back original order ... ");flush.console()}
1432          firstofeachgroup = o__[f__]
1433          if (length(origorder <- forderv(firstofeachgroup))) {
1434            f__ = f__[origorder]
1435            len__ = len__[origorder]
1436          }
1437          if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
1438        }
1439        if (!orderedirows && !length(o__)) o__ = seq_len(xnrow)  # temp fix.  TODO: revist orderedirows
1440      } else {
1441        if (verbose) last.started.at=proc.time();
1442        if (bysameorder) {
1443          if (verbose) {cat("Finding groups using uniqlist on key ... ");flush.console()}
1444          f__ = uniqlist(byval)
1445        } else {
1446          if (!is.character(byindex) || length(byindex)!=1L) stop("Internal error: byindex not the index name")  # nocov
1447          if (verbose) {cat("Finding groups using uniqlist on index '", byindex, "' ... ", sep="");flush.console()}
1448          o__ = getindex(x, byindex)
1449          if (is.null(o__)) stop("Internal error: byindex not found")  # nocov
1450          f__ = uniqlist(byval, order=o__)
1451        }
1452        if (verbose) {
1453          cat(timetaken(last.started.at),"\n")
1454          last.started.at=proc.time()
1455          cat("Finding group sizes from the positions (can be avoided to save RAM) ... ")
1456          flush.console()  # for windows
1457        }
1458        len__ = uniqlengths(f__, xnrow)
1459        # TO DO: combine uniqlist and uniquelengths into one call.  Or, just set len__ to NULL when dogroups infers that.
1460        if (verbose) { cat(timetaken(last.started.at),"\n"); flush.console() }
1461      }
1462    } else {
1463      f__=NULL
1464      len__=0L
1465      bysameorder=TRUE   # for test 724
1466    }
1467    # TO DO: allow secondary keys to be stored, then we see if our by matches one, if so use it, and no need to sort again. TO DO: document multiple keys.
1468  }
1469  if (length(xcols)) {
1470    #  TODO add: if (max(len__)==nrow) stop("There is no need to deep copy x in this case")
1471    #  TODO move down to dogroup.c, too.
1472    SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols)  # must be deep copy when largest group is a subset
1473    if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
1474    SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
1475  }
1476  if (nrow(SDenv$.SDall)==0L) {
1477    setattr(SDenv$.SDall,"row.names",c(NA_integer_,0L))
1478    setattr(SDenv$.SD,"row.names",c(NA_integer_,0L))
1479  }
1480  # .set_row_names() basically other than not integer() for 0 length, otherwise dogroups has no [1] to modify to -.N
1481  .Call(C_lock, SDenv$.SD)  # stops := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
1482  .Call(C_lock, SDenv$.SDall)
1483  lockBinding(".SD",SDenv)
1484  lockBinding(".SDall",SDenv)
1485  lockBinding(".N",SDenv)
1486  lockBinding(".GRP",SDenv)
1487  lockBinding(".iSD",SDenv)
1488
1489  SDenv$.NGRP = length(f__)
1490  lockBinding(".NGRP", SDenv)
1491
1492  GForce = FALSE
1493  if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) {  # Ability to turn off if problems or to benchmark the benefit
1494    # Optimization to reduce overhead of calling lapply over and over for each group
1495    oldjsub = jsub
1496    funi = 1L # Fix for #985
1497    # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then.
1498    .massageSD = function(jsub) {
1499      txt = as.list(jsub)[-1L]
1500      if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "")  # fixes bug #110
1501      fun = txt[[2L]]
1502      if (fun %iscall% "function") {
1503        # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT
1504        # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means
1505        # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD)
1506        # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!)
1507        thisfun = paste0("..FUN", funi) # Fix for #985
1508        assign(thisfun,eval(fun, SDenv, SDenv), SDenv)  # to avoid creating function() for each column of .SD
1509        lockBinding(thisfun,SDenv)
1510        txt[[1L]] = as.name(thisfun)
1511      } else {
1512        if (is.character(fun)) fun = as.name(fun)
1513        txt[[1L]] = fun
1514      }
1515      ans = vector("list", length(sdvars)+1L)
1516      ans[[1L]] = as.name("list")
1517      for (ii in seq_along(sdvars)) {
1518        txt[[2L]] = as.name(sdvars[ii])
1519        ans[[ii+1L]] = as.call(txt)
1520      }
1521      jsub = as.call(ans)  # important no names here
1522      jvnames = sdvars      # but here instead
1523      list(jsub, jvnames)
1524      # It may seem inefficient to construct a potentially long expression. But, consider calling
1525      # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it
1526      # is called, involving small memory allocations.
1527      # The R level lapply calls as.list which needs a shallow copy.
1528      # lapply also does a setAttib of names (duplicating the same names over and over again
1529      # for each group) which is terrible for our needs. We replace all that with a
1530      # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol
1531      # lookup), and the eval() inside dogroups hardly has to do anything. All this results in
1532      # overhead minimised. We don't need to worry about the env passed to the eval in a possible
1533      # lapply replacement, or how to pass ... efficiently to it.
1534      # Plus we optimize lapply first, so that mean() can be optimized too as well, next.
1535    }
1536    if (is.name(jsub)) {
1537      if (jsub == ".SD") {
1538        jsub = as.call(c(quote(list), lapply(sdvars, as.name)))
1539        jvnames = sdvars
1540      }
1541    } else if (length(as.character(jsub[[1L]])) == 1L) {  # Else expect problems with <jsub[[1L]] == >
1542      # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612
1543      subopt = length(jsub) == 3L &&
1544        (jsub[[1L]] == "[" ||
1545           (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) &&
1546        (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
1547      headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail"
1548      firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030
1549      if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
1550          (subopt || headopt || firstopt)) {
1551        if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
1552        # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
1553        jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
1554        jvnames = sdvars
1555      } else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) {
1556        deparse_ans = .massageSD(jsub)
1557        jsub = deparse_ans[[1L]]
1558        jvnames = deparse_ans[[2L]]
1559      } else if (jsub[[1L]] == "c" && length(jsub) > 1L) {
1560        # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already.
1561        # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here.
1562        # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains
1563        # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value*
1564        # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always*
1565        # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT.
1566        # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output
1567        # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations.
1568        # For now, we optimise all functions mentioned in 'optfuns' below.
1569        optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var")
1570        is_valid = TRUE
1571        any_SD = FALSE
1572        jsubl = as.list.default(jsub)
1573        oldjvnames = jvnames
1574        jvnames = NULL           # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward.
1575        # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!!
1576        for (i_ in 2L:length(jsubl)) {
1577          this = jsub[[i_]]
1578          if (is.name(this)) {  # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names
1579            if (this == ".SD") { # optimise '.SD' alone
1580              any_SD = TRUE
1581              jsubl[[i_]] = lapply(sdvars, as.name)
1582              jvnames = c(jvnames, sdvars)
1583            } else if (this == ".N") {
1584              # don't optimise .I in c(.SD, .I), it's length can be > 1
1585              # only c(.SD, list(.I)) should be optimised!! .N is always length 1.
1586              jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this))
1587            } else {
1588              # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_])
1589              is_valid=FALSE
1590              break
1591            }
1592          } else if (is.call(this)) {
1593            if (this[[1L]] == "lapply" && this[[2L]] == ".SD" && length(xcols)) {
1594              any_SD = TRUE
1595              deparse_ans = .massageSD(this)
1596              funi = funi + 1L # Fix for #985
1597              jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.)
1598              jvnames = c(jvnames, deparse_ans[[2L]])
1599            } else if (this[[1L]] == "list") {
1600              # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen
1601              if (length(this) > 1L) {
1602                jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.)
1603                jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
1604                idx  = unlist(lapply(jl__, function(x) is.name(x) && x == ".I"))
1605                if (any(idx)) jn__[idx & (jn__ == "")] = "I"
1606                jvnames = c(jvnames, jn__)
1607                jsubl[[i_]] = jl__
1608              }
1609            } else if (this %iscall% optfuns && length(this)>1L) {
1610              jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_])
1611            } else if ( length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") &&
1612                    this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) {
1613              # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
1614              any_SD = TRUE
1615              jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this })
1616              jvnames = c(jvnames, sdvars)
1617            } else if (any(all.vars(this) == ".SD")) {
1618              # TODO, TO DO: revisit complex cases (as illustrated below)
1619              # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp]
1620              # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation.
1621              is_valid=FALSE
1622              break
1623            } else { # just to be sure that any other case (I've overlooked) runs smoothly, without optimisation
1624              # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any?
1625              is_valid=FALSE
1626              break
1627            }
1628          } else {
1629            is_valid = FALSE
1630            break
1631          }
1632        }
1633        if (!is_valid || !any_SD) { # restore if c(...) doesn't contain lapply(.SD, ..) or if it's just invalid
1634          jvnames = oldjvnames           # reset jvnames
1635          jsub = oldjsub                 # reset jsub
1636          jsubl = as.list.default(jsubl) # reset jsubl
1637        } else {
1638          setattr(jsubl, 'names', NULL)
1639          jsub = as.call(unlist(jsubl, use.names=FALSE))
1640          jsub[[1L]] = quote(list)
1641        }
1642      }
1643    }
1644    if (verbose) {
1645      if (!identical(oldjsub, jsub))
1646        cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="")
1647      else
1648        cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="")
1649    }
1650    dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character?
1651    # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with
1652    # nomatch=0L even now.. but not switching it on yet, will deal it separately.
1653    if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
1654      if (!length(ansvars) && !use.I) {
1655        GForce = FALSE
1656        if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) {
1657          GForce = TRUE
1658          if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
1659        }
1660      } else {
1661        # Apply GForce
1662        .gforce_ok = function(q) {
1663          if (dotN(q)) return(TRUE) # For #334
1664          # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD
1665          # is.symbol() is for #1369, #1974 and #2949
1666          if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE)
1667          if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE)  # 875
1668          if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE)
1669          # ... head-tail uses default value n=6 which as of now should not go gforce ^^
1670          # otherwise there must be three arguments, and only in two cases:
1671          #   1) head/tail(x, 1) or 2) x[n], n>0
1672          length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
1673            ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) )
1674        }
1675        if (jsub[[1L]]=="list") {
1676          GForce = TRUE
1677          for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) {
1678            if (!.gforce_ok(jsub[[ii]])) {GForce = FALSE; break}
1679          }
1680        } else GForce = .gforce_ok(jsub)
1681        if (GForce) {
1682          if (jsub[[1L]]=="list")
1683            for (ii in seq_along(jsub)[-1L]) {
1684              if (dotN(jsub[[ii]])) next; # For #334
1685              jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]]))
1686              if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame())  # tests 1187.2 & 1187.4
1687            }
1688          else {
1689            jsub[[1L]] = as.name(paste0("g", jsub[[1L]]))
1690            if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
1691          }
1692          if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
1693        } else if (verbose) cat("GForce is on, left j unchanged\n");
1694      }
1695    }
1696    if (!GForce && !is.name(jsub)) {
1697      # Still do the old speedup for mean, for now
1698      nomeanopt=FALSE  # to be set by .optmean() using <<- inside it
1699      oldjsub = jsub
1700      if (jsub[[1L]]=="list") {
1701        # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() and/or the for-looped if()
1702        # jsub[[1]]=="list" so the first item of todo will always be FALSE
1703        todo = sapply(jsub, `%iscall%`, 'mean')
1704        if (any(todo)) {
1705          w = which(todo)
1706          jsub[w] = lapply(jsub[w], .optmean)
1707        }
1708      } else if (jsub[[1L]]=="mean") {
1709        jsub = .optmean(jsub)
1710      }
1711      if (nomeanopt) {
1712        warning("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.",immediate.=TRUE)
1713      }
1714      if (verbose) {
1715        if (!identical(oldjsub, jsub))
1716          cat("Old mean optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
1717        else
1718          cat("Old mean optimization is on, left j unchanged.\n")
1719      }
1720      assign("Cfastmean", Cfastmean, SDenv)
1721      # Old comments still here for now ...
1722      # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow.
1723      # Maybe change to :
1724      #     assign("mean", fastmean, SDenv)  # neater than the hard work above, but slower
1725      # when fastmean can do trim.
1726    }
1727  } else if (verbose) {
1728    if (getOption("datatable.optimize")<1L) cat("All optimizations are turned off\n")
1729    else cat("Optimization is on but left j unchanged (single plain symbol): '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
1730  }
1731  if (byjoin) {
1732    groups = i
1733    grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
1734    jiscols = chmatch(jisvars, names_i)  # integer() if there are no jisvars (usually there aren't, advanced feature)
1735    xjiscols = chmatch(xjisvars, names_x)
1736    SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
1737    if (!missing(on)) o__ = xo else o__ = integer(0L)
1738  } else {
1739    groups = byval
1740    grpcols = seq_along(byval)
1741    jiscols = NULL   # NULL rather than integer() is used in C to know when using by
1742    xjiscols = NULL
1743  }
1744  lockBinding(".xSD", SDenv)
1745  grporder = o__
1746  # for #971, added !GForce. if (GForce) we do it much more (memory) efficiently than subset of order vector below.
1747  if (length(irows) && !isTRUE(irows) && !GForce) {
1748    # any zeros in irows were removed by convertNegAndZeroIdx earlier above; no need to check for zeros again. Test 1058-1061 check case #2758.
1749    if (length(o__) && length(irows)!=length(o__)) stop("Internal error: length(irows)!=length(o__)") # nocov
1750    o__ = if (length(o__)) irows[o__]  # better do this once up front (even though another alloc) than deep repeated branch in dogroups.c
1751          else irows
1752  } # else grporder is left bound to same o__ memory (no cost of copy)
1753  if (is.null(lhs)) cols=NULL
1754  if (!length(f__)) {
1755    # for consistency of empty case in test 184
1756    f__=len__=0L
1757  }
1758  if (verbose) {last.started.at=proc.time();cat("Making each group and running j (GForce ",GForce,") ... ",sep="");flush.console()}
1759  if (GForce) {
1760    thisEnv = new.env()  # not parent=parent.frame() so that gsum is found
1761    for (ii in ansvars) assign(ii, x[[ii]], thisEnv)
1762    assign(".N", len__, thisEnv) # For #334
1763    #fix for #1683
1764    if (use.I) assign(".I", seq_len(nrow(x)), thisEnv)
1765    ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971.
1766    gi = if (length(o__)) o__[f__] else f__
1767    g = lapply(grpcols, function(i) groups[[i]][gi])
1768    ans = c(g, ans)
1769  } else {
1770    ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose)
1771  }
1772  # unlock any locked data.table components of the answer, #4159
1773  # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173
1774  #   TODO: is there an efficient way to get around this MAX_DEPTH limit?
1775  MAX_DEPTH = 5L
1776  runlock = function(x, current_depth = 1L) {
1777    if (is.list(x) && current_depth <= MAX_DEPTH) {  # is.list() used to be is.recursive(), #4814
1778      if (inherits(x, 'data.table')) .Call(C_unlock, x)
1779      else return(lapply(x, runlock, current_depth = current_depth + 1L))
1780    }
1781    return(invisible())
1782  }
1783  runlock(ans)
1784  if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
1785  # TO DO: xrows would be a better name for irows: irows means the rows of x that i joins to
1786  # Grouping by i: icols the joins columns (might not need), isdcols (the non join i and used by j), all __ are length x
1787  # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__)
1788  # TO DO: setkey could mark the key whether it is unique or not.
1789  if (!is.null(lhs)) {
1790    if (any(names_x[cols] %chin% key(x)))
1791      setkey(x,NULL)
1792    # fixes #1479. Take care of secondary indices, TODO: cleaner way of doing this
1793    attrs = attr(x, 'index', exact=TRUE)
1794    skeys = names(attributes(attrs))
1795    if (!is.null(skeys)) {
1796      hits  = unlist(lapply(paste0("__", names_x[cols]), function(x) grep(x, skeys, fixed = TRUE)))
1797      hits  = skeys[unique(hits)]
1798      for (i in seq_along(hits)) setattr(attrs, hits[i], NULL) # does by reference
1799    }
1800    if (keyby) {
1801      cnames = as.character(bysubl)[-1L]
1802      cnames = gsub('^`|`$', '', cnames)  # the wrapping backticks that were added above can be removed now, #3378
1803      if (all(cnames %chin% names_x)) {
1804        if (verbose) {last.started.at=proc.time();cat("setkey() after the := with keyby= ... ");flush.console()}
1805        setkeyv(x,cnames)  # TO DO: setkey before grouping to get memcpy benefit.
1806        if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
1807      }
1808      else warning("The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=.\n")
1809    }
1810    return(suppPrint(x))
1811  }
1812  if (is.null(ans)) {
1813    ans = as.data.table.list(lapply(groups,"[",0L))  # side-effects only such as test 168
1814    setnames(ans,seq_along(bynames),bynames)   # TO DO: why doesn't groups have bynames in the first place?
1815    return(ans)
1816  }
1817  setattr(ans,"row.names",.set_row_names(length(ans[[1L]])))
1818  setattr(ans,"class",class(x)) # fix for #64
1819  if (is.null(names(ans))) {
1820    # Efficiency gain of dropping names has been successful. Ordinarily this will run.
1821    if (is.null(jvnames)) jvnames = character(length(ans)-length(bynames))
1822    if (length(bynames)+length(jvnames)!=length(ans))
1823      stop("Internal error: jvnames is length ",length(jvnames), " but ans is ",length(ans)," and bynames is ", length(bynames)) # nocov
1824    ww = which(jvnames=="")
1825    if (any(ww)) jvnames[ww] = paste0("V",ww)
1826    setattr(ans, "names", c(bynames, jvnames))
1827  } else {
1828    setnames(ans,seq_along(bynames),bynames)   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
1829  }
1830  if (byjoin && keyby && !bysameorder) {
1831    if (verbose) {last.started.at=proc.time();cat("setkey() afterwards for keyby=.EACHI ... ");flush.console()}
1832    setkeyv(ans,names(ans)[seq_along(byval)])
1833    if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
1834  } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) {
1835    setattr(ans,"sorted",names(ans)[seq_along(grpcols)])
1836  }
1837  setalloccol(ans)   # TODO: overallocate in dogroups in the first place and remove this line
1838}
1839
1840.optmean = function(expr) {   # called by optimization of j inside [.data.table only. Outside for a small speed advantage.
1841  if (length(expr)==2L)  # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE
1842    return(call(".External",quote(Cfastmean),expr[[2L]], FALSE))
1843    # return(call(".Internal",expr))  # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012
1844  if (length(expr)==3L && identical("na",substring(names(expr)[3L], 1L, 2L)))   # one parameter passed to mean()
1845    return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]]))  # faster than .Call
1846  assign("nomeanopt",TRUE,parent.frame())
1847  expr  # e.g. trim is not optimized, just na.rm
1848}
1849
1850#  [[.data.frame is now dispatched due to inheritance.
1851#  The code below tried to avoid that but made things
1852#  very slow (462 times faster down to 1 in the timings test).
1853#  TO DO. Reintroduce velow but dispatch straight to
1854#  .C("do_subset2") or better. Tests 604-608 test
1855#  that this doesn't regress.
1856
1857#"[[.data.table" = function(x,...) {
1858#    if (!cedta()) return(`[[.data.frame`(x,...))
1859#    .subset2(x,...)
1860#    #class(x)=NULL  # awful, copy
1861#    #x[[...]]
1862#}
1863
1864#"[[<-.data.table" = function(x,i,j,value) {
1865#    if (!cedta()) return(`[[<-.data.frame`(x,i,j,value))
1866#    if (!missing(j)) stop("[[i,j]] assignment not available in data.table, put assignment(s) in [i,{...}] instead, more powerful")
1867#    cl = oldClass(x)  # [[<-.data.frame uses oldClass rather than class, don't know why but we'll follow suit
1868#    class(x) = NULL
1869#    x[[i]] = value
1870#    class(x) = cl
1871#    x
1872#}
1873
1874as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) {
1875  # rownames = the rownames column (most common usage)
1876  if (!is.null(rownames)) {
1877    if (!is.null(rownames.value)) stop("rownames and rownames.value cannot both be used at the same time")
1878    if (length(rownames)>1L) {
1879      # TODO in future as warned in NEWS for 1.11.6:
1880      #   warning("length(rownames)>1 is deprecated. Please use rownames.value= instead")
1881      if (length(rownames)!=nrow(x))
1882        stop("length(rownames)==", length(rownames), " but nrow(DT)==", nrow(x),
1883             ". The rownames argument specifies a single column name or number. Consider rownames.value= instead.")
1884      rownames.value = rownames
1885      rownames = NULL
1886    } else if (length(rownames)==0L) {
1887      stop("length(rownames)==0 but should be a single column name or number, or NULL")
1888    } else {
1889      if (isTRUE(rownames)) {
1890        if (length(key(x))>1L) {
1891          warning("rownames is TRUE but key has multiple columns ",
1892                  brackify(key(x)), "; taking first column x[,1] as rownames")
1893        }
1894        rownames = if (length(key(x))==1L) chmatch(key(x),names(x)) else 1L
1895      }
1896      else if (is.logical(rownames) || is.na(rownames)) {
1897        # FALSE, NA, NA_character_ all mean the same as NULL
1898        rownames = NULL
1899      }
1900      else if (is.character(rownames)) {
1901        w = chmatch(rownames, names(x))
1902        if (is.na(w)) stop("'", rownames, "' is not a column of x")
1903        rownames = w
1904      }
1905      else { # rownames is a column number already
1906        rownames = as.integer(rownames)
1907        if (is.na(rownames) || rownames<1L || rownames>ncol(x))
1908          stop("as.integer(rownames)==", rownames,
1909               " which is outside the column number range [1,ncol=", ncol(x), "].")
1910      }
1911    }
1912  } else if (!is.null(rownames.value)) {
1913    if (length(rownames.value)!=nrow(x))
1914      stop("length(rownames.value)==", length(rownames.value),
1915           " but should be nrow(x)==", nrow(x))
1916  }
1917  if (!is.null(rownames)) {
1918    # extract that column and drop it.
1919    rownames.value = x[[rownames]]
1920    dm = dim(x) - 0:1
1921    cn = names(x)[-rownames]
1922    X = x[, .SD, .SDcols = cn]
1923  } else {
1924    dm = dim(x)
1925    cn = names(x)
1926    X = x
1927  }
1928  p = dm[2L]
1929  n = dm[1L]
1930  collabs = as.list(cn)
1931  class(X) = NULL
1932  non.numeric = non.atomic = FALSE
1933  all.logical = TRUE
1934  for (j in seq_len(p)) {
1935    if (is.ff(X[[j]])) X[[j]] = X[[j]][]   # nocov to bring the ff into memory, since we need to create a matrix in memory
1936    xj = X[[j]]
1937    if (length(dj <- dim(xj)) == 2L && dj[2L] > 1L) {
1938      if (inherits(xj, "data.table"))
1939        xj = X[[j]] = as.matrix(X[[j]])
1940      dnj = dimnames(xj)[[2L]]
1941      collabs[[j]] = paste(collabs[[j]], if (length(dnj) >
1942        0L)
1943        dnj
1944      else seq_len(dj[2L]), sep = ".")
1945    }
1946    if (!is.logical(xj))
1947      all.logical = FALSE
1948    if (length(levels(xj)) > 0L || !(is.numeric(xj) || is.complex(xj) || is.logical(xj)) ||
1949        (!is.null(cl <- attr(xj, "class", exact=TRUE)) && any(cl %chin%
1950        c("Date", "POSIXct", "POSIXlt"))))
1951      non.numeric = TRUE
1952    if (!is.atomic(xj))
1953      non.atomic = TRUE
1954  }
1955  if (non.atomic) {
1956    for (j in seq_len(p)) {
1957      xj = X[[j]]
1958      if (is.recursive(xj)) { }
1959      else X[[j]] = as.list(as.vector(xj))
1960    }
1961  }
1962  else if (all.logical) { }
1963  else if (non.numeric) {
1964    for (j in seq_len(p)) {
1965      if (is.character(X[[j]])) next
1966      xj = X[[j]]
1967      miss = is.na(xj)
1968      xj = if (length(levels(xj))) as.vector(xj) else format(xj)
1969      is.na(xj) = miss
1970      X[[j]] = xj
1971    }
1972  }
1973  X = unlist(X, recursive = FALSE, use.names = FALSE)
1974  if (any(dm==0L)) {
1975    # retain highest type of input for empty output, #4762
1976    if (length(X)!=0L)
1977      stop("Internal error: as.matrix.data.table length(X)==", length(X), " but a dimension is zero")  # nocov
1978    return(array(if (is.null(X)) NA else X, dim = dm, dimnames = list(rownames.value, cn)))
1979  }
1980  dim(X) <- c(n, length(X)/n)
1981  dimnames(X) <- list(rownames.value, unlist(collabs, use.names = FALSE))
1982  X
1983}
1984
1985# bug #2375. fixed. same as head.data.frame and tail.data.frame to deal with negative indices
1986head.data.table = function(x, n=6L, ...) {
1987  if (!cedta()) return(NextMethod()) # nocov
1988  stopifnot(length(n) == 1L)
1989  i = seq_len(if (n<0L) max(nrow(x)+n, 0L) else min(n,nrow(x)))
1990  x[i, , ]
1991}
1992tail.data.table = function(x, n=6L, ...) {
1993  if (!cedta()) return(NextMethod()) # nocov
1994  stopifnot(length(n) == 1L)
1995  n = if (n<0L) max(nrow(x) + n, 0L) else min(n, nrow(x))
1996  i = seq.int(to=nrow(x), length.out=n)
1997  x[i]
1998}
1999
2000"[<-.data.table" = function (x, i, j, value) {
2001  # [<- is provided for consistency, but := is preferred as it allows by group and by reference to subsets of columns
2002  # with no copy of the (very large, say 10GB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change.
2003  if (!cedta()) {
2004    x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value)
2005        else `[<-.data.frame`(x, i, j, value)
2006    return(setalloccol(x))    # over-allocate (again).   Avoid all this by using :=.
2007  }
2008  # TO DO: warning("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='")
2009  if (!missing(i)) {
2010    isub=substitute(i)
2011    i = eval(.massagei(isub), x, parent.frame())
2012    if (is.matrix(i)) {
2013      if (!missing(j)) stop("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j")
2014      x = `[<-.data.frame`(x, i, value=value)
2015      return(setalloccol(x))
2016    }
2017    i = x[i, which=TRUE]
2018    # Tried adding ... after value above, and passing ... in here (e.g. for mult="first") but R CMD check
2019    # then gives "The argument of a replacement function which corresponds to the right hand side must be
2020    # named 'value'".  So, users have to use := for that.
2021  } else i = NULL          # meaning (to C code) all rows, without allocating 1L:nrow(x) vector
2022  if (missing(j)) j=names(x)
2023  if (!is.atomic(j)) stop("j must be an atomic vector, see ?is.atomic")
2024  if (anyNA(j)) stop("NA in j")
2025  if (is.character(j)) {
2026    newnames = setdiff(j,names(x))
2027    cols = as.integer(chmatch(j, c(names(x),newnames)))
2028    # We can now mix existing columns and new columns
2029  } else {
2030    if (!is.numeric(j)) stop("j must be vector of column name or positions")
2031    if (any(j>ncol(x))) stop("Attempt to assign to column position greater than ncol(x). Create the column by name, instead. This logic intends to catch (most likely) user errors.")
2032    cols = as.integer(j)  # for convenience e.g. to convert 1 to 1L
2033    newnames = NULL
2034  }
2035  reinstatekey=NULL
2036  if (haskey(x) && identical(key(x),key(value)) &&
2037    identical(names(x),names(value)) &&
2038    is.sorted(i) &&
2039    identical(substitute(x),quote(`*tmp*`))) {
2040    # DT["a",]$y = 1.1  winds up creating `*tmp*` subset of rows and assigning _all_ the columns into x and
2041    # over-writing the key columns with the same value (not just the single 'y' column).
2042    # That isn't good for speed; it's an R thing. Solution is to use := instead to avoid all this, but user
2043    # expects key to be retained in this case because _he_ didn't assign to a key column (the internal base R
2044    # code did).
2045    reinstatekey=key(x)
2046  }
2047  if (!selfrefok(x) || truelength(x) < ncol(x)+length(newnames)) {
2048    x = setalloccol(x, length(x)+length(newnames)) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too
2049    # search for one other .Call to assign in [.data.table to see how it differs
2050  }
2051  x = .Call(Cassign,copy(x),i,cols,newnames,value) # From 3.1.0, DF[2,"b"] = 7 no longer copies DF$a (so in this [<-.data.table method we need to copy)
2052  setalloccol(x)  #  can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe.
2053  if (length(reinstatekey)) setkeyv(x,reinstatekey)
2054  invisible(x)
2055  # no copy at all if user calls directly; i.e. `[<-.data.table`(x,i,j,value)
2056  # or uses data.table := syntax; i.e. DT[i,j:=value]
2057  # but, there is one copy by R in [<- dispatch to `*tmp*`; i.e. DT[i,j]=value. *Update: not from R > 3.0.2, yay*
2058  # That copy is via main/duplicate.c which preserves truelength but copies length amount. Hence setalloccol(x,length(x)).
2059  # No warn passed to assign here because we know it'll be copied via *tmp*.
2060  # := allows subassign to a column with no copy of the column at all,  and by group, etc.
2061}
2062
2063"$<-.data.table" = function(x, name, value) {
2064  if (!cedta()) {
2065    ans = `$<-.data.frame`(x, name, value)
2066    return(setalloccol(ans))           # over-allocate (again)
2067  }
2068  x = copy(x)
2069  set(x,j=name,value=value)  # important i is missing here
2070}
2071
2072as.data.frame.data.table = function(x, ...)
2073{
2074  ans = copy(x)
2075  setattr(ans,"row.names",.set_row_names(nrow(x)))   # since R 2.4.0, data.frames can have non-character row names
2076  setattr(ans,"class","data.frame")
2077  setattr(ans,"sorted",NULL)  # remove so if you convert to df, do something, and convert back, it is not sorted
2078  setattr(ans,".internal.selfref",NULL)
2079  # leave tl intact, no harm,
2080  ans
2081}
2082
2083as.list.data.table = function(x, ...) {
2084  # Similar to as.list.data.frame in base. Although a data.table/frame is a list, too, it may be
2085  # being coerced to raw list type (by calling code) so that "[" and "[[" work in their raw list form,
2086  # such as lapply does for data.frame. So we do have to remove the class attributes (and thus shallow
2087  # copy is almost instant way to achieve that, without risking compatibility).
2088  #if (sys.call(-2L)[[1L]]=="lapply")
2089  #    return(x)
2090  ans = shallow(x)
2091  setattr(ans, "class", NULL)
2092  setattr(ans, "row.names", NULL)
2093  setattr(ans, "sorted", NULL)
2094  setattr(ans,".internal.selfref", NULL)   # needed to pass S4 tests for example
2095  ans
2096}
2097
2098
2099dimnames.data.table = function(x) {
2100  if (!cedta()) {
2101    if (!inherits(x, "data.frame"))
2102      stop("data.table inherits from data.frame (from v1.5), but this data.table does not. Has it been created manually (e.g. by using 'structure' rather than 'data.table') or saved to disk using a prior version of data.table?")
2103    return(`dimnames.data.frame`(x))
2104  }
2105  list(NULL, names(x))
2106}
2107
2108"dimnames<-.data.table" = function (x, value)   # so that can do  colnames(dt)=<..>  as well as names(dt)=<..>
2109{
2110  if (!cedta()) return(`dimnames<-.data.frame`(x,value))  # nocov ; will drop key but names<-.data.table (below) is more common usage and does retain the key
2111  if (!is.list(value) || length(value) != 2L) stop("attempting to assign invalid object to dimnames of a data.table")
2112  if (!is.null(value[[1L]])) stop("data.tables do not have rownames")
2113  if (ncol(x) != length(value[[2L]])) stop("Can't assign ", length(value[[2L]]), " colnames to a ", ncol(x), "-column data.table")
2114  setnames(x,as.character(value[[2L]]))
2115  x  # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change.
2116}
2117
2118"names<-.data.table" = function(x,value)
2119{
2120  # When non data.table aware packages change names, we'd like to maintain the key.
2121  # If call is names(DT)[2]="newname", R will call this names<-.data.table function (notice no i) with 'value' already prepared to be same length as ncol
2122  x = shallow(x) # `names<-` should not modify by reference. Related to #1015, #476 and #825. Needed for R v3.1.0+.  TO DO: revisit
2123  if (is.null(value))
2124    setattr(x,"names",NULL)   # e.g. plyr::melt() calls base::unname()
2125  else
2126    setnames(x,value)
2127  x   # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change.
2128}
2129
2130within.data.table = function (data, expr, ...)
2131# basically within.list but retains key (if any)
2132# will be slower than using := or a regular query (see ?within for further info).
2133{
2134  if (!cedta()) return(NextMethod()) # nocov
2135  parent = parent.frame()
2136  e = evalq(environment(), data, parent)
2137  eval(substitute(expr), e)  # might (and it's known that some user code does) contain rm()
2138  l = as.list(e)
2139  l = l[!vapply_1b(l, is.null)]
2140  nD = length(del <- setdiff(names(data), (nl <- names(l))))
2141  ans = copy(data)
2142  if (length(nl)) ans[,nl] = l
2143  if (nD) ans[,del] = NULL
2144  if (haskey(data) && all(key(data) %chin% names(ans))) {
2145    x = TRUE
2146    for (i in key(data)) {
2147      x = identical(data[[i]],ans[[i]])
2148      if (!x) break
2149    }
2150    if (x) setattr(ans,"sorted",key(data))
2151  }
2152  ans
2153}
2154
2155transform.data.table = function (`_data`, ...)
2156# basically transform.data.frame with data.table instead of data.frame, and retains key
2157{
2158  if (!cedta()) return(NextMethod()) # nocov
2159  e = eval(substitute(list(...)), `_data`, parent.frame())
2160  tags = names(e)
2161  inx = chmatch(tags, names(`_data`))
2162  matched = !is.na(inx)
2163  if (any(matched)) {
2164    .Call(C_unlock, `_data`) # fix for #1641, now covered by test 104.2
2165    `_data`[,inx[matched]] = e[matched]
2166    `_data` = as.data.table(`_data`)
2167  }
2168  if (!all(matched)) {
2169    ans = do.call("data.table", c(list(`_data`), e[!matched]))
2170  } else {
2171    ans = `_data`
2172  }
2173  key.cols = key(`_data`)
2174  if (!any(tags %chin% key.cols)) {
2175    setattr(ans, "sorted", key.cols)
2176  }
2177  ans
2178}
2179
2180subset.data.table = function (x, subset, select, ...)
2181{
2182  key.cols = key(x)
2183
2184  if (missing(subset)) {
2185    r = TRUE
2186  } else {
2187    e = substitute(subset)
2188    r = eval(e, x, parent.frame())
2189    if (!is.logical(r))
2190      stop("'subset' must evaluate to logical")
2191    r = r & !is.na(r)
2192  }
2193
2194  if (missing(select)) {
2195    vars = seq_len(ncol(x))
2196  } else {
2197    nl = as.list(seq_len(ncol(x)))
2198    setattr(nl,"names",names(x))
2199    vars = eval(substitute(select), nl, parent.frame())  # e.g.  select=colF:colP
2200    # #891 fix - don't convert numeric vars to column names - will break when there are duplicate columns
2201    key.cols = intersect(key.cols, names(x)[vars]) ## Only keep key.columns found in the select clause
2202  }
2203
2204  ans = x[r, vars, with = FALSE]
2205
2206  if (nrow(ans) > 0L) {
2207    if (!missing(select) && length(key.cols)) {
2208      ## Set the key on the returned data.table as long as the key
2209      ## columns that "remain" are the same as the original, or a
2210      ## prefix of it.
2211      is.prefix = all(key(x)[seq_len(length(key.cols))] == key.cols)
2212      if (is.prefix) {
2213        setattr(ans, "sorted", key.cols)
2214      }
2215    }
2216  } else {
2217    setkey(ans,NULL)
2218  }
2219  ans
2220}
2221
2222# Equivalent of 'rowSums(is.na(dt) > 0L)' but much faster and memory efficient.
2223# Also called "complete.cases" in base. Unfortunately it's not a S3 generic.
2224# Also handles bit64::integer64. TODO: export this?
2225# For internal use only. 'by' requires integer input. No argument checks here yet.
2226is_na = function(x, by=seq_along(x)) .Call(Cdt_na, x, by)
2227any_na = function(x, by=seq_along(x)) .Call(CanyNA, x, by)
2228
2229na.omit.data.table = function (object, cols = seq_along(object), invert = FALSE, ...) {
2230  # compare to stats:::na.omit.data.frame
2231  if (!cedta()) return(NextMethod()) # nocov
2232  if ( !missing(invert) && is.na(as.logical(invert)) )
2233    stop("Argument 'invert' must be logical TRUE/FALSE")
2234  cols = colnamesInt(object, cols, check_dups=FALSE)
2235  ix = .Call(Cdt_na, object, cols)
2236  # forgot about invert with no NA case, #2660
2237  if (invert) {
2238    if (all(ix))
2239      object
2240    else
2241      .Call(CsubsetDT, object, which_(ix, bool = TRUE), seq_along(object))
2242  } else {
2243    if (any(ix))
2244      .Call(CsubsetDT, object, which_(ix, bool = FALSE), seq_along(object))
2245    else
2246      object
2247  }
2248}
2249
2250which_ = function(x, bool = TRUE) {
2251  # fix for #1467, quotes result in "not resolved in current namespace" error
2252  .Call(Cwhichwrapper, x, bool)
2253}
2254
2255is.na.data.table = function (x) {
2256  if (!cedta()) return(`is.na.data.frame`(x))
2257  do.call("cbind", lapply(x, "is.na"))
2258}
2259
2260# not longer needed as inherits ...
2261#    t.data.table = t.data.frame
2262#    Math.data.table = Math.data.frame
2263#    summary.data.table = summary.data.frame
2264
2265Ops.data.table = function(e1, e2 = NULL)
2266{
2267  ans = NextMethod()
2268  if (cedta() && is.data.frame(ans)) ans = as.data.table(ans)
2269  else if (is.matrix(ans)) colnames(ans) = copy(colnames(ans))
2270  ans
2271}
2272
2273split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) {
2274  if (!is.data.table(x)) stop("x argument must be a data.table")
2275  stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by),  is.logical(flatten))
2276  # split data.frame way, using `f` and not `by` argument
2277  if (!missing(f)) {
2278    if (!length(f) && nrow(x))
2279      stop("group length is 0 but data nrow > 0")
2280    if (!missing(by))
2281      stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor")
2282    # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency
2283    return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind]))
2284  }
2285  if (missing(by)) stop("Either 'by' or 'f' argument must be supplied")
2286  # check reserved column names during processing
2287  if (".ll.tech.split" %chin% names(x)) stop("Column '.ll.tech.split' is reserved for split.data.table processing")
2288  if (".nm.tech.split" %chin% by) stop("Column '.nm.tech.split' is reserved for split.data.table processing")
2289  if (!all(by %chin% names(x))) stop("Argument 'by' must refer to column names in x")
2290  if (!all(by.atomic <- vapply_1b(by, function(.by) is.atomic(x[[.by]])))) stop("Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: ", brackify(by[!by.atomic]))
2291  # list of data.tables (flatten) or list of lists of ... data.tables
2292  make.levels = function(x, cols, sorted) {
2293    by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE
2294    ul = lapply(setNames(cols, nm=cols), function(col) {
2295      if (!is.factor(x[[col]])) unique(x[[col]]) else {
2296      .x_lev = levels(x[[col]])
2297      #need to keep as a factor or order will be lost, #2082
2298      factor(.x_lev, levels = .x_lev)
2299      }
2300    })
2301    r = do.call("CJ", c(ul, sorted=sorted, unique=TRUE))
2302    if (!sorted && nrow(by.order)) {
2303      ii = r[by.order, on=cols, which=TRUE]
2304      r = rbindlist(list(
2305        r[ii], # original order from data
2306        r[-ii] # empty levels at the end
2307      ))
2308    }
2309    r
2310  }
2311  .by = by[1L]
2312  # this builds data.table call - is much more cleaner than handling each case one by one
2313  dtq = as.list(call("[", as.name("x")))
2314  join = FALSE
2315  flatten_any = flatten && any(vapply_1b(by, function(col) is.factor(x[[col]])))
2316  nested_current = !flatten && is.factor(x[[.by]])
2317  if (!drop && (flatten_any || nested_current)) {
2318    # create 'levs' here to avoid lexical scoping glitches, see #3151
2319    levs = make.levels(x=x, cols=if (flatten) by else .by, sorted=sorted)
2320    dtq[["i"]] = quote(levs)
2321    join = TRUE
2322  }
2323  dtq[["j"]] = substitute(
2324    list(.ll.tech.split=list(.expr)),
2325    list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ?
2326  )
2327  by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L]
2328  dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
2329    .expr,
2330    list(.expr = if(join) {as.name(".EACHI")} else if (flatten) by else .by)
2331  )
2332  dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by)
2333  if (join) dtq[["on"]] = if (flatten) by else .by
2334  dtq = as.call(dtq)
2335  if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="")
2336  tmp = eval(dtq)
2337  # add names on list
2338  setattr(ll <- tmp$.ll.tech.split,
2339      "names",
2340      as.character(
2341        if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(lapply(.SD, as.character)), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split
2342      ))
2343  # handle nested split
2344  if (flatten || length(by) == 1L) {
2345    for (x in ll) .Call(C_unlock, x)
2346    lapply(ll, setDT)
2347    # alloc.col could handle DT in list as done in: c9c4ff80bdd4c600b0c4eff23b207d53677176bd
2348  } else if (length(by) > 1L) {
2349    lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten)
2350  }
2351}
2352
2353# TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
2354
2355copy = function(x) {
2356  newx = .Call(Ccopy,x)  # copies at length but R's duplicate() also copies truelength over.
2357                         # TO DO: inside Ccopy it could reset tl to 0 or length, but no matter as selfrefok detects it
2358                         # TO DO: revisit duplicate.c in R 3.0.3 and see where it's at
2359
2360  reallocate = function(y) {
2361    if (is.data.table(y)) {
2362      .Call(C_unlock, y)
2363      setalloccol(y)
2364    } else if (is.list(y)) {
2365      oldClass = class(y)
2366      setattr(y, 'class', NULL)  # otherwise [[.person method (which returns itself) results in infinite recursion, #4620
2367      y[] = lapply(y, reallocate)
2368      if (!identical(oldClass, 'list')) setattr(y, 'class', oldClass)
2369    }
2370    y
2371  }
2372
2373  reallocate(newx)
2374}
2375
2376.shallow = function(x, cols = NULL, retain.key = FALSE, unlock = FALSE) {
2377  wasnull = is.null(cols)
2378  cols = colnamesInt(x, cols, check_dups=FALSE)
2379  ans = .Call(Cshallowwrapper, x, cols)  # copies VECSXP only
2380
2381  if(retain.key){
2382    if (wasnull) return(ans) # handle most frequent case first
2383    ## get correct key if cols are present
2384    cols = names(x)[cols]
2385    keylength = which.first(!key(ans) %chin% cols) - 1L
2386    if (is.na(keylength)) keylength <- length(key(ans))
2387    if (!keylength) {
2388      setattr(ans, "sorted", NULL) ## no key remaining
2389    } else {
2390      setattr(ans, "sorted", head(key(ans), keylength)) ## keep what can be kept
2391    }
2392    ## take care of attributes.
2393    indices = names(attributes(attr(ans, "index", exact=TRUE)))
2394    for(index in indices) {
2395      indexcols = strsplit(index, split = "__")[[1L]][-1L]
2396      indexlength = which.first(!indexcols %chin% cols) - 1L
2397      if (is.na(indexlength)) next ## all columns are present, nothing to be done
2398      reducedindex = paste0("__", indexcols[seq_len(indexlength)], collapse="") ## the columns until the first missing from the new index
2399      if (reducedindex %chin% indices || !indexlength) {
2400        ## Either reduced index already present or no columns of the original index remain.
2401        ## Drop the original index completely
2402        setattr(attr(ans, "index", exact=TRUE), index, NULL)
2403      } else if(length(attr(attr(ans, "index", exact=TRUE), index, exact=TRUE))) {
2404        ## index is not length 0. Drop it since shortening could lead to spurious reordering in discarded columns (#2336)
2405        setattr(attr(ans, "index", exact=TRUE), index, NULL)
2406      } else {
2407        ## rename index to reducedindex
2408        names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index", exact=TRUE))) == index] = reducedindex
2409      }
2410    }
2411  } else { # retain.key == FALSE
2412    setattr(ans, "sorted", NULL)
2413    setattr(ans, "index", NULL)
2414  }
2415  if (unlock) .Call(C_unlock, ans)
2416  ans
2417}
2418
2419shallow = function(x, cols=NULL) {
2420  if (!is.data.table(x))
2421    stop("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table")
2422  ans = .shallow(x, cols=cols, retain.key = TRUE)
2423  ans
2424}
2425
2426setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"))
2427{
2428  name = substitute(DT)
2429  if (identical(name, quote(`*tmp*`))) stop("setalloccol attempting to modify `*tmp*`")
2430  ans = .Call(Calloccolwrapper, DT, eval(n), verbose)
2431  if (is.name(name)) {
2432    name = as.character(name)
2433    assign(name,ans,parent.frame(),inherits=TRUE)
2434  }
2435  ans
2436}
2437
2438selfrefok = function(DT,verbose=getOption("datatable.verbose")) {
2439  .Call(Cselfrefokwrapper,DT,verbose)
2440}
2441
2442truelength = function(x) .Call(Ctruelength,x)
2443# deliberately no "truelength<-" method.  setalloccol is the mechanism for that.
2444# settruelength() no longer need (and so removed) now that data.table depends on R 2.14.0
2445# which initializes tl to zero rather than leaving uninitialized.
2446
2447setattr = function(x,name,value) {
2448  # Wrapper for setAttrib internal R function
2449  # Sets attribute by reference (no copy)
2450  # Named setattr (rather than setattrib) at R level to more closely resemble attr<-
2451  # And as from 1.7.8 is made exported in NAMESPACE for use in user attributes.
2452  # User can also call `attr<-` function directly, but that copies (maybe just when NAMED>0, which is always for data.frame, I think).  See "Confused by NAMED" thread on r-devel 24 Nov 2011.
2453  # We tend to use setattr() internally in data.table.R because often we construct a data.table and it hasn't
2454  # got names yet. setnames() is the user interface which checks integrity and doesn't let you drop names for example.
2455  if (name=="names" && is.data.table(x) && length(attr(x, "names", exact=TRUE)) && !is.null(value))
2456    setnames(x,value)
2457    # Using setnames here so that truelength of names can be retained, to carry out integrity checks such as not
2458    # creating names longer than the number of columns of x, and to change the key, too
2459    # For convenience so that setattr(DT,"names",allnames) works as expected without requiring a switch to setnames.
2460  else {
2461    ans = .Call(Csetattrib, x, name, value)
2462    # If name=="names" and this is the first time names are assigned (e.g. in data.table()), this will be grown by setalloccol very shortly afterwards in the caller.
2463    if (!is.null(ans)) {
2464      warning("Input is a length=1 logical that points to the same address as R's global value. Therefore the attribute has not been set by reference, rather on a copy. You will need to assign the result back to a variable. See issue #1281.")
2465      x = ans
2466    }
2467  }
2468  # fix for #1142 - duplicated levels for factors
2469  if (name == "levels" && is.factor(x) && anyDuplicated(value))
2470    .Call(Csetlevels, x, (value <- as.character(value)), unique(value))
2471  invisible(x)
2472}
2473
2474setnames = function(x,old,new,skip_absent=FALSE) {
2475  # Sets by reference, maintains truelength, no copy of table at all.
2476  # But also more convenient than names(DT)[i]="newname"  because we can also do setnames(DT,"oldname","newname")
2477  # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness.
2478  # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with ""
2479  if (!is.data.frame(x)) stop("x is not a data.table or data.frame")
2480  ncol = length(x)
2481  if (length(names(x)) != ncol) stop("x has ",ncol," columns but its names are length ",length(names(x)))
2482  stopifnot(isTRUEorFALSE(skip_absent))
2483  if (missing(new) || missing(old)) {
2484    # usage: setnames(DT, new = letters[1:n])
2485    if (missing(old)) { old = new; new = NULL }
2486    # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2
2487    if (is.function(old)) old = old(names(x))
2488    if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.")
2489    if (length(old) != ncol) stop("Can't assign ",length(old)," names to a ",ncol," column data.table")
2490    if (anyNA(names(x))) {
2491      # if x somehow has some NA names, which() needs help to return them, #2475
2492      w = which((names(x) != old) | (Encoding(names(x)) != Encoding(old)) | (is.na(names(x)) & !is.na(old)))
2493    } else {
2494      w = which(names(x) != old | (Encoding(names(x)) != Encoding(old)))
2495    }
2496    if (!length(w)) return(invisible(x))  # no changes
2497    new = old[w]
2498    i = w
2499  } else {
2500    if (is.function(new)) new = if (is.numeric(old)) new(names(x)[old]) else new(old)
2501    if (!is.character(new)) stop("'new' is not a character vector or a function")
2502    #  if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)]))  # dups allowed without warning; warn if and when the dup causes an ambiguity
2503    if (anyNA(new)) stop("NA in 'new' at positions ", brackify(which(is.na(new))))
2504    if (anyDuplicated(old)) stop("Some duplicates exist in 'old': ", brackify(old[duplicated(old)]))
2505    if (is.numeric(old)) i = old = seq_along(x)[old]  # leave it to standard R to manipulate bounds and negative numbers
2506    else if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character")
2507    if (length(new)!=length(old)) stop("'old' is length ",length(old)," but 'new' is length ",length(new))
2508    if (anyNA(old)) stop("NA (or out of bounds) in 'old' at positions ", brackify(which(is.na(old))))
2509    if (is.character(old)) {
2510      i = chmatchdup(c(old,old), names(x))  # chmatchdup returns the second of any duplicates matched to in names(x) (if any)
2511      if (!all(tt<-is.na(tail(i,length(old))))) warning("Item ",w<-which.first(!tt)," of 'old' is '", old[w],"' which appears several times in column names. Just the first will be changed. There are ", sum(!tt)-1L," other items in old that are also duplicated in column names.")
2512      i = head(i,length(old))
2513      if (anyNA(i)) {
2514        if (isTRUE(skip_absent)) {
2515          w = !is.na(i)
2516          new = new[w]
2517          i = i[w]
2518        } else {
2519          stop("Items of 'old' not found in column names: ", brackify(old[is.na(i)]), ". Consider skip_absent=TRUE.")
2520        }
2521      }
2522    }
2523    if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) {
2524      w = which(!w)
2525      new = new[w]
2526      i = i[w]
2527    }
2528    if (!length(new)) return(invisible(x)) # no changes
2529    if (length(i) != length(new)) stop("Internal error: length(i)!=length(new)") # nocov
2530  }
2531  # update the key if the column name being change is in the key
2532  m = chmatch(names(x)[i], key(x))
2533  w = which(!is.na(m))
2534  if (length(w))
2535    .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w])
2536
2537  # update secondary keys
2538  idx = attr(x, "index", exact=TRUE)
2539  for (k in names(attributes(idx))) {
2540    tt = strsplit(k,split="__")[[1L]][-1L]
2541    m = chmatch(names(x)[i], tt)
2542    w = which(!is.na(m))
2543    if (length(w)) {
2544      tt[m[w]] = new[w]
2545      newk = paste0("__",tt,collapse="")
2546      setattr(idx, newk, attr(idx, k, exact=TRUE))
2547      setattr(idx, k, NULL)
2548    }
2549  }
2550
2551  .Call(Csetcharvec, attr(x, "names", exact=TRUE), as.integer(i), new)
2552  invisible(x)
2553}
2554
2555setcolorder = function(x, neworder=key(x))
2556{
2557  if (is.character(neworder) && anyDuplicated(names(x)))
2558    stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.")
2559  # if (!is.data.table(x)) stop("x is not a data.table")
2560  neworder = colnamesInt(x, neworder, check_dups=FALSE)  # dups are now checked inside Csetcolorder below
2561  if (length(neworder) != length(x)) {
2562    #if shorter than length(x), pad by the missing
2563    #  elements (checks below will catch other mistakes)
2564    neworder = c(neworder, setdiff(seq_along(x), neworder))
2565  }
2566  .Call(Csetcolorder, x, neworder)
2567  invisible(x)
2568}
2569
2570set = function(x,i=NULL,j,value)  # low overhead, loopable
2571{
2572  .Call(Cassign,x,i,j,NULL,value)
2573  invisible(x)
2574}
2575
2576chmatch = function(x, table, nomatch=NA_integer_)
2577  .Call(Cchmatch, x, table, as.integer(nomatch[1L])) # [1L] to fix #1672
2578
2579# chmatchdup() behaves like 'pmatch' but only the 'exact' matching part; i.e. a value in
2580# 'x' is matched to 'table' only once. No index will be present more than once. For example:
2581# chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
2582# chmatchdup(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
2583# chmatchdup(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
2584chmatchdup = function(x, table, nomatch=NA_integer_)
2585  .Call(Cchmatchdup, x, table, as.integer(nomatch[1L]))
2586
2587"%chin%" = function(x, table)
2588  .Call(Cchin, x, table)  # TO DO  if table has 'ul' then match to that
2589
2590chorder = function(x) {
2591  o = forderv(x, sort=TRUE, retGrp=FALSE)
2592  if (length(o)) o else seq_along(x)
2593}
2594
2595chgroup = function(x) {
2596  # TO DO: deprecate and remove this. It's exported but doubt anyone uses it. Think the plan was to use it internally, but forderv superceded.
2597  o = forderv(x, sort=FALSE, retGrp=TRUE)
2598  if (length(o)) as.vector(o) else seq_along(x)  # as.vector removes the attributes
2599}
2600
2601# plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948
2602rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) {
2603  l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x))  #1626; e.g. psych binds a data.frame|table with a matrix
2604  rbindlist(l, use.names, fill, idcol)
2605}
2606cbind.data.table = data.table
2607.rbind.data.table = rbind.data.table  # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0
2608
2609rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) {
2610  if (is.null(l)) return(null.data.table())
2611  if (!is.list(l) || is.data.frame(l)) stop("Input is ", class(l)[1L]," but should be a plain list of items to be stacked")
2612  if (isFALSE(idcol)) { idcol = NULL }
2613  else if (!is.null(idcol)) {
2614    if (isTRUE(idcol)) idcol = ".id"
2615    if (!is.character(idcol)) stop("idcol must be a logical or character vector of length 1. If logical TRUE the id column will named '.id'.")
2616    idcol = idcol[1L]
2617  }
2618  miss = missing(use.names)
2619  # more checking of use.names happens at C level; this is just minimal to massage 'check' to NA
2620  if (identical(use.names, NA)) stop("use.names=NA invalid")  # otherwise use.names=NA could creep in an usage equivalent to use.names='check'
2621  if (identical(use.names,"check")) {
2622    if (!miss) stop("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.")
2623    use.names = NA
2624  }
2625  ans = .Call(Crbindlist, l, use.names, fill, idcol)
2626  if (!length(ans)) return(null.data.table())
2627  setDT(ans)[]
2628}
2629
2630vecseq = function(x,y,clamp) .Call(Cvecseq,x,y,clamp)
2631
2632# .Call(Caddress, x) increments NAM() when x is vector with NAM(1). Referring object within non-primitive function is enough to increment reference.
2633address = function(x) .Call(Caddress, eval(substitute(x), parent.frame()))
2634
2635":=" = function(...) {
2636  # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227
2637  stop('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").')
2638}
2639
2640setDF = function(x, rownames=NULL) {
2641  if (!is.list(x)) stop("setDF only accepts data.table, data.frame or list of equal length as input")
2642  if (anyDuplicated(rownames)) stop("rownames contains duplicates")
2643  if (is.data.table(x)) {
2644    # copied from as.data.frame.data.table
2645    if (is.null(rownames)) {
2646      rn = .set_row_names(nrow(x))
2647    } else {
2648      if (length(rownames) != nrow(x))
2649        stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames))
2650      rn = rownames
2651    }
2652    setattr(x, "row.names", rn)
2653    setattr(x, "class", "data.frame")
2654    setattr(x, "sorted", NULL)
2655    setattr(x, ".internal.selfref", NULL)
2656  } else if (is.data.frame(x)) {
2657    if (!is.null(rownames)) {
2658      if (length(rownames) != nrow(x))
2659        stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames))
2660      setattr(x, "row.names", rownames)
2661    }
2662    x
2663  } else {
2664    n = vapply_1i(x, length)
2665    mn = max(n)
2666    if (any(n<mn))
2667      stop("All elements in argument 'x' to 'setDF' must be of same length")
2668    xn = names(x)
2669    if (is.null(xn)) {
2670      setattr(x, "names", paste0("V",seq_len(length(x))))
2671    } else {
2672      idx = xn %chin% ""
2673      if (any(idx)) {
2674        xn[idx] = paste0("V", seq_along(which(idx)))
2675        setattr(x, "names", xn)
2676      }
2677    }
2678    if (is.null(rownames)) {
2679      rn = .set_row_names(mn)
2680    } else {
2681      if (length(rownames) != mn)
2682      stop("rownames incorrect length; expected ", mn, " names, got ", length(rownames))
2683      rn = rownames
2684    }
2685    setattr(x,"row.names", rn)
2686    setattr(x,"class","data.frame")
2687  }
2688  invisible(x)
2689}
2690
2691setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
2692  name = substitute(x)
2693  if (is.name(name)) {
2694    home = function(x, env) {
2695      if (identical(env, emptyenv()))
2696        stop("Cannot find symbol ", cname, call. = FALSE)
2697      else if (exists(x, env, inherits=FALSE)) env
2698      else home(x, parent.env(env))
2699    }
2700    cname = as.character(name)
2701    envir = home(cname, parent.frame())
2702    if (bindingIsLocked(cname, envir)) {
2703      stop("Cannot convert '", cname, "' to data.table by reference because binding is locked. It is very likely that '", cname, "' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.")
2704    }
2705  }
2706  # check no matrix-like columns, #3760. Other than a single list(matrix) is unambiguous and depended on by some revdeps, #3581
2707  if (length(x)>1L) {
2708    idx = vapply_1i(x, function(xi) length(dim(xi)))>1L
2709    if (any(idx))
2710      warning("Some columns are a multi-column type (such as a matrix column): ", brackify(which(idx)),". setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.")
2711  }
2712  if (is.data.table(x)) {
2713    # fix for #1078 and #1128, see .resetclass() for explanation.
2714    setattr(x, 'class', .resetclass(x, 'data.table'))
2715    if (!missing(key)) setkeyv(x, key) # fix for #1169
2716    if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
2717    if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x)
2718  } else if (is.data.frame(x)) {
2719    rn = if (!identical(keep.rownames, FALSE)) rownames(x) else NULL
2720    setattr(x, "row.names", .set_row_names(nrow(x)))
2721    if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
2722    # fix for #1078 and #1128, see .resetclass() for explanation.
2723    setattr(x, "class", .resetclass(x, 'data.frame'))
2724    setalloccol(x)
2725    if (!is.null(rn)) {
2726      nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x))
2727      x[, (nm[1L]) := rn]
2728      setcolorder(x, nm)
2729    }
2730  } else if (is.list(x) && length(x)==1L && is.matrix(x[[1L]])) {
2731    # a single list(matrix) is unambiguous and depended on by some revdeps, #3581
2732    x = as.data.table.matrix(x[[1L]])
2733  } else if (is.null(x) || (is.list(x) && !length(x))) {
2734    x = null.data.table()
2735  } else if (is.list(x)) {
2736    # copied from as.data.table.list - except removed the copy
2737    for (i in seq_along(x)) {
2738      if (is.null(x[[i]])) next   # allow NULL columns to be created by setDT(list) even though they are not really allowed
2739                                  # many operations still work in the presence of NULL columns and it might be convenient
2740                                  # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which
2741                                  # fail for NULL columns will give helpful error at that point, #3480 and #3471
2742      if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
2743    }
2744    n = vapply_1i(x, length)
2745    n_range = range(n)
2746    if (n_range[1L] != n_range[2L]) {
2747      tbl = sort(table(n))
2748      stop("All elements in argument 'x' to 'setDT' must be of same length, but the profile of input lengths (length:frequency) is: ",
2749           brackify(sprintf('%s:%d', names(tbl), tbl)), "\nThe first entry with fewer than ", n_range[2L], " entries is ", which.max(n<n_range[2L]))
2750    }
2751    xn = names(x)
2752    if (is.null(xn)) {
2753      setattr(x, "names", paste0("V",seq_len(length(x))))
2754    } else {
2755      idx = xn %chin% "" # names can be NA - test 1006 caught that!
2756      if (any(idx)) {
2757        xn[idx] = paste0("V", seq_along(which(idx)))
2758        setattr(x, "names", xn)
2759      }
2760      if (check.names) setattr(x, "names", make.names(xn, unique=TRUE))
2761    }
2762    setattr(x,"row.names",.set_row_names(n_range[2L]))
2763    setattr(x,"class",c("data.table","data.frame"))
2764    setalloccol(x)
2765  } else {
2766    stop("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'")
2767  }
2768  if (!is.null(key)) setkeyv(x, key)
2769  if (is.name(name)) {
2770    name = as.character(name)
2771    assign(name, x, parent.frame(), inherits=TRUE)
2772  } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) {
2773    # common case is call from 'lapply()'
2774    k = eval(name[[2L]], parent.frame(), parent.frame())
2775    if (is.list(k)) {
2776      origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame())
2777      if (length(j) == 1L) {
2778        if (is.character(j)) {
2779          j = match(j, names(k))
2780          if (is.na(j))
2781            stop("Item '", origj, "' not found in names of input list")
2782        }
2783      }
2784      .Call(Csetlistelt,k,as.integer(j), x)
2785    } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) {
2786      assign(as.character(name[[3L]]), x, k, inherits=FALSE)
2787    }
2788  }
2789  .Call(CexpandAltRep, x)  # issue#2866 and PR#2882
2790  invisible(x)
2791}
2792
2793as_list = function(x) {
2794  lx = vector("list", 1L)
2795  .Call(Csetlistelt, lx, 1L, x)
2796  lx
2797}
2798
2799# FR #1353
2800rowid = function(..., prefix=NULL) {
2801  rowidv(list(...), prefix=prefix)
2802}
2803
2804rowidv = function(x, cols=seq_along(x), prefix=NULL) {
2805  if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L))
2806    stop("'prefix' must be NULL or a character vector of length 1.")
2807  if (is.atomic(x)) {
2808    if (!missing(cols) && !is.null(cols))
2809      stop("x is a single vector, non-NULL 'cols' doesn't make sense.")
2810    cols = 1L
2811    x = as_list(x)
2812  } else if (!length(cols)) {
2813    stop("x is a list, 'cols' cannot be 0-length.")
2814  }
2815  xorder = forderv(x, by=cols, sort=FALSE, retGrp=TRUE) # speedup on char with sort=FALSE
2816  xstart = attr(xorder, 'starts', exact=TRUE)
2817  if (!length(xorder)) xorder = seq_along(x[[1L]])
2818  ids = .Call(Cfrank, xorder, xstart, uniqlengths(xstart, length(xorder)), "sequence")
2819  if (!is.null(prefix))
2820    ids = paste0(prefix, ids)
2821  ids
2822}
2823
2824# FR #686
2825rleid = function(..., prefix=NULL) {
2826  rleidv(list(...), prefix=prefix)
2827}
2828
2829rleidv = function(x, cols=seq_along(x), prefix=NULL) {
2830  if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L))
2831    stop("'prefix' must be NULL or a character vector of length 1.")
2832  if (is.atomic(x)) {
2833    if (!missing(cols) && !is.null(cols))
2834      stop("x is a single vector, non-NULL 'cols' doesn't make sense.")
2835    cols = 1L
2836    x = as_list(x)
2837  } else if (!length(cols)) {
2838    stop("x is a list, 'cols' cannot be 0-length.")
2839  }
2840  cols = colnamesInt(x, cols, check_dups=FALSE)
2841  ids = .Call(Crleid, x, cols)
2842  if (!is.null(prefix)) ids = paste0(prefix, ids)
2843  ids
2844}
2845
2846# GForce functions
2847#   to add a new function to GForce (from the R side -- the easy part!):
2848#     (1) add it to gfuns
2849#     (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function
2850#     (3) define the gfun = function() R wrapper
2851gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod",
2852          "median", "min", "max", "var", "sd", ".N") # added .N for #334
2853`g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here.
2854ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment
2855gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment
2856gfirst = function(x) .Call(Cgfirst, x)
2857glast = function(x) .Call(Cglast, x)
2858gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm, TRUE)  # warnOverflow=TRUE, #986
2859gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
2860gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm)
2861gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm)
2862gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm)
2863gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm)
2864gvar = function(x, na.rm=FALSE) .Call(Cgvar, x, na.rm)
2865gsd = function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm)
2866gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows)
2867
2868isReallyReal = function(x) {
2869  .Call(CisReallyReal, x)
2870}
2871
2872.prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){
2873  ## helper that decides, whether a fast binary search can be performed, if i is a call
2874  ## For details on the supported queries, see \code{\link{datatable-optimize}}
2875  ## Additional restrictions are imposed if x is .SD, or if options indicate that no optimization
2876  ## is to be performed
2877  #' @param isub the substituted i
2878  #' @param x the data.table
2879  #' @param enclos The environment where to evaluate when RHS is not a column of x
2880  #' @param notjoin boolean that is set before, indicating whether i started with '!'.
2881  #' @param verbose TRUE for detailed output
2882  #' @return If i is not fast subsettable, NULL. Else, a list with entries:
2883  #'        out$i: a data.table that will be used as i with proper column names and key.
2884  #'        out$on: the correct 'on' statement that will be used for x[i, on =...]
2885  #'        out$notjoin Bool. In some cases, notjoin is updated within the function.
2886  #'        ATTENTION: If nothing else helps, an auto-index is created on x unless options prevent this.
2887  if(getOption("datatable.optimize") < 3L) return(NULL) ## at least level three optimization required.
2888  if (!is.call(isub)) return(NULL)
2889  if (.Call(C_islocked, x)) return(NULL)  # fix for #958, don't create auto index on '.SD'.
2890  ## a list of all possible operators with their translations into the 'on' clause
2891  validOps = list(op = c("==", "%in%", "%chin%"),
2892                   on = c("==", "==",   "=="))
2893
2894  ## Determine, whether the nature of isub in general supports fast binary search
2895  remainingIsub = isub
2896  i = list()
2897  on = character(0L)
2898  nonEqui = FALSE
2899  while(length(remainingIsub)){
2900    if(is.call(remainingIsub)){
2901      if (length(remainingIsub[[1L]]) != 1L) return(NULL) ## only single symbol, either '&' or one of validOps allowed.
2902      if (remainingIsub[[1L]] != "&"){ ## only a single expression present or a different connection.
2903        stub = remainingIsub
2904        remainingIsub = NULL ## there is no remainder to be evaluated after stub.
2905      } else {
2906        ## multiple expressions with & connection.
2907        if (notjoin) return(NULL) ## expressions of type DT[!(a==1 & b==2)] currently not supported
2908        stub = remainingIsub[[3L]] ## the single column expression like col == 4
2909        remainingIsub = remainingIsub[[2L]] ## the potentially longer expression with potential additional '&'
2910      }
2911    } else { ## single symbol present
2912      stub = remainingIsub
2913      remainingIsub = NULL
2914    }
2915    ## check the stub if it is fastSubsettable
2916    if(is.symbol(stub)){
2917      ## something like DT[x & y]. If x and y are logical columns, we can optimize.
2918      col = as.character(stub)
2919      if(!col %chin% names(x)) return(NULL)
2920      if(!is.logical(x[[col]])) return(NULL)
2921      ## redirect to normal DT[x == TRUE]
2922      stub = call("==", as.symbol(col), TRUE)
2923    }
2924    if (length(stub[[1L]]) != 1) return(NULL) # nocov Whatever it is, definitely not one of the valid operators
2925    operator = as.character(stub[[1L]])
2926    if (!operator %chin% validOps$op) return(NULL) ## operator not supported
2927    if (!is.name(stub[[2L]])) return(NULL)
2928    col = as.character(stub[[2L]])
2929    if (!col %chin% names(x)) return(NULL) ## any non-column name prevents fast subsetting
2930    if(col %chin% names(i)) return(NULL) ## repeated appearance of the same column not supported (e.g. DT[x < 3 & x < 5])
2931    ## now check the RHS of stub
2932    RHS = eval(stub[[3L]], x, enclos)
2933    if (is.list(RHS)) RHS = as.character(RHS)  # fix for #961
2934    if (length(RHS) != 1L && !operator %chin% c("%in%", "%chin%")){
2935      if (length(RHS) != nrow(x)) stop(gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x), domain="R-data.table"), domain=NA)
2936      return(NULL) # DT[colA == colB] regular element-wise vector scan
2937    }
2938    if ( mode(x[[col]]) != mode(RHS) ||                # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric')
2939         is.factor(x[[col]])+is.factor(RHS) == 1L ||   # but factor is also mode 'numeric' so treat that separately
2940         is.integer(x[[col]]) && isReallyReal(RHS) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type
2941      # re-direct non-matching type cases to base R, as data.table's binary
2942      # search based join is strict in types. #957, #961 and #1361
2943      # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on
2944      return(NULL)
2945    }
2946    if (!operator %chin% c("%in%", "%chin%")) {
2947      # additional requirements for notjoin and NA values. Behaviour is different for %in%, %chin% compared to other operators
2948      # RHS is of length=1 or n
2949      if (any_na(as_list(RHS))) {
2950        ## dt[x == NA] or dt[x <= NA] will always return empty
2951        notjoin = FALSE
2952        RHS = RHS[0L]
2953      } else if (notjoin) {
2954        ## dt[!x == 3] must not return rows where x is NA
2955        RHS = c(RHS, if (is.double(RHS) && is.double(x[[col]])) c(NA, NaN) else NA)
2956      }
2957    }
2958    ## if it passed until here, fast subset can be done for this stub
2959    i = c(i, setNames(list(RHS), col))
2960    on = c(on, setNames(paste0(col, validOps$on[validOps$op == operator], col), col))
2961    ## loop continues with remainingIsub
2962  }
2963  if (length(i) == 0L) stop("Internal error in .isFastSubsettable. Please report to data.table developers") # nocov
2964  ## convert i to data.table with all combinations in rows.
2965  if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){
2966    ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635
2967    if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()}
2968    return(NULL)
2969  }
2970  ## Care is needed with names as we construct i
2971  ## with 'CJ' and 'do.call' and this would cause problems if colNames were 'sorted' or 'unique'
2972  ## as these two would be interpreted as args for CJ
2973  colNames = names(i)
2974  names(i) = NULL
2975  i$sorted = FALSE
2976  i$unique = TRUE
2977  i = do.call(CJ, i)
2978  setnames(i, colNames)
2979  idx = NULL
2980  if(is.null(idx)){
2981      ## check whether key fits the columns in i.
2982      ## order of key columns makes no difference, as long as they are all upfront in the key, I believe.
2983      if (all(names(i) %chin% head(key(x), length(i)))){
2984          if (verbose) {cat("Optimized subsetting with key '", paste0( head(key(x), length(i)), collapse = ", "),"'\n",sep="");flush.console()}
2985          idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly.
2986          idxCols = head(key(x), length(i)) ## in correct order!
2987      }
2988  }
2989  if (is.null(idx)){
2990    if (!getOption("datatable.use.index")) return(NULL) # #1422
2991    ## check whether an existing index can be used
2992    ## An index can be used if it corresponds exactly to the columns in i (similar to the key above)
2993    candidates = indices(x, vectors = TRUE)
2994    idx = NULL
2995    for (cand in candidates){
2996      if (all(names(i) %chin% cand) && length(cand) == length(i)){
2997        idx = attr(attr(x, "index", exact=TRUE), paste0("__", cand, collapse = ""), exact = TRUE)
2998        idxCols = cand
2999        break
3000      }
3001    }
3002    if (!is.null(idx)){
3003      if (verbose) {cat("Optimized subsetting with index '", paste0( idxCols, collapse = "__"),"'\n",sep="");flush.console()}
3004    }
3005  }
3006  if (is.null(idx)){
3007    ## if nothing else helped, auto create a new index that can be used
3008    if (!getOption("datatable.auto.index")) return(NULL)
3009    if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()}
3010    if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()}
3011    setindexv(x, names(i))
3012    if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()}
3013    if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()}
3014    idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE)
3015    idxCols = names(i)
3016  }
3017  if(!is.null(idxCols)){
3018    setkeyv(i, idxCols)
3019    on = on[idxCols] ## make sure 'on' is in the correct order. Otherwise the logic won't recognise that a key / index already exists.
3020  }
3021  return(list(i  = i,
3022              on = on,
3023              notjoin = notjoin
3024              )
3025         )
3026}
3027
3028
3029.parse_on = function(onsub, isnull_inames) {
3030  ## helper that takes the 'on' string(s) and extracts comparison operators and column names from it.
3031  #' @param onsub the substituted on
3032  #' @param isnull_inames bool; TRUE if i has no names.
3033  #' @return List with two entries:
3034  #'         'on' : character vector providing the column names for the join.
3035  #'                Names correspond to columns in x, entries correspond to columns in i
3036  #'         'ops': integer vector. Gives the indices of the operators that connect the columns in x and i.
3037  ops = c("==", "<=", "<", ">=", ">", "!=")
3038  pat = paste0("(", ops, ")", collapse="|")
3039  if (onsub %iscall% 'eval') {
3040    onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
3041  }
3042  if (onsub %iscall% c('list', '.')) {
3043    spat = paste0("[ ]+(", pat, ")[ ]+")
3044    onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L)))
3045    onsub = as.call(c(quote(c), onsub))
3046  }
3047  on = eval(onsub, parent.frame(2L), parent.frame(2L))
3048  if (length(on) == 0L || !is.character(on))
3049    stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
3050  ## extract the operators and potential variable names from 'on'.
3051  ## split at backticks to take care about variable names like `col1<=`.
3052  pieces = strsplit(on, "(?=[`])", perl = TRUE)
3053  xCols  = character(length(on))
3054  ## if 'on' is named, the names are the xCols for sure
3055  if(!is.null(names(on))){
3056    xCols = names(on)
3057  }
3058  iCols     = character(length(on))
3059  operators = character(length(on))
3060  ## loop over the elements and extract operators and column names.
3061  for(i in seq_along(pieces)){
3062    thisCols      = character(0L)
3063    thisOperators = character(0L)
3064    j = 1L
3065    while(j <= length(pieces[[i]])){
3066      if(pieces[[i]][j] == "`"){
3067        ## start of a variable name with backtick.
3068        thisCols = c(thisCols, pieces[[i]][j+1L])
3069        j = j+3L # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.`
3070      } else {
3071        ## no backtick
3072        ## search for operators
3073        thisOperators = c(thisOperators,
3074                           unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])),
3075                                  use.names = FALSE))
3076        ## search for column names
3077        thisCols = c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1L]]))
3078        ## there can be empty string column names because of trimws, remove them
3079        thisCols = thisCols[thisCols != ""]
3080        j = j+1L
3081      }
3082    }
3083    if (length(thisOperators) == 0L) {
3084      ## if no operator is given, it must be ==
3085      operators[i] = "=="
3086    } else if (length(thisOperators) == 1L) {
3087      operators[i] = thisOperators
3088    } else {
3089      ## multiple operators found in one 'on' part. Something is wrong.
3090      stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.")
3091    }
3092    if (length(thisCols) == 2L){
3093      ## two column names found, first is xCol, second is iCol for sure
3094      xCols[i] = thisCols[1L]
3095      iCols[i] = thisCols[2L]
3096    } else if (length(thisCols) == 1L){
3097      ## a single column name found. Can mean different things
3098      if(xCols[i] != ""){
3099        ## xCol is given by names(on). thisCols must be iCol
3100        iCols[i] = thisCols[1L]
3101      } else if (isnull_inames){
3102        ## i has no names. It will be given the names V1, V2, ... automatically.
3103        ## The single column name is the x column. It will match to the ith column in i.
3104        xCols[i] = thisCols[1L]
3105        iCols[i] = paste0("V", i)
3106      } else {
3107        ## i has names and one single column name is given by on.
3108        ## This means that xCol and iCol have the same name.
3109        xCols[i] = thisCols[1L]
3110        iCols[i] = thisCols[1L]
3111      }
3112    } else if (length(thisCols) == 0L){
3113      stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.")
3114    } else {
3115      stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.")
3116    }
3117  }
3118  idx_op = match(operators, ops, nomatch=0L)
3119  if (any(idx_op %in% c(0L, 6L)))
3120    stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".")
3121  ## the final on will contain the xCol as name, the iCol as value
3122  on = iCols
3123  names(on) = xCols
3124  return(list(on = on, ops = idx_op))
3125}
3126