1 #include "data.table.h"
2 #include <Rdefines.h>
3 #include <fcntl.h>
4 #include <time.h>
5 
anySpecialStatic(SEXP x)6 static bool anySpecialStatic(SEXP x) {
7   // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd
8   // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups
9   // creates .SD for the largest group once up front, overwriting the contents for each group. Their
10   // value changes across group but not their memory address. (.NGRP is also special static but its value
11   // is constant across groups so that's excluded here.)
12   // This works well, other than a relatively rare case when two conditions are both true :
13   //   1) the j expression returns a group column as-is without doing any aggregation
14   //   2) that result is placed in a list column result
15   // The list column result can then incorrectly contain the result for the last group repeated for all
16   // groups because the list column ends up holding a pointer to these special static vectors.
17   // See test 2153, and to illustrate here, consider a simplified test 1341
18   // > DT
19   //        x     y
20   //    <int> <int>
21   // 1:     1     1
22   // 2:     2     2
23   // 3:     1     3
24   // 4:     2     4
25   // > DT[, .(list(y)), by=x]
26   //        x     V1
27   //    <int> <list>
28   // 1:     1    2,4  # should be 1,3
29   // 2:     2    2,4
30   //
31   // This has been fixed for a decade but the solution has changed over time.
32   //
33   // We don't wish to inspect the j expression for these cases because there are so many; e.g. user defined functions.
34   // A special symbol does not need to appear in j for the problem to occur. Using a member of .SD is enough as the example above illustrates.
35   // Using R's own reference counting could invoke too many unnecessary copies because these specials are routinely referenced.
36   // Hence we mark these specials (SD, BY, I) here in dogroups and if j's value is being assigned to a list column, we check to
37   // see if any specials are present and copy them if so.
38   // This keeps the special logic in one place in one file here. Previously this copy was done by memrecycle in assign.c but then
39   // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed.
40   // Currently the marker is negative truelength. These specials are protected by us here and before we release them
41   // we restore the true truelength for when R starts to use vector truelength.
42   const int n = length(x);
43   // use length() not LENGTH() because LENGTH() on NULL is segfault in R<3.5 where we still define USE_RINTERNALS
44   // (see data.table.h), and isNewList() is true for NULL
45   if (n==0)
46     return false;
47   if (isVectorAtomic(x))
48     return ALTREP(x) || TRUELENGTH(x)<0;
49   if (isNewList(x)) {
50     if (TRUELENGTH(x)<0)
51       return true;  // test 2158
52     for (int i=0; i<n; ++i) {
53       if (anySpecialStatic(VECTOR_ELT(x,i)))
54         return true;
55     }
56   }
57   return false;
58 }
59 
dogroups(SEXP dt,SEXP dtcols,SEXP groups,SEXP grpcols,SEXP jiscols,SEXP xjiscols,SEXP grporder,SEXP order,SEXP starts,SEXP lens,SEXP jexp,SEXP env,SEXP lhs,SEXP newnames,SEXP on,SEXP verboseArg)60 SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg)
61 {
62   R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp;
63   int nprotect=0;
64   SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source;
65   Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE;
66   clock_t tstart=0, tblock[10]={0}; int nblock[10]={0};
67   const bool verbose = LOGICAL(verboseArg)[0]==1;
68 
69   if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov
70   if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov
71   if (TYPEOF(lens) != INTSXP) error(_("Internal error: lens not integer")); // # nocov
72   // starts can now be NA (<0): if (INTEGER(starts)[0]<0 || INTEGER(lens)[0]<0) error(_("starts[1]<0 or lens[1]<0"));
73   if (!isNull(jiscols) && LENGTH(order) && !LOGICAL(on)[0]) error(_("Internal error: jiscols not NULL but o__ has length")); // # nocov
74   if (!isNull(xjiscols) && LENGTH(order) && !LOGICAL(on)[0]) error(_("Internal error: xjiscols not NULL but o__ has length")); // # nocov
75   if(!isEnvironment(env)) error(_("'env' should be an environment"));
76   ngrp = length(starts);  // the number of groups  (nrow(groups) will be larger when by)
77   ngrpcols = length(grpcols);
78   nrowgroups = length(VECTOR_ELT(groups,0));
79   // fix for longstanding FR/bug, #495. E.g., DT[, c(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:v3] resulted in error.. the idea is, 1) we create .SDall, which is normally == .SD. But if extra vars are detected in jexp other than .SD, then .SD becomes a shallow copy of .SDall with only .SDcols in .SD. Since internally, we don't make a copy, changing .SDall will reflect in .SD. Hopefully this'll workout :-).
80   SEXP SDall = PROTECT(findVar(install(".SDall"), env)); nprotect++;  // PROTECT for rchk
81   SEXP SD = PROTECT(findVar(install(".SD"), env)); nprotect++;
82 
83   defineVar(sym_BY, BY = PROTECT(allocVector(VECSXP, ngrpcols)), env); nprotect++;  // PROTECT for rchk
84   SEXP bynames = PROTECT(allocVector(STRSXP, ngrpcols));  nprotect++;   // TO DO: do we really need bynames, can we assign names afterwards in one step?
85   for (int i=0; i<ngrpcols; ++i) {
86     int j = INTEGER(grpcols)[i]-1;
87     SET_VECTOR_ELT(BY, i, allocVector(TYPEOF(VECTOR_ELT(groups, j)),
88       nrowgroups ? 1 : 0)); // TODO: might be able to be 1 always but 0 when 'groups' are integer(0) seem sensible. #2440 was involved in the past.
89     // Fix for #36, by cols with attributes when also used in `j` lost the attribute.
90     copyMostAttrib(VECTOR_ELT(groups, j), VECTOR_ELT(BY,i));  // not names, otherwise test 778 would fail
91     SET_STRING_ELT(bynames, i, STRING_ELT(getAttrib(groups,R_NamesSymbol), j));
92     defineVar(install(CHAR(STRING_ELT(bynames,i))), VECTOR_ELT(BY,i), env);      // by vars can be used by name in j as well as via .BY
93     if (SIZEOF(VECTOR_ELT(BY,i))==0)
94       error(_("Internal error: unsupported size-0 type '%s' in column %d of 'by' should have been caught earlier"), type2char(TYPEOF(VECTOR_ELT(BY, i))), i+1); // # nocov
95     SET_TRUELENGTH(VECTOR_ELT(BY,i), -1); // marker for anySpecialStatic(); see its comments
96   }
97   setAttrib(BY, R_NamesSymbol, bynames); // Fix for #42 - BY doesn't retain names anymore
98   R_LockBinding(sym_BY, env);
99   if (isNull(jiscols) && (length(bynames)!=length(groups) || length(bynames)!=length(grpcols))) error(_("!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]"),length(bynames),length(groups),length(grpcols));
100   // TO DO: check this check above.
101 
102   N =   PROTECT(findVar(install(".N"), env));   nprotect++; // PROTECT for rchk
103   SET_TRUELENGTH(N, -1);  // marker for anySpecialStatic(); see its comments
104   GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++;
105   SET_TRUELENGTH(GRP, -1);  // marker for anySpecialStatic(); see its comments
106   iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS)
107   xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++;
108   R_len_t maxGrpSize = 0;
109   const int *ilens = INTEGER(lens), n=LENGTH(lens);
110   for (R_len_t i=0; i<n; ++i) {
111     if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
112   }
113   defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
114   SET_TRUELENGTH(I, -maxGrpSize);  // marker for anySpecialStatic(); see its comments
115   R_LockBinding(install(".I"), env);
116 
117   SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by"
118   // fetch rownames of .SD.  rownames[1] is set to -thislen for each group, in case .SD is passed to
119   // non data.table aware package that uses rownames
120   for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s));  // getAttrib0 basically but that's hidden in attrib.c
121   if (s==R_NilValue) error(_("row.names attribute of .SD not found"));
122   rownames = CAR(s);
123   if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]);
124 
125   // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables
126   // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group.
127   SEXP names = PROTECT(getAttrib(SDall, R_NamesSymbol)); nprotect++;
128   if (length(names) != length(SDall)) error(_("length(names)!=length(SD)"));
129   SEXP *nameSyms = (SEXP *)R_alloc(length(names), sizeof(SEXP));
130 
131   for(int i=0; i<length(SDall); ++i) {
132     SEXP this = VECTOR_ELT(SDall, i);
133     if (SIZEOF(this)==0)
134       error(_("Internal error: size-0 type %d in .SD column %d should have been caught earlier"), TYPEOF(this), i); // # nocov
135     if (LENGTH(this) != maxGrpSize)
136       error(_("Internal error: SDall %d length = %d != %d"), i+1, LENGTH(this), maxGrpSize); // # nocov
137     nameSyms[i] = install(CHAR(STRING_ELT(names, i)));
138     // fixes http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by
139     copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), this);  // not names, otherwise test 778 would fail
140     SET_TRUELENGTH(this, -maxGrpSize);  // marker for anySpecialStatic(); see its comments
141   }
142 
143   SEXP xknames = PROTECT(getAttrib(xSD, R_NamesSymbol)); nprotect++;
144   if (length(xknames) != length(xSD)) error(_("length(xknames)!=length(xSD)"));
145   SEXP *xknameSyms = (SEXP *)R_alloc(length(xknames), sizeof(SEXP));
146   for(int i=0; i<length(xSD); ++i) {
147     if (SIZEOF(VECTOR_ELT(xSD, i))==0)
148       error(_("Internal error: type %d in .xSD column %d should have been caught by now"), TYPEOF(VECTOR_ELT(xSD, i)), i); // # nocov
149     xknameSyms[i] = install(CHAR(STRING_ELT(xknames, i)));
150   }
151 
152   if (length(iSD)!=length(jiscols)) error(_("length(iSD)[%d] != length(jiscols)[%d]"),length(iSD),length(jiscols));
153   if (length(xSD)!=length(xjiscols)) error(_("length(xSD)[%d] != length(xjiscols)[%d]"),length(xSD),length(xjiscols));
154 
155   SEXP listwrap = PROTECT(allocVector(VECSXP, 1)); nprotect++;
156   Rboolean jexpIsSymbolOtherThanSD = (isSymbol(jexp) && strcmp(CHAR(PRINTNAME(jexp)),".SD")!=0);  // test 559
157 
158   ansloc = 0;
159   const int *istarts = INTEGER(starts);
160   const int *iorder = INTEGER(order);
161   for(int i=0; i<ngrp; ++i) {   // even for an empty i table, ngroup is length 1 (starts is value 0), for consistency of empty cases
162 
163     if (istarts[i]==0 && (i<ngrp-1 || estn>-1)) continue;
164     // Previously had replaced (i>0 || !isNull(lhs)) with i>0 to fix #49
165     // The above is now to fix #1993, see test 1746.
166     // In cases were no i rows match, '|| estn>-1' ensures that the last empty group creates an empty result.
167     // TODO: revisit and tidy
168 
169     if (!isNull(lhs) &&
170         (istarts[i] == NA_INTEGER ||
171          (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)))
172       continue;
173     grpn = ilens[i];
174     INTEGER(N)[0] = istarts[i] == NA_INTEGER ? 0 : grpn;
175     // .N is number of rows matched to ( 0 even when nomatch is NA)
176     INTEGER(GRP)[0] = i+1;  // group counter exposed as .GRP
177     INTEGER(rownames)[1] = -grpn;  // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch
178     for (int j=0; j<length(SDall); ++j) {
179       SETLENGTH(VECTOR_ELT(SDall,j), grpn);  // before copying data in otherwise assigning after the end could error R API checks
180       defineVar(nameSyms[j], VECTOR_ELT(SDall, j), env);
181       // Redo this defineVar for each group in case user's j assigned to the column names (env is static) (tests 387 and 388)
182       // nameSyms pre-stored to save repeated install() for efficiency, though.
183     }
184     for (int j=0; j<length(xSD); ++j) {
185       defineVar(xknameSyms[j], VECTOR_ELT(xSD, j), env);
186     }
187 
188     if (length(iSD) && length(VECTOR_ELT(iSD, 0))/*#4364*/) for (int j=0; j<length(iSD); ++j) {   // either this or the next for() will run, not both
189       memrecycle(VECTOR_ELT(iSD,j), R_NilValue, 0, 1, VECTOR_ELT(groups, INTEGER(jiscols)[j]-1), i, 1, j+1, "Internal error assigning to iSD");
190       // we're just use memrecycle here to assign a single value
191     }
192     // igrp determines the start of the current group in rows of dt (0 based).
193     // if jiscols is not null, we have a by = .EACHI, so the start is exactly i.
194     // Otherwise, igrp needs to be determined from starts, potentially taking care about the order if present.
195     igrp = !isNull(jiscols) ? i : (length(grporder) ? INTEGER(grporder)[istarts[i]-1]-1 : istarts[i]-1);
196     if (igrp>=0 && nrowgroups) for (int j=0; j<length(BY); ++j) {    // igrp can be -1 so 'if' is important, otherwise memcpy crash
197       memrecycle(VECTOR_ELT(BY,j), R_NilValue, 0, 1, VECTOR_ELT(groups, INTEGER(grpcols)[j]-1), igrp, 1, j+1, "Internal error assigning to BY");
198     }
199     if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) {
200       for (int j=0; j<length(SDall); ++j) {
201         writeNA(VECTOR_ELT(SDall, j), 0, 1);
202         // writeNA uses SET_ for STR and VEC, and we always use SET_ to assign to SDall always too. Otherwise,
203         // this writeNA could decrement the reference for the old value which wasn't incremented in the first place.
204         // Further, the eval(jval) could feasibly assign to SD although that is currently caught and disallowed. If that
205         // became possible, that assign from user's j expression would decrement the reference which wasn't incremented
206         // in the first place. And finally, upon release of SD, values will be decremented, where they weren't incremented
207         // in the first place. All in all, too risky to write behind the barrier in this section.
208         // Or in the words, this entire section, and this entire dogroups.c file, is now write-barrier compliant from v1.12.10
209         // and we hope that reference counting on by default from R 4.0 will avoid costly gc()s.
210       }
211       grpn = 1;  // it may not be 1 e.g. test 722. TODO: revisit.
212       SETLENGTH(I, grpn);
213       INTEGER(I)[0] = 0;
214       for (int j=0; j<length(xSD); ++j) {
215         writeNA(VECTOR_ELT(xSD, j), 0, 1);
216       }
217     } else {
218       if (verbose) tstart = clock();
219       SETLENGTH(I, grpn);
220       int *iI = INTEGER(I);
221       if (LENGTH(order)==0) {
222         const int rownum = grpn ? istarts[i]-1 : -1;
223         for (int j=0; j<grpn; ++j) iI[j] = rownum+j+1;
224         if (rownum>=0) {
225           for (int j=0; j<length(SDall); ++j)
226             memrecycle(VECTOR_ELT(SDall,j), R_NilValue, 0, grpn, VECTOR_ELT(dt, INTEGER(dtcols)[j]-1), rownum, grpn, j+1, "Internal error assigning to SDall");
227           for (int j=0; j<length(xSD); ++j)
228             memrecycle(VECTOR_ELT(xSD,j), R_NilValue, 0, 1, VECTOR_ELT(dt, INTEGER(xjiscols)[j]-1), rownum, 1, j+1, "Internal error assigning to xSD");
229         }
230         if (verbose) { tblock[0] += clock()-tstart; nblock[0]++; }
231       } else {
232         const int rownum = istarts[i]-1;
233         for (int k=0; k<grpn; ++k) iI[k] = iorder[rownum+k];
234         for (int j=0; j<length(SDall); ++j) {
235           // this is the main non-contiguous gather, and is parallel (within-column) for non-SEXP
236           subsetVectorRaw(VECTOR_ELT(SDall,j), VECTOR_ELT(dt,INTEGER(dtcols)[j]-1), I, /*anyNA=*/false);
237         }
238         if (verbose) { tblock[1] += clock()-tstart; nblock[1]++; }
239         // The two blocks have separate timing statements to make sure which is running
240       }
241     }
242 
243     if (verbose) tstart = clock();  // call to clock() is more expensive than an 'if'
244     PROTECT(jval = eval(jexp, env));
245     if (verbose) { tblock[2] += clock()-tstart; nblock[2]++; }
246 
247     if (isNull(jval))  {
248       // j may be a plot or other side-effect only
249       UNPROTECT(1);
250       continue;
251     }
252     if ((wasvector = (isVectorAtomic(jval) || jexpIsSymbolOtherThanSD))) {  // see test 559
253       // Prior to 1.8.1 the wrap with list() was done up in R after the first group had tested.
254       // This wrap is done to make the code below easier to loop through. listwrap avoids copying jval.
255       SET_VECTOR_ELT(listwrap,0,jval);
256       jval = listwrap;
257     } else {
258       if (!isNewList(jval))    // isNewList tests for VECSXP. isList tests for (old) LISTSXP
259         error(_("j evaluates to type '%s'. Must evaluate to atomic vector or list."),type2char(TYPEOF(jval)));
260       if (!LENGTH(jval)) {
261         UNPROTECT(1);
262         continue;
263       }
264       for (int j=0; j<LENGTH(jval); ++j) {
265         thiscol = VECTOR_ELT(jval,j);
266         if (!isNull(thiscol) && (!isVector(thiscol) || isFrame(thiscol) || isArray(thiscol) ))
267           error(_("All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards."));
268       }
269     }
270     if (!isNull(lhs)) {
271       R_len_t origncol = LENGTH(dt);
272       // check jval first before proceeding to add columns, so that if error on the first group, the columns aren't added
273       for (int j=0; j<length(lhs); ++j) {
274         RHS = VECTOR_ELT(jval,j%LENGTH(jval));
275         if (isNull(RHS))
276           error(_("RHS of := is NULL during grouped assignment, but it's not possible to delete parts of a column."));
277         int vlen = length(RHS);
278         if (vlen>1 && vlen!=grpn) {
279           SEXP colname = isNull(VECTOR_ELT(dt, INTEGER(lhs)[j]-1)) ? STRING_ELT(newnames, INTEGER(lhs)[j]-origncol-1) : STRING_ELT(dtnames,INTEGER(lhs)[j]-1);
280           error(_("Supplied %d items to be assigned to group %d of size %d in column '%s'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."),vlen,i+1,grpn,CHAR(colname));
281           // e.g. in #91 `:=` did not issue recycling warning during grouping. Now it is error not warning.
282         }
283       }
284       int n = LENGTH(VECTOR_ELT(dt, 0));
285       for (int j=0; j<length(lhs); ++j) {
286         int colj = INTEGER(lhs)[j]-1;
287         target = VECTOR_ELT(dt, colj);
288         RHS = VECTOR_ELT(jval,j%LENGTH(jval));
289         if (isNull(target)) {
290           // first time adding to new column
291           if (TRUELENGTH(dt) < colj+1) error(_("Internal error: Trying to add new column by reference but tl is full; setalloccol should have run first at R level before getting to this point in dogroups")); // # nocov
292           target = PROTECT(allocNAVectorLike(RHS, n));
293           // Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
294           // contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
295           // Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
296           SETLENGTH(dtnames, LENGTH(dtnames)+1);
297           SETLENGTH(dt, LENGTH(dt)+1);
298           SET_VECTOR_ELT(dt, colj, target);
299           UNPROTECT(1);
300           SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
301           copyMostAttrib(RHS, target); // attributes of first group dominate; e.g. initial factor levels come from first group
302         }
303         bool copied = false;
304         if (isNewList(target) && anySpecialStatic(RHS)) {  // see comments in anySpecialStatic()
305           RHS = PROTECT(copyAsPlain(RHS));
306           copied = true;
307         }
308         const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, "");
309         // can't error here because length mismatch already checked for all jval columns before starting to add any new columns
310         if (copied) UNPROTECT(1);
311         if (warn)
312           warning(_("Group %d column '%s': %s"), i+1, CHAR(STRING_ELT(dtnames, colj)), warn);
313       }
314       UNPROTECT(1); // jval
315       continue;
316     }
317     maxn = 0;
318     if (njval==0) njval = LENGTH(jval);   // for first group, then the rest (when non 0) must conform to the first >0 group
319     if (njval!=LENGTH(jval)) error(_("j doesn't evaluate to the same number of columns for each group"));  // this would be a problem even if we unlisted afterwards. This way the user finds out earlier though so he can fix and rerun sooner.
320     for (int j=0; j<njval; ++j) {
321       int k = length(VECTOR_ELT(jval,j));  // might be NULL, so length not LENGTH
322       maxn = k>maxn ? k : maxn;
323     }
324     if (ansloc + maxn > estn) {
325       if (estn == -1) {
326         // Given first group and j's result on it, make a good guess for size of result required.
327         if (grpn==0)
328           estn = maxn = 0;   // empty case e.g. test 184. maxn is 1 here due to sum(integer()) == 0L
329         else if (maxn==1) // including when grpn==1 we default to assuming it's an aggregate
330           estn = LENGTH(starts);
331           // Common case 1 : j is a list of simple aggregates i.e. list of atoms only
332         else if (maxn >= grpn) {
333           estn = 0;
334           for (int j=0; j<LENGTH(lens); ++j) estn+=ilens[j];
335           // Common case 2 : j returns as many rows as there are in the group (maybe a join)
336           // TO DO: this might over allocate if first group has 1 row and j is actually a single row aggregate
337           //        in cases when we're not sure could wait for the first few groups before deciding.
338         } else  // maxn < grpn
339           estn = maxn * LENGTH(starts);
340           // Common case 3 : head or tail of .SD perhaps
341         if (estn<maxn) estn=maxn;  // if the result for the first group is larger than the table itself(!) Unusual case where a join is being done in j via .SD and the 1-row table is an edge case of bigger picture.
342         PROTECT(ans = allocVector(VECSXP, ngrpcols + njval));
343         nprotect++;
344         firstalloc=TRUE;
345         for(int j=0; j<ngrpcols; ++j) {
346           thiscol = VECTOR_ELT(groups, INTEGER(grpcols)[j]-1);
347           SET_VECTOR_ELT(ans, j, allocVector(TYPEOF(thiscol), estn));
348           copyMostAttrib(thiscol, VECTOR_ELT(ans,j));  // not names, otherwise test 778 would fail
349         }
350         for(int j=0; j<njval; ++j) {
351           thiscol = VECTOR_ELT(jval, j);
352           if (isNull(thiscol))
353             error(_("Column %d of j's result for the first group is NULL. We rely on the column types of the first result to decide the type expected for the remaining groups (and require consistency). NULL columns are acceptable for later groups (and those are replaced with NA of appropriate type and recycled) but not for the first. Please use a typed empty vector instead, such as integer() or numeric()."), j+1);
354           if (verbose && !isNull(getAttrib(thiscol, R_NamesSymbol))) {
355             if (wasvector) {
356               Rprintf(_("j appears to be a named vector. The same names will likely be created over and over again for each group and slow things down. Try and pass a named list (which data.table optimizes) or an unnamed list() instead.\n"));
357             } else {
358               Rprintf(_("Column %d of j is a named vector (each item down the rows is named, somehow). Please remove those names for efficiency (to save creating them over and over for each group). They are ignored anyway.\n"), j+1);
359             }
360           }
361           SET_VECTOR_ELT(ans, ngrpcols+j, allocVector(TYPEOF(thiscol), estn));
362           copyMostAttrib(thiscol, VECTOR_ELT(ans,ngrpcols+j));  // not names, otherwise test 276 would fail
363         }
364         SEXP jvalnames = PROTECT(getAttrib(jval, R_NamesSymbol));
365         if (!isNull(jvalnames)) {
366           if (verbose) Rprintf(_("The result of j is a named list. It's very inefficient to create the same names over and over again for each group. When j=list(...), any names are detected, removed and put back after grouping has completed, for efficiency. Using j=transform(), for example, prevents that speedup (consider changing to :=). This message may be upgraded to warning in future.\n"));  // e.g. test 104 has j=transform().
367           // names of result come from the first group and the names of remaining groups are ignored (all that matters for them is that the number of columns (and their types) match the first group.
368           SEXP names2 = PROTECT(allocVector(STRSXP,ngrpcols+njval));
369           //  for (j=0; j<ngrpcols; j++) SET_STRING_ELT(names2, j, STRING_ELT(bynames,j));  // These get set back up in R
370           for (int j=0; j<njval; ++j) SET_STRING_ELT(names2, ngrpcols+j, STRING_ELT(jvalnames,j));
371           setAttrib(ans, R_NamesSymbol, names2);
372           UNPROTECT(1); // names2
373           // setAttrib(SD, R_NamesSymbol, R_NilValue); // so that lapply(.SD,mean) is unnamed from 2nd group on
374         }
375         UNPROTECT(1); // jvalnames
376       } else {
377         estn = ((double)ngrp/i)*1.1*(ansloc+maxn);
378         if (verbose) Rprintf(_("dogroups: growing from %d to %d rows\n"), length(VECTOR_ELT(ans,0)), estn);
379         if (length(ans) != ngrpcols + njval) error(_("dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]"),length(ans),ngrpcols,njval);
380         for (int j=0; j<length(ans); ++j) SET_VECTOR_ELT(ans, j, growVector(VECTOR_ELT(ans,j), estn));
381       }
382     }
383     // write the group values to ans, recycled to match the nrow of the result for this group ...
384     for (int j=0; j<ngrpcols; ++j) {
385       memrecycle(VECTOR_ELT(ans,j), R_NilValue, ansloc, maxn, VECTOR_ELT(groups, INTEGER(grpcols)[j]-1), igrp, 1, j+1, "Internal error recycling group values");
386     }
387     // Now copy jval into ans ...
388     for (int j=0; j<njval; ++j) {
389       thisansloc = ansloc;
390       source = VECTOR_ELT(jval,j);
391       thislen = length(source);
392       target = VECTOR_ELT(ans, j+ngrpcols);
393       if (thislen == 0) {
394         // including NULL and typed empty vectors, fill with NA
395         // A NULL in the first group's jval isn't allowed; caught above after allocating ans
396         if (!NullWarnDone && maxn>1) {  // maxn==1 in tests 172,280,281,282,403,405 and 406
397           warning(_("Item %d of j's result for group %d is zero length. This will be filled with %d NAs to match the longest column in this result. Later groups may have a similar problem but only the first is reported to save filling the warning buffer."), j+1, i+1, maxn);
398           NullWarnDone = TRUE;
399         }
400         writeNA(target, thisansloc, maxn);
401       } else {
402         // thislen>0
403         if (TYPEOF(source) != TYPEOF(target))
404           error(_("Column %d of result for group %d is type '%s' but expecting type '%s'. Column types must be consistent for each group."), j+1, i+1, type2char(TYPEOF(source)), type2char(TYPEOF(target)));
405         if (thislen>1 && thislen!=maxn && grpn>0) {  // grpn>0 for grouping empty tables; test 1986
406           error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
407         }
408         bool copied = false;
409         if (isNewList(target) && anySpecialStatic(source)) {  // see comments in anySpecialStatic()
410           source = PROTECT(copyAsPlain(source));
411           copied = true;
412         }
413         memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
414         if (copied) UNPROTECT(1);
415       }
416     }
417     ansloc += maxn;
418     if (firstalloc) {
419       nprotect++;          //  remember the first jval. If we UNPROTECTed now, we'd unprotect
420       firstalloc = FALSE;  //  ans. The first jval is needed to create the right size and type of ans.
421       // TO DO: could avoid this last 'if' by adding a dummy PROTECT after first alloc for this UNPROTECT(1) to do.
422     }
423     else UNPROTECT(1);  // the jval. Don't want them to build up. The first jval can stay protected till the end ok.
424   }
425   if (isNull(lhs) && ans!=NULL) {
426     if (ansloc < LENGTH(VECTOR_ELT(ans,0))) {
427       if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0)));
428       for (int j=0; j<length(ans); j++) SET_VECTOR_ELT(ans, j, growVector(VECTOR_ELT(ans,j), ansloc));
429       // shrinks (misuse of word 'grow') back to the rows written, otherwise leak until ...
430       // ... TO DO: set truelength to LENGTH(VECTOR_ELT(ans,0)), length to ansloc and enhance finalizer to handle over-allocated rows.
431     }
432   } else ans = R_NilValue;
433   // Now reset length of .SD columns and .I to length of largest group, otherwise leak if the last group is smaller (often is).
434   // Also reset truelength on specials; see comments in anySpecialStatic().
435   for (int j=0; j<length(SDall); ++j) {
436     SEXP this = VECTOR_ELT(SDall,j);
437     SETLENGTH(this, maxGrpSize);
438     SET_TRUELENGTH(this, maxGrpSize);
439   }
440   SETLENGTH(I, maxGrpSize);
441   SET_TRUELENGTH(I, maxGrpSize);
442   for (int i=0; i<length(BY); ++i) {
443     SEXP this = VECTOR_ELT(BY, i);
444     SET_TRUELENGTH(this, length(this)); // might be 0 or 1; see its allocVector above
445   }
446   SET_TRUELENGTH(N, 1);
447   SET_TRUELENGTH(GRP, 1);
448   if (verbose) {
449     if (nblock[0] && nblock[1]) error(_("Internal error: block 0 [%d] and block 1 [%d] have both run"), nblock[0], nblock[1]); // # nocov
450     int w = nblock[1]>0;
451     Rprintf(_("\n  %s took %.3fs for %d groups\n"), w ? "collecting discontiguous groups" : "memcpy contiguous groups",
452                           1.0*tblock[w]/CLOCKS_PER_SEC, nblock[w]);
453     Rprintf(_("  eval(j) took %.3fs for %d calls\n"), 1.0*tblock[2]/CLOCKS_PER_SEC, nblock[2]);
454   }
455   UNPROTECT(nprotect);
456   return(ans);
457 }
458 
keepattr(SEXP to,SEXP from)459 SEXP keepattr(SEXP to, SEXP from)
460 {
461   // Same as R_copyDFattr in src/main/attrib.c, but that seems not exposed in R's api
462   // Only difference is that we reverse from and to in the prototype, for easier calling above
463   SET_ATTRIB(to, ATTRIB(from));
464   IS_S4_OBJECT(from) ?  SET_S4_OBJECT(to) : UNSET_S4_OBJECT(to);
465   SET_OBJECT(to, OBJECT(from));
466   return to;
467 }
468 
growVector(SEXP x,const R_len_t newlen)469 SEXP growVector(SEXP x, const R_len_t newlen)
470 {
471   // Similar to EnlargeVector in src/main/subassign.c, with the following changes :
472   // * replaced switch and loops with one memcpy for INTEGER and REAL, but need to age CHAR and VEC.
473   // * no need to cater for names
474   // * much shorter and faster
475   SEXP newx;
476   R_len_t len = length(x);
477   if (isNull(x)) error(_("growVector passed NULL"));
478   PROTECT(newx = allocVector(TYPEOF(x), newlen));   // TO DO: R_realloc(?) here?
479   if (newlen < len) len=newlen;   // i.e. shrink
480   switch (TYPEOF(x)) {
481   case RAWSXP:  memcpy(RAW(newx),     RAW(x),     len*SIZEOF(x)); break;
482   case LGLSXP:  memcpy(LOGICAL(newx), LOGICAL(x), len*SIZEOF(x)); break;
483   case INTSXP:  memcpy(INTEGER(newx), INTEGER(x), len*SIZEOF(x)); break;
484   case REALSXP: memcpy(REAL(newx),    REAL(x),    len*SIZEOF(x)); break;
485   case CPLXSXP: memcpy(COMPLEX(newx), COMPLEX(x), len*SIZEOF(x)); break;
486   case STRSXP : {
487     const SEXP *xd = SEXPPTR_RO(x);
488     for (int i=0; i<len; ++i)
489       SET_STRING_ELT(newx, i, xd[i]);
490   } break;
491   case VECSXP : {
492     const SEXP *xd = SEXPPTR_RO(x);
493     for (int i=0; i<len; ++i)
494       SET_VECTOR_ELT(newx, i, xd[i]);
495   } break;
496   default :
497     error(_("Internal error: growVector doesn't support type '%s'"), type2char(TYPEOF(x)));  // # nocov
498   }
499   // if (verbose) Rprintf(_("Growing vector from %d to %d items of type '%s'\n"), len, newlen, type2char(TYPEOF(x)));
500   // Would print for every column if here. Now just up in dogroups (one msg for each table grow).
501   keepattr(newx,x);
502   UNPROTECT(1);
503   return newx;
504 }
505 
506 
507 // benchmark timings for #481 fix:
508 // old code - no changes, R v3.0.3
509 // > system.time(dt[, list(list(y)), by=x])
510 //    user  system elapsed
511 //  82.593   0.936  84.314
512 // > system.time(dt[, list(list(y)), by=x])
513 //    user  system elapsed
514 //  34.558   0.628  35.658
515 // > system.time(dt[, list(list(y)), by=x])
516 //    user  system elapsed
517 //  37.056   0.315  37.668
518 //
519 // All new changes in place, R v3.0.3
520 // > system.time(dt[, list(list(y)), by=x])
521 //    user  system elapsed
522 //  82.852   0.952  84.575
523 // > system.time(dt[, list(list(y)), by=x])
524 //    user  system elapsed
525 //  34.600   0.356  35.173
526 // > system.time(dt[, list(list(y)), by=x])
527 //    user  system elapsed
528 //  36.865   0.514  37.901
529 
530 // old code - no changes, R v3.1.0 --- BUT RESULTS ARE WRONG!
531 // > system.time(dt[, list(list(y)), by=x])
532 //    user  system elapsed
533 //  11.022   0.352  11.455
534 // > system.time(dt[, list(list(y)), by=x])
535 //    user  system elapsed
536 //  10.397   0.119  10.600
537 // > system.time(dt[, list(list(y)), by=x])
538 //    user  system elapsed
539 //  10.665   0.101  11.013
540 
541 // All new changes in place, R v3.1.0
542 // > system.time(dt[, list(list(y)), by=x])
543 //    user  system elapsed
544 //  83.279   1.057  89.856
545 // > system.time(dt[, list(list(y)), by=x])
546 //    user  system elapsed
547 //  30.569   0.633  31.452
548 // > system.time(dt[, list(list(y)), by=x])
549 //    user  system elapsed
550 //  30.827   0.239  32.306
551