1 #include "data.table.h"
2 #include <Rdefines.h>
3 #include <fcntl.h>
4 #include <time.h>
5
anySpecialStatic(SEXP x)6 static bool anySpecialStatic(SEXP x) {
7 // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd
8 // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups
9 // creates .SD for the largest group once up front, overwriting the contents for each group. Their
10 // value changes across group but not their memory address. (.NGRP is also special static but its value
11 // is constant across groups so that's excluded here.)
12 // This works well, other than a relatively rare case when two conditions are both true :
13 // 1) the j expression returns a group column as-is without doing any aggregation
14 // 2) that result is placed in a list column result
15 // The list column result can then incorrectly contain the result for the last group repeated for all
16 // groups because the list column ends up holding a pointer to these special static vectors.
17 // See test 2153, and to illustrate here, consider a simplified test 1341
18 // > DT
19 // x y
20 // <int> <int>
21 // 1: 1 1
22 // 2: 2 2
23 // 3: 1 3
24 // 4: 2 4
25 // > DT[, .(list(y)), by=x]
26 // x V1
27 // <int> <list>
28 // 1: 1 2,4 # should be 1,3
29 // 2: 2 2,4
30 //
31 // This has been fixed for a decade but the solution has changed over time.
32 //
33 // We don't wish to inspect the j expression for these cases because there are so many; e.g. user defined functions.
34 // A special symbol does not need to appear in j for the problem to occur. Using a member of .SD is enough as the example above illustrates.
35 // Using R's own reference counting could invoke too many unnecessary copies because these specials are routinely referenced.
36 // Hence we mark these specials (SD, BY, I) here in dogroups and if j's value is being assigned to a list column, we check to
37 // see if any specials are present and copy them if so.
38 // This keeps the special logic in one place in one file here. Previously this copy was done by memrecycle in assign.c but then
39 // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed.
40 // Currently the marker is negative truelength. These specials are protected by us here and before we release them
41 // we restore the true truelength for when R starts to use vector truelength.
42 const int n = length(x);
43 // use length() not LENGTH() because LENGTH() on NULL is segfault in R<3.5 where we still define USE_RINTERNALS
44 // (see data.table.h), and isNewList() is true for NULL
45 if (n==0)
46 return false;
47 if (isVectorAtomic(x))
48 return ALTREP(x) || TRUELENGTH(x)<0;
49 if (isNewList(x)) {
50 if (TRUELENGTH(x)<0)
51 return true; // test 2158
52 for (int i=0; i<n; ++i) {
53 if (anySpecialStatic(VECTOR_ELT(x,i)))
54 return true;
55 }
56 }
57 return false;
58 }
59
dogroups(SEXP dt,SEXP dtcols,SEXP groups,SEXP grpcols,SEXP jiscols,SEXP xjiscols,SEXP grporder,SEXP order,SEXP starts,SEXP lens,SEXP jexp,SEXP env,SEXP lhs,SEXP newnames,SEXP on,SEXP verboseArg)60 SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg)
61 {
62 R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp;
63 int nprotect=0;
64 SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source;
65 Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE;
66 clock_t tstart=0, tblock[10]={0}; int nblock[10]={0};
67 const bool verbose = LOGICAL(verboseArg)[0]==1;
68
69 if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov
70 if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov
71 if (TYPEOF(lens) != INTSXP) error(_("Internal error: lens not integer")); // # nocov
72 // starts can now be NA (<0): if (INTEGER(starts)[0]<0 || INTEGER(lens)[0]<0) error(_("starts[1]<0 or lens[1]<0"));
73 if (!isNull(jiscols) && LENGTH(order) && !LOGICAL(on)[0]) error(_("Internal error: jiscols not NULL but o__ has length")); // # nocov
74 if (!isNull(xjiscols) && LENGTH(order) && !LOGICAL(on)[0]) error(_("Internal error: xjiscols not NULL but o__ has length")); // # nocov
75 if(!isEnvironment(env)) error(_("'env' should be an environment"));
76 ngrp = length(starts); // the number of groups (nrow(groups) will be larger when by)
77 ngrpcols = length(grpcols);
78 nrowgroups = length(VECTOR_ELT(groups,0));
79 // fix for longstanding FR/bug, #495. E.g., DT[, c(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:v3] resulted in error.. the idea is, 1) we create .SDall, which is normally == .SD. But if extra vars are detected in jexp other than .SD, then .SD becomes a shallow copy of .SDall with only .SDcols in .SD. Since internally, we don't make a copy, changing .SDall will reflect in .SD. Hopefully this'll workout :-).
80 SEXP SDall = PROTECT(findVar(install(".SDall"), env)); nprotect++; // PROTECT for rchk
81 SEXP SD = PROTECT(findVar(install(".SD"), env)); nprotect++;
82
83 defineVar(sym_BY, BY = PROTECT(allocVector(VECSXP, ngrpcols)), env); nprotect++; // PROTECT for rchk
84 SEXP bynames = PROTECT(allocVector(STRSXP, ngrpcols)); nprotect++; // TO DO: do we really need bynames, can we assign names afterwards in one step?
85 for (int i=0; i<ngrpcols; ++i) {
86 int j = INTEGER(grpcols)[i]-1;
87 SET_VECTOR_ELT(BY, i, allocVector(TYPEOF(VECTOR_ELT(groups, j)),
88 nrowgroups ? 1 : 0)); // TODO: might be able to be 1 always but 0 when 'groups' are integer(0) seem sensible. #2440 was involved in the past.
89 // Fix for #36, by cols with attributes when also used in `j` lost the attribute.
90 copyMostAttrib(VECTOR_ELT(groups, j), VECTOR_ELT(BY,i)); // not names, otherwise test 778 would fail
91 SET_STRING_ELT(bynames, i, STRING_ELT(getAttrib(groups,R_NamesSymbol), j));
92 defineVar(install(CHAR(STRING_ELT(bynames,i))), VECTOR_ELT(BY,i), env); // by vars can be used by name in j as well as via .BY
93 if (SIZEOF(VECTOR_ELT(BY,i))==0)
94 error(_("Internal error: unsupported size-0 type '%s' in column %d of 'by' should have been caught earlier"), type2char(TYPEOF(VECTOR_ELT(BY, i))), i+1); // # nocov
95 SET_TRUELENGTH(VECTOR_ELT(BY,i), -1); // marker for anySpecialStatic(); see its comments
96 }
97 setAttrib(BY, R_NamesSymbol, bynames); // Fix for #42 - BY doesn't retain names anymore
98 R_LockBinding(sym_BY, env);
99 if (isNull(jiscols) && (length(bynames)!=length(groups) || length(bynames)!=length(grpcols))) error(_("!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]"),length(bynames),length(groups),length(grpcols));
100 // TO DO: check this check above.
101
102 N = PROTECT(findVar(install(".N"), env)); nprotect++; // PROTECT for rchk
103 SET_TRUELENGTH(N, -1); // marker for anySpecialStatic(); see its comments
104 GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++;
105 SET_TRUELENGTH(GRP, -1); // marker for anySpecialStatic(); see its comments
106 iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS)
107 xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++;
108 R_len_t maxGrpSize = 0;
109 const int *ilens = INTEGER(lens), n=LENGTH(lens);
110 for (R_len_t i=0; i<n; ++i) {
111 if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
112 }
113 defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
114 SET_TRUELENGTH(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments
115 R_LockBinding(install(".I"), env);
116
117 SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by"
118 // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to
119 // non data.table aware package that uses rownames
120 for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c
121 if (s==R_NilValue) error(_("row.names attribute of .SD not found"));
122 rownames = CAR(s);
123 if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]);
124
125 // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables
126 // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group.
127 SEXP names = PROTECT(getAttrib(SDall, R_NamesSymbol)); nprotect++;
128 if (length(names) != length(SDall)) error(_("length(names)!=length(SD)"));
129 SEXP *nameSyms = (SEXP *)R_alloc(length(names), sizeof(SEXP));
130
131 for(int i=0; i<length(SDall); ++i) {
132 SEXP this = VECTOR_ELT(SDall, i);
133 if (SIZEOF(this)==0)
134 error(_("Internal error: size-0 type %d in .SD column %d should have been caught earlier"), TYPEOF(this), i); // # nocov
135 if (LENGTH(this) != maxGrpSize)
136 error(_("Internal error: SDall %d length = %d != %d"), i+1, LENGTH(this), maxGrpSize); // # nocov
137 nameSyms[i] = install(CHAR(STRING_ELT(names, i)));
138 // fixes http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by
139 copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), this); // not names, otherwise test 778 would fail
140 SET_TRUELENGTH(this, -maxGrpSize); // marker for anySpecialStatic(); see its comments
141 }
142
143 SEXP xknames = PROTECT(getAttrib(xSD, R_NamesSymbol)); nprotect++;
144 if (length(xknames) != length(xSD)) error(_("length(xknames)!=length(xSD)"));
145 SEXP *xknameSyms = (SEXP *)R_alloc(length(xknames), sizeof(SEXP));
146 for(int i=0; i<length(xSD); ++i) {
147 if (SIZEOF(VECTOR_ELT(xSD, i))==0)
148 error(_("Internal error: type %d in .xSD column %d should have been caught by now"), TYPEOF(VECTOR_ELT(xSD, i)), i); // # nocov
149 xknameSyms[i] = install(CHAR(STRING_ELT(xknames, i)));
150 }
151
152 if (length(iSD)!=length(jiscols)) error(_("length(iSD)[%d] != length(jiscols)[%d]"),length(iSD),length(jiscols));
153 if (length(xSD)!=length(xjiscols)) error(_("length(xSD)[%d] != length(xjiscols)[%d]"),length(xSD),length(xjiscols));
154
155 SEXP listwrap = PROTECT(allocVector(VECSXP, 1)); nprotect++;
156 Rboolean jexpIsSymbolOtherThanSD = (isSymbol(jexp) && strcmp(CHAR(PRINTNAME(jexp)),".SD")!=0); // test 559
157
158 ansloc = 0;
159 const int *istarts = INTEGER(starts);
160 const int *iorder = INTEGER(order);
161 for(int i=0; i<ngrp; ++i) { // even for an empty i table, ngroup is length 1 (starts is value 0), for consistency of empty cases
162
163 if (istarts[i]==0 && (i<ngrp-1 || estn>-1)) continue;
164 // Previously had replaced (i>0 || !isNull(lhs)) with i>0 to fix #49
165 // The above is now to fix #1993, see test 1746.
166 // In cases were no i rows match, '|| estn>-1' ensures that the last empty group creates an empty result.
167 // TODO: revisit and tidy
168
169 if (!isNull(lhs) &&
170 (istarts[i] == NA_INTEGER ||
171 (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)))
172 continue;
173 grpn = ilens[i];
174 INTEGER(N)[0] = istarts[i] == NA_INTEGER ? 0 : grpn;
175 // .N is number of rows matched to ( 0 even when nomatch is NA)
176 INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP
177 INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch
178 for (int j=0; j<length(SDall); ++j) {
179 SETLENGTH(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
180 defineVar(nameSyms[j], VECTOR_ELT(SDall, j), env);
181 // Redo this defineVar for each group in case user's j assigned to the column names (env is static) (tests 387 and 388)
182 // nameSyms pre-stored to save repeated install() for efficiency, though.
183 }
184 for (int j=0; j<length(xSD); ++j) {
185 defineVar(xknameSyms[j], VECTOR_ELT(xSD, j), env);
186 }
187
188 if (length(iSD) && length(VECTOR_ELT(iSD, 0))/*#4364*/) for (int j=0; j<length(iSD); ++j) { // either this or the next for() will run, not both
189 memrecycle(VECTOR_ELT(iSD,j), R_NilValue, 0, 1, VECTOR_ELT(groups, INTEGER(jiscols)[j]-1), i, 1, j+1, "Internal error assigning to iSD");
190 // we're just use memrecycle here to assign a single value
191 }
192 // igrp determines the start of the current group in rows of dt (0 based).
193 // if jiscols is not null, we have a by = .EACHI, so the start is exactly i.
194 // Otherwise, igrp needs to be determined from starts, potentially taking care about the order if present.
195 igrp = !isNull(jiscols) ? i : (length(grporder) ? INTEGER(grporder)[istarts[i]-1]-1 : istarts[i]-1);
196 if (igrp>=0 && nrowgroups) for (int j=0; j<length(BY); ++j) { // igrp can be -1 so 'if' is important, otherwise memcpy crash
197 memrecycle(VECTOR_ELT(BY,j), R_NilValue, 0, 1, VECTOR_ELT(groups, INTEGER(grpcols)[j]-1), igrp, 1, j+1, "Internal error assigning to BY");
198 }
199 if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) {
200 for (int j=0; j<length(SDall); ++j) {
201 writeNA(VECTOR_ELT(SDall, j), 0, 1);
202 // writeNA uses SET_ for STR and VEC, and we always use SET_ to assign to SDall always too. Otherwise,
203 // this writeNA could decrement the reference for the old value which wasn't incremented in the first place.
204 // Further, the eval(jval) could feasibly assign to SD although that is currently caught and disallowed. If that
205 // became possible, that assign from user's j expression would decrement the reference which wasn't incremented
206 // in the first place. And finally, upon release of SD, values will be decremented, where they weren't incremented
207 // in the first place. All in all, too risky to write behind the barrier in this section.
208 // Or in the words, this entire section, and this entire dogroups.c file, is now write-barrier compliant from v1.12.10
209 // and we hope that reference counting on by default from R 4.0 will avoid costly gc()s.
210 }
211 grpn = 1; // it may not be 1 e.g. test 722. TODO: revisit.
212 SETLENGTH(I, grpn);
213 INTEGER(I)[0] = 0;
214 for (int j=0; j<length(xSD); ++j) {
215 writeNA(VECTOR_ELT(xSD, j), 0, 1);
216 }
217 } else {
218 if (verbose) tstart = clock();
219 SETLENGTH(I, grpn);
220 int *iI = INTEGER(I);
221 if (LENGTH(order)==0) {
222 const int rownum = grpn ? istarts[i]-1 : -1;
223 for (int j=0; j<grpn; ++j) iI[j] = rownum+j+1;
224 if (rownum>=0) {
225 for (int j=0; j<length(SDall); ++j)
226 memrecycle(VECTOR_ELT(SDall,j), R_NilValue, 0, grpn, VECTOR_ELT(dt, INTEGER(dtcols)[j]-1), rownum, grpn, j+1, "Internal error assigning to SDall");
227 for (int j=0; j<length(xSD); ++j)
228 memrecycle(VECTOR_ELT(xSD,j), R_NilValue, 0, 1, VECTOR_ELT(dt, INTEGER(xjiscols)[j]-1), rownum, 1, j+1, "Internal error assigning to xSD");
229 }
230 if (verbose) { tblock[0] += clock()-tstart; nblock[0]++; }
231 } else {
232 const int rownum = istarts[i]-1;
233 for (int k=0; k<grpn; ++k) iI[k] = iorder[rownum+k];
234 for (int j=0; j<length(SDall); ++j) {
235 // this is the main non-contiguous gather, and is parallel (within-column) for non-SEXP
236 subsetVectorRaw(VECTOR_ELT(SDall,j), VECTOR_ELT(dt,INTEGER(dtcols)[j]-1), I, /*anyNA=*/false);
237 }
238 if (verbose) { tblock[1] += clock()-tstart; nblock[1]++; }
239 // The two blocks have separate timing statements to make sure which is running
240 }
241 }
242
243 if (verbose) tstart = clock(); // call to clock() is more expensive than an 'if'
244 PROTECT(jval = eval(jexp, env));
245 if (verbose) { tblock[2] += clock()-tstart; nblock[2]++; }
246
247 if (isNull(jval)) {
248 // j may be a plot or other side-effect only
249 UNPROTECT(1);
250 continue;
251 }
252 if ((wasvector = (isVectorAtomic(jval) || jexpIsSymbolOtherThanSD))) { // see test 559
253 // Prior to 1.8.1 the wrap with list() was done up in R after the first group had tested.
254 // This wrap is done to make the code below easier to loop through. listwrap avoids copying jval.
255 SET_VECTOR_ELT(listwrap,0,jval);
256 jval = listwrap;
257 } else {
258 if (!isNewList(jval)) // isNewList tests for VECSXP. isList tests for (old) LISTSXP
259 error(_("j evaluates to type '%s'. Must evaluate to atomic vector or list."),type2char(TYPEOF(jval)));
260 if (!LENGTH(jval)) {
261 UNPROTECT(1);
262 continue;
263 }
264 for (int j=0; j<LENGTH(jval); ++j) {
265 thiscol = VECTOR_ELT(jval,j);
266 if (!isNull(thiscol) && (!isVector(thiscol) || isFrame(thiscol) || isArray(thiscol) ))
267 error(_("All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards."));
268 }
269 }
270 if (!isNull(lhs)) {
271 R_len_t origncol = LENGTH(dt);
272 // check jval first before proceeding to add columns, so that if error on the first group, the columns aren't added
273 for (int j=0; j<length(lhs); ++j) {
274 RHS = VECTOR_ELT(jval,j%LENGTH(jval));
275 if (isNull(RHS))
276 error(_("RHS of := is NULL during grouped assignment, but it's not possible to delete parts of a column."));
277 int vlen = length(RHS);
278 if (vlen>1 && vlen!=grpn) {
279 SEXP colname = isNull(VECTOR_ELT(dt, INTEGER(lhs)[j]-1)) ? STRING_ELT(newnames, INTEGER(lhs)[j]-origncol-1) : STRING_ELT(dtnames,INTEGER(lhs)[j]-1);
280 error(_("Supplied %d items to be assigned to group %d of size %d in column '%s'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."),vlen,i+1,grpn,CHAR(colname));
281 // e.g. in #91 `:=` did not issue recycling warning during grouping. Now it is error not warning.
282 }
283 }
284 int n = LENGTH(VECTOR_ELT(dt, 0));
285 for (int j=0; j<length(lhs); ++j) {
286 int colj = INTEGER(lhs)[j]-1;
287 target = VECTOR_ELT(dt, colj);
288 RHS = VECTOR_ELT(jval,j%LENGTH(jval));
289 if (isNull(target)) {
290 // first time adding to new column
291 if (TRUELENGTH(dt) < colj+1) error(_("Internal error: Trying to add new column by reference but tl is full; setalloccol should have run first at R level before getting to this point in dogroups")); // # nocov
292 target = PROTECT(allocNAVectorLike(RHS, n));
293 // Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
294 // contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
295 // Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
296 SETLENGTH(dtnames, LENGTH(dtnames)+1);
297 SETLENGTH(dt, LENGTH(dt)+1);
298 SET_VECTOR_ELT(dt, colj, target);
299 UNPROTECT(1);
300 SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
301 copyMostAttrib(RHS, target); // attributes of first group dominate; e.g. initial factor levels come from first group
302 }
303 bool copied = false;
304 if (isNewList(target) && anySpecialStatic(RHS)) { // see comments in anySpecialStatic()
305 RHS = PROTECT(copyAsPlain(RHS));
306 copied = true;
307 }
308 const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, "");
309 // can't error here because length mismatch already checked for all jval columns before starting to add any new columns
310 if (copied) UNPROTECT(1);
311 if (warn)
312 warning(_("Group %d column '%s': %s"), i+1, CHAR(STRING_ELT(dtnames, colj)), warn);
313 }
314 UNPROTECT(1); // jval
315 continue;
316 }
317 maxn = 0;
318 if (njval==0) njval = LENGTH(jval); // for first group, then the rest (when non 0) must conform to the first >0 group
319 if (njval!=LENGTH(jval)) error(_("j doesn't evaluate to the same number of columns for each group")); // this would be a problem even if we unlisted afterwards. This way the user finds out earlier though so he can fix and rerun sooner.
320 for (int j=0; j<njval; ++j) {
321 int k = length(VECTOR_ELT(jval,j)); // might be NULL, so length not LENGTH
322 maxn = k>maxn ? k : maxn;
323 }
324 if (ansloc + maxn > estn) {
325 if (estn == -1) {
326 // Given first group and j's result on it, make a good guess for size of result required.
327 if (grpn==0)
328 estn = maxn = 0; // empty case e.g. test 184. maxn is 1 here due to sum(integer()) == 0L
329 else if (maxn==1) // including when grpn==1 we default to assuming it's an aggregate
330 estn = LENGTH(starts);
331 // Common case 1 : j is a list of simple aggregates i.e. list of atoms only
332 else if (maxn >= grpn) {
333 estn = 0;
334 for (int j=0; j<LENGTH(lens); ++j) estn+=ilens[j];
335 // Common case 2 : j returns as many rows as there are in the group (maybe a join)
336 // TO DO: this might over allocate if first group has 1 row and j is actually a single row aggregate
337 // in cases when we're not sure could wait for the first few groups before deciding.
338 } else // maxn < grpn
339 estn = maxn * LENGTH(starts);
340 // Common case 3 : head or tail of .SD perhaps
341 if (estn<maxn) estn=maxn; // if the result for the first group is larger than the table itself(!) Unusual case where a join is being done in j via .SD and the 1-row table is an edge case of bigger picture.
342 PROTECT(ans = allocVector(VECSXP, ngrpcols + njval));
343 nprotect++;
344 firstalloc=TRUE;
345 for(int j=0; j<ngrpcols; ++j) {
346 thiscol = VECTOR_ELT(groups, INTEGER(grpcols)[j]-1);
347 SET_VECTOR_ELT(ans, j, allocVector(TYPEOF(thiscol), estn));
348 copyMostAttrib(thiscol, VECTOR_ELT(ans,j)); // not names, otherwise test 778 would fail
349 }
350 for(int j=0; j<njval; ++j) {
351 thiscol = VECTOR_ELT(jval, j);
352 if (isNull(thiscol))
353 error(_("Column %d of j's result for the first group is NULL. We rely on the column types of the first result to decide the type expected for the remaining groups (and require consistency). NULL columns are acceptable for later groups (and those are replaced with NA of appropriate type and recycled) but not for the first. Please use a typed empty vector instead, such as integer() or numeric()."), j+1);
354 if (verbose && !isNull(getAttrib(thiscol, R_NamesSymbol))) {
355 if (wasvector) {
356 Rprintf(_("j appears to be a named vector. The same names will likely be created over and over again for each group and slow things down. Try and pass a named list (which data.table optimizes) or an unnamed list() instead.\n"));
357 } else {
358 Rprintf(_("Column %d of j is a named vector (each item down the rows is named, somehow). Please remove those names for efficiency (to save creating them over and over for each group). They are ignored anyway.\n"), j+1);
359 }
360 }
361 SET_VECTOR_ELT(ans, ngrpcols+j, allocVector(TYPEOF(thiscol), estn));
362 copyMostAttrib(thiscol, VECTOR_ELT(ans,ngrpcols+j)); // not names, otherwise test 276 would fail
363 }
364 SEXP jvalnames = PROTECT(getAttrib(jval, R_NamesSymbol));
365 if (!isNull(jvalnames)) {
366 if (verbose) Rprintf(_("The result of j is a named list. It's very inefficient to create the same names over and over again for each group. When j=list(...), any names are detected, removed and put back after grouping has completed, for efficiency. Using j=transform(), for example, prevents that speedup (consider changing to :=). This message may be upgraded to warning in future.\n")); // e.g. test 104 has j=transform().
367 // names of result come from the first group and the names of remaining groups are ignored (all that matters for them is that the number of columns (and their types) match the first group.
368 SEXP names2 = PROTECT(allocVector(STRSXP,ngrpcols+njval));
369 // for (j=0; j<ngrpcols; j++) SET_STRING_ELT(names2, j, STRING_ELT(bynames,j)); // These get set back up in R
370 for (int j=0; j<njval; ++j) SET_STRING_ELT(names2, ngrpcols+j, STRING_ELT(jvalnames,j));
371 setAttrib(ans, R_NamesSymbol, names2);
372 UNPROTECT(1); // names2
373 // setAttrib(SD, R_NamesSymbol, R_NilValue); // so that lapply(.SD,mean) is unnamed from 2nd group on
374 }
375 UNPROTECT(1); // jvalnames
376 } else {
377 estn = ((double)ngrp/i)*1.1*(ansloc+maxn);
378 if (verbose) Rprintf(_("dogroups: growing from %d to %d rows\n"), length(VECTOR_ELT(ans,0)), estn);
379 if (length(ans) != ngrpcols + njval) error(_("dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]"),length(ans),ngrpcols,njval);
380 for (int j=0; j<length(ans); ++j) SET_VECTOR_ELT(ans, j, growVector(VECTOR_ELT(ans,j), estn));
381 }
382 }
383 // write the group values to ans, recycled to match the nrow of the result for this group ...
384 for (int j=0; j<ngrpcols; ++j) {
385 memrecycle(VECTOR_ELT(ans,j), R_NilValue, ansloc, maxn, VECTOR_ELT(groups, INTEGER(grpcols)[j]-1), igrp, 1, j+1, "Internal error recycling group values");
386 }
387 // Now copy jval into ans ...
388 for (int j=0; j<njval; ++j) {
389 thisansloc = ansloc;
390 source = VECTOR_ELT(jval,j);
391 thislen = length(source);
392 target = VECTOR_ELT(ans, j+ngrpcols);
393 if (thislen == 0) {
394 // including NULL and typed empty vectors, fill with NA
395 // A NULL in the first group's jval isn't allowed; caught above after allocating ans
396 if (!NullWarnDone && maxn>1) { // maxn==1 in tests 172,280,281,282,403,405 and 406
397 warning(_("Item %d of j's result for group %d is zero length. This will be filled with %d NAs to match the longest column in this result. Later groups may have a similar problem but only the first is reported to save filling the warning buffer."), j+1, i+1, maxn);
398 NullWarnDone = TRUE;
399 }
400 writeNA(target, thisansloc, maxn);
401 } else {
402 // thislen>0
403 if (TYPEOF(source) != TYPEOF(target))
404 error(_("Column %d of result for group %d is type '%s' but expecting type '%s'. Column types must be consistent for each group."), j+1, i+1, type2char(TYPEOF(source)), type2char(TYPEOF(target)));
405 if (thislen>1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986
406 error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
407 }
408 bool copied = false;
409 if (isNewList(target) && anySpecialStatic(source)) { // see comments in anySpecialStatic()
410 source = PROTECT(copyAsPlain(source));
411 copied = true;
412 }
413 memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
414 if (copied) UNPROTECT(1);
415 }
416 }
417 ansloc += maxn;
418 if (firstalloc) {
419 nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect
420 firstalloc = FALSE; // ans. The first jval is needed to create the right size and type of ans.
421 // TO DO: could avoid this last 'if' by adding a dummy PROTECT after first alloc for this UNPROTECT(1) to do.
422 }
423 else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok.
424 }
425 if (isNull(lhs) && ans!=NULL) {
426 if (ansloc < LENGTH(VECTOR_ELT(ans,0))) {
427 if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0)));
428 for (int j=0; j<length(ans); j++) SET_VECTOR_ELT(ans, j, growVector(VECTOR_ELT(ans,j), ansloc));
429 // shrinks (misuse of word 'grow') back to the rows written, otherwise leak until ...
430 // ... TO DO: set truelength to LENGTH(VECTOR_ELT(ans,0)), length to ansloc and enhance finalizer to handle over-allocated rows.
431 }
432 } else ans = R_NilValue;
433 // Now reset length of .SD columns and .I to length of largest group, otherwise leak if the last group is smaller (often is).
434 // Also reset truelength on specials; see comments in anySpecialStatic().
435 for (int j=0; j<length(SDall); ++j) {
436 SEXP this = VECTOR_ELT(SDall,j);
437 SETLENGTH(this, maxGrpSize);
438 SET_TRUELENGTH(this, maxGrpSize);
439 }
440 SETLENGTH(I, maxGrpSize);
441 SET_TRUELENGTH(I, maxGrpSize);
442 for (int i=0; i<length(BY); ++i) {
443 SEXP this = VECTOR_ELT(BY, i);
444 SET_TRUELENGTH(this, length(this)); // might be 0 or 1; see its allocVector above
445 }
446 SET_TRUELENGTH(N, 1);
447 SET_TRUELENGTH(GRP, 1);
448 if (verbose) {
449 if (nblock[0] && nblock[1]) error(_("Internal error: block 0 [%d] and block 1 [%d] have both run"), nblock[0], nblock[1]); // # nocov
450 int w = nblock[1]>0;
451 Rprintf(_("\n %s took %.3fs for %d groups\n"), w ? "collecting discontiguous groups" : "memcpy contiguous groups",
452 1.0*tblock[w]/CLOCKS_PER_SEC, nblock[w]);
453 Rprintf(_(" eval(j) took %.3fs for %d calls\n"), 1.0*tblock[2]/CLOCKS_PER_SEC, nblock[2]);
454 }
455 UNPROTECT(nprotect);
456 return(ans);
457 }
458
keepattr(SEXP to,SEXP from)459 SEXP keepattr(SEXP to, SEXP from)
460 {
461 // Same as R_copyDFattr in src/main/attrib.c, but that seems not exposed in R's api
462 // Only difference is that we reverse from and to in the prototype, for easier calling above
463 SET_ATTRIB(to, ATTRIB(from));
464 IS_S4_OBJECT(from) ? SET_S4_OBJECT(to) : UNSET_S4_OBJECT(to);
465 SET_OBJECT(to, OBJECT(from));
466 return to;
467 }
468
growVector(SEXP x,const R_len_t newlen)469 SEXP growVector(SEXP x, const R_len_t newlen)
470 {
471 // Similar to EnlargeVector in src/main/subassign.c, with the following changes :
472 // * replaced switch and loops with one memcpy for INTEGER and REAL, but need to age CHAR and VEC.
473 // * no need to cater for names
474 // * much shorter and faster
475 SEXP newx;
476 R_len_t len = length(x);
477 if (isNull(x)) error(_("growVector passed NULL"));
478 PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
479 if (newlen < len) len=newlen; // i.e. shrink
480 switch (TYPEOF(x)) {
481 case RAWSXP: memcpy(RAW(newx), RAW(x), len*SIZEOF(x)); break;
482 case LGLSXP: memcpy(LOGICAL(newx), LOGICAL(x), len*SIZEOF(x)); break;
483 case INTSXP: memcpy(INTEGER(newx), INTEGER(x), len*SIZEOF(x)); break;
484 case REALSXP: memcpy(REAL(newx), REAL(x), len*SIZEOF(x)); break;
485 case CPLXSXP: memcpy(COMPLEX(newx), COMPLEX(x), len*SIZEOF(x)); break;
486 case STRSXP : {
487 const SEXP *xd = SEXPPTR_RO(x);
488 for (int i=0; i<len; ++i)
489 SET_STRING_ELT(newx, i, xd[i]);
490 } break;
491 case VECSXP : {
492 const SEXP *xd = SEXPPTR_RO(x);
493 for (int i=0; i<len; ++i)
494 SET_VECTOR_ELT(newx, i, xd[i]);
495 } break;
496 default :
497 error(_("Internal error: growVector doesn't support type '%s'"), type2char(TYPEOF(x))); // # nocov
498 }
499 // if (verbose) Rprintf(_("Growing vector from %d to %d items of type '%s'\n"), len, newlen, type2char(TYPEOF(x)));
500 // Would print for every column if here. Now just up in dogroups (one msg for each table grow).
501 keepattr(newx,x);
502 UNPROTECT(1);
503 return newx;
504 }
505
506
507 // benchmark timings for #481 fix:
508 // old code - no changes, R v3.0.3
509 // > system.time(dt[, list(list(y)), by=x])
510 // user system elapsed
511 // 82.593 0.936 84.314
512 // > system.time(dt[, list(list(y)), by=x])
513 // user system elapsed
514 // 34.558 0.628 35.658
515 // > system.time(dt[, list(list(y)), by=x])
516 // user system elapsed
517 // 37.056 0.315 37.668
518 //
519 // All new changes in place, R v3.0.3
520 // > system.time(dt[, list(list(y)), by=x])
521 // user system elapsed
522 // 82.852 0.952 84.575
523 // > system.time(dt[, list(list(y)), by=x])
524 // user system elapsed
525 // 34.600 0.356 35.173
526 // > system.time(dt[, list(list(y)), by=x])
527 // user system elapsed
528 // 36.865 0.514 37.901
529
530 // old code - no changes, R v3.1.0 --- BUT RESULTS ARE WRONG!
531 // > system.time(dt[, list(list(y)), by=x])
532 // user system elapsed
533 // 11.022 0.352 11.455
534 // > system.time(dt[, list(list(y)), by=x])
535 // user system elapsed
536 // 10.397 0.119 10.600
537 // > system.time(dt[, list(list(y)), by=x])
538 // user system elapsed
539 // 10.665 0.101 11.013
540
541 // All new changes in place, R v3.1.0
542 // > system.time(dt[, list(list(y)), by=x])
543 // user system elapsed
544 // 83.279 1.057 89.856
545 // > system.time(dt[, list(list(y)), by=x])
546 // user system elapsed
547 // 30.569 0.633 31.452
548 // > system.time(dt[, list(list(y)), by=x])
549 // user system elapsed
550 // 30.827 0.239 32.306
551