1 #include "data.table.h"
2
subsetVectorRaw(SEXP ans,SEXP source,SEXP idx,const bool anyNA)3 void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA)
4 // Used here by subsetDT() and by dogroups.c
5 {
6 const int n = length(idx);
7 if (length(ans)!=n) error(_("Internal error: subsetVectorRaw length(ans)==%d n=%d"), length(ans), n);
8
9 const int *restrict idxp = INTEGER(idx);
10 // anyNA refers to NA _in idx_; if there's NA in the data (source) that's just regular data to be copied
11 // negatives, zeros and out-of-bounds have already been dealt with in convertNegAndZero so we can rely
12 // here on idx in range [1,length(ans)].
13
14 int nth = getDTthreads(n, /*throttle=*/true); // not const for Solaris, #4638
15 // For small n such as 2,3,4 etc we had hoped OpenMP would be sensible inside it and not create a team
16 // with each thread doing just one item. Otherwise, call overhead would be too high for highly iterated
17 // calls on very small subsets. Timings were tested in #3175. However, the overhead does seem to add up
18 // significantly. Hence the throttle was introduced, #4484. And not having the OpenMP region at all here
19 // when nth==1 (the ifs below in PARLOOP) seems to help too, #4200.
20 // To stress test the code for correctness by forcing multi-threading on for small data, the throttle can
21 // be turned off using setDThreads() or R_DATATABLE_THROTTLE environment variable.
22
23 #define PARLOOP(_NAVAL_) \
24 if (anyNA) { \
25 if (nth>1) { \
26 _Pragma("omp parallel for num_threads(nth)") \
27 for (int i=0; i<n; ++i) { \
28 int elem = idxp[i]; \
29 ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1]; \
30 } \
31 } else { \
32 for (int i=0; i<n; ++i) { \
33 int elem = idxp[i]; \
34 ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1]; \
35 } \
36 } \
37 } else { \
38 if (nth>1) { \
39 _Pragma("omp parallel for num_threads(nth)") \
40 for (int i=0; i<n; ++i) { \
41 ap[i] = sp[idxp[i]-1]; \
42 } \
43 } else { \
44 for (int i=0; i<n; ++i) { \
45 ap[i] = sp[idxp[i]-1]; \
46 } \
47 } \
48 }
49
50 switch(TYPEOF(source)) {
51 case INTSXP: case LGLSXP: {
52 int *sp = INTEGER(source);
53 int *ap = INTEGER(ans);
54 PARLOOP(NA_INTEGER)
55 } break;
56 case REALSXP : {
57 if (INHERITS(source, char_integer64)) {
58 int64_t *sp = (int64_t *)REAL(source);
59 int64_t *ap = (int64_t *)REAL(ans);
60 PARLOOP(INT64_MIN)
61 } else {
62 double *sp = REAL(source);
63 double *ap = REAL(ans);
64 PARLOOP(NA_REAL)
65 }
66 } break;
67 case STRSXP : {
68 // write barrier (assigning strings/lists) is not thread safe. Hence single threaded.
69 // To go parallel here would need access to NODE_IS_OLDER, at least. Given gcgen, mark and named
70 // are upper bounded and max 3, REFCNT==REFCNTMAX could be checked first and then critical SET_ if not.
71 // Inside that critical just before SET_ it could check REFCNT<REFCNTMAX still held. Similarly for gcgen.
72 // TODO - discuss with Luke Tierney. Produce benchmarks on integer/double to see if it's worth making a safe
73 // API interface for package use for STRSXP.
74 // Aside: setkey() is a separate special case (a permutation) and does do this in parallel without using SET_*.
75 const SEXP *sp = SEXPPTR_RO(source);
76 if (anyNA) {
77 for (int i=0; i<n; i++) { int elem = idxp[i]; SET_STRING_ELT(ans, i, elem==NA_INTEGER ? NA_STRING : sp[elem-1]); }
78 } else {
79 for (int i=0; i<n; i++) { SET_STRING_ELT(ans, i, sp[idxp[i]-1]); }
80 }
81 } break;
82 case VECSXP : {
83 const SEXP *sp = SEXPPTR_RO(source);
84 if (anyNA) {
85 for (int i=0; i<n; i++) { int elem = idxp[i]; SET_VECTOR_ELT(ans, i, elem==NA_INTEGER ? R_NilValue : sp[elem-1]); }
86 } else {
87 for (int i=0; i<n; i++) { SET_VECTOR_ELT(ans, i, sp[idxp[i]-1]); }
88 }
89 } break;
90 case CPLXSXP : {
91 Rcomplex *sp = COMPLEX(source);
92 Rcomplex *ap = COMPLEX(ans);
93 PARLOOP(NA_CPLX)
94 } break;
95 case RAWSXP : {
96 Rbyte *sp = RAW(source);
97 Rbyte *ap = RAW(ans);
98 PARLOOP(0)
99 } break;
100 default :
101 error(_("Internal error: column type '%s' not supported by data.table subset. All known types are supported so please report as bug."), type2char(TYPEOF(source))); // # nocov
102 }
103 }
104
check_idx(SEXP idx,int max,bool * anyNA_out,bool * orderedSubset_out)105 static const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out)
106 // set anyNA for branchless subsetVectorRaw
107 // error if any negatives, zeros or >max since they should have been dealt with by convertNegAndZeroIdx() called ealier at R level.
108 // single cache efficient sweep with prefetch, so very low priority to go parallel
109 {
110 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov
111 bool anyLess=false, anyNA=false;
112 int last = INT32_MIN;
113 int *idxp = INTEGER(idx), n=LENGTH(idx);
114 for (int i=0; i<n; i++) {
115 int elem = idxp[i];
116 if (elem<=0 && elem!=NA_INTEGER) return "Internal inefficiency: idx contains negatives or zeros. Should have been dealt with earlier."; // e.g. test 762 (TODO-fix)
117 if (elem>max) return "Internal inefficiency: idx contains an item out-of-range. Should have been dealt with earlier."; // e.g. test 1639.64
118 anyNA |= elem==NA_INTEGER;
119 anyLess |= elem<last;
120 last = elem;
121 }
122 *anyNA_out = anyNA;
123 *orderedSubset_out = !anyLess; // for the purpose of ordered keys elem==last is allowed
124 return NULL;
125 }
126
convertNegAndZeroIdx(SEXP idx,SEXP maxArg,SEXP allowOverMax)127 SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax)
128 {
129 // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros
130 // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time)
131 // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway)
132 // allowOverMaxArg is false when := (test 1024), otherwise true for selecting
133
134 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov
135 if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov
136 if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE")); // # nocov
137 int max = INTEGER(maxArg)[0], n=LENGTH(idx);
138 if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov includes NA which will print as INT_MIN
139 int *idxp = INTEGER(idx);
140
141 bool stop = false;
142 #pragma omp parallel for num_threads(getDTthreads(n, true))
143 for (int i=0; i<n; i++) {
144 if (stop) continue;
145 int elem = idxp[i];
146 if ((elem<1 && elem!=NA_INTEGER) || elem>max) stop=true;
147 }
148 if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max]
149
150 // ---------
151 // else massage the input to a standard idx where all items are either NA or in range [1,max] ...
152
153 int countNeg=0, countZero=0, countNA=0, firstOverMax=0;
154 for (int i=0; i<n; i++) {
155 int elem = idxp[i];
156 if (elem==NA_INTEGER) countNA++;
157 else if (elem<0) countNeg++;
158 else if (elem==0) countZero++;
159 else if (elem>max && firstOverMax==0) firstOverMax=i+1;
160 }
161 if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) {
162 error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max);
163 }
164
165 int countPos = n-countNeg-countZero-countNA;
166 if (countPos && countNeg) {
167 int i=0, firstNeg=0, firstPos=0;
168 while (i<n && (firstNeg==0 || firstPos==0)) {
169 int elem = idxp[i];
170 if (firstPos==0 && elem>0) firstPos=i+1;
171 if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
172 i++;
173 }
174 error(_("Item %d of i is %d and item %d is %d. Cannot mix positives and negatives."), firstNeg, idxp[firstNeg-1], firstPos, idxp[firstPos-1]);
175 }
176 if (countNeg && countNA) {
177 int i=0, firstNeg=0, firstNA=0;
178 while (i<n && (firstNeg==0 || firstNA==0)) {
179 int elem = idxp[i];
180 if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
181 if (firstNA==0 && elem==NA_INTEGER) firstNA=i+1;
182 i++;
183 }
184 error(_("Item %d of i is %d and item %d is NA. Cannot mix negatives and NA."), firstNeg, idxp[firstNeg-1], firstNA);
185 }
186
187 SEXP ans;
188 if (countNeg==0) {
189 // just zeros to remove, or >max to convert to NA
190 ans = PROTECT(allocVector(INTSXP, n - countZero));
191 int *ansp = INTEGER(ans);
192 for (int i=0, ansi=0; i<n; i++) {
193 int elem = idxp[i];
194 if (elem==0) continue;
195 ansp[ansi++] = elem>max ? NA_INTEGER : elem;
196 }
197 } else {
198 // idx is all negative without any NA but perhaps some zeros
199 bool *keep = (bool *)R_alloc(max, sizeof(bool)); // 4 times less memory that INTSXP in src/main/subscript.c
200 for (int i=0; i<max; i++) keep[i] = true;
201 int countRemoved=0, countDup=0, countBeyond=0; // idx=c(-10,-5,-10) removing row 10 twice
202 int firstBeyond=0, firstDup=0;
203 for (int i=0; i<n; i++) {
204 int elem = -idxp[i];
205 if (elem==0) continue;
206 if (elem>max) {
207 countBeyond++;
208 if (firstBeyond==0) firstBeyond=i+1;
209 continue;
210 }
211 if (!keep[elem-1]) {
212 countDup++;
213 if (firstDup==0) firstDup=i+1;
214 } else {
215 keep[elem-1] = false;
216 countRemoved++;
217 }
218 }
219 if (countBeyond)
220 warning(_("Item %d of i is %d but there are only %d rows. Ignoring this and %d more like it out of %d."), firstBeyond, idxp[firstBeyond-1], max, countBeyond-1, n);
221 if (countDup)
222 warning(_("Item %d of i is %d which removes that item but that has occurred before. Ignoring this dup and %d other dups."), firstDup, idxp[firstDup-1], countDup-1);
223 int ansn = max-countRemoved;
224 ans = PROTECT(allocVector(INTSXP, ansn));
225 int *ansp = INTEGER(ans);
226 for (int i=0, ansi=0; i<max; i++) {
227 if (keep[i]) ansp[ansi++] = i+1;
228 }
229 }
230 UNPROTECT(1);
231 return ans;
232 }
233
checkCol(SEXP col,int colNum,int nrow,SEXP x)234 static void checkCol(SEXP col, int colNum, int nrow, SEXP x)
235 {
236 if (isNull(col)) error(_("Column %d is NULL; malformed data.table."), colNum);
237 if (isNewList(col) && INHERITS(col, char_dataframe)) {
238 SEXP names = getAttrib(x, R_NamesSymbol);
239 error(_("Column %d ['%s'] is a data.frame or data.table; malformed data.table."),
240 colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)));
241 }
242 if (length(col)!=nrow) {
243 SEXP names = getAttrib(x, R_NamesSymbol);
244 error(_("Column %d ['%s'] is length %d but column 1 is length %d; malformed data.table."),
245 colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)), length(col), nrow);
246 }
247 }
248
249 /*
250 * subsetDT - Subsets a data.table
251 * NOTE:
252 * 1) 'rows' and 'cols' are 1-based, passed from R level
253 * 2) Originally for subsetting vectors in fcast and now the beginnings of [.data.table ported to C
254 * 3) Immediate need is for R 3.1 as lglVec[1] now returns R's global TRUE and we don't want := to change that global [think 1 row data.tables]
255 * 4) Could do it other ways but may as well go to C now as we were going to do that anyway
256 */
257
subsetDT(SEXP x,SEXP rows,SEXP cols)258 SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md and man/cdt.Rd
259 int nprotect=0;
260 if (!isNewList(x)) error(_("Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'"), type2char(TYPEOF(rows))); // # nocov
261 if (!length(x)) return(x); // return empty list
262
263 const int nrow = length(VECTOR_ELT(x,0));
264 // check index once up front for 0 or NA, for branchless subsetVectorRaw which is repeated for each column
265 bool anyNA=false, orderedSubset=true; // true for when rows==null (meaning all rows)
266 if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) {
267 SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++;
268 rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++;
269 const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset);
270 if (err!=NULL) error(err);
271 }
272
273 if (!isInteger(cols)) error(_("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'"), type2char(TYPEOF(cols))); // # nocov
274 for (int i=0; i<LENGTH(cols); i++) {
275 int this = INTEGER(cols)[i];
276 if (this<1 || this>LENGTH(x)) error(_("Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]"), i+1, this, LENGTH(x));
277 }
278
279 int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue));
280 SEXP ans = PROTECT(allocVector(VECSXP, LENGTH(cols)+overAlloc)); nprotect++; // doing alloc.col directly here; eventually alloc.col can be deprecated.
281
282 // user-defined and superclass attributes get copied as from v1.12.0
283 copyMostAttrib(x, ans);
284 // most means all except R_NamesSymbol, R_DimSymbol and R_DimNamesSymbol
285 // includes row.names (oddly, given other dims aren't) and "sorted" dealt with below
286 // class is also copied here which retains superclass name in class vector as has been the case for many years; e.g. tests 1228.* for #64
287
288 SET_TRUELENGTH(ans, LENGTH(ans));
289 SETLENGTH(ans, LENGTH(cols));
290 int ansn;
291 if (isNull(rows)) {
292 ansn = nrow;
293 const int *colD = INTEGER(cols);
294 for (int i=0; i<LENGTH(cols); i++) {
295 SEXP thisCol = VECTOR_ELT(x, colD[i]-1);
296 checkCol(thisCol, colD[i], nrow, x);
297 SET_VECTOR_ELT(ans, i, copyAsPlain(thisCol));
298 // materialize the column subset as we have always done for now, until REFCNT is on by default in R (TODO)
299 }
300 } else {
301 ansn = LENGTH(rows); // has been checked not to contain zeros or negatives, so this length is the length of result
302 const int *colD = INTEGER(cols);
303 for (int i=0; i<LENGTH(cols); i++) {
304 SEXP source = VECTOR_ELT(x, colD[i]-1);
305 checkCol(source, colD[i], nrow, x);
306 SEXP target;
307 SET_VECTOR_ELT(ans, i, target=allocVector(TYPEOF(source), ansn));
308 copyMostAttrib(source, target);
309 subsetVectorRaw(target, source, rows, anyNA); // parallel within column
310 }
311 }
312 SEXP tmp = PROTECT(allocVector(STRSXP, LENGTH(cols)+overAlloc)); nprotect++;
313 SET_TRUELENGTH(tmp, LENGTH(tmp));
314 SETLENGTH(tmp, LENGTH(cols));
315 setAttrib(ans, R_NamesSymbol, tmp);
316 subsetVectorRaw(tmp, getAttrib(x, R_NamesSymbol), cols, /*anyNA=*/false);
317
318 tmp = PROTECT(allocVector(INTSXP, 2)); nprotect++;
319 INTEGER(tmp)[0] = NA_INTEGER;
320 INTEGER(tmp)[1] = -ansn;
321 setAttrib(ans, R_RowNamesSymbol, tmp); // The contents of tmp must be set before being passed to setAttrib(). setAttrib looks at tmp value and copies it in the case of R_RowNamesSymbol. Caused hard to track bug around 28 Sep 2014.
322
323 // clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)
324 setAttrib(ans, sym_index, R_NilValue);
325 // but maintain key if ordered subset
326 SEXP key = getAttrib(x, sym_sorted);
327 if (length(key)) {
328 SEXP in = PROTECT(chin(key, getAttrib(ans,R_NamesSymbol))); nprotect++;
329 int i = 0; while(i<LENGTH(key) && LOGICAL(in)[i]) i++;
330 // i is now the keylen that can be kept. 2 lines above much easier in C than R
331 if (i==0 || !orderedSubset) {
332 // clear key that was copied over by copyMostAttrib() above
333 setAttrib(ans, sym_sorted, R_NilValue);
334 } else {
335 // make a new key attribute; shorter if i<LENGTH(key) or same length copied so this key is safe to change by ref (setnames)
336 setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
337 for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
338 }
339 }
340 unlock(ans);
341 setselfref(ans);
342 UNPROTECT(nprotect);
343 return ans;
344 }
345
subsetVector(SEXP x,SEXP idx)346 SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
347 bool anyNA=false, orderedSubset=false;
348 int nprotect=0;
349 if (isNull(x))
350 error(_("Internal error: NULL can not be subset. It is invalid for a data.table to contain a NULL column.")); // # nocov
351 if (check_idx(idx, length(x), &anyNA, &orderedSubset) != NULL)
352 error(_("Internal error: CsubsetVector is internal-use-only but has received negatives, zeros or out-of-range")); // # nocov
353 SEXP ans = PROTECT(allocVector(TYPEOF(x), length(idx))); nprotect++;
354 copyMostAttrib(x, ans);
355 subsetVectorRaw(ans, x, idx, anyNA);
356 UNPROTECT(nprotect);
357 return ans;
358 }
359
360