1 #include "data.table.h"
2 
subsetVectorRaw(SEXP ans,SEXP source,SEXP idx,const bool anyNA)3 void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA)
4 // Used here by subsetDT() and by dogroups.c
5 {
6   const int n = length(idx);
7   if (length(ans)!=n) error(_("Internal error: subsetVectorRaw length(ans)==%d n=%d"), length(ans), n);
8 
9   const int *restrict idxp = INTEGER(idx);
10   // anyNA refers to NA _in idx_; if there's NA in the data (source) that's just regular data to be copied
11   // negatives, zeros and out-of-bounds have already been dealt with in convertNegAndZero so we can rely
12   // here on idx in range [1,length(ans)].
13 
14   int nth = getDTthreads(n, /*throttle=*/true);   // not const for Solaris, #4638
15   // For small n such as 2,3,4 etc we had hoped OpenMP would be sensible inside it and not create a team
16   // with each thread doing just one item. Otherwise, call overhead would be too high for highly iterated
17   // calls on very small subsets. Timings were tested in #3175. However, the overhead does seem to add up
18   // significantly. Hence the throttle was introduced, #4484. And not having the OpenMP region at all here
19   // when nth==1 (the ifs below in PARLOOP) seems to help too, #4200.
20   // To stress test the code for correctness by forcing multi-threading on for small data, the throttle can
21   // be turned off using setDThreads() or R_DATATABLE_THROTTLE environment variable.
22 
23   #define PARLOOP(_NAVAL_)                                        \
24   if (anyNA) {                                                    \
25     if (nth>1) {                                                  \
26       _Pragma("omp parallel for num_threads(nth)")                \
27       for (int i=0; i<n; ++i) {                                   \
28         int elem = idxp[i];                                       \
29         ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1];          \
30       }                                                           \
31     } else {                                                      \
32       for (int i=0; i<n; ++i) {                                   \
33         int elem = idxp[i];                                       \
34         ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1];          \
35       }                                                           \
36     }                                                             \
37   } else {                                                        \
38     if (nth>1) {                                                  \
39       _Pragma("omp parallel for num_threads(nth)")                \
40       for (int i=0; i<n; ++i) {                                   \
41         ap[i] = sp[idxp[i]-1];                                    \
42       }                                                           \
43     } else {                                                      \
44       for (int i=0; i<n; ++i) {                                   \
45         ap[i] = sp[idxp[i]-1];                                    \
46       }                                                           \
47     }                                                             \
48   }
49 
50   switch(TYPEOF(source)) {
51   case INTSXP: case LGLSXP: {
52     int *sp = INTEGER(source);
53     int *ap = INTEGER(ans);
54     PARLOOP(NA_INTEGER)
55   } break;
56   case REALSXP : {
57     if (INHERITS(source, char_integer64)) {
58       int64_t *sp = (int64_t *)REAL(source);
59       int64_t *ap = (int64_t *)REAL(ans);
60       PARLOOP(INT64_MIN)
61     } else {
62       double *sp = REAL(source);
63       double *ap = REAL(ans);
64       PARLOOP(NA_REAL)
65     }
66   } break;
67   case STRSXP : {
68     // write barrier (assigning strings/lists) is not thread safe. Hence single threaded.
69     // To go parallel here would need access to NODE_IS_OLDER, at least. Given gcgen, mark and named
70     // are upper bounded and max 3, REFCNT==REFCNTMAX could be checked first and then critical SET_ if not.
71     // Inside that critical just before SET_ it could check REFCNT<REFCNTMAX still held. Similarly for gcgen.
72     // TODO - discuss with Luke Tierney. Produce benchmarks on integer/double to see if it's worth making a safe
73     //        API interface for package use for STRSXP.
74     // Aside: setkey() is a separate special case (a permutation) and does do this in parallel without using SET_*.
75     const SEXP *sp = SEXPPTR_RO(source);
76     if (anyNA) {
77       for (int i=0; i<n; i++) { int elem = idxp[i]; SET_STRING_ELT(ans, i, elem==NA_INTEGER ? NA_STRING : sp[elem-1]); }
78     } else {
79       for (int i=0; i<n; i++) {                     SET_STRING_ELT(ans, i, sp[idxp[i]-1]); }
80     }
81   } break;
82   case VECSXP : {
83     const SEXP *sp = SEXPPTR_RO(source);
84     if (anyNA) {
85       for (int i=0; i<n; i++) { int elem = idxp[i]; SET_VECTOR_ELT(ans, i, elem==NA_INTEGER ? R_NilValue : sp[elem-1]); }
86     } else {
87       for (int i=0; i<n; i++) {                     SET_VECTOR_ELT(ans, i, sp[idxp[i]-1]); }
88     }
89   } break;
90   case CPLXSXP : {
91     Rcomplex *sp = COMPLEX(source);
92     Rcomplex *ap = COMPLEX(ans);
93     PARLOOP(NA_CPLX)
94   } break;
95   case RAWSXP : {
96     Rbyte *sp = RAW(source);
97     Rbyte *ap = RAW(ans);
98     PARLOOP(0)
99   } break;
100   default :
101     error(_("Internal error: column type '%s' not supported by data.table subset. All known types are supported so please report as bug."), type2char(TYPEOF(source)));  // # nocov
102   }
103 }
104 
check_idx(SEXP idx,int max,bool * anyNA_out,bool * orderedSubset_out)105 static const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out)
106 // set anyNA for branchless subsetVectorRaw
107 // error if any negatives, zeros or >max since they should have been dealt with by convertNegAndZeroIdx() called ealier at R level.
108 // single cache efficient sweep with prefetch, so very low priority to go parallel
109 {
110   if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov
111   bool anyLess=false, anyNA=false;
112   int last = INT32_MIN;
113   int *idxp = INTEGER(idx), n=LENGTH(idx);
114   for (int i=0; i<n; i++) {
115     int elem = idxp[i];
116     if (elem<=0 && elem!=NA_INTEGER) return "Internal inefficiency: idx contains negatives or zeros. Should have been dealt with earlier.";  // e.g. test 762  (TODO-fix)
117     if (elem>max) return "Internal inefficiency: idx contains an item out-of-range. Should have been dealt with earlier.";                   // e.g. test 1639.64
118     anyNA |= elem==NA_INTEGER;
119     anyLess |= elem<last;
120     last = elem;
121   }
122   *anyNA_out = anyNA;
123   *orderedSubset_out = !anyLess; // for the purpose of ordered keys elem==last is allowed
124   return NULL;
125 }
126 
convertNegAndZeroIdx(SEXP idx,SEXP maxArg,SEXP allowOverMax)127 SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax)
128 {
129   // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros
130   // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time)
131   // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway)
132   // allowOverMaxArg is false when := (test 1024), otherwise true for selecting
133 
134   if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov
135   if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov
136   if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE"));  // # nocov
137   int max = INTEGER(maxArg)[0], n=LENGTH(idx);
138   if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov    includes NA which will print as INT_MIN
139   int *idxp = INTEGER(idx);
140 
141   bool stop = false;
142   #pragma omp parallel for num_threads(getDTthreads(n, true))
143   for (int i=0; i<n; i++) {
144     if (stop) continue;
145     int elem = idxp[i];
146     if ((elem<1 && elem!=NA_INTEGER) || elem>max) stop=true;
147   }
148   if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max]
149 
150   // ---------
151   // else massage the input to a standard idx where all items are either NA or in range [1,max] ...
152 
153   int countNeg=0, countZero=0, countNA=0, firstOverMax=0;
154   for (int i=0; i<n; i++) {
155     int elem = idxp[i];
156     if (elem==NA_INTEGER) countNA++;
157     else if (elem<0) countNeg++;
158     else if (elem==0) countZero++;
159     else if (elem>max && firstOverMax==0) firstOverMax=i+1;
160   }
161   if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) {
162     error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max);
163   }
164 
165   int countPos = n-countNeg-countZero-countNA;
166   if (countPos && countNeg) {
167     int i=0, firstNeg=0, firstPos=0;
168     while (i<n && (firstNeg==0 || firstPos==0)) {
169       int elem = idxp[i];
170       if (firstPos==0 && elem>0) firstPos=i+1;
171       if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
172       i++;
173     }
174     error(_("Item %d of i is %d and item %d is %d. Cannot mix positives and negatives."), firstNeg, idxp[firstNeg-1], firstPos, idxp[firstPos-1]);
175   }
176   if (countNeg && countNA) {
177     int i=0, firstNeg=0, firstNA=0;
178     while (i<n && (firstNeg==0 || firstNA==0)) {
179       int elem = idxp[i];
180       if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
181       if (firstNA==0 && elem==NA_INTEGER) firstNA=i+1;
182       i++;
183     }
184     error(_("Item %d of i is %d and item %d is NA. Cannot mix negatives and NA."), firstNeg, idxp[firstNeg-1], firstNA);
185   }
186 
187   SEXP ans;
188   if (countNeg==0) {
189     // just zeros to remove, or >max to convert to NA
190     ans = PROTECT(allocVector(INTSXP, n - countZero));
191     int *ansp = INTEGER(ans);
192     for (int i=0, ansi=0; i<n; i++) {
193       int elem = idxp[i];
194       if (elem==0) continue;
195       ansp[ansi++] = elem>max ? NA_INTEGER : elem;
196     }
197   } else {
198     // idx is all negative without any NA but perhaps some zeros
199     bool *keep = (bool *)R_alloc(max, sizeof(bool));    // 4 times less memory that INTSXP in src/main/subscript.c
200     for (int i=0; i<max; i++) keep[i] = true;
201     int countRemoved=0, countDup=0, countBeyond=0;   // idx=c(-10,-5,-10) removing row 10 twice
202     int firstBeyond=0, firstDup=0;
203     for (int i=0; i<n; i++) {
204       int elem = -idxp[i];
205       if (elem==0) continue;
206       if (elem>max) {
207         countBeyond++;
208         if (firstBeyond==0) firstBeyond=i+1;
209         continue;
210       }
211       if (!keep[elem-1]) {
212         countDup++;
213         if (firstDup==0) firstDup=i+1;
214       } else {
215         keep[elem-1] = false;
216         countRemoved++;
217       }
218     }
219     if (countBeyond)
220       warning(_("Item %d of i is %d but there are only %d rows. Ignoring this and %d more like it out of %d."), firstBeyond, idxp[firstBeyond-1], max, countBeyond-1, n);
221     if (countDup)
222       warning(_("Item %d of i is %d which removes that item but that has occurred before. Ignoring this dup and %d other dups."), firstDup, idxp[firstDup-1], countDup-1);
223     int ansn = max-countRemoved;
224     ans = PROTECT(allocVector(INTSXP, ansn));
225     int *ansp = INTEGER(ans);
226     for (int i=0, ansi=0; i<max; i++) {
227       if (keep[i]) ansp[ansi++] = i+1;
228     }
229   }
230   UNPROTECT(1);
231   return ans;
232 }
233 
checkCol(SEXP col,int colNum,int nrow,SEXP x)234 static void checkCol(SEXP col, int colNum, int nrow, SEXP x)
235 {
236   if (isNull(col)) error(_("Column %d is NULL; malformed data.table."), colNum);
237   if (isNewList(col) && INHERITS(col, char_dataframe)) {
238     SEXP names = getAttrib(x, R_NamesSymbol);
239     error(_("Column %d ['%s'] is a data.frame or data.table; malformed data.table."),
240           colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)));
241   }
242   if (length(col)!=nrow) {
243     SEXP names = getAttrib(x, R_NamesSymbol);
244     error(_("Column %d ['%s'] is length %d but column 1 is length %d; malformed data.table."),
245           colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)), length(col), nrow);
246   }
247 }
248 
249 /*
250 * subsetDT - Subsets a data.table
251 * NOTE:
252 *   1) 'rows' and 'cols' are 1-based, passed from R level
253 *   2) Originally for subsetting vectors in fcast and now the beginnings of [.data.table ported to C
254 *   3) Immediate need is for R 3.1 as lglVec[1] now returns R's global TRUE and we don't want := to change that global [think 1 row data.tables]
255 *   4) Could do it other ways but may as well go to C now as we were going to do that anyway
256 */
257 
subsetDT(SEXP x,SEXP rows,SEXP cols)258 SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md and man/cdt.Rd
259   int nprotect=0;
260   if (!isNewList(x)) error(_("Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'"), type2char(TYPEOF(rows))); // # nocov
261   if (!length(x)) return(x);  // return empty list
262 
263   const int nrow = length(VECTOR_ELT(x,0));
264   // check index once up front for 0 or NA, for branchless subsetVectorRaw which is repeated for each column
265   bool anyNA=false, orderedSubset=true;   // true for when rows==null (meaning all rows)
266   if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) {
267     SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++;
268     rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++;
269     const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset);
270     if (err!=NULL) error(err);
271   }
272 
273   if (!isInteger(cols)) error(_("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'"), type2char(TYPEOF(cols))); // # nocov
274   for (int i=0; i<LENGTH(cols); i++) {
275     int this = INTEGER(cols)[i];
276     if (this<1 || this>LENGTH(x)) error(_("Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]"), i+1, this, LENGTH(x));
277   }
278 
279   int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue));
280   SEXP ans = PROTECT(allocVector(VECSXP, LENGTH(cols)+overAlloc)); nprotect++;  // doing alloc.col directly here; eventually alloc.col can be deprecated.
281 
282   // user-defined and superclass attributes get copied as from v1.12.0
283   copyMostAttrib(x, ans);
284   // most means all except R_NamesSymbol, R_DimSymbol and R_DimNamesSymbol
285   // includes row.names (oddly, given other dims aren't) and "sorted" dealt with below
286   // class is also copied here which retains superclass name in class vector as has been the case for many years; e.g. tests 1228.* for #64
287 
288   SET_TRUELENGTH(ans, LENGTH(ans));
289   SETLENGTH(ans, LENGTH(cols));
290   int ansn;
291   if (isNull(rows)) {
292     ansn = nrow;
293     const int *colD = INTEGER(cols);
294     for (int i=0; i<LENGTH(cols); i++) {
295       SEXP thisCol = VECTOR_ELT(x, colD[i]-1);
296       checkCol(thisCol, colD[i], nrow, x);
297       SET_VECTOR_ELT(ans, i, copyAsPlain(thisCol));
298       // materialize the column subset as we have always done for now, until REFCNT is on by default in R (TODO)
299     }
300   } else {
301     ansn = LENGTH(rows);  // has been checked not to contain zeros or negatives, so this length is the length of result
302     const int *colD = INTEGER(cols);
303     for (int i=0; i<LENGTH(cols); i++) {
304       SEXP source = VECTOR_ELT(x, colD[i]-1);
305       checkCol(source, colD[i], nrow, x);
306       SEXP target;
307       SET_VECTOR_ELT(ans, i, target=allocVector(TYPEOF(source), ansn));
308       copyMostAttrib(source, target);
309       subsetVectorRaw(target, source, rows, anyNA);  // parallel within column
310     }
311   }
312   SEXP tmp = PROTECT(allocVector(STRSXP, LENGTH(cols)+overAlloc)); nprotect++;
313   SET_TRUELENGTH(tmp, LENGTH(tmp));
314   SETLENGTH(tmp, LENGTH(cols));
315   setAttrib(ans, R_NamesSymbol, tmp);
316   subsetVectorRaw(tmp, getAttrib(x, R_NamesSymbol), cols, /*anyNA=*/false);
317 
318   tmp = PROTECT(allocVector(INTSXP, 2)); nprotect++;
319   INTEGER(tmp)[0] = NA_INTEGER;
320   INTEGER(tmp)[1] = -ansn;
321   setAttrib(ans, R_RowNamesSymbol, tmp);  // The contents of tmp must be set before being passed to setAttrib(). setAttrib looks at tmp value and copies it in the case of R_RowNamesSymbol. Caused hard to track bug around 28 Sep 2014.
322 
323   // clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)
324   setAttrib(ans, sym_index, R_NilValue);
325   // but maintain key if ordered subset
326   SEXP key = getAttrib(x, sym_sorted);
327   if (length(key)) {
328     SEXP in = PROTECT(chin(key, getAttrib(ans,R_NamesSymbol))); nprotect++;
329     int i = 0;  while(i<LENGTH(key) && LOGICAL(in)[i]) i++;
330     // i is now the keylen that can be kept. 2 lines above much easier in C than R
331     if (i==0 || !orderedSubset) {
332       // clear key that was copied over by copyMostAttrib() above
333       setAttrib(ans, sym_sorted, R_NilValue);
334     } else {
335       // make a new key attribute; shorter if i<LENGTH(key) or same length copied so this key is safe to change by ref (setnames)
336       setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
337       for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
338     }
339   }
340   unlock(ans);
341   setselfref(ans);
342   UNPROTECT(nprotect);
343   return ans;
344 }
345 
subsetVector(SEXP x,SEXP idx)346 SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
347   bool anyNA=false, orderedSubset=false;
348   int nprotect=0;
349   if (isNull(x))
350     error(_("Internal error: NULL can not be subset. It is invalid for a data.table to contain a NULL column."));      // # nocov
351   if (check_idx(idx, length(x), &anyNA, &orderedSubset) != NULL)
352     error(_("Internal error: CsubsetVector is internal-use-only but has received negatives, zeros or out-of-range"));  // # nocov
353   SEXP ans = PROTECT(allocVector(TYPEOF(x), length(idx))); nprotect++;
354   copyMostAttrib(x, ans);
355   subsetVectorRaw(ans, x, idx, anyNA);
356   UNPROTECT(nprotect);
357   return ans;
358 }
359 
360