1 #include "data.table.h"
2 #include <Rdefines.h>
3 #include <ctype.h> // for isdigit
4
rbindlist(SEXP l,SEXP usenamesArg,SEXP fillArg,SEXP idcolArg)5 SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
6 {
7 if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL)
8 error(_("fill= should be TRUE or FALSE"));
9 if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1)
10 error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA
11 if (!length(l)) return(l);
12 if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
13 Rboolean usenames = LOGICAL(usenamesArg)[0];
14 const bool fill = LOGICAL(fillArg)[0];
15 if (fill && usenames!=TRUE) {
16 if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default)
17 usenames=TRUE;
18 }
19 const bool idcol = !isNull(idcolArg);
20 if (idcol && (!isString(idcolArg) || LENGTH(idcolArg)!=1)) error(_("Internal error: rbindlist.c idcol is not a single string")); // # nocov
21 int ncol=0, first=0;
22 int64_t nrow=0, upperBoundUniqueNames=1;
23 bool anyNames=false;
24 int numZero=0, firstZeroCol=0, firstZeroItem=0;
25 int *eachMax = (int *)R_alloc(LENGTH(l), sizeof(int));
26 // pre-check for any errors here to save having to get cleanup right below when usenames
27 for (int i=0; i<LENGTH(l); i++) { // length(l)>0 checked above
28 eachMax[i] = 0;
29 SEXP li = VECTOR_ELT(l, i);
30 if (isNull(li)) continue;
31 if (TYPEOF(li) != VECSXP) error(_("Item %d of input is not a data.frame, data.table or list"), i+1);
32 const int thisncol = length(li);
33 if (!thisncol) continue;
34 // delete as now more flexible ... if (fill && isNull(getAttrib(li, R_NamesSymbol))) error(_("When fill=TRUE every item of the input must have column names. Item %d does not."), i+1);
35 if (fill) {
36 if (thisncol>ncol) ncol=thisncol; // this section initializes ncol with max ncol. ncol may be increased when usenames is accounted for further down
37 } else {
38 if (ncol==0) { ncol=thisncol; first=i; }
39 else if (thisncol!=ncol) error(_("Item %d has %d columns, inconsistent with item %d which has %d columns. To fill missing columns use fill=TRUE."), i+1, thisncol, first+1, ncol);
40 }
41 int nNames = length(getAttrib(li, R_NamesSymbol));
42 if (nNames>0 && nNames!=thisncol) error(_("Item %d has %d columns but %d column names. Invalid object."), i+1, thisncol, nNames);
43 if (nNames>0) anyNames=true;
44 upperBoundUniqueNames += nNames;
45 int maxLen=0, whichMax=0;
46 for (int j=0; j<thisncol; ++j) { int tt=length(VECTOR_ELT(li,j)); if (tt>maxLen) { maxLen=tt; whichMax=j; } }
47 for (int j=0; j<thisncol; ++j) {
48 int tt = length(VECTOR_ELT(li, j));
49 if (tt>1 && tt!=maxLen) error(_("Column %d of item %d is length %d inconsistent with column %d which is length %d. Only length-1 columns are recycled."), j+1, i+1, tt, whichMax+1, maxLen);
50 if (tt==0 && maxLen>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; }
51 }
52 eachMax[i] = maxLen;
53 nrow += maxLen;
54 }
55 if (numZero) { // #1871
56 SEXP names = getAttrib(VECTOR_ELT(l, firstZeroItem), R_NamesSymbol);
57 const char *ch = names==R_NilValue ? "" : CHAR(STRING_ELT(names, firstZeroCol));
58 warning(_("Column %d ['%s'] of item %d is length 0. This (and %d other%s like it) has been filled with NA (NULL for list columns) to make each item uniform."),
59 firstZeroCol+1, ch, firstZeroItem+1, numZero-1, numZero==2?"":"s");
60 }
61 if (nrow==0 && ncol==0) return(R_NilValue);
62 if (nrow>INT32_MAX) error(_("Total rows in the list is %"PRId64" which is larger than the maximum number of rows, currently %d"), (int64_t)nrow, INT32_MAX);
63 if (usenames==TRUE && !anyNames) error(_("use.names=TRUE but no item of input list has any names"));
64
65 int *colMap=NULL; // maps each column in final result to the column of each list item
66 if (usenames==TRUE || usenames==NA_LOGICAL) {
67 // here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
68 // when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
69 // first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
70 SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP)); // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names)
71 if (!uniq) error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames);
72 savetl_init();
73 int nuniq=0;
74 for (int i=0; i<LENGTH(l); i++) {
75 SEXP li = VECTOR_ELT(l, i);
76 int thisncol=LENGTH(li);
77 if (isNull(li) || !LENGTH(li)) continue;
78 const SEXP cn = getAttrib(li, R_NamesSymbol);
79 if (!length(cn)) continue;
80 const SEXP *cnp = STRING_PTR(cn);
81 for (int j=0; j<thisncol; j++) {
82 SEXP s = cnp[j];
83 if (TRUELENGTH(s)<0) continue; // seen this name before
84 if (TRUELENGTH(s)>0) savetl(s);
85 uniq[nuniq++] = s;
86 SET_TRUELENGTH(s,-nuniq);
87 }
88 }
89 if (nuniq>0) {
90 SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare
91 if (!tt) free(uniq); // shrink never fails; just keep codacy happy
92 uniq = tt;
93 }
94 // now count the dups (if any) and how they're distributed across the items
95 int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames
96 int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
97 if (!counts || !maxdup) {
98 // # nocov start
99 for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
100 free(uniq); free(counts); free(maxdup);
101 savetl_end();
102 error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
103 // # nocov end
104 }
105 for (int i=0; i<LENGTH(l); i++) {
106 SEXP li = VECTOR_ELT(l, i);
107 int thisncol=length(li);
108 if (thisncol==0) continue;
109 const SEXP cn = getAttrib(li, R_NamesSymbol);
110 if (!length(cn)) continue;
111 const SEXP *cnp = STRING_PTR(cn);
112 memset(counts, 0, nuniq*sizeof(int));
113 for (int j=0; j<thisncol; j++) {
114 SEXP s = cnp[j];
115 counts[ -TRUELENGTH(s)-1 ]++;
116 }
117 for (int u=0; u<nuniq; u++) {
118 if (counts[u] > maxdup[u]) maxdup[u] = counts[u];
119 }
120 }
121 int ttncol = 0;
122 for (int u=0; u<nuniq; ++u) ttncol+=maxdup[u];
123 if (ttncol>ncol) ncol=ttncol;
124 free(maxdup); maxdup=NULL; // not needed again
125 // ncol is now the final number of columns accounting for unique and dups across all colnames
126 // allocate a matrix: nrows==length(list) each entry contains which column to fetch for that final column
127
128 int *colMapRaw = (int *)malloc(LENGTH(l)*ncol * sizeof(int)); // the result of this scope used later
129 int *uniqMap = (int *)malloc(ncol * sizeof(int)); // maps the ith unique string to the first time it occurs in the final result
130 int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc
131 if (!colMapRaw || !uniqMap || !dupLink) {
132 // # nocov start
133 for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
134 free(uniq); free(counts); free(colMapRaw); free(uniqMap); free(dupLink);
135 savetl_end();
136 error(_("Failed to allocate ncol=%d items working memory in rbindlist.c"), ncol);
137 // # nocov end
138 }
139 for (int i=0; i<LENGTH(l)*ncol; ++i) colMapRaw[i]=-1; // 0-based so use -1
140 for (int i=0; i<ncol; ++i) {uniqMap[i] = dupLink[i] = -1;}
141 int nextCol=0, lastDup=ncol-1;
142
143 for (int i=0; i<LENGTH(l); ++i) {
144 SEXP li = VECTOR_ELT(l, i);
145 int thisncol=length(li);
146 if (thisncol==0) continue;
147 const SEXP cn = getAttrib(li, R_NamesSymbol);
148 if (!length(cn)) {
149 for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
150 } else {
151 const SEXP *cnp = STRING_PTR(cn);
152 memset(counts, 0, nuniq*sizeof(int));
153 for (int j=0; j<thisncol; j++) {
154 SEXP s = cnp[j];
155 int w = -TRUELENGTH(s)-1;
156 int wi = counts[w]++; // how many dups have we seen before of this name within this item
157 if (uniqMap[w]==-1) {
158 // first time seen this name across all items
159 uniqMap[w] = nextCol++;
160 } else {
161 while (wi && dupLink[w]>0) { w=dupLink[w]; --wi; } // hop through the dups
162 if (wi && dupLink[w]==-1) {
163 // first time we've seen this number of dups of this name
164 w = dupLink[w] = lastDup--;
165 uniqMap[w] = nextCol++;
166 }
167 }
168 colMapRaw[i*ncol + uniqMap[w]] = j;
169 }
170 }
171 }
172 for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); // zero out our usage of tl
173 free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
174 savetl_end(); // restore R's usage
175
176 // colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
177 // move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
178 colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(int));
179 // This R_alloc could fail with out-of-memory but given it is very small it's very unlikely. If it does fail, colMapRaw will leak.
180 // But colMapRaw leaking now in this very rare situation is better than colMapRaw leaking in the more likely but still rare conditions later.
181 // And it's better than having to trap all exit point from here to the end of rbindlist, which may not be possible; e.g. writeNA() could error inside it with unsupported type.
182 // This very unlikely leak could be fixed by using an on.exit() at R level rbindlist(); R-exts$6.1.2 refers to pwilcox for example. However, that would not
183 // solve the (mere) leak if we ever call rbindlist internally from other C functions.
184 memcpy(colMap, colMapRaw, LENGTH(l)*ncol*sizeof(int));
185 free(colMapRaw); // local scope in this branch to ensure can't be used below
186
187 // to view map when debugging ...
188 // for (int i=0; i<LENGTH(l); ++i) { for (int j=0; j<ncol; ++j) Rprintf(_("%2d "),colMap[i*ncol + j]); Rprintf(_("\n")); }
189 }
190
191 if (fill && usenames==NA_LOGICAL) error(_("Internal error: usenames==NA but fill=TRUE. usenames should have been set to TRUE earlier with warning."));
192 if (!fill && (usenames==TRUE || usenames==NA_LOGICAL)) {
193 // Ensure no missings in both cases, and (when usenames==NA) all columns in same order too
194 // We proceeded earlier as if fill was true, so varying ncol items will have missings here
195 char buff[1001] = "";
196 const char *extra = usenames==TRUE?"":_(" use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for "\
197 " backwards compatibility. See news item 5 in v1.12.2 for options to control this message.");
198 for (int i=0; i<LENGTH(l); ++i) {
199 SEXP li = VECTOR_ELT(l, i);
200 if (!length(li) || !length(getAttrib(li, R_NamesSymbol))) continue;
201 for (int j=0; j<ncol; ++j) {
202 const int w = colMap[i*ncol + j];
203 if (w==-1) {
204 int missi = i;
205 while (colMap[i*ncol + j]==-1 && i<LENGTH(l)) i++;
206 if (i==LENGTH(l)) error(_("Internal error: could not find the first column name not present in earlier item"));
207 SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
208 int w2 = colMap[i*ncol + j];
209 const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
210 snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
211 w2+1, str, i+1, missi+1, extra );
212 if (usenames==TRUE) error(buff);
213 i = LENGTH(l); // break from outer i loop
214 break; // break from inner j loop
215 }
216 if (w!=j && usenames==NA_LOGICAL) {
217 SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
218 if (!isString(s) || i==0) error(_("Internal error: usenames==NA but an out-of-order name has been found in an item with no names or the first item. [%d]"), i);
219 snprintf(buff, 1000, _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
220 w+1, CHAR(STRING_ELT(s,w)), i+1, j+1, i, extra);
221 i = LENGTH(l);
222 break;
223 }
224 }
225 if (buff[0]) {
226 SEXP opt = GetOption(install("datatable.rbindlist.check"), R_NilValue);
227 if (!isNull(opt) && !(isString(opt) && length(opt)==1)) {
228 warning(_("options()$datatable.rbindlist.check is set but is not a single string. See news item 5 in v1.12.2."));
229 opt = R_NilValue;
230 }
231 const char *o = isNull(opt) ? "message" : CHAR(STRING_ELT(opt,0));
232 if (strcmp(o,"message")==0) { eval(PROTECT(lang2(install("message"),PROTECT(ScalarString(mkChar(buff))))), R_GlobalEnv); UNPROTECT(2); }
233 else if (strcmp(o,"warning")==0) warning(buff);
234 else if (strcmp(o,"error")==0) error(buff);
235 else if (strcmp(o,"none")!=0) warning(_("options()$datatable.rbindlist.check=='%s' which is not 'message'|'warning'|'error'|'none'. See news item 5 in v1.12.2."), o);
236 }
237 }
238 }
239 if (usenames==NA_LOGICAL) {
240 usenames=FALSE; // for backwards compatibility, see warning above which says this will change to TRUE in future
241 ncol = length(VECTOR_ELT(l, first)); // ncol was increased as if fill=true, so reduce it back given fill=false (fill==false checked above)
242 }
243
244 int nprotect = 0;
245 SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); nprotect++;
246 SEXP ansNames;
247 setAttrib(ans, R_NamesSymbol, ansNames=allocVector(STRSXP, idcol + ncol));
248 if (idcol) {
249 SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0));
250 SEXP idval, listNames=getAttrib(l, R_NamesSymbol);
251 if (length(listNames)) {
252 SET_VECTOR_ELT(ans, 0, idval=allocVector(STRSXP, nrow));
253 for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
254 SEXP li = VECTOR_ELT(l, i);
255 if (!length(li)) continue;
256 const int thisnrow = eachMax[i];
257 SEXP thisname = STRING_ELT(listNames, i);
258 for (int k=0; k<thisnrow; ++k) SET_STRING_ELT(idval, ansloc++, thisname);
259 }
260 } else {
261 SET_VECTOR_ELT(ans, 0, idval=allocVector(INTSXP, nrow));
262 int *idvald = INTEGER(idval);
263 for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
264 SEXP li = VECTOR_ELT(l, i);
265 if (!length(li)) continue;
266 const int thisnrow = eachMax[i];
267 for (int k=0; k<thisnrow; ++k) idvald[ansloc++] = i+1;
268 }
269 }
270 }
271
272 SEXP coercedForFactor = NULL;
273 for(int j=0; j<ncol; ++j) {
274 int maxType=LGLSXP; // initialize with LGLSXP for test 2002.3 which has col x NULL in both lists to be filled with NA for #1871
275 bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true.
276 int longestLen=0, longestW=-1, longestI=-1; // just for ordered factor
277 SEXP longestLevels=R_NilValue; // just for ordered factor
278 bool int64=false;
279 const char *foundName=NULL;
280 bool anyNotStringOrFactor=false;
281 SEXP firstCol=R_NilValue;
282 int firsti=-1, firstw=-1;
283 for (int i=0; i<LENGTH(l); ++i) {
284 SEXP li = VECTOR_ELT(l, i);
285 if (!length(li)) continue;
286 int w = usenames ? colMap[i*ncol + j] : j; // colMap tells us which item to fetch for each of the final result columns, so we can stack column-by-column
287 if (w==-1) continue; // column j of final result has no input from this item (fill must be true)
288 if (!foundName) {
289 SEXP cn=PROTECT(getAttrib(li, R_NamesSymbol));
290 if (length(cn)) { SEXP tt; SET_STRING_ELT(ansNames, idcol+j, tt=STRING_ELT(cn, w)); foundName=CHAR(tt); }
291 UNPROTECT(1);
292 }
293 SEXP thisCol = VECTOR_ELT(li, w);
294 int thisType = TYPEOF(thisCol);
295 // Use >= for #546 -- TYPEORDER=0 for both LGLSXP and EXPRSXP (but also NULL)
296 if (TYPEORDER(thisType)>=TYPEORDER(maxType) && !isNull(thisCol)) maxType=thisType;
297 if (isFactor(thisCol)) {
298 if (isNull(getAttrib(thisCol,R_LevelsSymbol))) error(_("Column %d of item %d has type 'factor' but has no levels; i.e. malformed."), w+1, i+1);
299 factor = true;
300 if (isOrdered(thisCol)) {
301 orderedFactor = true;
302 int thisLen = length(getAttrib(thisCol, R_LevelsSymbol));
303 if (thisLen>longestLen) { longestLen=thisLen; longestLevels=getAttrib(thisCol, R_LevelsSymbol); /*for warnings later ...*/longestW=w; longestI=i; }
304 }
305 } else if (!isString(thisCol)) anyNotStringOrFactor=true; // even for length 0 columns for consistency; test 2113.3
306 if (INHERITS(thisCol, char_integer64)) {
307 if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below
308 int64=true;
309 }
310 if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; }
311 else {
312 if (!factor && !int64) {
313 if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)),
314 PROTECT(getAttrib(firstCol, R_ClassSymbol)),
315 0)) {
316 error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1);
317 }
318 UNPROTECT(2);
319 }
320 }
321 }
322
323 if (!foundName) { static char buff[12]; snprintf(buff,12,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; }
324 if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option)
325 if (int64 && maxType!=REALSXP)
326 error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov
327 SEXP target;
328 SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list
329 if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
330
331 if (factor && anyNotStringOrFactor) {
332 // in future warn, or use list column instead ... warning(_("Column %d contains a factor but not all items for the column are character or factor"), idcol+j+1);
333 // some coercing from (likely) integer/numeric to character will be needed. But this coerce can feasibly fail with out-of-memory, so we have to do it up-front
334 // before the savetl_init() because we have no hook to clean up tl if coerceVector fails.
335 if (coercedForFactor==NULL) { coercedForFactor=PROTECT(allocVector(VECSXP, LENGTH(l))); nprotect++; }
336 for (int i=0; i<LENGTH(l); ++i) {
337 int w = usenames ? colMap[i*ncol + j] : j;
338 if (w==-1) continue;
339 SEXP thisCol = VECTOR_ELT(VECTOR_ELT(l, i), w);
340 if (!isFactor(thisCol) && !isString(thisCol)) {
341 SET_VECTOR_ELT(coercedForFactor, i, coerceVector(thisCol, STRSXP));
342 }
343 }
344 }
345 int ansloc=0;
346 if (factor) {
347 char warnStr[1000] = "";
348 savetl_init(); // no error from now (or warning given options(warn=2)) until savetl_end
349 int nLevel=0, allocLevel=0;
350 SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc.
351 if (orderedFactor) {
352 // If all sets of ordered levels are compatible (no ambiguities or conflicts) then an ordered factor is created, otherwise regular factor.
353 // Currently the longest set of ordered levels is taken and all other ordered levels must be a compatible subset of that.
354 // e.g. c( a<c<b, z<a<c<b, a<b ) => z<a<c<b [ the longest is the middle one, and the other two are ordered subsets of it ]
355 // c( a<c<b, z<c<a<b, a<b ) => regular factor because it contains an ambiguity: is a<c or c<a?
356 // c( a<c<b, c<b, 'c,b' ) => a<c<b because the regular factor/character items c and b exist in the ordered levels
357 // c( a<c<b, c<b, 'c,d' ) => a<c<b<d 'd' from non-ordered item added on the end of longest ordered levels
358 // c( a<c<b, c<b<d<e ) => regular factor because this case isn't yet implemented. a<c<b<d<e would be possible in future (extending longest at the beginning or end)
359 const SEXP *sd = STRING_PTR(longestLevels);
360 nLevel = allocLevel = longestLen;
361 levelsRaw = (SEXP *)malloc(nLevel * sizeof(SEXP));
362 if (!levelsRaw) { savetl_end(); error(_("Failed to allocate working memory for %d ordered factor levels of result column %d"), nLevel, idcol+j+1); }
363 for (int k=0; k<longestLen; ++k) {
364 SEXP s = sd[k];
365 if (TRUELENGTH(s)>0) savetl(s);
366 levelsRaw[k] = s;
367 SET_TRUELENGTH(s,-k-1);
368 }
369 for (int i=0; i<LENGTH(l); ++i) {
370 int w = usenames ? colMap[i*ncol + j] : j;
371 if (w==-1) continue;
372 SEXP thisCol = VECTOR_ELT(VECTOR_ELT(l, i), w);
373 if (isOrdered(thisCol)) {
374 SEXP levels = getAttrib(thisCol, R_LevelsSymbol);
375 const SEXP *levelsD = STRING_PTR(levels);
376 const int n = length(levels);
377 for (int k=0, last=0; k<n; ++k) {
378 SEXP s = levelsD[k];
379 const int tl = TRUELENGTH(s);
380 if (tl>=last) { // if tl>=0 then also tl>=last because last<=0
381 if (tl>=0) {
382 snprintf(warnStr, 1000, // not direct warning as we're inside tl region
383 _("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \
384 "Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."),
385 w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1);
386 } else {
387 snprintf(warnStr, 1000,
388 _("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \
389 "A regular factor will be created for this column due to this ambiguity."),
390 w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1);
391 // k>=1 (so k-1 is ok) because when k==0 last==0 and this branch wouldn't happen
392 }
393 orderedFactor=false;
394 i=LENGTH(l); // break outer i loop
395 break; // break inner k loop
396 // we leave the tl set for the longest levels; the regular factor will be created with the longest ordered levels first in case that useful for user
397 }
398 last = tl; // negative ordinal; last should monotonically grow more negative if the levels are an ordered subset of the longest
399 }
400 }
401 }
402 }
403 for (int i=0; i<LENGTH(l); ++i) {
404 const int thisnrow = eachMax[i];
405 SEXP li = VECTOR_ELT(l, i);
406 if (!length(li)) continue; // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508)
407 int w = usenames ? colMap[i*ncol + j] : j;
408 if (w==-1) {
409 writeNA(target, ansloc, thisnrow);
410 } else {
411 SEXP thisCol = VECTOR_ELT(li, w);
412 SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i));
413 const int n = length(thisColStr);
414 const SEXP *thisColStrD = STRING_PTR(thisColStr); // D for data
415 for (int k=0; k<n; ++k) {
416 SEXP s = thisColStrD[k];
417 if (s==NA_STRING || // remove NA from levels; test 1979 found by package emil when revdep testing 1.12.2 (#3473)
418 TRUELENGTH(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
419 if (TRUELENGTH(s)>0) savetl(s);
420 if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0
421 SEXP *tt = NULL;
422 if (allocLevel<INT_MAX) {
423 int64_t new = (int64_t)allocLevel+n-k+1024; // if all remaining levels in this item haven't been seen before, plus 1024 margin in case of many very short levels
424 allocLevel = (new>(int64_t)INT_MAX) ? INT_MAX : (int)new;
425 tt = (SEXP *)realloc(levelsRaw, allocLevel*sizeof(SEXP)); // first time levelsRaw==NULL and realloc==malloc in that case
426 }
427 if (tt==NULL) {
428 // # nocov start
429 // C spec states that if realloc() fails (above) the original block (levelsRaw) is left untouched: it is not freed or moved. We ...
430 for (int k=0; k<nLevel; k++) SET_TRUELENGTH(levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
431 free(levelsRaw);
432 savetl_end();
433 error(_("Failed to allocate working memory for %d factor levels of result column %d when reading item %d of item %d"), allocLevel, idcol+j+1, w+1, i+1);
434 // # nocov end
435 }
436 levelsRaw = tt;
437 }
438 SET_TRUELENGTH(s,-(++nLevel));
439 levelsRaw[nLevel-1] = s;
440 }
441 int *targetd = INTEGER(target);
442 if (isFactor(thisCol)) {
443 const int *id = INTEGER(thisCol);
444 if (length(thisCol)<=1) {
445 // recycle length-1, or NA-fill length-0
446 SEXP lev;
447 const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER;
448 // ^^ #3915 and tests 2015.2-5
449 for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
450 } else {
451 // length(thisCol)==thisnrow alreay checked before this truelength-clobber region
452 // If all i==truelength(i) then just do a memcpy since hop is identity. Otherwise hop via the integer map.
453 bool hop = false;
454 if (orderedFactor) {
455 // retain the position of NA level (if any) and the integer mappings to it
456 for (int k=0; k<n; ++k) {
457 SEXP s = thisColStrD[k];
458 if (s!=NA_STRING && -TRUELENGTH(s)!=k+1) { hop=true; break; }
459 }
460 } else {
461 for (int k=0; k<n; ++k) {
462 SEXP s = thisColStrD[k];
463 if (s==NA_STRING || -TRUELENGTH(s)!=k+1) { hop=true; break; }
464 }
465 }
466 if (hop) {
467 if (orderedFactor) {
468 for (int r=0; r<thisnrow; ++r)
469 targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(thisColStrD[id[r]-1]);
470 } else {
471 for (int r=0; r<thisnrow; ++r) {
472 SEXP lev;
473 targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRUELENGTH(lev);
474 }
475 }
476 } else {
477 memcpy(targetd+ansloc, id, thisnrow*SIZEOF(thisCol));
478 }
479 }
480 } else {
481 const SEXP *sd = STRING_PTR(thisColStr);
482 if (length(thisCol)<=1) {
483 const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRUELENGTH(sd[0]) : NA_INTEGER;
484 for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
485 } else {
486 for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRUELENGTH(sd[r]);
487 }
488 }
489 }
490 ansloc += thisnrow;
491 }
492 for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
493 savetl_end();
494 if (warnStr[0]) warning(warnStr); // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
495 SEXP levelsSxp;
496 setAttrib(target, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nLevel));
497 for (int k=0; k<nLevel; ++k) SET_STRING_ELT(levelsSxp, k, levelsRaw[k]);
498 free(levelsRaw);
499 if (orderedFactor) {
500 SEXP tt;
501 setAttrib(target, R_ClassSymbol, tt=allocVector(STRSXP, 2));
502 SET_STRING_ELT(tt, 0, char_ordered);
503 SET_STRING_ELT(tt, 1, char_factor);
504 } else {
505 setAttrib(target, R_ClassSymbol, ScalarString(char_factor));
506 }
507 } else { // factor==false
508 for (int i=0; i<LENGTH(l); ++i) {
509 const int thisnrow = eachMax[i];
510 if (thisnrow==0) continue;
511 SEXP li = VECTOR_ELT(l, i);
512 int w = usenames ? colMap[i*ncol + j] : j;
513 SEXP thisCol;
514 if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871
515 writeNA(target, ansloc, thisnrow); // writeNA is integer64 aware and writes INT64_MIN
516 } else {
517 if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) {
518 // do an as.list() on the atomic column; #3528
519 thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); nprotect++;
520 }
521 // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909)
522 const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName);
523 if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret);
524 // e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2
525 // TODO: but maxType should handle that and this should never warn
526 }
527 ansloc += thisnrow;
528 }
529 }
530 }
531 UNPROTECT(nprotect); // ans, coercedForFactor, thisCol
532 return(ans);
533 }
534