1 #include "data.table.h"
2 #include <Rdefines.h>
3 #include <ctype.h>   // for isdigit
4 
rbindlist(SEXP l,SEXP usenamesArg,SEXP fillArg,SEXP idcolArg)5 SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
6 {
7   if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL)
8     error(_("fill= should be TRUE or FALSE"));
9   if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1)
10     error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)"));  // R levels converts "check" to NA
11   if (!length(l)) return(l);
12   if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
13   Rboolean usenames = LOGICAL(usenamesArg)[0];
14   const bool fill = LOGICAL(fillArg)[0];
15   if (fill && usenames!=TRUE) {
16     if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default)
17     usenames=TRUE;
18   }
19   const bool idcol = !isNull(idcolArg);
20   if (idcol && (!isString(idcolArg) || LENGTH(idcolArg)!=1)) error(_("Internal error: rbindlist.c idcol is not a single string"));  // # nocov
21   int ncol=0, first=0;
22   int64_t nrow=0, upperBoundUniqueNames=1;
23   bool anyNames=false;
24   int numZero=0, firstZeroCol=0, firstZeroItem=0;
25   int *eachMax = (int *)R_alloc(LENGTH(l), sizeof(int));
26   // pre-check for any errors here to save having to get cleanup right below when usenames
27   for (int i=0; i<LENGTH(l); i++) {  // length(l)>0 checked above
28     eachMax[i] = 0;
29     SEXP li = VECTOR_ELT(l, i);
30     if (isNull(li)) continue;
31     if (TYPEOF(li) != VECSXP) error(_("Item %d of input is not a data.frame, data.table or list"), i+1);
32     const int thisncol = length(li);
33     if (!thisncol) continue;
34     // delete as now more flexible ... if (fill && isNull(getAttrib(li, R_NamesSymbol))) error(_("When fill=TRUE every item of the input must have column names. Item %d does not."), i+1);
35     if (fill) {
36       if (thisncol>ncol) ncol=thisncol;  // this section initializes ncol with max ncol. ncol may be increased when usenames is accounted for further down
37     } else {
38       if (ncol==0) { ncol=thisncol; first=i; }
39       else if (thisncol!=ncol) error(_("Item %d has %d columns, inconsistent with item %d which has %d columns. To fill missing columns use fill=TRUE."), i+1, thisncol, first+1, ncol);
40     }
41     int nNames = length(getAttrib(li, R_NamesSymbol));
42     if (nNames>0 && nNames!=thisncol) error(_("Item %d has %d columns but %d column names. Invalid object."), i+1, thisncol, nNames);
43     if (nNames>0) anyNames=true;
44     upperBoundUniqueNames += nNames;
45     int maxLen=0, whichMax=0;
46     for (int j=0; j<thisncol; ++j) { int tt=length(VECTOR_ELT(li,j)); if (tt>maxLen) { maxLen=tt; whichMax=j; } }
47     for (int j=0; j<thisncol; ++j) {
48       int tt = length(VECTOR_ELT(li, j));
49       if (tt>1 && tt!=maxLen) error(_("Column %d of item %d is length %d inconsistent with column %d which is length %d. Only length-1 columns are recycled."), j+1, i+1, tt, whichMax+1, maxLen);
50       if (tt==0 && maxLen>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; }
51     }
52     eachMax[i] = maxLen;
53     nrow += maxLen;
54   }
55   if (numZero) {  // #1871
56     SEXP names = getAttrib(VECTOR_ELT(l, firstZeroItem), R_NamesSymbol);
57     const char *ch = names==R_NilValue ? "" : CHAR(STRING_ELT(names, firstZeroCol));
58     warning(_("Column %d ['%s'] of item %d is length 0. This (and %d other%s like it) has been filled with NA (NULL for list columns) to make each item uniform."),
59             firstZeroCol+1, ch, firstZeroItem+1, numZero-1, numZero==2?"":"s");
60   }
61   if (nrow==0 && ncol==0) return(R_NilValue);
62   if (nrow>INT32_MAX) error(_("Total rows in the list is %"PRId64" which is larger than the maximum number of rows, currently %d"), (int64_t)nrow, INT32_MAX);
63   if (usenames==TRUE && !anyNames) error(_("use.names=TRUE but no item of input list has any names"));
64 
65   int *colMap=NULL; // maps each column in final result to the column of each list item
66   if (usenames==TRUE || usenames==NA_LOGICAL) {
67     // here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
68     // when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
69     // first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
70     SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP));  // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names)
71     if (!uniq) error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames);
72     savetl_init();
73     int nuniq=0;
74     for (int i=0; i<LENGTH(l); i++) {
75       SEXP li = VECTOR_ELT(l, i);
76       int thisncol=LENGTH(li);
77       if (isNull(li) || !LENGTH(li)) continue;
78       const SEXP cn = getAttrib(li, R_NamesSymbol);
79       if (!length(cn)) continue;
80       const SEXP *cnp = STRING_PTR(cn);
81       for (int j=0; j<thisncol; j++) {
82         SEXP s = cnp[j];
83         if (TRUELENGTH(s)<0) continue;  // seen this name before
84         if (TRUELENGTH(s)>0) savetl(s);
85         uniq[nuniq++] = s;
86         SET_TRUELENGTH(s,-nuniq);
87       }
88     }
89     if (nuniq>0) {
90       SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP));  // shrink to only what we need to release the spare
91       if (!tt) free(uniq);  // shrink never fails; just keep codacy happy
92       uniq = tt;
93     }
94     // now count the dups (if any) and how they're distributed across the items
95     int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames
96     int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
97     if (!counts || !maxdup) {
98       // # nocov start
99       for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
100       free(uniq); free(counts); free(maxdup);
101       savetl_end();
102       error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
103       // # nocov end
104     }
105     for (int i=0; i<LENGTH(l); i++) {
106       SEXP li = VECTOR_ELT(l, i);
107       int thisncol=length(li);
108       if (thisncol==0) continue;
109       const SEXP cn = getAttrib(li, R_NamesSymbol);
110       if (!length(cn)) continue;
111       const SEXP *cnp = STRING_PTR(cn);
112       memset(counts, 0, nuniq*sizeof(int));
113       for (int j=0; j<thisncol; j++) {
114         SEXP s = cnp[j];
115         counts[ -TRUELENGTH(s)-1 ]++;
116       }
117       for (int u=0; u<nuniq; u++) {
118         if (counts[u] > maxdup[u]) maxdup[u] = counts[u];
119       }
120     }
121     int ttncol = 0;
122     for (int u=0; u<nuniq; ++u) ttncol+=maxdup[u];
123     if (ttncol>ncol) ncol=ttncol;
124     free(maxdup); maxdup=NULL;  // not needed again
125     // ncol is now the final number of columns accounting for unique and dups across all colnames
126     // allocate a matrix:  nrows==length(list)  each entry contains which column to fetch for that final column
127 
128     int *colMapRaw = (int *)malloc(LENGTH(l)*ncol * sizeof(int));  // the result of this scope used later
129     int *uniqMap = (int *)malloc(ncol * sizeof(int)); // maps the ith unique string to the first time it occurs in the final result
130     int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc
131     if (!colMapRaw || !uniqMap || !dupLink) {
132       // # nocov start
133       for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
134       free(uniq); free(counts); free(colMapRaw); free(uniqMap); free(dupLink);
135       savetl_end();
136       error(_("Failed to allocate ncol=%d items working memory in rbindlist.c"), ncol);
137       // # nocov end
138     }
139     for (int i=0; i<LENGTH(l)*ncol; ++i) colMapRaw[i]=-1;   // 0-based so use -1
140     for (int i=0; i<ncol; ++i) {uniqMap[i] = dupLink[i] = -1;}
141     int nextCol=0, lastDup=ncol-1;
142 
143     for (int i=0; i<LENGTH(l); ++i) {
144       SEXP li = VECTOR_ELT(l, i);
145       int thisncol=length(li);
146       if (thisncol==0) continue;
147       const SEXP cn = getAttrib(li, R_NamesSymbol);
148       if (!length(cn)) {
149         for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
150       } else {
151         const SEXP *cnp = STRING_PTR(cn);
152         memset(counts, 0, nuniq*sizeof(int));
153         for (int j=0; j<thisncol; j++) {
154           SEXP s = cnp[j];
155           int w = -TRUELENGTH(s)-1;
156           int wi = counts[w]++; // how many dups have we seen before of this name within this item
157           if (uniqMap[w]==-1) {
158             // first time seen this name across all items
159             uniqMap[w] = nextCol++;
160           } else {
161             while (wi && dupLink[w]>0) { w=dupLink[w]; --wi; }  // hop through the dups
162             if (wi && dupLink[w]==-1) {
163               // first time we've seen this number of dups of this name
164               w = dupLink[w] = lastDup--;
165               uniqMap[w] = nextCol++;
166             }
167           }
168           colMapRaw[i*ncol + uniqMap[w]] = j;
169         }
170       }
171     }
172     for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);  // zero out our usage of tl
173     free(uniq); free(counts); free(uniqMap); free(dupLink);  // all local scope so no need to set to NULL
174     savetl_end();  // restore R's usage
175 
176     // colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
177     // move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
178     colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(int));
179     // This R_alloc could fail with out-of-memory but given it is very small it's very unlikely. If it does fail, colMapRaw will leak.
180     //   But colMapRaw leaking now in this very rare situation is better than colMapRaw leaking in the more likely but still rare conditions later.
181     //   And it's better than having to trap all exit point from here to the end of rbindlist, which may not be possible; e.g. writeNA() could error inside it with unsupported type.
182     //   This very unlikely leak could be fixed by using an on.exit() at R level rbindlist(); R-exts$6.1.2 refers to pwilcox for example. However, that would not
183     //   solve the (mere) leak if we ever call rbindlist internally from other C functions.
184     memcpy(colMap, colMapRaw, LENGTH(l)*ncol*sizeof(int));
185     free(colMapRaw);  // local scope in this branch to ensure can't be used below
186 
187     // to view map when debugging ...
188     // for (int i=0; i<LENGTH(l); ++i) { for (int j=0; j<ncol; ++j) Rprintf(_("%2d "),colMap[i*ncol + j]);  Rprintf(_("\n")); }
189   }
190 
191   if (fill && usenames==NA_LOGICAL) error(_("Internal error: usenames==NA but fill=TRUE. usenames should have been set to TRUE earlier with warning."));
192   if (!fill && (usenames==TRUE || usenames==NA_LOGICAL)) {
193     // Ensure no missings in both cases, and (when usenames==NA) all columns in same order too
194     // We proceeded earlier as if fill was true, so varying ncol items will have missings here
195     char buff[1001] = "";
196     const char *extra = usenames==TRUE?"":_(" use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for "\
197                                             " backwards compatibility. See news item 5 in v1.12.2 for options to control this message.");
198     for (int i=0; i<LENGTH(l); ++i) {
199       SEXP li = VECTOR_ELT(l, i);
200       if (!length(li) || !length(getAttrib(li, R_NamesSymbol))) continue;
201       for (int j=0; j<ncol; ++j) {
202         const int w = colMap[i*ncol + j];
203         if (w==-1) {
204           int missi = i;
205           while (colMap[i*ncol + j]==-1 && i<LENGTH(l)) i++;
206           if (i==LENGTH(l)) error(_("Internal error: could not find the first column name not present in earlier item"));
207           SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
208           int w2 = colMap[i*ncol + j];
209           const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
210           snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
211                         w2+1, str, i+1, missi+1, extra );
212           if (usenames==TRUE) error(buff);
213           i = LENGTH(l); // break from outer i loop
214           break;         // break from inner j loop
215         }
216         if (w!=j && usenames==NA_LOGICAL) {
217           SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
218           if (!isString(s) || i==0) error(_("Internal error: usenames==NA but an out-of-order name has been found in an item with no names or the first item. [%d]"), i);
219           snprintf(buff, 1000, _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
220                                w+1, CHAR(STRING_ELT(s,w)), i+1, j+1, i, extra);
221           i = LENGTH(l);
222           break;
223         }
224       }
225       if (buff[0]) {
226         SEXP opt = GetOption(install("datatable.rbindlist.check"), R_NilValue);
227         if (!isNull(opt) && !(isString(opt) && length(opt)==1)) {
228           warning(_("options()$datatable.rbindlist.check is set but is not a single string. See news item 5 in v1.12.2."));
229           opt = R_NilValue;
230         }
231         const char *o = isNull(opt) ? "message" : CHAR(STRING_ELT(opt,0));
232         if      (strcmp(o,"message")==0) { eval(PROTECT(lang2(install("message"),PROTECT(ScalarString(mkChar(buff))))), R_GlobalEnv); UNPROTECT(2); }
233         else if (strcmp(o,"warning")==0) warning(buff);
234         else if (strcmp(o,"error")==0)   error(buff);
235         else if (strcmp(o,"none")!=0)    warning(_("options()$datatable.rbindlist.check=='%s' which is not 'message'|'warning'|'error'|'none'. See news item 5 in v1.12.2."), o);
236       }
237     }
238   }
239   if (usenames==NA_LOGICAL) {
240     usenames=FALSE;  // for backwards compatibility, see warning above which says this will change to TRUE in future
241     ncol = length(VECTOR_ELT(l, first));  // ncol was increased as if fill=true, so reduce it back given fill=false (fill==false checked above)
242   }
243 
244   int nprotect = 0;
245   SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); nprotect++;
246   SEXP ansNames;
247   setAttrib(ans, R_NamesSymbol, ansNames=allocVector(STRSXP, idcol + ncol));
248   if (idcol) {
249     SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0));
250     SEXP idval, listNames=getAttrib(l, R_NamesSymbol);
251     if (length(listNames)) {
252       SET_VECTOR_ELT(ans, 0, idval=allocVector(STRSXP, nrow));
253       for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
254         SEXP li = VECTOR_ELT(l, i);
255         if (!length(li)) continue;
256         const int thisnrow = eachMax[i];
257         SEXP thisname = STRING_ELT(listNames, i);
258         for (int k=0; k<thisnrow; ++k) SET_STRING_ELT(idval, ansloc++, thisname);
259       }
260     } else {
261       SET_VECTOR_ELT(ans, 0, idval=allocVector(INTSXP, nrow));
262       int *idvald = INTEGER(idval);
263       for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
264         SEXP li = VECTOR_ELT(l, i);
265         if (!length(li)) continue;
266         const int thisnrow = eachMax[i];
267         for (int k=0; k<thisnrow; ++k) idvald[ansloc++] = i+1;
268       }
269     }
270   }
271 
272   SEXP coercedForFactor = NULL;
273   for(int j=0; j<ncol; ++j) {
274     int maxType=LGLSXP;  // initialize with LGLSXP for test 2002.3 which has col x NULL in both lists to be filled with NA for #1871
275     bool factor=false, orderedFactor=false;     // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true.
276     int longestLen=0, longestW=-1, longestI=-1; // just for ordered factor
277     SEXP longestLevels=R_NilValue;              // just for ordered factor
278     bool int64=false;
279     const char *foundName=NULL;
280     bool anyNotStringOrFactor=false;
281     SEXP firstCol=R_NilValue;
282     int firsti=-1, firstw=-1;
283     for (int i=0; i<LENGTH(l); ++i) {
284       SEXP li = VECTOR_ELT(l, i);
285       if (!length(li)) continue;
286       int w = usenames ? colMap[i*ncol + j] : j;  // colMap tells us which item to fetch for each of the final result columns, so we can stack column-by-column
287       if (w==-1) continue;  // column j of final result has no input from this item (fill must be true)
288       if (!foundName) {
289         SEXP cn=PROTECT(getAttrib(li, R_NamesSymbol));
290         if (length(cn)) { SEXP tt; SET_STRING_ELT(ansNames, idcol+j, tt=STRING_ELT(cn, w)); foundName=CHAR(tt); }
291         UNPROTECT(1);
292       }
293       SEXP thisCol = VECTOR_ELT(li, w);
294       int thisType = TYPEOF(thisCol);
295       // Use >= for #546 -- TYPEORDER=0 for both LGLSXP and EXPRSXP (but also NULL)
296       if (TYPEORDER(thisType)>=TYPEORDER(maxType) && !isNull(thisCol)) maxType=thisType;
297       if (isFactor(thisCol)) {
298         if (isNull(getAttrib(thisCol,R_LevelsSymbol))) error(_("Column %d of item %d has type 'factor' but has no levels; i.e. malformed."), w+1, i+1);
299         factor = true;
300         if (isOrdered(thisCol)) {
301           orderedFactor = true;
302           int thisLen = length(getAttrib(thisCol, R_LevelsSymbol));
303           if (thisLen>longestLen) { longestLen=thisLen; longestLevels=getAttrib(thisCol, R_LevelsSymbol); /*for warnings later ...*/longestW=w; longestI=i; }
304         }
305       } else if (!isString(thisCol)) anyNotStringOrFactor=true;  // even for length 0 columns for consistency; test 2113.3
306       if (INHERITS(thisCol, char_integer64)) {
307         if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below
308         int64=true;
309       }
310       if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; }
311       else {
312         if (!factor && !int64) {
313           if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)),
314                                    PROTECT(getAttrib(firstCol, R_ClassSymbol)),
315                                    0)) {
316             error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1);
317           }
318           UNPROTECT(2);
319         }
320       }
321     }
322 
323     if (!foundName) { static char buff[12]; snprintf(buff,12,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; }
324     if (factor) maxType=INTSXP;  // if any items are factors then a factor is created (could be an option)
325     if (int64 && maxType!=REALSXP)
326       error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov
327     SEXP target;
328     SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow));  // does not initialize logical & numerics, but does initialize character and list
329     if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
330 
331     if (factor && anyNotStringOrFactor) {
332       // in future warn, or use list column instead ... warning(_("Column %d contains a factor but not all items for the column are character or factor"), idcol+j+1);
333       // some coercing from (likely) integer/numeric to character will be needed. But this coerce can feasibly fail with out-of-memory, so we have to do it up-front
334       // before the savetl_init() because we have no hook to clean up tl if coerceVector fails.
335       if (coercedForFactor==NULL) { coercedForFactor=PROTECT(allocVector(VECSXP, LENGTH(l))); nprotect++; }
336       for (int i=0; i<LENGTH(l); ++i) {
337         int w = usenames ? colMap[i*ncol + j] : j;
338         if (w==-1) continue;
339         SEXP thisCol = VECTOR_ELT(VECTOR_ELT(l, i), w);
340         if (!isFactor(thisCol) && !isString(thisCol)) {
341           SET_VECTOR_ELT(coercedForFactor, i, coerceVector(thisCol, STRSXP));
342         }
343       }
344     }
345     int ansloc=0;
346     if (factor) {
347       char warnStr[1000] = "";
348       savetl_init();  // no error from now (or warning given options(warn=2)) until savetl_end
349       int nLevel=0, allocLevel=0;
350       SEXP *levelsRaw = NULL;  // growing list of SEXP pointers. Raw since managed with raw realloc.
351       if (orderedFactor) {
352         // If all sets of ordered levels are compatible (no ambiguities or conflicts) then an ordered factor is created, otherwise regular factor.
353         // Currently the longest set of ordered levels is taken and all other ordered levels must be a compatible subset of that.
354         // e.g. c( a<c<b, z<a<c<b, a<b ) => z<a<c<b  [ the longest is the middle one, and the other two are ordered subsets of it ]
355         //      c( a<c<b, z<c<a<b, a<b ) => regular factor because it contains an ambiguity: is a<c or c<a?
356         //      c( a<c<b, c<b, 'c,b'   ) => a<c<b  because the regular factor/character items c and b exist in the ordered levels
357         //      c( a<c<b, c<b, 'c,d'   ) => a<c<b<d  'd' from non-ordered item added on the end of longest ordered levels
358         //      c( a<c<b, c<b<d<e )  => regular factor because this case isn't yet implemented. a<c<b<d<e would be possible in future (extending longest at the beginning or end)
359         const SEXP *sd = STRING_PTR(longestLevels);
360         nLevel = allocLevel = longestLen;
361         levelsRaw = (SEXP *)malloc(nLevel * sizeof(SEXP));
362         if (!levelsRaw) { savetl_end(); error(_("Failed to allocate working memory for %d ordered factor levels of result column %d"), nLevel, idcol+j+1); }
363         for (int k=0; k<longestLen; ++k) {
364           SEXP s = sd[k];
365           if (TRUELENGTH(s)>0) savetl(s);
366           levelsRaw[k] = s;
367           SET_TRUELENGTH(s,-k-1);
368         }
369         for (int i=0; i<LENGTH(l); ++i) {
370           int w = usenames ? colMap[i*ncol + j] : j;
371           if (w==-1) continue;
372           SEXP thisCol = VECTOR_ELT(VECTOR_ELT(l, i), w);
373           if (isOrdered(thisCol)) {
374             SEXP levels = getAttrib(thisCol, R_LevelsSymbol);
375             const SEXP *levelsD = STRING_PTR(levels);
376             const int n = length(levels);
377             for (int k=0, last=0; k<n; ++k) {
378               SEXP s = levelsD[k];
379               const int tl = TRUELENGTH(s);
380               if (tl>=last) {  // if tl>=0 then also tl>=last because last<=0
381                 if (tl>=0) {
382                   snprintf(warnStr, 1000,   // not direct warning as we're inside tl region
383                   _("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \
384                     "Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."),
385                   w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1);
386                 } else {
387                   snprintf(warnStr, 1000,
388                   _("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \
389                     "A regular factor will be created for this column due to this ambiguity."),
390                   w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1);
391                   // k>=1 (so k-1 is ok) because when k==0 last==0 and this branch wouldn't happen
392                 }
393                 orderedFactor=false;
394                 i=LENGTH(l);  // break outer i loop
395                 break;        // break inner k loop
396                 // we leave the tl set for the longest levels; the regular factor will be created with the longest ordered levels first in case that useful for user
397               }
398               last = tl;  // negative ordinal; last should monotonically grow more negative if the levels are an ordered subset of the longest
399             }
400           }
401         }
402       }
403       for (int i=0; i<LENGTH(l); ++i) {
404         const int thisnrow = eachMax[i];
405         SEXP li = VECTOR_ELT(l, i);
406         if (!length(li)) continue;  // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508)
407         int w = usenames ? colMap[i*ncol + j] : j;
408         if (w==-1) {
409           writeNA(target, ansloc, thisnrow);
410         } else {
411           SEXP thisCol = VECTOR_ELT(li, w);
412           SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i));
413           const int n = length(thisColStr);
414           const SEXP *thisColStrD = STRING_PTR(thisColStr);  // D for data
415           for (int k=0; k<n; ++k) {
416             SEXP s = thisColStrD[k];
417             if (s==NA_STRING ||             // remove NA from levels; test 1979 found by package emil when revdep testing 1.12.2 (#3473)
418                 TRUELENGTH(s)<0) continue;  // seen this level before; handles removing dups from levels as well as finding unique of character columns
419             if (TRUELENGTH(s)>0) savetl(s);
420             if (allocLevel==nLevel) {       // including initial time when allocLevel==nLevel==0
421               SEXP *tt = NULL;
422               if (allocLevel<INT_MAX) {
423                 int64_t new = (int64_t)allocLevel+n-k+1024; // if all remaining levels in this item haven't been seen before, plus 1024 margin in case of many very short levels
424                 allocLevel = (new>(int64_t)INT_MAX) ? INT_MAX : (int)new;
425                 tt = (SEXP *)realloc(levelsRaw, allocLevel*sizeof(SEXP));  // first time levelsRaw==NULL and realloc==malloc in that case
426               }
427               if (tt==NULL) {
428                 // # nocov start
429                 // C spec states that if realloc() fails (above) the original block (levelsRaw) is left untouched: it is not freed or moved. We ...
430                 for (int k=0; k<nLevel; k++) SET_TRUELENGTH(levelsRaw[k], 0);   // ... rely on that in this loop which uses levelsRaw.
431                 free(levelsRaw);
432                 savetl_end();
433                 error(_("Failed to allocate working memory for %d factor levels of result column %d when reading item %d of item %d"), allocLevel, idcol+j+1, w+1, i+1);
434                 // # nocov end
435               }
436               levelsRaw = tt;
437             }
438             SET_TRUELENGTH(s,-(++nLevel));
439             levelsRaw[nLevel-1] = s;
440           }
441           int *targetd = INTEGER(target);
442           if (isFactor(thisCol)) {
443             const int *id = INTEGER(thisCol);
444             if (length(thisCol)<=1) {
445               // recycle length-1, or NA-fill length-0
446               SEXP lev;
447               const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER;
448               //                                                                                    ^^ #3915 and tests 2015.2-5
449               for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
450             } else {
451               // length(thisCol)==thisnrow alreay checked before this truelength-clobber region
452               // If all i==truelength(i) then just do a memcpy since hop is identity. Otherwise hop via the integer map.
453               bool hop = false;
454               if (orderedFactor) {
455                 // retain the position of NA level (if any) and the integer mappings to it
456                 for (int k=0; k<n; ++k) {
457                   SEXP s = thisColStrD[k];
458                   if (s!=NA_STRING && -TRUELENGTH(s)!=k+1) { hop=true; break; }
459                 }
460               } else {
461                 for (int k=0; k<n; ++k) {
462                   SEXP s = thisColStrD[k];
463                   if (s==NA_STRING || -TRUELENGTH(s)!=k+1) { hop=true; break; }
464                 }
465               }
466               if (hop) {
467                 if (orderedFactor) {
468                   for (int r=0; r<thisnrow; ++r)
469                     targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(thisColStrD[id[r]-1]);
470                 } else {
471                   for (int r=0; r<thisnrow; ++r) {
472                     SEXP lev;
473                     targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRUELENGTH(lev);
474                   }
475                 }
476               } else {
477                 memcpy(targetd+ansloc, id, thisnrow*SIZEOF(thisCol));
478               }
479             }
480           } else {
481             const SEXP *sd = STRING_PTR(thisColStr);
482             if (length(thisCol)<=1) {
483               const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRUELENGTH(sd[0]) : NA_INTEGER;
484               for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
485             } else {
486               for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRUELENGTH(sd[r]);
487             }
488           }
489         }
490         ansloc += thisnrow;
491       }
492       for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
493       savetl_end();
494       if (warnStr[0]) warning(warnStr);  // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
495       SEXP levelsSxp;
496       setAttrib(target, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nLevel));
497       for (int k=0; k<nLevel; ++k) SET_STRING_ELT(levelsSxp, k, levelsRaw[k]);
498       free(levelsRaw);
499       if (orderedFactor) {
500         SEXP tt;
501         setAttrib(target, R_ClassSymbol, tt=allocVector(STRSXP, 2));
502         SET_STRING_ELT(tt, 0, char_ordered);
503         SET_STRING_ELT(tt, 1, char_factor);
504       } else {
505         setAttrib(target, R_ClassSymbol, ScalarString(char_factor));
506       }
507     } else {  // factor==false
508       for (int i=0; i<LENGTH(l); ++i) {
509         const int thisnrow = eachMax[i];
510         if (thisnrow==0) continue;
511         SEXP li = VECTOR_ELT(l, i);
512         int w = usenames ? colMap[i*ncol + j] : j;
513         SEXP thisCol;
514         if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) {  // !length for zeroCol warning above; #1871
515           writeNA(target, ansloc, thisnrow);  // writeNA is integer64 aware and writes INT64_MIN
516         } else {
517           if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) {
518             // do an as.list() on the atomic column; #3528
519             thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); nprotect++;
520           }
521           // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909)
522           const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName);
523           if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret);
524           // e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2
525           // TODO: but maxType should handle that and this should never warn
526         }
527         ansloc += thisnrow;
528       }
529     }
530   }
531   UNPROTECT(nprotect);  // ans, coercedForFactor, thisCol
532   return(ans);
533 }
534