1 #include <cstring>
2 
3 #include "cpp11/R.hpp"
4 
5 #include "cpp11/protect.hpp"
6 #include "cpp11/sexp.hpp"
7 #include "cpp11/integers.hpp"
8 #include "cpp11/strings.hpp"
9 #include "cpp11/list.hpp"
10 #include "cpp11/data_frame.hpp"
11 
12 #if R_VERSION < R_Version(3, 5, 0)
13 
DATAPTR(SEXP x)14 void* DATAPTR(SEXP x) {
15   switch(TYPEOF(x)) {
16     case STRSXP:
17       return (char*) CHAR(x);
18       break;
19     case LGLSXP:
20       return (char*) LOGICAL(x);
21     case INTSXP:
22       return (char*) INTEGER(x);
23     case RAWSXP:
24       return (char*) RAW(x);
25     case CPLXSXP:
26       return (char*) COMPLEX(x);
27     case REALSXP:
28       return (char*) REAL(x);
29     default:
30       cpp11::stop("Invalid type %s", Rf_type2char(TYPEOF(x)));
31   }
32   return nullptr;
33 }
34 
35 #endif
36 
37 // A debug macro -- change to 'debug(x) x' for debug output
38 #define debug(x)
39 
40 // An optimized rep
41 #define DO_REP(RTYPE, CTYPE, ACCESSOR)                         \
42   {                                                            \
43     for (int i = 0; i < n; ++i) {                              \
44       memcpy((char*)ACCESSOR(output) + i * xn * sizeof(CTYPE), \
45              (char*)ACCESSOR(x),                               \
46              sizeof(CTYPE) * xn);                              \
47     }                                                          \
48   }
49 
rep_(SEXP x,int n,std::string var_name)50 SEXP rep_(SEXP x, int n, std::string var_name) {
51   if (!Rf_isVectorAtomic(x) && TYPEOF(x) != VECSXP) {
52     cpp11::stop("All columns must be atomic vectors or lists. Problem with '%s'", var_name.c_str());
53   }
54 
55   if (Rf_inherits(x, "POSIXlt")) {
56     cpp11::stop("'%s' is a POSIXlt. Please convert to POSIXct.", var_name.c_str());
57   }
58 
59   int xn = Rf_length(x);
60   int nout = xn * n;
61 
62   cpp11::sexp output(Rf_allocVector(TYPEOF(x), nout));
63   switch (TYPEOF(x)) {
64     case INTSXP:
65       DO_REP(INTSXP, int, INTEGER);
66       break;
67     case REALSXP:
68       DO_REP(REALSXP, double, REAL);
69       break;
70     case LGLSXP:
71       DO_REP(LGLSXP, int, LOGICAL);
72       break;
73     case CPLXSXP:
74       DO_REP(CPLXSXP, Rcomplex, COMPLEX);
75       break;
76     case RAWSXP:
77       DO_REP(RAWSXP, Rbyte, RAW);
78       break;
79     case STRSXP: {
80       int counter = 0;
81       for (int i = 0; i < n; ++i) {
82         for (int j = 0; j < xn; ++j) {
83           SET_STRING_ELT(output, counter, STRING_ELT(x, j));
84           ++counter;
85         }
86       }
87       break;
88     }
89     case VECSXP: {
90       int counter = 0;
91       for (int i = 0; i < n; ++i) {
92         for (int j = 0; j < xn; ++j) {
93           SET_VECTOR_ELT(output, counter, VECTOR_ELT(x, j));
94           ++counter;
95         }
96       }
97       break;
98     }
99     default: {
100                cpp11::stop("Unhandled RTYPE in '%s'", var_name.c_str());
101       return R_NilValue;
102     }
103   }
104 
105   Rf_copyMostAttrib(x, output);
106   return output;
107 }
108 
109 // Optimized factor routine for the case where we want to make
110 // a factor from a vector of names -- used for generating the
111 // 'variable' column in the melted data.frame
make_variable_column_factor(cpp11::strings x,int nrow)112 cpp11::integers make_variable_column_factor(cpp11::strings x, int nrow) {
113   cpp11::writable::integers output(x.size() * nrow);
114 
115   int idx = 0;
116   for (int i = 0; i < x.size(); ++i)
117     for (int j = 0; j < nrow; ++j)
118       output[idx++] = i + 1;
119 
120   output.attr("levels") = x;
121   output.attr("class") = "factor";
122   return output;
123 }
124 
make_variable_column_character(cpp11::strings x,int nrow)125 cpp11::strings make_variable_column_character(cpp11::strings x, int nrow) {
126   cpp11::writable::strings output(x.size() * nrow);
127 
128   int idx = 0;
129   for (int i = 0; i < x.size(); ++i)
130     for (int j = 0; j < nrow; ++j)
131       output[idx++] = x[i];
132 
133   return output;
134 }
135 
136 // Concatenate vectors for the 'value' column
137 #define DO_CONCATENATE(CTYPE)                                \
138   {                                                          \
139     memcpy((char*)DATAPTR(output) + i* nrow * sizeof(CTYPE), \
140            (char*)DATAPTR(tmp),                              \
141            nrow * sizeof(CTYPE));                            \
142     break;                                                   \
143   }
144 
concatenate(const cpp11::data_frame & x,cpp11::integers ind,bool factorsAsStrings)145 SEXP concatenate(const cpp11::data_frame& x, cpp11::integers ind, bool factorsAsStrings) {
146 
147   int nrow = x.nrow();
148   int n_ind = ind.size();
149 
150   // We coerce up to the 'max type' if necessary, using the fact
151   // that R's SEXPTYPEs are also ordered in terms of 'precision'
152   // Note: we convert factors to characters if necessary
153   int max_type = 0;
154   int ctype = 0;
155   for (int i = 0; i < n_ind; ++i) {
156 
157     if (Rf_isFactor(x[ind[i]]) and factorsAsStrings) {
158       ctype = STRSXP;
159     } else {
160       ctype = TYPEOF(x[ind[i]]);
161     }
162     max_type = ctype > max_type ? ctype : max_type;
163   }
164 
165   debug(printf("Max type of value variables is %s\n", Rf_type2char(max_type)));
166 
167   cpp11::sexp tmp;
168   cpp11::sexp output(Rf_allocVector(max_type, nrow * n_ind));
169   for (int i = 0; i < n_ind; ++i) {
170     SEXP col = x[ind[i]];
171 
172     if (Rf_inherits(col, "POSIXlt")) {
173       cpp11::stop("Column %i is a POSIXlt. Please convert to POSIXct.", i + 1);
174     }
175 
176     // a 'tmp' pointer to the current column being iterated over, or
177     // a coerced version if necessary
178     if (TYPEOF(col) == max_type) {
179       tmp = col;
180     } else if (Rf_isFactor(col) and factorsAsStrings) {
181       tmp = Rf_asCharacterFactor(col);
182     } else {
183       tmp = Rf_coerceVector(col, max_type);
184     }
185 
186     switch (max_type) {
187       case INTSXP:
188         DO_CONCATENATE(int);
189       case REALSXP:
190         DO_CONCATENATE(double);
191       case LGLSXP:
192         DO_CONCATENATE(int);
193       case CPLXSXP:
194         DO_CONCATENATE(Rcomplex);
195       case STRSXP: {
196         for (int j = 0; j < nrow; ++j) {
197           SET_STRING_ELT(output, i * nrow + j, STRING_ELT(tmp, j));
198         }
199         break;
200       }
201       case VECSXP: {
202         for (int j = 0; j < nrow; ++j) {
203           SET_VECTOR_ELT(output, i * nrow + j, VECTOR_ELT(tmp, j));
204         }
205         break;
206       }
207     default:
208                    cpp11::stop("All columns be atomic vectors or lists (not %s)", Rf_type2char(max_type));
209     }
210   }
211 
212   return output;
213 }
214 
215 [[cpp11::register]]
melt_dataframe(cpp11::data_frame data,const cpp11::integers & id_ind,const cpp11::integers & measure_ind,cpp11::strings variable_name,cpp11::strings value_name,cpp11::sexp attrTemplate,bool factorsAsStrings,bool valueAsFactor,bool variableAsFactor)216 cpp11::list melt_dataframe(cpp11::data_frame data,
217                     const cpp11::integers& id_ind,
218                     const cpp11::integers& measure_ind,
219                     cpp11::strings variable_name,
220                     cpp11::strings value_name,
221                     cpp11::sexp attrTemplate,
222                     bool factorsAsStrings,
223                     bool valueAsFactor,
224                     bool variableAsFactor) {
225 
226   int nrow = data.nrow();
227 
228   cpp11::strings data_names(data.attr("names"));
229 
230   int n_id = id_ind.size();
231   debug(Rprintf("n_id == %i\n", n_id));
232 
233   int n_measure = measure_ind.size();
234   debug(Rprintf("n_measure == %i\n", n_measure));
235 
236   // Don't melt if the value variables are non-atomic
237   for (int i = 0; i < n_measure; ++i) {
238     if (!Rf_isVector(data[measure_ind[i]]) || Rf_inherits(data[measure_ind[i]], "data.frame")) {
239       cpp11::stop("All columns must be atomic vectors or lists. Problem with column %i.", measure_ind[i] + 1);
240     }
241   }
242 
243   // The output should be a data.frame with:
244   // number of columns == number of id vars + 'variable' + 'value',
245   // with number of rows == data.nrow() * number of value vars
246   cpp11::writable::list output(n_id + 2);
247 
248   // First, allocate the ID variables
249   // we repeat each ID vector n_measure times
250   for (int i = 0; i < n_id; ++i) {
251     SEXP object = data[id_ind[i]];
252     std::string var_name = std::string(data_names[id_ind[i]]);
253     output[i] = rep_(object, n_measure, var_name);
254   }
255 
256   // Now, we assign the 'variable' and 'value' columns
257 
258   // 'variable' is made up of repeating the names of the 'measure' variables,
259   // each nrow times. We want this to be a factor as well.
260   cpp11::writable::strings id_names(n_measure);
261   for (int i = 0; i < n_measure; ++i) {
262     id_names[i] = data_names[measure_ind[i]];
263   }
264   if (variableAsFactor) {
265     output[n_id] = make_variable_column_factor(id_names, nrow);
266   } else {
267     output[n_id] = make_variable_column_character(id_names, nrow);
268   }
269 
270   // 'value' is made by concatenating each of the 'value' variables
271   output[n_id + 1] = concatenate(data, measure_ind, factorsAsStrings);
272   if (!Rf_isNull(attrTemplate)) {
273     Rf_copyMostAttrib(attrTemplate, output[n_id + 1]);
274   }
275 
276   // Make the List more data.frame like
277 
278   // Set the row names
279   output.attr("row.names") = {NA_INTEGER, -(nrow * n_measure)};
280 
281   // Set the names
282   cpp11::writable::strings out_names(n_id + 2);
283   for (int i = 0; i < n_id; ++i) {
284     out_names[i] = data_names[id_ind[i]];
285   }
286   out_names[n_id] = variable_name[0];
287   out_names[n_id + 1] = value_name[0];
288   output.attr("names") = out_names;
289 
290   // Set the class
291   output.attr("class") = "data.frame";
292 
293   return output;
294 }
295