1 #include "cpp11/list.hpp"
2
3 #include "Collector.h"
4 #include "LocaleInfo.h"
5 #include "QiParsers.h"
6 #include "utils.h"
7
create(const cpp11::list & spec,LocaleInfo * pLocale)8 CollectorPtr Collector::create(const cpp11::list& spec, LocaleInfo* pLocale) {
9 std::string subclass(cpp11::as_cpp<cpp11::strings>(spec.attr("class"))[0]);
10
11 if (subclass == "collector_skip") {
12 return CollectorPtr(new CollectorSkip());
13 }
14 if (subclass == "collector_logical") {
15 return CollectorPtr(new CollectorLogical());
16 }
17 if (subclass == "collector_integer") {
18 return CollectorPtr(new CollectorInteger());
19 }
20 if (subclass == "collector_double") {
21 return CollectorPtr(new CollectorDouble(pLocale->decimalMark_));
22 }
23 if (subclass == "collector_number") {
24 return CollectorPtr(
25 new CollectorNumeric(pLocale->decimalMark_, pLocale->groupingMark_));
26 }
27 if (subclass == "collector_character") {
28 return CollectorPtr(new CollectorCharacter(&pLocale->encoder_));
29 }
30 if (subclass == "collector_date") {
31 SEXP format_ = spec["format"];
32 std::string format = (Rf_isNull(format_)) != 0U
33 ? pLocale->dateFormat_
34 : cpp11::as_cpp<std::string>(format_);
35 return CollectorPtr(new CollectorDate(pLocale, format));
36 }
37 if (subclass == "collector_datetime") {
38 std::string format = cpp11::as_cpp<std::string>(spec["format"]);
39 return CollectorPtr(new CollectorDateTime(pLocale, format));
40 }
41 if (subclass == "collector_time") {
42 std::string format = cpp11::as_cpp<std::string>(spec["format"]);
43 return CollectorPtr(new CollectorTime(pLocale, format));
44 }
45 if (subclass == "collector_factor") {
46 cpp11::sexp levels(spec["levels"]);
47 bool ordered = cpp11::as_cpp<bool>(spec["ordered"]);
48 bool includeNa = cpp11::as_cpp<bool>(spec["include_na"]);
49 return CollectorPtr(
50 new CollectorFactor(&pLocale->encoder_, levels, ordered, includeNa));
51 }
52
53 cpp11::stop("Unsupported column type");
54 return CollectorPtr(new CollectorSkip());
55 }
56
57 std::vector<CollectorPtr>
collectorsCreate(const cpp11::list & specs,LocaleInfo * pLocale)58 collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale) {
59 std::vector<CollectorPtr> collectors;
60 for (auto spec : specs) {
61 CollectorPtr col(Collector::create(SEXP(spec), pLocale));
62 collectors.push_back(col);
63 }
64
65 return collectors;
66 }
67
68 // Implementations ------------------------------------------------------------
69
setValue(int i,const Token & t)70 void CollectorCharacter::setValue(int i, const Token& t) {
71 switch (t.type()) {
72 case TOKEN_STRING: {
73 std::string buffer;
74 SourceIterators string = t.getString(&buffer);
75
76 if (t.hasNull()) {
77 warn(t.row(), t.col(), "", "embedded null");
78 }
79
80 SET_STRING_ELT(
81 column_,
82 i,
83 pEncoder_->makeSEXP(string.first, string.second, t.hasNull()));
84 break;
85 };
86 case TOKEN_MISSING:
87 SET_STRING_ELT(column_, i, NA_STRING);
88 break;
89 case TOKEN_EMPTY:
90 SET_STRING_ELT(column_, i, Rf_mkCharCE("", CE_UTF8));
91 break;
92 case TOKEN_EOF:
93 cpp11::stop("Invalid token");
94 }
95 }
96
setValue(int i,const std::string & s)97 void CollectorCharacter::setValue(int i, const std::string& s) {
98 SET_STRING_ELT(column_, i, Rf_mkCharCE(s.c_str(), CE_UTF8));
99 }
100
setValue(int i,const Token & t)101 void CollectorDate::setValue(int i, const Token& t) {
102 switch (t.type()) {
103 case TOKEN_STRING: {
104 std::string buffer;
105 SourceIterators string = t.getString(&buffer);
106 std::string std_string(string.first, string.second);
107
108 parser_.setDate(std_string.c_str());
109 bool res =
110 (format_.empty()) ? parser_.parseLocaleDate() : parser_.parse(format_);
111
112 if (!res) {
113 warn(t.row(), t.col(), "date like " + format_, std_string);
114 REAL(column_)[i] = NA_REAL;
115 return;
116 }
117
118 DateTime dt = parser_.makeDate();
119 if (!dt.validDate()) {
120 warn(t.row(), t.col(), "valid date", std_string);
121 REAL(column_)[i] = NA_REAL;
122 return;
123 }
124 REAL(column_)[i] = dt.date();
125 return;
126 }
127 case TOKEN_MISSING:
128 case TOKEN_EMPTY:
129 REAL(column_)[i] = NA_REAL;
130 return;
131 case TOKEN_EOF:
132 cpp11::stop("Invalid token");
133 }
134 }
135
setValue(int i,const Token & t)136 void CollectorDateTime::setValue(int i, const Token& t) {
137 switch (t.type()) {
138 case TOKEN_STRING: {
139 std::string buffer;
140 SourceIterators string = t.getString(&buffer);
141 std::string std_string(string.first, string.second);
142
143 parser_.setDate(std_string.c_str());
144 bool res =
145 (format_.empty()) ? parser_.parseISO8601() : parser_.parse(format_);
146
147 if (!res) {
148 warn(t.row(), t.col(), "date like " + format_, std_string);
149 REAL(column_)[i] = NA_REAL;
150 return;
151 }
152
153 DateTime dt = parser_.makeDateTime();
154 if (!dt.validDateTime()) {
155 warn(t.row(), t.col(), "valid date", std_string);
156 REAL(column_)[i] = NA_REAL;
157 return;
158 }
159
160 REAL(column_)[i] = dt.datetime();
161 return;
162 }
163 case TOKEN_MISSING:
164 case TOKEN_EMPTY:
165 REAL(column_)[i] = NA_REAL;
166 return;
167 case TOKEN_EOF:
168 cpp11::stop("Invalid token");
169 }
170 }
171
setValue(int i,const Token & t)172 void CollectorDouble::setValue(int i, const Token& t) {
173 switch (t.type()) {
174 case TOKEN_STRING: {
175 std::string buffer;
176 SourceIterators str = t.getString(&buffer);
177
178 const char* end = str.second;
179 bool ok =
180 parseDouble(decimalMark_, str.first, str.second, REAL(column_)[i]);
181 if (!ok) {
182 REAL(column_)[i] = NA_REAL;
183 SourceIterators org_str = t.getString(&buffer);
184 warn(t.row(), t.col(), "a double", org_str);
185 return;
186 }
187
188 if (str.second != end) {
189 REAL(column_)[i] = NA_REAL;
190 SourceIterators org_str = t.getString(&buffer);
191 warn(t.row(), t.col(), "no trailing characters", org_str);
192 return;
193 }
194
195 return;
196 }
197 case TOKEN_MISSING:
198 case TOKEN_EMPTY:
199 REAL(column_)[i] = NA_REAL;
200 break;
201 case TOKEN_EOF:
202 cpp11::stop("Invalid token");
203 }
204 }
205
setValue(int i,size_t st)206 void CollectorDouble::setValue(int i, size_t st) { REAL(column_)[i] = st; }
207
insert(int i,const cpp11::r_string & str,const Token & t)208 void CollectorFactor::insert(
209 int i, const cpp11::r_string& str, const Token& t) {
210 auto it = levelset_.find(str);
211 if (it == levelset_.end()) {
212 if (implicitLevels_ || (includeNa_ && str == NA_STRING)) {
213 int n = levelset_.size();
214 levelset_.insert(std::make_pair(str, n));
215 levels_.push_back(str);
216 INTEGER(column_)[i] = n + 1;
217 } else {
218 warn(t.row(), t.col(), "value in level set", str);
219 INTEGER(column_)[i] = NA_INTEGER;
220 }
221 } else {
222 INTEGER(column_)[i] = it->second + 1;
223 }
224 }
225
setValue(int i,const Token & t)226 void CollectorFactor::setValue(int i, const Token& t) {
227
228 switch (t.type()) {
229 case TOKEN_EMPTY:
230 case TOKEN_STRING: {
231 std::string buffer;
232 SourceIterators string = t.getString(&buffer);
233
234 cpp11::r_string std_string(
235 pEncoder_->makeSEXP(string.first, string.second, t.hasNull()));
236 insert(i, std_string, t);
237 return;
238 };
239 case TOKEN_MISSING:
240 if (includeNa_) {
241 insert(i, NA_STRING, t);
242 } else {
243 INTEGER(column_)[i] = NA_INTEGER;
244 }
245 return;
246 case TOKEN_EOF:
247 cpp11::stop("Invalid token");
248 }
249 }
250
setValue(int i,const Token & t)251 void CollectorInteger::setValue(int i, const Token& t) {
252
253 switch (t.type()) {
254 case TOKEN_STRING: {
255 std::string buffer;
256 SourceIterators str = t.getString(&buffer);
257
258 bool ok = parseInt(str.first, str.second, INTEGER(column_)[i]);
259 if (!ok) {
260 INTEGER(column_)[i] = NA_INTEGER;
261 SourceIterators org_str = t.getString(&buffer);
262 warn(t.row(), t.col(), "an integer", org_str);
263 return;
264 }
265
266 if (str.first != str.second) {
267 SourceIterators org_str = t.getString(&buffer);
268 warn(t.row(), t.col(), "no trailing characters", org_str);
269 INTEGER(column_)[i] = NA_INTEGER;
270 return;
271 }
272
273 return;
274 };
275 case TOKEN_MISSING:
276 case TOKEN_EMPTY:
277 INTEGER(column_)[i] = NA_INTEGER;
278 break;
279 case TOKEN_EOF:
280 cpp11::stop("Invalid token");
281 }
282 }
283
setValue(int i,const Token & t)284 void CollectorLogical::setValue(int i, const Token& t) {
285
286 switch (t.type()) {
287 case TOKEN_STRING: {
288 std::string buffer;
289 SourceIterators string = t.getString(&buffer);
290 std::string str(string.first, string.second);
291 size_t len = string.second - string.first;
292
293 if (isTrue(string.first, string.second) ||
294 (len == 1 && *string.first == '1')) {
295 LOGICAL(column_)[i] = 1;
296 return;
297 }
298 if (isFalse(string.first, string.second) ||
299 (len == 1 && *string.first == '0')) {
300 LOGICAL(column_)[i] = 0;
301 return;
302 }
303
304 warn(t.row(), t.col(), "1/0/T/F/TRUE/FALSE", string);
305 LOGICAL(column_)[i] = NA_LOGICAL;
306 return;
307 };
308 case TOKEN_MISSING:
309 case TOKEN_EMPTY:
310 LOGICAL(column_)[i] = NA_LOGICAL;
311 return;
312 break;
313 case TOKEN_EOF:
314 cpp11::stop("Invalid token");
315 }
316 }
317
setValue(int i,const Token & t)318 void CollectorNumeric::setValue(int i, const Token& t) {
319 switch (t.type()) {
320 case TOKEN_STRING: {
321 std::string buffer;
322 SourceIterators str = t.getString(&buffer);
323
324 bool ok = parseNumber(
325 decimalMark_, groupingMark_, str.first, str.second, REAL(column_)[i]);
326
327 if (!ok) {
328 SourceIterators org_str = t.getString(&buffer);
329 REAL(column_)[i] = NA_REAL;
330 warn(t.row(), t.col(), "a number", org_str);
331 return;
332 }
333
334 break;
335 }
336 case TOKEN_MISSING:
337 case TOKEN_EMPTY:
338 REAL(column_)[i] = NA_REAL;
339 break;
340 case TOKEN_EOF:
341 cpp11::stop("Invalid token");
342 }
343 }
344
setValue(int i,const Token & t)345 void CollectorTime::setValue(int i, const Token& t) {
346 switch (t.type()) {
347 case TOKEN_STRING: {
348 std::string buffer;
349 SourceIterators string = t.getString(&buffer);
350 std::string std_string(string.first, string.second);
351
352 parser_.setDate(std_string.c_str());
353 bool res =
354 (format_.empty()) ? parser_.parseLocaleTime() : parser_.parse(format_);
355
356 if (!res) {
357 warn(t.row(), t.col(), "time like " + format_, std_string);
358 REAL(column_)[i] = NA_REAL;
359 return;
360 }
361
362 DateTime dt = parser_.makeTime();
363 if (!dt.validDuration()) {
364 warn(t.row(), t.col(), "valid duration", std_string);
365 REAL(column_)[i] = NA_REAL;
366 return;
367 }
368 REAL(column_)[i] = dt.time();
369 return;
370 }
371 case TOKEN_MISSING:
372 case TOKEN_EMPTY:
373 REAL(column_)[i] = NA_REAL;
374 return;
375 case TOKEN_EOF:
376 cpp11::stop("Invalid token");
377 }
378 }
379
setValue(int i,const Token & t)380 void CollectorRaw::setValue(int i, const Token& t) {
381 if (t.type() == TOKEN_EOF) {
382 cpp11::stop("Invalid token");
383 }
384 SET_VECTOR_ELT(column_, i, t.asRaw());
385 }
386