1 #include "cpp11/R.hpp"
2 #include "cpp11/integers.hpp"
3 #include "cpp11/list.hpp"
4 #include "cpp11/sexp.hpp"
5 #include <memory>
6 
7 #include "Collector.h"
8 #include "LocaleInfo.h"
9 #include "Source.h"
10 #include "Tokenizer.h"
11 #include "TokenizerLine.h"
12 #include "Warnings.h"
13 
14 [[cpp11::register]] cpp11::integers
dim_tokens_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec)15 dim_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec) {
16   SourcePtr source = Source::create(sourceSpec);
17   TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
18   tokenizer->tokenize(source->begin(), source->end());
19 
20   int rows = -1;
21 
22   int cols = -1;
23 
24   for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
25        t = tokenizer->nextToken()) {
26     rows = t.row();
27 
28     if ((int)t.col() > cols) {
29       cols = t.col();
30     }
31   }
32 
33   cpp11::writable::integers out(rows + 1);
34   for (auto&& x : out) {
35     x = cols + 1;
36   }
37   return out;
38 }
39 
count_fields_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,int n_max)40 [[cpp11::register]] std::vector<int> count_fields_(
41     const cpp11::list& sourceSpec,
42     const cpp11::list& tokenizerSpec,
43     int n_max) {
44   SourcePtr source = Source::create(sourceSpec);
45   TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
46   tokenizer->tokenize(source->begin(), source->end());
47 
48   std::vector<int> fields;
49 
50   for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
51        t = tokenizer->nextToken()) {
52     if (n_max > 0 && t.row() >= (size_t)n_max) {
53       break;
54     }
55 
56     if (t.row() >= fields.size()) {
57       fields.resize(t.row() + 1);
58     }
59 
60     fields[t.row()] = t.col() + 1;
61   }
62 
63   return fields;
64 }
65 
guess_header_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,const cpp11::list & locale_)66 [[cpp11::register]] cpp11::list guess_header_(
67     const cpp11::list& sourceSpec,
68     const cpp11::list& tokenizerSpec,
69     const cpp11::list& locale_) {
70   Warnings warnings;
71   LocaleInfo locale(locale_);
72   SourcePtr source = Source::create(sourceSpec);
73   TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
74   tokenizer->tokenize(source->begin(), source->end());
75   tokenizer->setWarnings(&warnings);
76 
77   CollectorCharacter out(&locale.encoder_);
78   out.setWarnings(&warnings);
79   Token t = tokenizer->nextToken();
80   size_t row_num = t.row();
81 
82   size_t max_size = 0;
83   size_t capacity = 0;
84 
85   for (; t.type() != TOKEN_EOF && t.row() == row_num;
86        t = tokenizer->nextToken()) {
87     if (t.col() >= max_size) {
88       max_size = t.col();
89     }
90 
91     if (max_size >= capacity) {
92       capacity = (max_size + 1) * 2;
93       out.resize(capacity);
94     }
95 
96     if (t.type() == TOKEN_STRING) {
97       out.setValue(t.col(), t);
98     }
99   }
100 
101   out.resize(max_size + 1);
102 
103   using namespace cpp11::literals;
104   return cpp11::writable::list(
105       {"header"_nm = out.vector(), "skip"_nm = source->skippedRows() + 1});
106 }
107 
tokenize_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,int n_max)108 [[cpp11::register]] SEXP tokenize_(
109     const cpp11::list& sourceSpec,
110     const cpp11::list& tokenizerSpec,
111     int n_max) {
112   Warnings warnings;
113 
114   SourcePtr source = Source::create(sourceSpec);
115   TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
116   tokenizer->tokenize(source->begin(), source->end());
117   tokenizer->setWarnings(&warnings);
118 
119   std::vector<std::vector<std::string>> rows;
120 
121   for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
122        t = tokenizer->nextToken()) {
123     if (n_max > 0 && t.row() >= (size_t)n_max) {
124       break;
125     }
126 
127     if (t.row() >= rows.size()) {
128       rows.resize(t.row() + 1);
129     }
130 
131     std::vector<std::string>& row = rows[t.row()];
132     if (t.col() >= row.size()) {
133       row.resize(t.col() + 1);
134     }
135 
136     row[t.col()] = t.asString();
137   }
138 
139   cpp11::writable::list out;
140   out.reserve(rows.size());
141 
142   for (auto&& row : rows) {
143     cpp11::sexp row_data(cpp11::as_sexp(row));
144     out.push_back(row_data);
145   }
146 
147   return warnings.addAsAttribute(out);
148 }
149 
parse_vector_(const cpp11::strings & x,const cpp11::list & collectorSpec,const cpp11::list & locale_,const std::vector<std::string> & na,bool trim_ws)150 [[cpp11::register]] SEXP parse_vector_(
151     const cpp11::strings& x,
152     const cpp11::list& collectorSpec,
153     const cpp11::list& locale_,
154     const std::vector<std::string>& na,
155     bool trim_ws) {
156   Warnings warnings;
157   int n = x.size();
158 
159   LocaleInfo locale(locale_);
160 
161   std::shared_ptr<Collector> col(Collector::create(collectorSpec, &locale));
162   col->setWarnings(&warnings);
163   col->resize(n);
164 
165   for (int i = 0; i < n; ++i) {
166     Token t;
167     if (x[i] == NA_STRING) {
168       t = Token(TOKEN_MISSING, i, -1);
169     } else {
170       SEXP string = x[i];
171       t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false);
172       if (trim_ws) {
173         t.trim();
174       }
175       t.flagNA(na);
176     }
177     col->setValue(i, t);
178   }
179 
180   return warnings.addAsAttribute(static_cast<SEXP>(col->vector()));
181 }
182