1 #include "cpp11/R.hpp"
2 #include "cpp11/integers.hpp"
3 #include "cpp11/list.hpp"
4 #include "cpp11/sexp.hpp"
5 #include <memory>
6
7 #include "Collector.h"
8 #include "LocaleInfo.h"
9 #include "Source.h"
10 #include "Tokenizer.h"
11 #include "TokenizerLine.h"
12 #include "Warnings.h"
13
14 [[cpp11::register]] cpp11::integers
dim_tokens_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec)15 dim_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec) {
16 SourcePtr source = Source::create(sourceSpec);
17 TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
18 tokenizer->tokenize(source->begin(), source->end());
19
20 int rows = -1;
21
22 int cols = -1;
23
24 for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
25 t = tokenizer->nextToken()) {
26 rows = t.row();
27
28 if ((int)t.col() > cols) {
29 cols = t.col();
30 }
31 }
32
33 cpp11::writable::integers out(rows + 1);
34 for (auto&& x : out) {
35 x = cols + 1;
36 }
37 return out;
38 }
39
count_fields_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,int n_max)40 [[cpp11::register]] std::vector<int> count_fields_(
41 const cpp11::list& sourceSpec,
42 const cpp11::list& tokenizerSpec,
43 int n_max) {
44 SourcePtr source = Source::create(sourceSpec);
45 TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
46 tokenizer->tokenize(source->begin(), source->end());
47
48 std::vector<int> fields;
49
50 for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
51 t = tokenizer->nextToken()) {
52 if (n_max > 0 && t.row() >= (size_t)n_max) {
53 break;
54 }
55
56 if (t.row() >= fields.size()) {
57 fields.resize(t.row() + 1);
58 }
59
60 fields[t.row()] = t.col() + 1;
61 }
62
63 return fields;
64 }
65
guess_header_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,const cpp11::list & locale_)66 [[cpp11::register]] cpp11::list guess_header_(
67 const cpp11::list& sourceSpec,
68 const cpp11::list& tokenizerSpec,
69 const cpp11::list& locale_) {
70 Warnings warnings;
71 LocaleInfo locale(locale_);
72 SourcePtr source = Source::create(sourceSpec);
73 TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
74 tokenizer->tokenize(source->begin(), source->end());
75 tokenizer->setWarnings(&warnings);
76
77 CollectorCharacter out(&locale.encoder_);
78 out.setWarnings(&warnings);
79 Token t = tokenizer->nextToken();
80 size_t row_num = t.row();
81
82 size_t max_size = 0;
83 size_t capacity = 0;
84
85 for (; t.type() != TOKEN_EOF && t.row() == row_num;
86 t = tokenizer->nextToken()) {
87 if (t.col() >= max_size) {
88 max_size = t.col();
89 }
90
91 if (max_size >= capacity) {
92 capacity = (max_size + 1) * 2;
93 out.resize(capacity);
94 }
95
96 if (t.type() == TOKEN_STRING) {
97 out.setValue(t.col(), t);
98 }
99 }
100
101 out.resize(max_size + 1);
102
103 using namespace cpp11::literals;
104 return cpp11::writable::list(
105 {"header"_nm = out.vector(), "skip"_nm = source->skippedRows() + 1});
106 }
107
tokenize_(const cpp11::list & sourceSpec,const cpp11::list & tokenizerSpec,int n_max)108 [[cpp11::register]] SEXP tokenize_(
109 const cpp11::list& sourceSpec,
110 const cpp11::list& tokenizerSpec,
111 int n_max) {
112 Warnings warnings;
113
114 SourcePtr source = Source::create(sourceSpec);
115 TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
116 tokenizer->tokenize(source->begin(), source->end());
117 tokenizer->setWarnings(&warnings);
118
119 std::vector<std::vector<std::string>> rows;
120
121 for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
122 t = tokenizer->nextToken()) {
123 if (n_max > 0 && t.row() >= (size_t)n_max) {
124 break;
125 }
126
127 if (t.row() >= rows.size()) {
128 rows.resize(t.row() + 1);
129 }
130
131 std::vector<std::string>& row = rows[t.row()];
132 if (t.col() >= row.size()) {
133 row.resize(t.col() + 1);
134 }
135
136 row[t.col()] = t.asString();
137 }
138
139 cpp11::writable::list out;
140 out.reserve(rows.size());
141
142 for (auto&& row : rows) {
143 cpp11::sexp row_data(cpp11::as_sexp(row));
144 out.push_back(row_data);
145 }
146
147 return warnings.addAsAttribute(out);
148 }
149
parse_vector_(const cpp11::strings & x,const cpp11::list & collectorSpec,const cpp11::list & locale_,const std::vector<std::string> & na,bool trim_ws)150 [[cpp11::register]] SEXP parse_vector_(
151 const cpp11::strings& x,
152 const cpp11::list& collectorSpec,
153 const cpp11::list& locale_,
154 const std::vector<std::string>& na,
155 bool trim_ws) {
156 Warnings warnings;
157 int n = x.size();
158
159 LocaleInfo locale(locale_);
160
161 std::shared_ptr<Collector> col(Collector::create(collectorSpec, &locale));
162 col->setWarnings(&warnings);
163 col->resize(n);
164
165 for (int i = 0; i < n; ++i) {
166 Token t;
167 if (x[i] == NA_STRING) {
168 t = Token(TOKEN_MISSING, i, -1);
169 } else {
170 SEXP string = x[i];
171 t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false);
172 if (trim_ws) {
173 t.trim();
174 }
175 t.flagNA(na);
176 }
177 col->setValue(i, t);
178 }
179
180 return warnings.addAsAttribute(static_cast<SEXP>(col->vector()));
181 }
182