1 #include "cpp11/list.hpp"
2 #include "cpp11/protect.hpp"
3
4 #include "Tokenizer.h"
5 #include "TokenizerFwf.h"
6 #include "utils.h"
7
8 #include "Source.h"
9
10 #include <sstream>
11 #include <utility>
12
13 struct skip_t {
14 SourceIterator begin;
15 int lines;
16 };
17
skip_comments(SourceIterator begin,SourceIterator end,const std::string & comment="")18 skip_t skip_comments(
19 SourceIterator begin, SourceIterator end, const std::string& comment = "") {
20 skip_t out;
21 if (comment.length() == 0) {
22 out.begin = begin;
23 out.lines = 0;
24 return out;
25 }
26
27 SourceIterator cur = begin;
28 int skip = 0;
29 while (starts_with_comment(cur, end, comment)) {
30 // Skip rest of line
31 while (cur != end && *cur != '\n' && *cur != '\r') {
32 ++cur;
33 }
34
35 advanceForLF(&cur, end);
36 ++cur;
37 ++skip;
38 }
39
40 out.begin = cur;
41 out.lines = skip;
42 return out;
43 }
44
45 std::vector<bool>
emptyCols_(SourceIterator begin,SourceIterator end,size_t n=100)46 emptyCols_(SourceIterator begin, SourceIterator end, size_t n = 100) {
47
48 std::vector<bool> is_white;
49
50 size_t row = 0;
51
52 size_t col = 0;
53 for (SourceIterator cur = begin; cur != end; ++cur) {
54 if (row > n) {
55 break;
56 }
57
58 switch (*cur) {
59 case '\n':
60 case '\r':
61 advanceForLF(&cur, end);
62 col = 0;
63 row++;
64 break;
65 case ' ':
66 col++;
67 break;
68 default:
69 // Make sure there's enough room
70 if (col >= is_white.size()) {
71 is_white.resize(col + 1, true);
72 }
73 is_white[col] = false;
74 col++;
75 }
76 }
77
78 return is_white;
79 }
80
81 [[cpp11::register]] cpp11::list
whitespaceColumns(const cpp11::list & sourceSpec,int n,std::string comment)82 whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment) {
83 SourcePtr source = Source::create(sourceSpec);
84
85 skip_t s = skip_comments(source->begin(), source->end(), std::move(comment));
86
87 std::vector<bool> empty = emptyCols_(s.begin, source->end(), n);
88 std::vector<int> begin;
89
90 std::vector<int> end;
91
92 bool in_col = false;
93
94 for (size_t i = 0; i < empty.size(); ++i) {
95 if (in_col && empty[i]) {
96 end.push_back(i);
97 in_col = false;
98 } else if (!in_col && !empty[i]) {
99 begin.push_back(i);
100 in_col = true;
101 }
102 }
103
104 if (in_col) {
105 end.push_back(empty.size());
106 }
107
108 using namespace cpp11::literals;
109 return cpp11::writable::list(
110 {"begin"_nm = begin, "end"_nm = end, "skip"_nm = s.lines});
111 }
112
113 // TokenizerFwf --------------------------------------------------------------
114
115 #include "TokenizerFwf.h"
116
TokenizerFwf(const std::vector<int> & beginOffset,const std::vector<int> & endOffset,std::vector<std::string> NA,const std::string & comment,bool trimWS,bool skipEmptyRows)117 TokenizerFwf::TokenizerFwf(
118 const std::vector<int>& beginOffset,
119 const std::vector<int>& endOffset,
120 std::vector<std::string> NA,
121 const std::string& comment,
122 bool trimWS,
123 bool skipEmptyRows)
124 : beginOffset_(beginOffset),
125 endOffset_(endOffset),
126 NA_(std::move(NA)),
127 cols_(beginOffset.size()),
128 comment_(comment),
129 moreTokens_(false),
130 hasComment_(!comment.empty()),
131 trimWS_(trimWS),
132 skipEmptyRows_(skipEmptyRows) {
133 if (beginOffset_.size() != endOffset_.size()) {
134 cpp11::stop(
135 "Begin (%i) and end (%i) specifications must have equal length",
136 beginOffset_.size(),
137 endOffset_.size());
138 }
139
140 if (beginOffset_.empty()) {
141 cpp11::stop("Zero-length begin and end specifications not supported");
142 }
143
144 // File is assumed to be ragged (last column can have variable width)
145 // when the last element of endOffset_ is NA
146 isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER;
147
148 max_ = 0;
149 for (int j = 0; j < (cols_ - static_cast<int>(isRagged_)); ++j) {
150 if (endOffset_[j] <= beginOffset_[j]) {
151 cpp11::stop(
152 "Begin offset (%i) must be smaller than end offset (%i)",
153 beginOffset_[j],
154 endOffset_[j]);
155 }
156
157 if (beginOffset_[j] < 0) {
158 cpp11::stop("Begin offset (%i) must be greater than 0", beginOffset_[j]);
159 }
160
161 if (endOffset_[j] < 0) {
162 cpp11::stop("End offset (%i) must be greater than 0", endOffset_[j]);
163 }
164
165 if (endOffset_[j] > max_) {
166 max_ = endOffset_[j];
167 }
168 }
169 }
170
tokenize(SourceIterator begin,SourceIterator end)171 void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
172 cur_ = begin;
173 curLine_ = begin;
174
175 begin_ = begin;
176 end_ = end;
177
178 row_ = 0;
179 col_ = 0;
180 moreTokens_ = true;
181 }
182
progress()183 std::pair<double, size_t> TokenizerFwf::progress() {
184 size_t bytes = cur_ - begin_;
185 return std::make_pair(bytes / (double)(end_ - begin_), bytes);
186 }
187
nextToken()188 Token TokenizerFwf::nextToken() {
189 if (!moreTokens_) {
190 return {TOKEN_EOF, 0, 0};
191 }
192
193 // Check for comments only at start of line
194 while (cur_ != end_ && col_ == 0 &&
195 (isComment(cur_) || (isEmpty() && skipEmptyRows_))) {
196 // Skip rest of line
197 while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') {
198 ++cur_;
199 }
200 advanceForLF(&cur_, end_);
201 if (cur_ != end_) {
202 ++cur_;
203 }
204 curLine_ = cur_;
205 }
206
207 // Find start of field
208 SourceIterator fieldBegin = cur_;
209 findBeginning:
210 int skip = beginOffset_[col_] - (cur_ - curLine_);
211 if (skip < 0) { // overlapping case
212 fieldBegin += skip;
213 } else if (skip > 0) { // skipped columns case
214 for (int i = 0; i < skip; ++i) {
215 if (fieldBegin == end_) {
216 break;
217 }
218
219 if (*fieldBegin == '\n' || *fieldBegin == '\r') {
220 std::stringstream ss1;
221 ss1 << skip << " chars betwen fields";
222 std::stringstream ss2;
223 ss2 << skip << " chars until end of line";
224 warn(row_, col_, ss1.str(), ss2.str());
225
226 row_++;
227 col_ = 0;
228
229 advanceForLF(&fieldBegin, end_);
230 if (fieldBegin != end_) {
231 fieldBegin++;
232 }
233 cur_ = curLine_ = fieldBegin;
234 goto findBeginning;
235 }
236 fieldBegin++;
237 }
238 }
239
240 if (fieldBegin == end_) {
241 // need to warn here if col != 0/cols - 1
242 moreTokens_ = false;
243 return {TOKEN_EOF, 0, 0};
244 }
245
246 // Find end of field
247 SourceIterator fieldEnd = fieldBegin;
248 bool lastCol = (col_ == cols_ - 1);
249
250 bool tooShort = false;
251
252 bool hasNull = false;
253
254 if (lastCol && isRagged_) {
255 // Last column is ragged, so read until end of line (ignoring width)
256 while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
257 if (*fieldEnd == '\0') {
258 hasNull = true;
259 }
260 fieldEnd++;
261 }
262 } else {
263 int width = endOffset_[col_] - beginOffset_[col_];
264 // Find the end of the field, stopping for newlines
265 for (int i = 0; i < width; ++i) {
266 if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
267 if (!(col_ == 0 && !skipEmptyRows_)) {
268 std::stringstream ss1;
269 ss1 << i << " chars";
270 std::stringstream ss2;
271 ss2 << i;
272 warn(row_, col_, ss1.str(), ss2.str());
273 }
274
275 tooShort = true;
276 break;
277 }
278 if (*fieldEnd == '\0') {
279 hasNull = true;
280 }
281
282 fieldEnd++;
283 }
284 }
285
286 Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
287
288 if (lastCol || tooShort) {
289 row_++;
290 col_ = 0;
291
292 if (!(tooShort || isRagged_)) {
293 // Proceed to the end of the line when you are possibly not there.
294 // This is needed in case the last column in the file is not being read.
295 while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
296 fieldEnd++;
297 }
298 }
299
300 curLine_ = fieldEnd;
301 advanceForLF(&curLine_, end_);
302 if (curLine_ != end_) {
303 curLine_++;
304 }
305 cur_ = curLine_;
306 } else {
307 col_++;
308 cur_ = fieldEnd;
309 }
310
311 return t;
312 }
313
fieldToken(SourceIterator begin,SourceIterator end,bool hasNull)314 Token TokenizerFwf::fieldToken(
315 SourceIterator begin, SourceIterator end, bool hasNull) {
316 if (begin == end) {
317 return {TOKEN_MISSING, row_, col_};
318 }
319
320 Token t = Token(begin, end, row_, col_, hasNull);
321 if (trimWS_) {
322 t.trim();
323 }
324 t.flagNA(NA_);
325
326 return t;
327 }
328
isComment(const char * cur) const329 bool TokenizerFwf::isComment(const char* cur) const {
330 if (!hasComment_) {
331 return false;
332 }
333
334 return starts_with_comment(cur, end_, comment_);
335 }
336
isEmpty() const337 bool TokenizerFwf::isEmpty() const {
338 return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n';
339 }
340