1 #include "cpp11/list.hpp"
2 #include "cpp11/protect.hpp"
3 
4 #include "Tokenizer.h"
5 #include "TokenizerFwf.h"
6 #include "utils.h"
7 
8 #include "Source.h"
9 
10 #include <sstream>
11 #include <utility>
12 
13 struct skip_t {
14   SourceIterator begin;
15   int lines;
16 };
17 
skip_comments(SourceIterator begin,SourceIterator end,const std::string & comment="")18 skip_t skip_comments(
19     SourceIterator begin, SourceIterator end, const std::string& comment = "") {
20   skip_t out;
21   if (comment.length() == 0) {
22     out.begin = begin;
23     out.lines = 0;
24     return out;
25   }
26 
27   SourceIterator cur = begin;
28   int skip = 0;
29   while (starts_with_comment(cur, end, comment)) {
30     // Skip rest of line
31     while (cur != end && *cur != '\n' && *cur != '\r') {
32       ++cur;
33     }
34 
35     advanceForLF(&cur, end);
36     ++cur;
37     ++skip;
38   }
39 
40   out.begin = cur;
41   out.lines = skip;
42   return out;
43 }
44 
45 std::vector<bool>
emptyCols_(SourceIterator begin,SourceIterator end,size_t n=100)46 emptyCols_(SourceIterator begin, SourceIterator end, size_t n = 100) {
47 
48   std::vector<bool> is_white;
49 
50   size_t row = 0;
51 
52   size_t col = 0;
53   for (SourceIterator cur = begin; cur != end; ++cur) {
54     if (row > n) {
55       break;
56     }
57 
58     switch (*cur) {
59     case '\n':
60     case '\r':
61       advanceForLF(&cur, end);
62       col = 0;
63       row++;
64       break;
65     case ' ':
66       col++;
67       break;
68     default:
69       // Make sure there's enough room
70       if (col >= is_white.size()) {
71         is_white.resize(col + 1, true);
72       }
73       is_white[col] = false;
74       col++;
75     }
76   }
77 
78   return is_white;
79 }
80 
81 [[cpp11::register]] cpp11::list
whitespaceColumns(const cpp11::list & sourceSpec,int n,std::string comment)82 whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment) {
83   SourcePtr source = Source::create(sourceSpec);
84 
85   skip_t s = skip_comments(source->begin(), source->end(), std::move(comment));
86 
87   std::vector<bool> empty = emptyCols_(s.begin, source->end(), n);
88   std::vector<int> begin;
89 
90   std::vector<int> end;
91 
92   bool in_col = false;
93 
94   for (size_t i = 0; i < empty.size(); ++i) {
95     if (in_col && empty[i]) {
96       end.push_back(i);
97       in_col = false;
98     } else if (!in_col && !empty[i]) {
99       begin.push_back(i);
100       in_col = true;
101     }
102   }
103 
104   if (in_col) {
105     end.push_back(empty.size());
106   }
107 
108   using namespace cpp11::literals;
109   return cpp11::writable::list(
110       {"begin"_nm = begin, "end"_nm = end, "skip"_nm = s.lines});
111 }
112 
113 // TokenizerFwf --------------------------------------------------------------
114 
115 #include "TokenizerFwf.h"
116 
TokenizerFwf(const std::vector<int> & beginOffset,const std::vector<int> & endOffset,std::vector<std::string> NA,const std::string & comment,bool trimWS,bool skipEmptyRows)117 TokenizerFwf::TokenizerFwf(
118     const std::vector<int>& beginOffset,
119     const std::vector<int>& endOffset,
120     std::vector<std::string> NA,
121     const std::string& comment,
122     bool trimWS,
123     bool skipEmptyRows)
124     : beginOffset_(beginOffset),
125       endOffset_(endOffset),
126       NA_(std::move(NA)),
127       cols_(beginOffset.size()),
128       comment_(comment),
129       moreTokens_(false),
130       hasComment_(!comment.empty()),
131       trimWS_(trimWS),
132       skipEmptyRows_(skipEmptyRows) {
133   if (beginOffset_.size() != endOffset_.size()) {
134     cpp11::stop(
135         "Begin (%i) and end (%i) specifications must have equal length",
136         beginOffset_.size(),
137         endOffset_.size());
138   }
139 
140   if (beginOffset_.empty()) {
141     cpp11::stop("Zero-length begin and end specifications not supported");
142   }
143 
144   // File is assumed to be ragged (last column can have variable width)
145   // when the last element of endOffset_ is NA
146   isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER;
147 
148   max_ = 0;
149   for (int j = 0; j < (cols_ - static_cast<int>(isRagged_)); ++j) {
150     if (endOffset_[j] <= beginOffset_[j]) {
151       cpp11::stop(
152           "Begin offset (%i) must be smaller than end offset (%i)",
153           beginOffset_[j],
154           endOffset_[j]);
155     }
156 
157     if (beginOffset_[j] < 0) {
158       cpp11::stop("Begin offset (%i) must be greater than 0", beginOffset_[j]);
159     }
160 
161     if (endOffset_[j] < 0) {
162       cpp11::stop("End offset (%i) must be greater than 0", endOffset_[j]);
163     }
164 
165     if (endOffset_[j] > max_) {
166       max_ = endOffset_[j];
167     }
168   }
169 }
170 
tokenize(SourceIterator begin,SourceIterator end)171 void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
172   cur_ = begin;
173   curLine_ = begin;
174 
175   begin_ = begin;
176   end_ = end;
177 
178   row_ = 0;
179   col_ = 0;
180   moreTokens_ = true;
181 }
182 
progress()183 std::pair<double, size_t> TokenizerFwf::progress() {
184   size_t bytes = cur_ - begin_;
185   return std::make_pair(bytes / (double)(end_ - begin_), bytes);
186 }
187 
nextToken()188 Token TokenizerFwf::nextToken() {
189   if (!moreTokens_) {
190     return {TOKEN_EOF, 0, 0};
191   }
192 
193   // Check for comments only at start of line
194   while (cur_ != end_ && col_ == 0 &&
195          (isComment(cur_) || (isEmpty() && skipEmptyRows_))) {
196     // Skip rest of line
197     while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') {
198       ++cur_;
199     }
200     advanceForLF(&cur_, end_);
201     if (cur_ != end_) {
202       ++cur_;
203     }
204     curLine_ = cur_;
205   }
206 
207   // Find start of field
208   SourceIterator fieldBegin = cur_;
209 findBeginning:
210   int skip = beginOffset_[col_] - (cur_ - curLine_);
211   if (skip < 0) { // overlapping case
212     fieldBegin += skip;
213   } else if (skip > 0) { // skipped columns case
214     for (int i = 0; i < skip; ++i) {
215       if (fieldBegin == end_) {
216         break;
217       }
218 
219       if (*fieldBegin == '\n' || *fieldBegin == '\r') {
220         std::stringstream ss1;
221         ss1 << skip << " chars betwen fields";
222         std::stringstream ss2;
223         ss2 << skip << " chars until end of line";
224         warn(row_, col_, ss1.str(), ss2.str());
225 
226         row_++;
227         col_ = 0;
228 
229         advanceForLF(&fieldBegin, end_);
230         if (fieldBegin != end_) {
231           fieldBegin++;
232         }
233         cur_ = curLine_ = fieldBegin;
234         goto findBeginning;
235       }
236       fieldBegin++;
237     }
238   }
239 
240   if (fieldBegin == end_) {
241     // need to warn here if col != 0/cols - 1
242     moreTokens_ = false;
243     return {TOKEN_EOF, 0, 0};
244   }
245 
246   // Find end of field
247   SourceIterator fieldEnd = fieldBegin;
248   bool lastCol = (col_ == cols_ - 1);
249 
250   bool tooShort = false;
251 
252   bool hasNull = false;
253 
254   if (lastCol && isRagged_) {
255     // Last column is ragged, so read until end of line (ignoring width)
256     while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
257       if (*fieldEnd == '\0') {
258         hasNull = true;
259       }
260       fieldEnd++;
261     }
262   } else {
263     int width = endOffset_[col_] - beginOffset_[col_];
264     // Find the end of the field, stopping for newlines
265     for (int i = 0; i < width; ++i) {
266       if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
267         if (!(col_ == 0 && !skipEmptyRows_)) {
268           std::stringstream ss1;
269           ss1 << i << " chars";
270           std::stringstream ss2;
271           ss2 << i;
272           warn(row_, col_, ss1.str(), ss2.str());
273         }
274 
275         tooShort = true;
276         break;
277       }
278       if (*fieldEnd == '\0') {
279         hasNull = true;
280       }
281 
282       fieldEnd++;
283     }
284   }
285 
286   Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
287 
288   if (lastCol || tooShort) {
289     row_++;
290     col_ = 0;
291 
292     if (!(tooShort || isRagged_)) {
293       // Proceed to the end of the line when you are possibly not there.
294       // This is needed in case the last column in the file is not being read.
295       while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
296         fieldEnd++;
297       }
298     }
299 
300     curLine_ = fieldEnd;
301     advanceForLF(&curLine_, end_);
302     if (curLine_ != end_) {
303       curLine_++;
304     }
305     cur_ = curLine_;
306   } else {
307     col_++;
308     cur_ = fieldEnd;
309   }
310 
311   return t;
312 }
313 
fieldToken(SourceIterator begin,SourceIterator end,bool hasNull)314 Token TokenizerFwf::fieldToken(
315     SourceIterator begin, SourceIterator end, bool hasNull) {
316   if (begin == end) {
317     return {TOKEN_MISSING, row_, col_};
318   }
319 
320   Token t = Token(begin, end, row_, col_, hasNull);
321   if (trimWS_) {
322     t.trim();
323   }
324   t.flagNA(NA_);
325 
326   return t;
327 }
328 
isComment(const char * cur) const329 bool TokenizerFwf::isComment(const char* cur) const {
330   if (!hasComment_) {
331     return false;
332   }
333 
334   return starts_with_comment(cur, end_, comment_);
335 }
336 
isEmpty() const337 bool TokenizerFwf::isEmpty() const {
338   return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n';
339 }
340