1 #include <cpp11/list.hpp>
2 #include <cpp11/sexp.hpp>
3 #include <cpp11/strings.hpp>
4 #include <utility>
5 
6 #include "LocaleInfo.h"
7 #include "columns.h"
8 #include "fixed_width_index.h"
9 #include "unicode_fopen.h"
10 
vroom_fwf_(const cpp11::list & inputs,const std::vector<int> & col_starts,const std::vector<int> & col_ends,bool trim_ws,cpp11::sexp col_names,cpp11::sexp col_types,cpp11::sexp col_select,cpp11::sexp name_repair,size_t skip,const char * comment,bool skip_empty_rows,ptrdiff_t n_max,SEXP id,const cpp11::strings & na,const cpp11::list & locale,ptrdiff_t guess_max,size_t num_threads,size_t altrep,bool progress)11 [[cpp11::register]] cpp11::list vroom_fwf_(
12     const cpp11::list& inputs,
13     const std::vector<int>& col_starts,
14     const std::vector<int>& col_ends,
15     bool trim_ws,
16     cpp11::sexp col_names,
17     cpp11::sexp col_types,
18     cpp11::sexp col_select,
19     cpp11::sexp name_repair,
20     size_t skip,
21     const char* comment,
22     bool skip_empty_rows,
23     ptrdiff_t n_max,
24     SEXP id,
25     const cpp11::strings& na,
26     const cpp11::list& locale,
27     ptrdiff_t guess_max,
28     size_t num_threads,
29     size_t altrep,
30     bool progress) {
31 
32   std::vector<std::string> filenames;
33 
34   bool add_filename = !Rf_isNull(id);
35 
36   // We need to retrieve filenames now before the connection objects are read,
37   // as they are invalid afterwards.
38   if (add_filename) {
39     filenames = get_filenames(inputs);
40   }
41 
42   auto idx = std::make_shared<vroom::index_collection>(
43       inputs,
44       col_starts,
45       col_ends,
46       trim_ws,
47       skip,
48       comment,
49       skip_empty_rows,
50       n_max,
51       progress);
52 
53   auto errors = new std::shared_ptr<vroom_errors>(new vroom_errors());
54 
55   return create_columns(
56       idx,
57       std::move(col_names),
58       std::move(col_types),
59       std::move(col_select),
60       std::move(name_repair),
61       id,
62       filenames,
63       na,
64       locale,
65       altrep,
66       guess_max,
67       errors,
68       num_threads);
69 }
70 
71 template <typename Iterator>
find_empty_cols(Iterator begin,Iterator end,ptrdiff_t n)72 std::vector<bool> find_empty_cols(Iterator begin, Iterator end, ptrdiff_t n) {
73 
74   std::vector<bool> is_white;
75 
76   size_t row = 0, col = 0;
77   for (Iterator cur = begin; cur != end; ++cur) {
78     if (n > 0 && row > static_cast<size_t>(n)) {
79       break;
80     }
81 
82     switch (*cur) {
83     case '\n':
84       col = 0;
85       row++;
86       break;
87     case '\r':
88     case ' ':
89       col++;
90       break;
91     default:
92       // Make sure there's enough room
93       if (col >= is_white.size())
94         is_white.resize(col + 1, true);
95       is_white[col] = false;
96       col++;
97     }
98   }
99 
100   return is_white;
101 }
102 
whitespace_columns_(const std::string & filename,size_t skip,ptrdiff_t n,const std::string & comment)103 [[cpp11::register]] cpp11::list whitespace_columns_(
104     const std::string& filename,
105     size_t skip,
106     ptrdiff_t n,
107     const std::string& comment) {
108 
109   std::error_code error;
110   auto mmap = make_mmap_source(filename.c_str(), error);
111   if (error) {
112     // We cannot actually portably compare error messages due to a bug in
113     // libstdc++ (https://stackoverflow.com/a/54316671/2055486), so just print
114     // the message on stderr return
115     REprintf("mapping error: %s", error.message().c_str());
116     return cpp11::list();
117   }
118 
119   size_t s = find_first_line(
120       mmap,
121       skip,
122       comment.data(),
123       /* skip_empty_rows */ true,
124       /* embedded_nl */ false,
125       /* quote */ '\0');
126 
127   std::vector<bool> empty = find_empty_cols(mmap.begin() + s, mmap.end(), n);
128   std::vector<int> begin, end;
129 
130   bool in_col = false;
131 
132   for (size_t i = 0; i < empty.size(); ++i) {
133     if (in_col && empty[i]) {
134       end.push_back(i);
135       in_col = false;
136     } else if (!in_col && !empty[i]) {
137       begin.push_back(i);
138       in_col = true;
139     }
140   }
141 
142   if (in_col)
143     end.push_back(empty.size());
144 
145   using namespace cpp11::literals;
146   return cpp11::writable::list({"begin"_nm = begin, "end"_nm = end});
147 }
148