1 #include <cpp11/list.hpp>
2 #include <cpp11/sexp.hpp>
3 #include <cpp11/strings.hpp>
4 #include <utility>
5
6 #include "LocaleInfo.h"
7 #include "columns.h"
8 #include "fixed_width_index.h"
9 #include "unicode_fopen.h"
10
vroom_fwf_(const cpp11::list & inputs,const std::vector<int> & col_starts,const std::vector<int> & col_ends,bool trim_ws,cpp11::sexp col_names,cpp11::sexp col_types,cpp11::sexp col_select,cpp11::sexp name_repair,size_t skip,const char * comment,bool skip_empty_rows,ptrdiff_t n_max,SEXP id,const cpp11::strings & na,const cpp11::list & locale,ptrdiff_t guess_max,size_t num_threads,size_t altrep,bool progress)11 [[cpp11::register]] cpp11::list vroom_fwf_(
12 const cpp11::list& inputs,
13 const std::vector<int>& col_starts,
14 const std::vector<int>& col_ends,
15 bool trim_ws,
16 cpp11::sexp col_names,
17 cpp11::sexp col_types,
18 cpp11::sexp col_select,
19 cpp11::sexp name_repair,
20 size_t skip,
21 const char* comment,
22 bool skip_empty_rows,
23 ptrdiff_t n_max,
24 SEXP id,
25 const cpp11::strings& na,
26 const cpp11::list& locale,
27 ptrdiff_t guess_max,
28 size_t num_threads,
29 size_t altrep,
30 bool progress) {
31
32 std::vector<std::string> filenames;
33
34 bool add_filename = !Rf_isNull(id);
35
36 // We need to retrieve filenames now before the connection objects are read,
37 // as they are invalid afterwards.
38 if (add_filename) {
39 filenames = get_filenames(inputs);
40 }
41
42 auto idx = std::make_shared<vroom::index_collection>(
43 inputs,
44 col_starts,
45 col_ends,
46 trim_ws,
47 skip,
48 comment,
49 skip_empty_rows,
50 n_max,
51 progress);
52
53 auto errors = new std::shared_ptr<vroom_errors>(new vroom_errors());
54
55 return create_columns(
56 idx,
57 std::move(col_names),
58 std::move(col_types),
59 std::move(col_select),
60 std::move(name_repair),
61 id,
62 filenames,
63 na,
64 locale,
65 altrep,
66 guess_max,
67 errors,
68 num_threads);
69 }
70
71 template <typename Iterator>
find_empty_cols(Iterator begin,Iterator end,ptrdiff_t n)72 std::vector<bool> find_empty_cols(Iterator begin, Iterator end, ptrdiff_t n) {
73
74 std::vector<bool> is_white;
75
76 size_t row = 0, col = 0;
77 for (Iterator cur = begin; cur != end; ++cur) {
78 if (n > 0 && row > static_cast<size_t>(n)) {
79 break;
80 }
81
82 switch (*cur) {
83 case '\n':
84 col = 0;
85 row++;
86 break;
87 case '\r':
88 case ' ':
89 col++;
90 break;
91 default:
92 // Make sure there's enough room
93 if (col >= is_white.size())
94 is_white.resize(col + 1, true);
95 is_white[col] = false;
96 col++;
97 }
98 }
99
100 return is_white;
101 }
102
whitespace_columns_(const std::string & filename,size_t skip,ptrdiff_t n,const std::string & comment)103 [[cpp11::register]] cpp11::list whitespace_columns_(
104 const std::string& filename,
105 size_t skip,
106 ptrdiff_t n,
107 const std::string& comment) {
108
109 std::error_code error;
110 auto mmap = make_mmap_source(filename.c_str(), error);
111 if (error) {
112 // We cannot actually portably compare error messages due to a bug in
113 // libstdc++ (https://stackoverflow.com/a/54316671/2055486), so just print
114 // the message on stderr return
115 REprintf("mapping error: %s", error.message().c_str());
116 return cpp11::list();
117 }
118
119 size_t s = find_first_line(
120 mmap,
121 skip,
122 comment.data(),
123 /* skip_empty_rows */ true,
124 /* embedded_nl */ false,
125 /* quote */ '\0');
126
127 std::vector<bool> empty = find_empty_cols(mmap.begin() + s, mmap.end(), n);
128 std::vector<int> begin, end;
129
130 bool in_col = false;
131
132 for (size_t i = 0; i < empty.size(); ++i) {
133 if (in_col && empty[i]) {
134 end.push_back(i);
135 in_col = false;
136 } else if (!in_col && !empty[i]) {
137 begin.push_back(i);
138 in_col = true;
139 }
140 }
141
142 if (in_col)
143 end.push_back(empty.size());
144
145 using namespace cpp11::literals;
146 return cpp11::writable::list({"begin"_nm = begin, "end"_nm = end});
147 }
148