1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <cstdint>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include <gtest/gtest.h>
24
25 #include "arrow/csv/options.h"
26 #include "arrow/csv/parser.h"
27 #include "arrow/csv/test_common.h"
28 #include "arrow/status.h"
29 #include "arrow/testing/gtest_util.h"
30
31 namespace arrow {
32 namespace csv {
33
CheckSkipRows(const std::string & rows,int32_t num_rows,int32_t expected_skipped_rows,int32_t expected_skipped_bytes)34 void CheckSkipRows(const std::string& rows, int32_t num_rows,
35 int32_t expected_skipped_rows, int32_t expected_skipped_bytes) {
36 const uint8_t* start = reinterpret_cast<const uint8_t*>(rows.data());
37 const uint8_t* data;
38 int32_t skipped_rows =
39 SkipRows(start, static_cast<int32_t>(rows.size()), num_rows, &data);
40 ASSERT_EQ(skipped_rows, expected_skipped_rows);
41 ASSERT_EQ(data - start, expected_skipped_bytes);
42 }
43
TEST(SkipRows,Basics)44 TEST(SkipRows, Basics) {
45 CheckSkipRows("", 0, 0, 0);
46 CheckSkipRows("", 15, 0, 0);
47
48 CheckSkipRows("a\nb\nc\nd", 1, 1, 2);
49 CheckSkipRows("a\nb\nc\nd", 2, 2, 4);
50 CheckSkipRows("a\nb\nc\nd", 3, 3, 6);
51 CheckSkipRows("a\nb\nc\nd", 4, 3, 6);
52
53 CheckSkipRows("a\nb\nc\nd\n", 3, 3, 6);
54 CheckSkipRows("a\nb\nc\nd\n", 4, 4, 8);
55 CheckSkipRows("a\nb\nc\nd\n", 5, 4, 8);
56
57 CheckSkipRows("\t\n\t\n\t\n\t", 1, 1, 2);
58 CheckSkipRows("\t\n\t\n\t\n\t", 3, 3, 6);
59 CheckSkipRows("\t\n\t\n\t\n\t", 4, 3, 6);
60
61 CheckSkipRows("a\r\nb\nc\rd\r\n", 1, 1, 3);
62 CheckSkipRows("a\r\nb\nc\rd\r\n", 2, 2, 5);
63 CheckSkipRows("a\r\nb\nc\rd\r\n", 3, 3, 7);
64 CheckSkipRows("a\r\nb\nc\rd\r\n", 4, 4, 10);
65 CheckSkipRows("a\r\nb\nc\rd\r\n", 5, 4, 10);
66
67 CheckSkipRows("a\r\nb\nc\rd\r", 4, 4, 9);
68 CheckSkipRows("a\r\nb\nc\rd\r", 5, 4, 9);
69 CheckSkipRows("a\r\nb\nc\rd\re", 4, 4, 9);
70 CheckSkipRows("a\r\nb\nc\rd\re", 5, 4, 9);
71
72 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 1, 1, 1);
73 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 2, 2, 3);
74 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 3, 3, 4);
75 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 4, 4, 6);
76 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 5, 5, 7);
77 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 6, 6, 9);
78 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 7, 7, 10);
79 CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 8, 7, 10);
80 }
81
82 ////////////////////////////////////////////////////////////////////////////
83 // BlockParser tests
84
85 // Read the column with the given index out of the BlockParser.
GetColumn(const BlockParser & parser,int32_t col_index,std::vector<std::string> * out,std::vector<bool> * out_quoted=nullptr)86 void GetColumn(const BlockParser& parser, int32_t col_index,
87 std::vector<std::string>* out, std::vector<bool>* out_quoted = nullptr) {
88 std::vector<std::string> values;
89 std::vector<bool> quoted_values;
90 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
91 values.push_back(std::string(reinterpret_cast<const char*>(data), size));
92 if (out_quoted) {
93 quoted_values.push_back(quoted);
94 }
95 return Status::OK();
96 };
97 ASSERT_OK(parser.VisitColumn(col_index, visit));
98 *out = std::move(values);
99 if (out_quoted) {
100 *out_quoted = std::move(quoted_values);
101 }
102 }
103
GetLastRow(const BlockParser & parser,std::vector<std::string> * out,std::vector<bool> * out_quoted=nullptr)104 void GetLastRow(const BlockParser& parser, std::vector<std::string>* out,
105 std::vector<bool>* out_quoted = nullptr) {
106 std::vector<std::string> values;
107 std::vector<bool> quoted_values;
108 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
109 values.push_back(std::string(reinterpret_cast<const char*>(data), size));
110 if (out_quoted) {
111 quoted_values.push_back(quoted);
112 }
113 return Status::OK();
114 };
115 ASSERT_OK(parser.VisitLastRow(visit));
116 *out = std::move(values);
117 if (out_quoted) {
118 *out_quoted = std::move(quoted_values);
119 }
120 }
121
TotalViewLength(const std::vector<util::string_view> & views)122 size_t TotalViewLength(const std::vector<util::string_view>& views) {
123 size_t total_view_length = 0;
124 for (const auto& view : views) {
125 total_view_length += view.length();
126 }
127 return total_view_length;
128 }
129
Parse(BlockParser & parser,const std::string & str,uint32_t * out_size)130 Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) {
131 return parser.Parse(util::string_view(str), out_size);
132 }
133
ParseFinal(BlockParser & parser,const std::string & str,uint32_t * out_size)134 Status ParseFinal(BlockParser& parser, const std::string& str, uint32_t* out_size) {
135 return parser.ParseFinal(util::string_view(str), out_size);
136 }
137
AssertParseOk(BlockParser & parser,const std::string & str)138 void AssertParseOk(BlockParser& parser, const std::string& str) {
139 uint32_t parsed_size = static_cast<uint32_t>(-1);
140 ASSERT_OK(Parse(parser, str, &parsed_size));
141 ASSERT_EQ(parsed_size, str.size());
142 }
143
AssertParseOk(BlockParser & parser,const std::vector<util::string_view> & data)144 void AssertParseOk(BlockParser& parser, const std::vector<util::string_view>& data) {
145 uint32_t parsed_size = static_cast<uint32_t>(-1);
146 ASSERT_OK(parser.Parse(data, &parsed_size));
147 ASSERT_EQ(parsed_size, TotalViewLength(data));
148 }
149
AssertParseFinal(BlockParser & parser,const std::string & str)150 void AssertParseFinal(BlockParser& parser, const std::string& str) {
151 uint32_t parsed_size = static_cast<uint32_t>(-1);
152 ASSERT_OK(ParseFinal(parser, str, &parsed_size));
153 ASSERT_EQ(parsed_size, str.size());
154 }
155
AssertParseFinal(BlockParser & parser,const std::vector<util::string_view> & data)156 void AssertParseFinal(BlockParser& parser, const std::vector<util::string_view>& data) {
157 uint32_t parsed_size = static_cast<uint32_t>(-1);
158 ASSERT_OK(parser.ParseFinal(data, &parsed_size));
159 ASSERT_EQ(parsed_size, TotalViewLength(data));
160 }
161
AssertParsePartial(BlockParser & parser,const std::string & str,uint32_t expected_size)162 void AssertParsePartial(BlockParser& parser, const std::string& str,
163 uint32_t expected_size) {
164 uint32_t parsed_size = static_cast<uint32_t>(-1);
165 ASSERT_OK(Parse(parser, str, &parsed_size));
166 ASSERT_EQ(parsed_size, expected_size);
167 }
168
AssertLastRowEq(const BlockParser & parser,const std::vector<std::string> expected)169 void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> expected) {
170 std::vector<std::string> values;
171 GetLastRow(parser, &values);
172 ASSERT_EQ(parser.num_rows(), expected.size());
173 ASSERT_EQ(values, expected);
174 }
175
AssertLastRowEq(const BlockParser & parser,const std::vector<std::string> expected,const std::vector<bool> expected_quoted)176 void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> expected,
177 const std::vector<bool> expected_quoted) {
178 std::vector<std::string> values;
179 std::vector<bool> quoted;
180 GetLastRow(parser, &values, "ed);
181 ASSERT_EQ(parser.num_cols(), expected.size());
182 ASSERT_EQ(values, expected);
183 ASSERT_EQ(quoted, expected_quoted);
184 }
185
AssertColumnEq(const BlockParser & parser,int32_t col_index,const std::vector<std::string> expected)186 void AssertColumnEq(const BlockParser& parser, int32_t col_index,
187 const std::vector<std::string> expected) {
188 std::vector<std::string> values;
189 GetColumn(parser, col_index, &values);
190 ASSERT_EQ(parser.num_rows(), expected.size());
191 ASSERT_EQ(values, expected);
192 }
193
AssertColumnEq(const BlockParser & parser,int32_t col_index,const std::vector<std::string> expected,const std::vector<bool> expected_quoted)194 void AssertColumnEq(const BlockParser& parser, int32_t col_index,
195 const std::vector<std::string> expected,
196 const std::vector<bool> expected_quoted) {
197 std::vector<std::string> values;
198 std::vector<bool> quoted;
199 GetColumn(parser, col_index, &values, "ed);
200 ASSERT_EQ(parser.num_rows(), expected.size());
201 ASSERT_EQ(values, expected);
202 ASSERT_EQ(quoted, expected_quoted);
203 }
204
AssertColumnsEq(const BlockParser & parser,const std::vector<std::vector<std::string>> expected)205 void AssertColumnsEq(const BlockParser& parser,
206 const std::vector<std::vector<std::string>> expected) {
207 ASSERT_EQ(parser.num_cols(), expected.size());
208 for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
209 AssertColumnEq(parser, col_index, expected[col_index]);
210 }
211 }
212
AssertColumnsEq(const BlockParser & parser,const std::vector<std::vector<std::string>> expected,const std::vector<std::vector<bool>> quoted)213 void AssertColumnsEq(const BlockParser& parser,
214 const std::vector<std::vector<std::string>> expected,
215 const std::vector<std::vector<bool>> quoted) {
216 ASSERT_EQ(parser.num_cols(), expected.size());
217 for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
218 AssertColumnEq(parser, col_index, expected[col_index], quoted[col_index]);
219 }
220 uint32_t total_bytes = 0;
221 for (const auto& col : expected) {
222 for (const auto& field : col) {
223 total_bytes += static_cast<uint32_t>(field.size());
224 }
225 }
226 ASSERT_EQ(total_bytes, parser.num_bytes());
227 }
228
TEST(BlockParser,Basics)229 TEST(BlockParser, Basics) {
230 {
231 auto csv = MakeCSVData({"ab,cd,\n", "ef,,gh\n", ",ij,kl\n"});
232 BlockParser parser(ParseOptions::Defaults());
233 AssertParseOk(parser, csv);
234 AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", "kl"}});
235 AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
236 }
237 {
238 auto csv1 = MakeCSVData({"ab,cd,\n", "ef,,gh\n"});
239 auto csv2 = MakeCSVData({",ij,kl\n"});
240 std::vector<util::string_view> csvs = {csv1, csv2};
241 BlockParser parser(ParseOptions::Defaults());
242 AssertParseOk(parser, {{csv1}, {csv2}});
243 AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", "kl"}});
244 AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
245 }
246 }
247
TEST(BlockParser,EmptyHeader)248 TEST(BlockParser, EmptyHeader) {
249 // Cannot infer number of columns
250 uint32_t out_size;
251 {
252 auto csv = MakeCSVData({""});
253 BlockParser parser(ParseOptions::Defaults());
254 ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
255 }
256 {
257 auto csv = MakeCSVData({"\n"});
258 BlockParser parser(ParseOptions::Defaults());
259 ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
260 }
261 }
262
TEST(BlockParser,Empty)263 TEST(BlockParser, Empty) {
264 {
265 auto csv = MakeCSVData({",\n"});
266 BlockParser parser(ParseOptions::Defaults());
267 AssertParseOk(parser, csv);
268 AssertColumnsEq(parser, {{""}, {""}});
269 AssertLastRowEq(parser, {"", ""}, {false, false});
270 }
271 {
272 auto csv = MakeCSVData({",\n,\n"});
273 BlockParser parser(ParseOptions::Defaults());
274 AssertParseOk(parser, csv);
275 AssertColumnsEq(parser, {{"", ""}, {"", ""}});
276 AssertLastRowEq(parser, {"", ""}, {false, false});
277 }
278 }
279
TEST(BlockParser,Whitespace)280 TEST(BlockParser, Whitespace) {
281 // Non-newline whitespace is preserved
282 auto csv = MakeCSVData({"a b, cd, \n", " ef, \t,gh\n"});
283 BlockParser parser(ParseOptions::Defaults());
284 AssertParseOk(parser, csv);
285 AssertColumnsEq(parser, {{"a b", " ef"}, {" cd", " \t"}, {" ", "gh"}});
286 }
287
TEST(BlockParser,Newlines)288 TEST(BlockParser, Newlines) {
289 auto csv = MakeCSVData({"a,b\n", "c,d\r\n", "e,f\r", "g,h\r"});
290 BlockParser parser(ParseOptions::Defaults());
291
292 AssertParseOk(parser, csv);
293 AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
294 }
295
TEST(BlockParser,MaxNumRows)296 TEST(BlockParser, MaxNumRows) {
297 auto csv = MakeCSVData({"a\n", "b\n", "c\n", "d\n"});
298 BlockParser parser(ParseOptions::Defaults(), -1, 3 /* max_num_rows */);
299
300 AssertParsePartial(parser, csv, 6);
301 AssertColumnsEq(parser, {{"a", "b", "c"}});
302
303 AssertParseOk(parser, csv.substr(6));
304 AssertColumnsEq(parser, {{"d"}});
305
306 AssertParseOk(parser, csv.substr(8));
307 AssertColumnsEq(parser, {{}});
308 }
309
TEST(BlockParser,EmptyLinesWithOneColumn)310 TEST(BlockParser, EmptyLinesWithOneColumn) {
311 auto csv = MakeCSVData({"a\n", "\n", "b\r", "\r", "c\r\n", "\r\n", "d\n"});
312 {
313 BlockParser parser(ParseOptions::Defaults());
314 AssertParseOk(parser, csv);
315 AssertColumnsEq(parser, {{"a", "b", "c", "d"}});
316 }
317 {
318 auto options = ParseOptions::Defaults();
319 options.ignore_empty_lines = false;
320 BlockParser parser(options);
321 AssertParseOk(parser, csv);
322 AssertColumnsEq(parser, {{"a", "", "b", "", "c", "", "d"}});
323 }
324 }
325
TEST(BlockParser,EmptyLinesWithSeveralColumns)326 TEST(BlockParser, EmptyLinesWithSeveralColumns) {
327 auto csv = MakeCSVData({"a,b\n", "\n", "c,d\r", "\r", "e,f\r\n", "\r\n", "g,h\n"});
328 {
329 BlockParser parser(ParseOptions::Defaults());
330 AssertParseOk(parser, csv);
331 AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
332 }
333 {
334 // Non-ignored empty lines get turned into empty values
335 auto options = ParseOptions::Defaults();
336 options.ignore_empty_lines = false;
337 BlockParser parser(options);
338 AssertParseOk(parser, csv);
339 AssertColumnsEq(parser,
340 {{"a", "", "c", "", "e", "", "g"}, {"b", "", "d", "", "f", "", "h"}});
341 }
342 }
343
TEST(BlockParser,EmptyLineFirst)344 TEST(BlockParser, EmptyLineFirst) {
345 auto csv = MakeCSVData({"\n", "\n", "a\n", "b\n"});
346 {
347 BlockParser parser(ParseOptions::Defaults());
348 AssertParseOk(parser, csv);
349 AssertColumnsEq(parser, {{"a", "b"}});
350 }
351 {
352 auto options = ParseOptions::Defaults();
353 options.ignore_empty_lines = false;
354 BlockParser parser(options);
355 AssertParseOk(parser, csv);
356 AssertColumnsEq(parser, {{"", "", "a", "b"}});
357 }
358 }
359
TEST(BlockParser,TruncatedData)360 TEST(BlockParser, TruncatedData) {
361 BlockParser parser(ParseOptions::Defaults());
362 auto csv = MakeCSVData({"a,b\n", "c,d\n"});
363 for (auto trim : {1, 2, 3}) {
364 AssertParsePartial(parser, csv.substr(0, csv.length() - trim), 4);
365 AssertColumnsEq(parser, {{"a"}, {"b"}});
366 }
367 }
368
TEST(BlockParser,Final)369 TEST(BlockParser, Final) {
370 // Tests for ParseFinal()
371 BlockParser parser(ParseOptions::Defaults());
372 auto csv = MakeCSVData({"ab,cd\n", "ef,gh\n"});
373 AssertParseFinal(parser, csv);
374 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
375
376 // Same without newline
377 csv = MakeCSVData({"ab,cd\n", "ef,gh"});
378 AssertParseFinal(parser, csv);
379 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
380
381 // Same with empty last item
382 csv = MakeCSVData({"ab,cd\n", "ef,"});
383 AssertParseFinal(parser, csv);
384 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", ""}});
385
386 // Same with single line
387 csv = MakeCSVData({"ab,cd"});
388 AssertParseFinal(parser, csv);
389 AssertColumnsEq(parser, {{"ab"}, {"cd"}});
390
391 // Two blocks
392 auto csv1 = MakeCSVData({"ab,cd\n"});
393 auto csv2 = MakeCSVData({"ef,"});
394 AssertParseFinal(parser, {{csv1}, {csv2}});
395 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", ""}});
396 }
397
TEST(BlockParser,FinalTruncatedData)398 TEST(BlockParser, FinalTruncatedData) {
399 // Test ParseFinal() with truncated data
400 uint32_t out_size;
401 BlockParser parser(ParseOptions::Defaults());
402 auto csv = MakeCSVData({"ab,cd\n", "ef"});
403 Status st = ParseFinal(parser, csv, &out_size);
404 ASSERT_RAISES(Invalid, st);
405 }
406
TEST(BlockParser,QuotingSimple)407 TEST(BlockParser, QuotingSimple) {
408 auto csv = MakeCSVData({"1,\",3,\",5\n"});
409
410 {
411 BlockParser parser(ParseOptions::Defaults());
412 AssertParseOk(parser, csv);
413 AssertColumnsEq(parser, {{"1"}, {",3,"}, {"5"}},
414 {{false}, {true}, {false}} /* quoted */);
415 }
416 {
417 auto options = ParseOptions::Defaults();
418 options.quoting = false;
419 BlockParser parser(options);
420 AssertParseOk(parser, csv);
421 AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
422 {{false}, {false}, {false}, {false}, {false}} /* quoted */);
423 }
424 {
425 auto options = ParseOptions::Defaults();
426 options.quote_char = 'Z';
427 BlockParser parser(options);
428 AssertParseOk(parser, csv);
429 AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
430 {{false}, {false}, {false}, {false}, {false}} /* quoted */);
431 }
432 }
433
TEST(BlockParser,QuotingNewline)434 TEST(BlockParser, QuotingNewline) {
435 auto csv = MakeCSVData({"a,\"c \n d\",e\n"});
436 BlockParser parser(ParseOptions::Defaults());
437 AssertParseOk(parser, csv);
438 AssertColumnsEq(parser, {{"a"}, {"c \n d"}, {"e"}},
439 {{false}, {true}, {false}} /* quoted */);
440 }
441
TEST(BlockParser,QuotingUnbalanced)442 TEST(BlockParser, QuotingUnbalanced) {
443 // Quote introduces a quoted field that doesn't end
444 auto csv = MakeCSVData({"a,b\n", "1,\",3,,5\n"});
445 BlockParser parser(ParseOptions::Defaults());
446 AssertParsePartial(parser, csv, 4);
447 AssertColumnsEq(parser, {{"a"}, {"b"}}, {{false}, {false}} /* quoted */);
448 }
449
TEST(BlockParser,QuotingEmpty)450 TEST(BlockParser, QuotingEmpty) {
451 {
452 BlockParser parser(ParseOptions::Defaults());
453 auto csv = MakeCSVData({"\"\"\n"});
454 AssertParseOk(parser, csv);
455 AssertColumnsEq(parser, {{""}}, {{true}} /* quoted */);
456 AssertLastRowEq(parser, {""}, {true});
457 }
458 {
459 BlockParser parser(ParseOptions::Defaults());
460 auto csv = MakeCSVData({",\"\"\n"});
461 AssertParseOk(parser, csv);
462 AssertColumnsEq(parser, {{""}, {""}}, {{false}, {true}} /* quoted */);
463 AssertLastRowEq(parser, {"", ""}, {false, true});
464 }
465 {
466 BlockParser parser(ParseOptions::Defaults());
467 auto csv = MakeCSVData({"\"\",\n"});
468 AssertParseOk(parser, csv);
469 AssertColumnsEq(parser, {{""}, {""}}, {{true}, {false}} /* quoted */);
470 AssertLastRowEq(parser, {"", ""}, {true, false});
471 }
472 }
473
TEST(BlockParser,QuotingDouble)474 TEST(BlockParser, QuotingDouble) {
475 {
476 BlockParser parser(ParseOptions::Defaults());
477 // 4 quotes is a quoted quote
478 auto csv = MakeCSVData({"\"\"\"\"\n"});
479 AssertParseOk(parser, csv);
480 AssertColumnsEq(parser, {{"\""}}, {{true}} /* quoted */);
481 }
482 {
483 BlockParser parser(ParseOptions::Defaults());
484 // 4 quotes is a quoted quote
485 auto csv = MakeCSVData({"a,\"\"\"\",b\n"});
486 AssertParseOk(parser, csv);
487 AssertColumnsEq(parser, {{"a"}, {"\""}, {"b"}},
488 {{false}, {true}, {false}} /* quoted */);
489 }
490 {
491 BlockParser parser(ParseOptions::Defaults());
492 // 6 quotes is two quoted quotes
493 auto csv = MakeCSVData({"\"\"\"\"\"\"\n"});
494 AssertParseOk(parser, csv);
495 AssertColumnsEq(parser, {{"\"\""}}, {{true}} /* quoted */);
496 }
497 {
498 BlockParser parser(ParseOptions::Defaults());
499 // 6 quotes is two quoted quotes
500 auto csv = MakeCSVData({"a,\"\"\"\"\"\",b\n"});
501 AssertParseOk(parser, csv);
502 AssertColumnsEq(parser, {{"a"}, {"\"\""}, {"b"}},
503 {{false}, {true}, {false}} /* quoted */);
504 }
505 }
506
TEST(BlockParser,QuotesAndMore)507 TEST(BlockParser, QuotesAndMore) {
508 // There may be trailing data after the quoted part of a field
509 {
510 BlockParser parser(ParseOptions::Defaults());
511 auto csv = MakeCSVData({"a,\"b\"c,d\n"});
512 AssertParseOk(parser, csv);
513 AssertColumnsEq(parser, {{"a"}, {"bc"}, {"d"}},
514 {{false}, {true}, {false}} /* quoted */);
515 }
516 }
517
TEST(BlockParser,QuotesSpecial)518 TEST(BlockParser, QuotesSpecial) {
519 // Some non-trivial cases
520 {
521 BlockParser parser(ParseOptions::Defaults());
522 auto csv = MakeCSVData({"a,b\"c,d\n"});
523 AssertParseOk(parser, csv);
524 AssertColumnsEq(parser, {{"a"}, {"b\"c"}, {"d"}},
525 {{false}, {false}, {false}} /* quoted */);
526 }
527 {
528 BlockParser parser(ParseOptions::Defaults());
529 auto csv = MakeCSVData({"a,\"b\" \"c\",d\n"});
530 AssertParseOk(parser, csv);
531 AssertColumnsEq(parser, {{"a"}, {"b \"c\""}, {"d"}},
532 {{false}, {true}, {false}} /* quoted */);
533 }
534 }
535
TEST(BlockParser,MismatchingNumColumns)536 TEST(BlockParser, MismatchingNumColumns) {
537 uint32_t out_size;
538 {
539 BlockParser parser(ParseOptions::Defaults());
540 auto csv = MakeCSVData({"a,b\nc\n"});
541 Status st = Parse(parser, csv, &out_size);
542 ASSERT_RAISES(Invalid, st);
543 }
544 {
545 BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
546 auto csv = MakeCSVData({"a\n"});
547 Status st = Parse(parser, csv, &out_size);
548 ASSERT_RAISES(Invalid, st);
549 }
550 {
551 BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
552 auto csv = MakeCSVData({"a,b,c\n"});
553 Status st = Parse(parser, csv, &out_size);
554 ASSERT_RAISES(Invalid, st);
555 }
556 }
557
TEST(BlockParser,Escaping)558 TEST(BlockParser, Escaping) {
559 auto options = ParseOptions::Defaults();
560 options.escaping = true;
561
562 {
563 auto csv = MakeCSVData({"a\\b,c\n"});
564 {
565 BlockParser parser(ParseOptions::Defaults());
566 AssertParseOk(parser, csv);
567 AssertColumnsEq(parser, {{"a\\b"}, {"c"}});
568 }
569 {
570 BlockParser parser(options);
571 AssertParseOk(parser, csv);
572 AssertColumnsEq(parser, {{"ab"}, {"c"}});
573 }
574 }
575 {
576 auto csv = MakeCSVData({"a\\,b,c\n"});
577 BlockParser parser(options);
578 AssertParseOk(parser, csv);
579 AssertColumnsEq(parser, {{"a,b"}, {"c"}});
580 }
581 }
582
583 // Generate test data with the given number of columns.
MakeLotsOfCsvColumns(int32_t num_columns)584 std::string MakeLotsOfCsvColumns(int32_t num_columns) {
585 std::string values, header;
586 header.reserve(num_columns * 10);
587 values.reserve(num_columns * 10);
588 for (int x = 0; x < num_columns; x++) {
589 if (x != 0) {
590 header += ",";
591 values += ",";
592 }
593 header += "c" + std::to_string(x);
594 values += std::to_string(x);
595 }
596
597 header += "\n";
598 values += "\n";
599 return MakeCSVData({header, values});
600 }
601
TEST(BlockParser,LotsOfColumns)602 TEST(BlockParser, LotsOfColumns) {
603 auto options = ParseOptions::Defaults();
604 BlockParser parser(options);
605 AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100));
606 }
607
TEST(BlockParser,QuotedEscape)608 TEST(BlockParser, QuotedEscape) {
609 auto options = ParseOptions::Defaults();
610 options.escaping = true;
611
612 {
613 auto csv = MakeCSVData({"\"a\\,b\",c\n"});
614 BlockParser parser(options);
615 AssertParseOk(parser, csv);
616 AssertColumnsEq(parser, {{"a,b"}, {"c"}}, {{true}, {false}} /* quoted */);
617 }
618 {
619 auto csv = MakeCSVData({"\"a\\\"b\",c\n"});
620 BlockParser parser(options);
621 AssertParseOk(parser, csv);
622 AssertColumnsEq(parser, {{"a\"b"}, {"c"}}, {{true}, {false}} /* quoted */);
623 }
624 }
625
626 } // namespace csv
627 } // namespace arrow
628