1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include <cstdint>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include <gtest/gtest.h>
24 
25 #include "arrow/csv/options.h"
26 #include "arrow/csv/parser.h"
27 #include "arrow/csv/test_common.h"
28 #include "arrow/status.h"
29 #include "arrow/testing/gtest_util.h"
30 
31 namespace arrow {
32 namespace csv {
33 
CheckSkipRows(const std::string & rows,int32_t num_rows,int32_t expected_skipped_rows,int32_t expected_skipped_bytes)34 void CheckSkipRows(const std::string& rows, int32_t num_rows,
35                    int32_t expected_skipped_rows, int32_t expected_skipped_bytes) {
36   const uint8_t* start = reinterpret_cast<const uint8_t*>(rows.data());
37   const uint8_t* data;
38   int32_t skipped_rows =
39       SkipRows(start, static_cast<int32_t>(rows.size()), num_rows, &data);
40   ASSERT_EQ(skipped_rows, expected_skipped_rows);
41   ASSERT_EQ(data - start, expected_skipped_bytes);
42 }
43 
TEST(SkipRows,Basics)44 TEST(SkipRows, Basics) {
45   CheckSkipRows("", 0, 0, 0);
46   CheckSkipRows("", 15, 0, 0);
47 
48   CheckSkipRows("a\nb\nc\nd", 1, 1, 2);
49   CheckSkipRows("a\nb\nc\nd", 2, 2, 4);
50   CheckSkipRows("a\nb\nc\nd", 3, 3, 6);
51   CheckSkipRows("a\nb\nc\nd", 4, 3, 6);
52 
53   CheckSkipRows("a\nb\nc\nd\n", 3, 3, 6);
54   CheckSkipRows("a\nb\nc\nd\n", 4, 4, 8);
55   CheckSkipRows("a\nb\nc\nd\n", 5, 4, 8);
56 
57   CheckSkipRows("\t\n\t\n\t\n\t", 1, 1, 2);
58   CheckSkipRows("\t\n\t\n\t\n\t", 3, 3, 6);
59   CheckSkipRows("\t\n\t\n\t\n\t", 4, 3, 6);
60 
61   CheckSkipRows("a\r\nb\nc\rd\r\n", 1, 1, 3);
62   CheckSkipRows("a\r\nb\nc\rd\r\n", 2, 2, 5);
63   CheckSkipRows("a\r\nb\nc\rd\r\n", 3, 3, 7);
64   CheckSkipRows("a\r\nb\nc\rd\r\n", 4, 4, 10);
65   CheckSkipRows("a\r\nb\nc\rd\r\n", 5, 4, 10);
66 
67   CheckSkipRows("a\r\nb\nc\rd\r", 4, 4, 9);
68   CheckSkipRows("a\r\nb\nc\rd\r", 5, 4, 9);
69   CheckSkipRows("a\r\nb\nc\rd\re", 4, 4, 9);
70   CheckSkipRows("a\r\nb\nc\rd\re", 5, 4, 9);
71 
72   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 1, 1, 1);
73   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 2, 2, 3);
74   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 3, 3, 4);
75   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 4, 4, 6);
76   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 5, 5, 7);
77   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 6, 6, 9);
78   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 7, 7, 10);
79   CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 8, 7, 10);
80 }
81 
82 ////////////////////////////////////////////////////////////////////////////
83 // BlockParser tests
84 
85 // Read the column with the given index out of the BlockParser.
GetColumn(const BlockParser & parser,int32_t col_index,std::vector<std::string> * out,std::vector<bool> * out_quoted=nullptr)86 void GetColumn(const BlockParser& parser, int32_t col_index,
87                std::vector<std::string>* out, std::vector<bool>* out_quoted = nullptr) {
88   std::vector<std::string> values;
89   std::vector<bool> quoted_values;
90   auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
91     values.push_back(std::string(reinterpret_cast<const char*>(data), size));
92     if (out_quoted) {
93       quoted_values.push_back(quoted);
94     }
95     return Status::OK();
96   };
97   ASSERT_OK(parser.VisitColumn(col_index, visit));
98   *out = std::move(values);
99   if (out_quoted) {
100     *out_quoted = std::move(quoted_values);
101   }
102 }
103 
GetLastRow(const BlockParser & parser,std::vector<std::string> * out,std::vector<bool> * out_quoted=nullptr)104 void GetLastRow(const BlockParser& parser, std::vector<std::string>* out,
105                 std::vector<bool>* out_quoted = nullptr) {
106   std::vector<std::string> values;
107   std::vector<bool> quoted_values;
108   auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
109     values.push_back(std::string(reinterpret_cast<const char*>(data), size));
110     if (out_quoted) {
111       quoted_values.push_back(quoted);
112     }
113     return Status::OK();
114   };
115   ASSERT_OK(parser.VisitLastRow(visit));
116   *out = std::move(values);
117   if (out_quoted) {
118     *out_quoted = std::move(quoted_values);
119   }
120 }
121 
TotalViewLength(const std::vector<util::string_view> & views)122 size_t TotalViewLength(const std::vector<util::string_view>& views) {
123   size_t total_view_length = 0;
124   for (const auto& view : views) {
125     total_view_length += view.length();
126   }
127   return total_view_length;
128 }
129 
Parse(BlockParser & parser,const std::string & str,uint32_t * out_size)130 Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) {
131   return parser.Parse(util::string_view(str), out_size);
132 }
133 
ParseFinal(BlockParser & parser,const std::string & str,uint32_t * out_size)134 Status ParseFinal(BlockParser& parser, const std::string& str, uint32_t* out_size) {
135   return parser.ParseFinal(util::string_view(str), out_size);
136 }
137 
AssertParseOk(BlockParser & parser,const std::string & str)138 void AssertParseOk(BlockParser& parser, const std::string& str) {
139   uint32_t parsed_size = static_cast<uint32_t>(-1);
140   ASSERT_OK(Parse(parser, str, &parsed_size));
141   ASSERT_EQ(parsed_size, str.size());
142 }
143 
AssertParseOk(BlockParser & parser,const std::vector<util::string_view> & data)144 void AssertParseOk(BlockParser& parser, const std::vector<util::string_view>& data) {
145   uint32_t parsed_size = static_cast<uint32_t>(-1);
146   ASSERT_OK(parser.Parse(data, &parsed_size));
147   ASSERT_EQ(parsed_size, TotalViewLength(data));
148 }
149 
AssertParseFinal(BlockParser & parser,const std::string & str)150 void AssertParseFinal(BlockParser& parser, const std::string& str) {
151   uint32_t parsed_size = static_cast<uint32_t>(-1);
152   ASSERT_OK(ParseFinal(parser, str, &parsed_size));
153   ASSERT_EQ(parsed_size, str.size());
154 }
155 
AssertParseFinal(BlockParser & parser,const std::vector<util::string_view> & data)156 void AssertParseFinal(BlockParser& parser, const std::vector<util::string_view>& data) {
157   uint32_t parsed_size = static_cast<uint32_t>(-1);
158   ASSERT_OK(parser.ParseFinal(data, &parsed_size));
159   ASSERT_EQ(parsed_size, TotalViewLength(data));
160 }
161 
AssertParsePartial(BlockParser & parser,const std::string & str,uint32_t expected_size)162 void AssertParsePartial(BlockParser& parser, const std::string& str,
163                         uint32_t expected_size) {
164   uint32_t parsed_size = static_cast<uint32_t>(-1);
165   ASSERT_OK(Parse(parser, str, &parsed_size));
166   ASSERT_EQ(parsed_size, expected_size);
167 }
168 
AssertLastRowEq(const BlockParser & parser,const std::vector<std::string> expected)169 void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> expected) {
170   std::vector<std::string> values;
171   GetLastRow(parser, &values);
172   ASSERT_EQ(parser.num_rows(), expected.size());
173   ASSERT_EQ(values, expected);
174 }
175 
AssertLastRowEq(const BlockParser & parser,const std::vector<std::string> expected,const std::vector<bool> expected_quoted)176 void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> expected,
177                      const std::vector<bool> expected_quoted) {
178   std::vector<std::string> values;
179   std::vector<bool> quoted;
180   GetLastRow(parser, &values, &quoted);
181   ASSERT_EQ(parser.num_cols(), expected.size());
182   ASSERT_EQ(values, expected);
183   ASSERT_EQ(quoted, expected_quoted);
184 }
185 
AssertColumnEq(const BlockParser & parser,int32_t col_index,const std::vector<std::string> expected)186 void AssertColumnEq(const BlockParser& parser, int32_t col_index,
187                     const std::vector<std::string> expected) {
188   std::vector<std::string> values;
189   GetColumn(parser, col_index, &values);
190   ASSERT_EQ(parser.num_rows(), expected.size());
191   ASSERT_EQ(values, expected);
192 }
193 
AssertColumnEq(const BlockParser & parser,int32_t col_index,const std::vector<std::string> expected,const std::vector<bool> expected_quoted)194 void AssertColumnEq(const BlockParser& parser, int32_t col_index,
195                     const std::vector<std::string> expected,
196                     const std::vector<bool> expected_quoted) {
197   std::vector<std::string> values;
198   std::vector<bool> quoted;
199   GetColumn(parser, col_index, &values, &quoted);
200   ASSERT_EQ(parser.num_rows(), expected.size());
201   ASSERT_EQ(values, expected);
202   ASSERT_EQ(quoted, expected_quoted);
203 }
204 
AssertColumnsEq(const BlockParser & parser,const std::vector<std::vector<std::string>> expected)205 void AssertColumnsEq(const BlockParser& parser,
206                      const std::vector<std::vector<std::string>> expected) {
207   ASSERT_EQ(parser.num_cols(), expected.size());
208   for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
209     AssertColumnEq(parser, col_index, expected[col_index]);
210   }
211 }
212 
AssertColumnsEq(const BlockParser & parser,const std::vector<std::vector<std::string>> expected,const std::vector<std::vector<bool>> quoted)213 void AssertColumnsEq(const BlockParser& parser,
214                      const std::vector<std::vector<std::string>> expected,
215                      const std::vector<std::vector<bool>> quoted) {
216   ASSERT_EQ(parser.num_cols(), expected.size());
217   for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
218     AssertColumnEq(parser, col_index, expected[col_index], quoted[col_index]);
219   }
220   uint32_t total_bytes = 0;
221   for (const auto& col : expected) {
222     for (const auto& field : col) {
223       total_bytes += static_cast<uint32_t>(field.size());
224     }
225   }
226   ASSERT_EQ(total_bytes, parser.num_bytes());
227 }
228 
TEST(BlockParser,Basics)229 TEST(BlockParser, Basics) {
230   {
231     auto csv = MakeCSVData({"ab,cd,\n", "ef,,gh\n", ",ij,kl\n"});
232     BlockParser parser(ParseOptions::Defaults());
233     AssertParseOk(parser, csv);
234     AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", "kl"}});
235     AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
236   }
237   {
238     auto csv1 = MakeCSVData({"ab,cd,\n", "ef,,gh\n"});
239     auto csv2 = MakeCSVData({",ij,kl\n"});
240     std::vector<util::string_view> csvs = {csv1, csv2};
241     BlockParser parser(ParseOptions::Defaults());
242     AssertParseOk(parser, {{csv1}, {csv2}});
243     AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", "kl"}});
244     AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
245   }
246 }
247 
TEST(BlockParser,EmptyHeader)248 TEST(BlockParser, EmptyHeader) {
249   // Cannot infer number of columns
250   uint32_t out_size;
251   {
252     auto csv = MakeCSVData({""});
253     BlockParser parser(ParseOptions::Defaults());
254     ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
255   }
256   {
257     auto csv = MakeCSVData({"\n"});
258     BlockParser parser(ParseOptions::Defaults());
259     ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
260   }
261 }
262 
TEST(BlockParser,Empty)263 TEST(BlockParser, Empty) {
264   {
265     auto csv = MakeCSVData({",\n"});
266     BlockParser parser(ParseOptions::Defaults());
267     AssertParseOk(parser, csv);
268     AssertColumnsEq(parser, {{""}, {""}});
269     AssertLastRowEq(parser, {"", ""}, {false, false});
270   }
271   {
272     auto csv = MakeCSVData({",\n,\n"});
273     BlockParser parser(ParseOptions::Defaults());
274     AssertParseOk(parser, csv);
275     AssertColumnsEq(parser, {{"", ""}, {"", ""}});
276     AssertLastRowEq(parser, {"", ""}, {false, false});
277   }
278 }
279 
TEST(BlockParser,Whitespace)280 TEST(BlockParser, Whitespace) {
281   // Non-newline whitespace is preserved
282   auto csv = MakeCSVData({"a b, cd, \n", " ef, \t,gh\n"});
283   BlockParser parser(ParseOptions::Defaults());
284   AssertParseOk(parser, csv);
285   AssertColumnsEq(parser, {{"a b", " ef"}, {" cd", " \t"}, {" ", "gh"}});
286 }
287 
TEST(BlockParser,Newlines)288 TEST(BlockParser, Newlines) {
289   auto csv = MakeCSVData({"a,b\n", "c,d\r\n", "e,f\r", "g,h\r"});
290   BlockParser parser(ParseOptions::Defaults());
291 
292   AssertParseOk(parser, csv);
293   AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
294 }
295 
TEST(BlockParser,MaxNumRows)296 TEST(BlockParser, MaxNumRows) {
297   auto csv = MakeCSVData({"a\n", "b\n", "c\n", "d\n"});
298   BlockParser parser(ParseOptions::Defaults(), -1, 3 /* max_num_rows */);
299 
300   AssertParsePartial(parser, csv, 6);
301   AssertColumnsEq(parser, {{"a", "b", "c"}});
302 
303   AssertParseOk(parser, csv.substr(6));
304   AssertColumnsEq(parser, {{"d"}});
305 
306   AssertParseOk(parser, csv.substr(8));
307   AssertColumnsEq(parser, {{}});
308 }
309 
TEST(BlockParser,EmptyLinesWithOneColumn)310 TEST(BlockParser, EmptyLinesWithOneColumn) {
311   auto csv = MakeCSVData({"a\n", "\n", "b\r", "\r", "c\r\n", "\r\n", "d\n"});
312   {
313     BlockParser parser(ParseOptions::Defaults());
314     AssertParseOk(parser, csv);
315     AssertColumnsEq(parser, {{"a", "b", "c", "d"}});
316   }
317   {
318     auto options = ParseOptions::Defaults();
319     options.ignore_empty_lines = false;
320     BlockParser parser(options);
321     AssertParseOk(parser, csv);
322     AssertColumnsEq(parser, {{"a", "", "b", "", "c", "", "d"}});
323   }
324 }
325 
TEST(BlockParser,EmptyLinesWithSeveralColumns)326 TEST(BlockParser, EmptyLinesWithSeveralColumns) {
327   auto csv = MakeCSVData({"a,b\n", "\n", "c,d\r", "\r", "e,f\r\n", "\r\n", "g,h\n"});
328   {
329     BlockParser parser(ParseOptions::Defaults());
330     AssertParseOk(parser, csv);
331     AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
332   }
333   {
334     // Non-ignored empty lines get turned into empty values
335     auto options = ParseOptions::Defaults();
336     options.ignore_empty_lines = false;
337     BlockParser parser(options);
338     AssertParseOk(parser, csv);
339     AssertColumnsEq(parser,
340                     {{"a", "", "c", "", "e", "", "g"}, {"b", "", "d", "", "f", "", "h"}});
341   }
342 }
343 
TEST(BlockParser,EmptyLineFirst)344 TEST(BlockParser, EmptyLineFirst) {
345   auto csv = MakeCSVData({"\n", "\n", "a\n", "b\n"});
346   {
347     BlockParser parser(ParseOptions::Defaults());
348     AssertParseOk(parser, csv);
349     AssertColumnsEq(parser, {{"a", "b"}});
350   }
351   {
352     auto options = ParseOptions::Defaults();
353     options.ignore_empty_lines = false;
354     BlockParser parser(options);
355     AssertParseOk(parser, csv);
356     AssertColumnsEq(parser, {{"", "", "a", "b"}});
357   }
358 }
359 
TEST(BlockParser,TruncatedData)360 TEST(BlockParser, TruncatedData) {
361   BlockParser parser(ParseOptions::Defaults());
362   auto csv = MakeCSVData({"a,b\n", "c,d\n"});
363   for (auto trim : {1, 2, 3}) {
364     AssertParsePartial(parser, csv.substr(0, csv.length() - trim), 4);
365     AssertColumnsEq(parser, {{"a"}, {"b"}});
366   }
367 }
368 
TEST(BlockParser,Final)369 TEST(BlockParser, Final) {
370   // Tests for ParseFinal()
371   BlockParser parser(ParseOptions::Defaults());
372   auto csv = MakeCSVData({"ab,cd\n", "ef,gh\n"});
373   AssertParseFinal(parser, csv);
374   AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
375 
376   // Same without newline
377   csv = MakeCSVData({"ab,cd\n", "ef,gh"});
378   AssertParseFinal(parser, csv);
379   AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
380 
381   // Same with empty last item
382   csv = MakeCSVData({"ab,cd\n", "ef,"});
383   AssertParseFinal(parser, csv);
384   AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", ""}});
385 
386   // Same with single line
387   csv = MakeCSVData({"ab,cd"});
388   AssertParseFinal(parser, csv);
389   AssertColumnsEq(parser, {{"ab"}, {"cd"}});
390 
391   // Two blocks
392   auto csv1 = MakeCSVData({"ab,cd\n"});
393   auto csv2 = MakeCSVData({"ef,"});
394   AssertParseFinal(parser, {{csv1}, {csv2}});
395   AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", ""}});
396 }
397 
TEST(BlockParser,FinalTruncatedData)398 TEST(BlockParser, FinalTruncatedData) {
399   // Test ParseFinal() with truncated data
400   uint32_t out_size;
401   BlockParser parser(ParseOptions::Defaults());
402   auto csv = MakeCSVData({"ab,cd\n", "ef"});
403   Status st = ParseFinal(parser, csv, &out_size);
404   ASSERT_RAISES(Invalid, st);
405 }
406 
TEST(BlockParser,QuotingSimple)407 TEST(BlockParser, QuotingSimple) {
408   auto csv = MakeCSVData({"1,\",3,\",5\n"});
409 
410   {
411     BlockParser parser(ParseOptions::Defaults());
412     AssertParseOk(parser, csv);
413     AssertColumnsEq(parser, {{"1"}, {",3,"}, {"5"}},
414                     {{false}, {true}, {false}} /* quoted */);
415   }
416   {
417     auto options = ParseOptions::Defaults();
418     options.quoting = false;
419     BlockParser parser(options);
420     AssertParseOk(parser, csv);
421     AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
422                     {{false}, {false}, {false}, {false}, {false}} /* quoted */);
423   }
424   {
425     auto options = ParseOptions::Defaults();
426     options.quote_char = 'Z';
427     BlockParser parser(options);
428     AssertParseOk(parser, csv);
429     AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
430                     {{false}, {false}, {false}, {false}, {false}} /* quoted */);
431   }
432 }
433 
TEST(BlockParser,QuotingNewline)434 TEST(BlockParser, QuotingNewline) {
435   auto csv = MakeCSVData({"a,\"c \n d\",e\n"});
436   BlockParser parser(ParseOptions::Defaults());
437   AssertParseOk(parser, csv);
438   AssertColumnsEq(parser, {{"a"}, {"c \n d"}, {"e"}},
439                   {{false}, {true}, {false}} /* quoted */);
440 }
441 
TEST(BlockParser,QuotingUnbalanced)442 TEST(BlockParser, QuotingUnbalanced) {
443   // Quote introduces a quoted field that doesn't end
444   auto csv = MakeCSVData({"a,b\n", "1,\",3,,5\n"});
445   BlockParser parser(ParseOptions::Defaults());
446   AssertParsePartial(parser, csv, 4);
447   AssertColumnsEq(parser, {{"a"}, {"b"}}, {{false}, {false}} /* quoted */);
448 }
449 
TEST(BlockParser,QuotingEmpty)450 TEST(BlockParser, QuotingEmpty) {
451   {
452     BlockParser parser(ParseOptions::Defaults());
453     auto csv = MakeCSVData({"\"\"\n"});
454     AssertParseOk(parser, csv);
455     AssertColumnsEq(parser, {{""}}, {{true}} /* quoted */);
456     AssertLastRowEq(parser, {""}, {true});
457   }
458   {
459     BlockParser parser(ParseOptions::Defaults());
460     auto csv = MakeCSVData({",\"\"\n"});
461     AssertParseOk(parser, csv);
462     AssertColumnsEq(parser, {{""}, {""}}, {{false}, {true}} /* quoted */);
463     AssertLastRowEq(parser, {"", ""}, {false, true});
464   }
465   {
466     BlockParser parser(ParseOptions::Defaults());
467     auto csv = MakeCSVData({"\"\",\n"});
468     AssertParseOk(parser, csv);
469     AssertColumnsEq(parser, {{""}, {""}}, {{true}, {false}} /* quoted */);
470     AssertLastRowEq(parser, {"", ""}, {true, false});
471   }
472 }
473 
TEST(BlockParser,QuotingDouble)474 TEST(BlockParser, QuotingDouble) {
475   {
476     BlockParser parser(ParseOptions::Defaults());
477     // 4 quotes is a quoted quote
478     auto csv = MakeCSVData({"\"\"\"\"\n"});
479     AssertParseOk(parser, csv);
480     AssertColumnsEq(parser, {{"\""}}, {{true}} /* quoted */);
481   }
482   {
483     BlockParser parser(ParseOptions::Defaults());
484     // 4 quotes is a quoted quote
485     auto csv = MakeCSVData({"a,\"\"\"\",b\n"});
486     AssertParseOk(parser, csv);
487     AssertColumnsEq(parser, {{"a"}, {"\""}, {"b"}},
488                     {{false}, {true}, {false}} /* quoted */);
489   }
490   {
491     BlockParser parser(ParseOptions::Defaults());
492     // 6 quotes is two quoted quotes
493     auto csv = MakeCSVData({"\"\"\"\"\"\"\n"});
494     AssertParseOk(parser, csv);
495     AssertColumnsEq(parser, {{"\"\""}}, {{true}} /* quoted */);
496   }
497   {
498     BlockParser parser(ParseOptions::Defaults());
499     // 6 quotes is two quoted quotes
500     auto csv = MakeCSVData({"a,\"\"\"\"\"\",b\n"});
501     AssertParseOk(parser, csv);
502     AssertColumnsEq(parser, {{"a"}, {"\"\""}, {"b"}},
503                     {{false}, {true}, {false}} /* quoted */);
504   }
505 }
506 
TEST(BlockParser,QuotesAndMore)507 TEST(BlockParser, QuotesAndMore) {
508   // There may be trailing data after the quoted part of a field
509   {
510     BlockParser parser(ParseOptions::Defaults());
511     auto csv = MakeCSVData({"a,\"b\"c,d\n"});
512     AssertParseOk(parser, csv);
513     AssertColumnsEq(parser, {{"a"}, {"bc"}, {"d"}},
514                     {{false}, {true}, {false}} /* quoted */);
515   }
516 }
517 
TEST(BlockParser,QuotesSpecial)518 TEST(BlockParser, QuotesSpecial) {
519   // Some non-trivial cases
520   {
521     BlockParser parser(ParseOptions::Defaults());
522     auto csv = MakeCSVData({"a,b\"c,d\n"});
523     AssertParseOk(parser, csv);
524     AssertColumnsEq(parser, {{"a"}, {"b\"c"}, {"d"}},
525                     {{false}, {false}, {false}} /* quoted */);
526   }
527   {
528     BlockParser parser(ParseOptions::Defaults());
529     auto csv = MakeCSVData({"a,\"b\" \"c\",d\n"});
530     AssertParseOk(parser, csv);
531     AssertColumnsEq(parser, {{"a"}, {"b \"c\""}, {"d"}},
532                     {{false}, {true}, {false}} /* quoted */);
533   }
534 }
535 
TEST(BlockParser,MismatchingNumColumns)536 TEST(BlockParser, MismatchingNumColumns) {
537   uint32_t out_size;
538   {
539     BlockParser parser(ParseOptions::Defaults());
540     auto csv = MakeCSVData({"a,b\nc\n"});
541     Status st = Parse(parser, csv, &out_size);
542     ASSERT_RAISES(Invalid, st);
543   }
544   {
545     BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
546     auto csv = MakeCSVData({"a\n"});
547     Status st = Parse(parser, csv, &out_size);
548     ASSERT_RAISES(Invalid, st);
549   }
550   {
551     BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
552     auto csv = MakeCSVData({"a,b,c\n"});
553     Status st = Parse(parser, csv, &out_size);
554     ASSERT_RAISES(Invalid, st);
555   }
556 }
557 
TEST(BlockParser,Escaping)558 TEST(BlockParser, Escaping) {
559   auto options = ParseOptions::Defaults();
560   options.escaping = true;
561 
562   {
563     auto csv = MakeCSVData({"a\\b,c\n"});
564     {
565       BlockParser parser(ParseOptions::Defaults());
566       AssertParseOk(parser, csv);
567       AssertColumnsEq(parser, {{"a\\b"}, {"c"}});
568     }
569     {
570       BlockParser parser(options);
571       AssertParseOk(parser, csv);
572       AssertColumnsEq(parser, {{"ab"}, {"c"}});
573     }
574   }
575   {
576     auto csv = MakeCSVData({"a\\,b,c\n"});
577     BlockParser parser(options);
578     AssertParseOk(parser, csv);
579     AssertColumnsEq(parser, {{"a,b"}, {"c"}});
580   }
581 }
582 
583 // Generate test data with the given number of columns.
MakeLotsOfCsvColumns(int32_t num_columns)584 std::string MakeLotsOfCsvColumns(int32_t num_columns) {
585   std::string values, header;
586   header.reserve(num_columns * 10);
587   values.reserve(num_columns * 10);
588   for (int x = 0; x < num_columns; x++) {
589     if (x != 0) {
590       header += ",";
591       values += ",";
592     }
593     header += "c" + std::to_string(x);
594     values += std::to_string(x);
595   }
596 
597   header += "\n";
598   values += "\n";
599   return MakeCSVData({header, values});
600 }
601 
TEST(BlockParser,LotsOfColumns)602 TEST(BlockParser, LotsOfColumns) {
603   auto options = ParseOptions::Defaults();
604   BlockParser parser(options);
605   AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100));
606 }
607 
TEST(BlockParser,QuotedEscape)608 TEST(BlockParser, QuotedEscape) {
609   auto options = ParseOptions::Defaults();
610   options.escaping = true;
611 
612   {
613     auto csv = MakeCSVData({"\"a\\,b\",c\n"});
614     BlockParser parser(options);
615     AssertParseOk(parser, csv);
616     AssertColumnsEq(parser, {{"a,b"}, {"c"}}, {{true}, {false}} /* quoted */);
617   }
618   {
619     auto csv = MakeCSVData({"\"a\\\"b\",c\n"});
620     BlockParser parser(options);
621     AssertParseOk(parser, csv);
622     AssertColumnsEq(parser, {{"a\"b"}, {"c"}}, {{true}, {false}} /* quoted */);
623   }
624 }
625 
626 }  // namespace csv
627 }  // namespace arrow
628