1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "benchmark/benchmark.h"
19 
20 #include <memory>
21 #include <sstream>
22 #include <string>
23 
24 #include "arrow/csv/chunker.h"
25 #include "arrow/csv/options.h"
26 #include "arrow/csv/parser.h"
27 #include "arrow/testing/gtest_util.h"
28 #include "arrow/util/string_view.h"
29 
30 namespace arrow {
31 namespace csv {
32 
33 struct Example {
34   int32_t num_rows;
35   const char* csv_rows;
36 };
37 
38 const Example quoted_example{1, "abc,\"d,f\",12.34,\n"};
39 const Example escaped_example{1, "abc,d\\,f,12.34,\n"};
40 
41 const Example flights_example{
42     8,
43     R"(2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0010,0002,-8,12,0014,280,279,263,2330,0737,4,0750,0741,-9,0,0,,,,,,
44 2015,1,1,4,US,840,N171US,SFO,CLT,0020,0018,-2,16,0034,286,293,266,2296,0800,11,0806,0811,5,0,0,,,,,,
45 2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0020,0015,-5,15,0030,285,281,258,2342,0748,8,0805,0756,-9,0,0,,,,,,
46 2015,1,1,4,AS,135,N527AS,SEA,ANC,0025,0024,-1,11,0035,235,215,199,1448,0254,5,0320,0259,-21,0,0,,,,,,
47 2015,1,1,4,DL,806,N3730B,SFO,MSP,0025,0020,-5,18,0038,217,230,206,1589,0604,6,0602,0610,8,0,0,,,,,,
48 2015,1,1,4,NK,612,N635NK,LAS,MSP,0025,0019,-6,11,0030,181,170,154,1299,0504,5,0526,0509,-17,0,0,,,,,,
49 2015,1,1,4,US,2013,N584UW,LAX,CLT,0030,0044,14,13,0057,273,249,228,2125,0745,8,0803,0753,-10,0,0,,,,,,
50 2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,0030,0019,-11,17,0036,195,193,173,1464,0529,3,0545,0532,-13,0,0,,,,,,
51 )"};
52 
53 // NOTE: quoted
54 const Example vehicles_example{
55     2,
56     R"(7088743681,https://greensboro.craigslist.org/ctd/d/cary-2004-honda-element-lx-4dr-suv/7088743681.html,greensboro,https://greensboro.craigslist.org,3995,2004,honda,element,,,gas,212526,clean,automatic,5J6YH18314L006498,fwd,,SUV,orange,https://images.craigslist.org/00E0E_eAUnhFF86M4_600x450.jpg,"2004 Honda Element LX 4dr SUV     Offered by: Best Import Auto Sales Inc — (919) 800-0650 — $3,995     EXCELLENT SHAPE INSIDE AND OUT FULLY SERVICED AND READY TO GO ,RUNS AND DRIVES PERFECT ,PLEASE CALL OR TEXT 919 454 4848 OR CALL 919 380 0380 IF INTERESTED.   Best Import Auto Sales Inc    Year: 2004 Make: Honda Model: Element Series: LX 4dr SUV VIN: 5J6YH18314L006498 Stock #: 4L006498 Condition: Used Mileage: 212,526  Exterior: Orange Interior: Black Body: SUV Transmission: Automatic 4-Speed Engine: 2.4L I4      **** Best Import Auto Sales Inc. �� Raleigh Auto Dealer *****  ⚡️⚡️⚡️ Call Or Text (919) 800-0650 ⚡️⚡️⚡️  ✅ - We can arrange Financing Options with most banks and credit unions!!!!     ✅ Extended Warranties Available on most vehicles!! ""Call To Inquire""Full Service ASE-Certified Shop Onsite!       More vehicle details: best-import-auto-sales-inc.hammerwebsites.net/v/3kE08kSD     Address: 1501 Buck Jones Rd Raleigh, NC 27606   Phone: (919) 800-0650     Website: www.bestimportsonline.com      �� ☎️ Call or text (919) 800-0650 for quick answers to your questions about this Honda Element Your message will always be answered by a real humannever an automated system.     Disclaimer: Best Import Auto Sales Inc will never sell, share, or spam your mobile number. Standard text messaging rates may apply.       2004 Honda Element LX 4dr SUV   6fbc204ebd7e4a32a30dcf2c8c3bcdea",,nc,35.7636,-78.7443
57   7088744126,https://greensboro.craigslist.org/cto/d/greensboro-2011-jaguar-xf-premier/7088744126.html,greensboro,https://greensboro.craigslist.org,9500,2011,jaguar,xf,excellent,,gas,85000,clean,automatic,,,,,blue,https://images.craigslist.org/00505_f22HGItCRpc_600x450.jpg,"2011 jaguar XF premium - estate sale. Retired lady executive. Like new, garaged and maintained. Very nice leather, heated seats, electric sunroof, metallic blue paint. 85K miles bumper-to-bumper warranty. Premium radio sound system. Built-in phone connection. Please call  show contact info  cell or  show contact info .  Asking Price $9500",,nc,36.1032,-79.8794
58 )"};
59 
60 const Example stocks_example{
61     3,
62     R"(2,2010-01-27 00:00:00,002204,华锐铸钢,536498.0,135378.0,2652784.2001924426,14160629.45,5.382023337513902,5.288274712474071,5.382023337513902,5.341540976701248,,5.338025403262254,1.01364599,0.21306505690870553
63 3,2010-02-05 00:00:00,600266,北京城建,1122615.0,1122615.0,8102476.086666377,57695471.0,7.236029036381633,7.025270909108382,7.170459841229955,7.095523618199466,,7.120720923193468,2.3025570905818964,0.4683513939405588
64 4,2010-01-04 00:00:00,600289,亿阳信通,602926.359,602926.359,16393247.138998777,167754890.0,10.381817699665978,9.960037526145015,10.092597009251604,10.321563389162982,,10.233170315655089,4.436963485334562,0.6025431050299465
65 )"};
66 
67 static constexpr int32_t kNumRows = 10000;
68 
BuildCSVData(const Example & example)69 static std::string BuildCSVData(const Example& example) {
70   std::stringstream ss;
71   for (int32_t i = 0; i < kNumRows; i += example.num_rows) {
72     ss << example.csv_rows;
73   }
74   return ss.str();
75 }
76 
BenchmarkCSVChunking(benchmark::State & state,const std::string & csv,ParseOptions options)77 static void BenchmarkCSVChunking(benchmark::State& state,  // NOLINT non-const reference
78                                  const std::string& csv, ParseOptions options) {
79   auto chunker = MakeChunker(options);
80   auto block = std::make_shared<Buffer>(util::string_view(csv));
81 
82   while (state.KeepRunning()) {
83     std::shared_ptr<Buffer> whole, partial;
84     ABORT_NOT_OK(chunker->Process(block, &whole, &partial));
85     benchmark::DoNotOptimize(whole->size());
86   }
87 
88   state.SetBytesProcessed(state.iterations() * csv.length());
89 }
90 
ChunkCSVQuotedBlock(benchmark::State & state)91 static void ChunkCSVQuotedBlock(benchmark::State& state) {  // NOLINT non-const reference
92   auto csv = BuildCSVData(quoted_example);
93   auto options = ParseOptions::Defaults();
94   options.quoting = true;
95   options.escaping = false;
96   options.newlines_in_values = true;
97 
98   BenchmarkCSVChunking(state, csv, options);
99 }
100 
ChunkCSVEscapedBlock(benchmark::State & state)101 static void ChunkCSVEscapedBlock(benchmark::State& state) {  // NOLINT non-const reference
102   auto csv = BuildCSVData(escaped_example);
103   auto options = ParseOptions::Defaults();
104   options.quoting = false;
105   options.escaping = true;
106   options.newlines_in_values = true;
107 
108   BenchmarkCSVChunking(state, csv, options);
109 }
110 
ChunkCSVNoNewlinesBlock(benchmark::State & state)111 static void ChunkCSVNoNewlinesBlock(
112     benchmark::State& state) {  // NOLINT non-const reference
113   auto csv = BuildCSVData(escaped_example);
114   auto options = ParseOptions::Defaults();
115   options.quoting = true;
116   options.escaping = false;
117   options.newlines_in_values = false;
118 
119   BenchmarkCSVChunking(state, csv, options);
120   // Provides better regression stability with timings rather than bogus
121   // bandwidth.
122   state.SetBytesProcessed(0);
123 }
124 
BenchmarkCSVParsing(benchmark::State & state,const std::string & csv,int32_t num_rows,ParseOptions options)125 static void BenchmarkCSVParsing(benchmark::State& state,  // NOLINT non-const reference
126                                 const std::string& csv, int32_t num_rows,
127                                 ParseOptions options) {
128   BlockParser parser(options, -1, num_rows + 1);
129 
130   while (state.KeepRunning()) {
131     uint32_t parsed_size = 0;
132     ABORT_NOT_OK(parser.Parse(util::string_view(csv), &parsed_size));
133 
134     // Include performance of visiting the parsed values, as that might
135     // vary depending on the parser's internal data structures.
136     bool dummy_quoted = false;
137     uint32_t dummy_size = 0;
138     auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) {
139       dummy_size += size;
140       dummy_quoted ^= quoted;
141       return Status::OK();
142     };
143     for (int32_t col = 0; col < parser.num_cols(); ++col) {
144       ABORT_NOT_OK(parser.VisitColumn(col, visit));
145       benchmark::DoNotOptimize(dummy_size);
146       benchmark::DoNotOptimize(dummy_quoted);
147     }
148   }
149 
150   state.SetBytesProcessed(state.iterations() * csv.size());
151 }
152 
BenchmarkCSVParsing(benchmark::State & state,const Example & example,ParseOptions options)153 static void BenchmarkCSVParsing(benchmark::State& state,  // NOLINT non-const reference
154                                 const Example& example, ParseOptions options) {
155   auto csv = BuildCSVData(example);
156   BenchmarkCSVParsing(state, csv, kNumRows, options);
157 }
158 
ParseCSVQuotedBlock(benchmark::State & state)159 static void ParseCSVQuotedBlock(benchmark::State& state) {  // NOLINT non-const reference
160   auto options = ParseOptions::Defaults();
161   options.quoting = true;
162   options.escaping = false;
163 
164   BenchmarkCSVParsing(state, quoted_example, options);
165 }
166 
ParseCSVEscapedBlock(benchmark::State & state)167 static void ParseCSVEscapedBlock(benchmark::State& state) {  // NOLINT non-const reference
168   auto options = ParseOptions::Defaults();
169   options.quoting = false;
170   options.escaping = true;
171 
172   BenchmarkCSVParsing(state, escaped_example, options);
173 }
174 
ParseCSVFlightsExample(benchmark::State & state)175 static void ParseCSVFlightsExample(
176     benchmark::State& state) {  // NOLINT non-const reference
177   BenchmarkCSVParsing(state, flights_example, ParseOptions::Defaults());
178 }
179 
ParseCSVVehiclesExample(benchmark::State & state)180 static void ParseCSVVehiclesExample(
181     benchmark::State& state) {  // NOLINT non-const reference
182   auto options = ParseOptions::Defaults();
183   options.quoting = true;
184   options.escaping = false;
185 
186   BenchmarkCSVParsing(state, vehicles_example, options);
187 }
188 
ParseCSVStocksExample(benchmark::State & state)189 static void ParseCSVStocksExample(
190     benchmark::State& state) {  // NOLINT non-const reference
191   BenchmarkCSVParsing(state, stocks_example, ParseOptions::Defaults());
192 }
193 
194 BENCHMARK(ChunkCSVQuotedBlock);
195 BENCHMARK(ChunkCSVEscapedBlock);
196 BENCHMARK(ChunkCSVNoNewlinesBlock);
197 
198 BENCHMARK(ParseCSVQuotedBlock);
199 BENCHMARK(ParseCSVEscapedBlock);
200 BENCHMARK(ParseCSVFlightsExample);
201 BENCHMARK(ParseCSVVehiclesExample);
202 BENCHMARK(ParseCSVStocksExample);
203 
204 }  // namespace csv
205 }  // namespace arrow
206