1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include <memory>
21 #include <sstream>
22 #include <string>
23
24 #include "arrow/csv/chunker.h"
25 #include "arrow/csv/options.h"
26 #include "arrow/csv/parser.h"
27 #include "arrow/testing/gtest_util.h"
28 #include "arrow/util/string_view.h"
29
30 namespace arrow {
31 namespace csv {
32
33 struct Example {
34 int32_t num_rows;
35 const char* csv_rows;
36 };
37
38 const Example quoted_example{1, "abc,\"d,f\",12.34,\n"};
39 const Example escaped_example{1, "abc,d\\,f,12.34,\n"};
40
41 const Example flights_example{
42 8,
43 R"(2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0010,0002,-8,12,0014,280,279,263,2330,0737,4,0750,0741,-9,0,0,,,,,,
44 2015,1,1,4,US,840,N171US,SFO,CLT,0020,0018,-2,16,0034,286,293,266,2296,0800,11,0806,0811,5,0,0,,,,,,
45 2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0020,0015,-5,15,0030,285,281,258,2342,0748,8,0805,0756,-9,0,0,,,,,,
46 2015,1,1,4,AS,135,N527AS,SEA,ANC,0025,0024,-1,11,0035,235,215,199,1448,0254,5,0320,0259,-21,0,0,,,,,,
47 2015,1,1,4,DL,806,N3730B,SFO,MSP,0025,0020,-5,18,0038,217,230,206,1589,0604,6,0602,0610,8,0,0,,,,,,
48 2015,1,1,4,NK,612,N635NK,LAS,MSP,0025,0019,-6,11,0030,181,170,154,1299,0504,5,0526,0509,-17,0,0,,,,,,
49 2015,1,1,4,US,2013,N584UW,LAX,CLT,0030,0044,14,13,0057,273,249,228,2125,0745,8,0803,0753,-10,0,0,,,,,,
50 2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,0030,0019,-11,17,0036,195,193,173,1464,0529,3,0545,0532,-13,0,0,,,,,,
51 )"};
52
53 // NOTE: quoted
54 const Example vehicles_example{
55 2,
56 R"(7088743681,https://greensboro.craigslist.org/ctd/d/cary-2004-honda-element-lx-4dr-suv/7088743681.html,greensboro,https://greensboro.craigslist.org,3995,2004,honda,element,,,gas,212526,clean,automatic,5J6YH18314L006498,fwd,,SUV,orange,https://images.craigslist.org/00E0E_eAUnhFF86M4_600x450.jpg,"2004 Honda Element LX 4dr SUV Offered by: Best Import Auto Sales Inc — (919) 800-0650 — $3,995 EXCELLENT SHAPE INSIDE AND OUT FULLY SERVICED AND READY TO GO ,RUNS AND DRIVES PERFECT ,PLEASE CALL OR TEXT 919 454 4848 OR CALL 919 380 0380 IF INTERESTED. Best Import Auto Sales Inc Year: 2004 Make: Honda Model: Element Series: LX 4dr SUV VIN: 5J6YH18314L006498 Stock #: 4L006498 Condition: Used Mileage: 212,526 Exterior: Orange Interior: Black Body: SUV Transmission: Automatic 4-Speed Engine: 2.4L I4 **** Best Import Auto Sales Inc. Raleigh Auto Dealer ***** ⚡️⚡️⚡️ Call Or Text (919) 800-0650 ⚡️⚡️⚡️ ✅ - We can arrange Financing Options with most banks and credit unions!!!! ✅ Extended Warranties Available on most vehicles!! ""Call To Inquire"" ✅ Full Service ASE-Certified Shop Onsite! More vehicle details: best-import-auto-sales-inc.hammerwebsites.net/v/3kE08kSD Address: 1501 Buck Jones Rd Raleigh, NC 27606 Phone: (919) 800-0650 Website: www.bestimportsonline.com ☎️ Call or text (919) 800-0650 for quick answers to your questions about this Honda Element Your message will always be answered by a real human — never an automated system. Disclaimer: Best Import Auto Sales Inc will never sell, share, or spam your mobile number. Standard text messaging rates may apply. 2004 Honda Element LX 4dr SUV 6fbc204ebd7e4a32a30dcf2c8c3bcdea",,nc,35.7636,-78.7443
57 7088744126,https://greensboro.craigslist.org/cto/d/greensboro-2011-jaguar-xf-premier/7088744126.html,greensboro,https://greensboro.craigslist.org,9500,2011,jaguar,xf,excellent,,gas,85000,clean,automatic,,,,,blue,https://images.craigslist.org/00505_f22HGItCRpc_600x450.jpg,"2011 jaguar XF premium - estate sale. Retired lady executive. Like new, garaged and maintained. Very nice leather, heated seats, electric sunroof, metallic blue paint. 85K miles bumper-to-bumper warranty. Premium radio sound system. Built-in phone connection. Please call show contact info cell or show contact info . Asking Price $9500",,nc,36.1032,-79.8794
58 )"};
59
60 const Example stocks_example{
61 3,
62 R"(2,2010-01-27 00:00:00,002204,华锐铸钢,536498.0,135378.0,2652784.2001924426,14160629.45,5.382023337513902,5.288274712474071,5.382023337513902,5.341540976701248,,5.338025403262254,1.01364599,0.21306505690870553
63 3,2010-02-05 00:00:00,600266,北京城建,1122615.0,1122615.0,8102476.086666377,57695471.0,7.236029036381633,7.025270909108382,7.170459841229955,7.095523618199466,,7.120720923193468,2.3025570905818964,0.4683513939405588
64 4,2010-01-04 00:00:00,600289,亿阳信通,602926.359,602926.359,16393247.138998777,167754890.0,10.381817699665978,9.960037526145015,10.092597009251604,10.321563389162982,,10.233170315655089,4.436963485334562,0.6025431050299465
65 )"};
66
67 static constexpr int32_t kNumRows = 10000;
68
BuildCSVData(const Example & example)69 static std::string BuildCSVData(const Example& example) {
70 std::stringstream ss;
71 for (int32_t i = 0; i < kNumRows; i += example.num_rows) {
72 ss << example.csv_rows;
73 }
74 return ss.str();
75 }
76
BenchmarkCSVChunking(benchmark::State & state,const std::string & csv,ParseOptions options)77 static void BenchmarkCSVChunking(benchmark::State& state, // NOLINT non-const reference
78 const std::string& csv, ParseOptions options) {
79 auto chunker = MakeChunker(options);
80 auto block = std::make_shared<Buffer>(util::string_view(csv));
81
82 while (state.KeepRunning()) {
83 std::shared_ptr<Buffer> whole, partial;
84 ABORT_NOT_OK(chunker->Process(block, &whole, &partial));
85 benchmark::DoNotOptimize(whole->size());
86 }
87
88 state.SetBytesProcessed(state.iterations() * csv.length());
89 }
90
ChunkCSVQuotedBlock(benchmark::State & state)91 static void ChunkCSVQuotedBlock(benchmark::State& state) { // NOLINT non-const reference
92 auto csv = BuildCSVData(quoted_example);
93 auto options = ParseOptions::Defaults();
94 options.quoting = true;
95 options.escaping = false;
96 options.newlines_in_values = true;
97
98 BenchmarkCSVChunking(state, csv, options);
99 }
100
ChunkCSVEscapedBlock(benchmark::State & state)101 static void ChunkCSVEscapedBlock(benchmark::State& state) { // NOLINT non-const reference
102 auto csv = BuildCSVData(escaped_example);
103 auto options = ParseOptions::Defaults();
104 options.quoting = false;
105 options.escaping = true;
106 options.newlines_in_values = true;
107
108 BenchmarkCSVChunking(state, csv, options);
109 }
110
ChunkCSVNoNewlinesBlock(benchmark::State & state)111 static void ChunkCSVNoNewlinesBlock(
112 benchmark::State& state) { // NOLINT non-const reference
113 auto csv = BuildCSVData(escaped_example);
114 auto options = ParseOptions::Defaults();
115 options.quoting = true;
116 options.escaping = false;
117 options.newlines_in_values = false;
118
119 BenchmarkCSVChunking(state, csv, options);
120 // Provides better regression stability with timings rather than bogus
121 // bandwidth.
122 state.SetBytesProcessed(0);
123 }
124
BenchmarkCSVParsing(benchmark::State & state,const std::string & csv,int32_t num_rows,ParseOptions options)125 static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const reference
126 const std::string& csv, int32_t num_rows,
127 ParseOptions options) {
128 BlockParser parser(options, -1, num_rows + 1);
129
130 while (state.KeepRunning()) {
131 uint32_t parsed_size = 0;
132 ABORT_NOT_OK(parser.Parse(util::string_view(csv), &parsed_size));
133
134 // Include performance of visiting the parsed values, as that might
135 // vary depending on the parser's internal data structures.
136 bool dummy_quoted = false;
137 uint32_t dummy_size = 0;
138 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) {
139 dummy_size += size;
140 dummy_quoted ^= quoted;
141 return Status::OK();
142 };
143 for (int32_t col = 0; col < parser.num_cols(); ++col) {
144 ABORT_NOT_OK(parser.VisitColumn(col, visit));
145 benchmark::DoNotOptimize(dummy_size);
146 benchmark::DoNotOptimize(dummy_quoted);
147 }
148 }
149
150 state.SetBytesProcessed(state.iterations() * csv.size());
151 }
152
BenchmarkCSVParsing(benchmark::State & state,const Example & example,ParseOptions options)153 static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const reference
154 const Example& example, ParseOptions options) {
155 auto csv = BuildCSVData(example);
156 BenchmarkCSVParsing(state, csv, kNumRows, options);
157 }
158
ParseCSVQuotedBlock(benchmark::State & state)159 static void ParseCSVQuotedBlock(benchmark::State& state) { // NOLINT non-const reference
160 auto options = ParseOptions::Defaults();
161 options.quoting = true;
162 options.escaping = false;
163
164 BenchmarkCSVParsing(state, quoted_example, options);
165 }
166
ParseCSVEscapedBlock(benchmark::State & state)167 static void ParseCSVEscapedBlock(benchmark::State& state) { // NOLINT non-const reference
168 auto options = ParseOptions::Defaults();
169 options.quoting = false;
170 options.escaping = true;
171
172 BenchmarkCSVParsing(state, escaped_example, options);
173 }
174
ParseCSVFlightsExample(benchmark::State & state)175 static void ParseCSVFlightsExample(
176 benchmark::State& state) { // NOLINT non-const reference
177 BenchmarkCSVParsing(state, flights_example, ParseOptions::Defaults());
178 }
179
ParseCSVVehiclesExample(benchmark::State & state)180 static void ParseCSVVehiclesExample(
181 benchmark::State& state) { // NOLINT non-const reference
182 auto options = ParseOptions::Defaults();
183 options.quoting = true;
184 options.escaping = false;
185
186 BenchmarkCSVParsing(state, vehicles_example, options);
187 }
188
ParseCSVStocksExample(benchmark::State & state)189 static void ParseCSVStocksExample(
190 benchmark::State& state) { // NOLINT non-const reference
191 BenchmarkCSVParsing(state, stocks_example, ParseOptions::Defaults());
192 }
193
194 BENCHMARK(ChunkCSVQuotedBlock);
195 BENCHMARK(ChunkCSVEscapedBlock);
196 BENCHMARK(ChunkCSVNoNewlinesBlock);
197
198 BENCHMARK(ParseCSVQuotedBlock);
199 BENCHMARK(ParseCSVEscapedBlock);
200 BENCHMARK(ParseCSVFlightsExample);
201 BENCHMARK(ParseCSVVehiclesExample);
202 BENCHMARK(ParseCSVStocksExample);
203
204 } // namespace csv
205 } // namespace arrow
206