1 #include <algorithm>
2 #include <chrono>
3 #include <iostream>
4 #include <map>
5 #include <vector>
6
7 #include "simdjson.h"
8
9 #define NB_ITERATION 20
10 #define MIN_BATCH_SIZE 10000
11 #define MAX_BATCH_SIZE 10000000
12
13 bool test_baseline = false;
14 bool test_per_batch = true;
15 bool test_best_batch = false;
16
compare(std::pair<size_t,double> i,std::pair<size_t,double> j)17 bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j) {
18 return i.second > j.second;
19 }
20
main(int argc,char * argv[])21 int main(int argc, char *argv[]) {
22
23 if (argc <= 1) {
24 std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
25 exit(1);
26 }
27 const char *filename = argv[1];
28 auto v = simdjson::padded_string::load(filename);
29 if (v.error()) {
30 std::cerr << "Could not load the file " << filename << std::endl;
31 return EXIT_FAILURE;
32 }
33 const simdjson::padded_string& p = v.value_unsafe();
34 if (test_baseline) {
35 std::wclog << "Baseline: Getline + normal parse... " << std::endl;
36 std::cout << "Gigabytes/second\t"
37 << "Nb of documents parsed" << std::endl;
38 for (auto i = 0; i < 3; i++) {
39 // Actual test
40 simdjson::dom::parser parser;
41 simdjson::error_code alloc_error = parser.allocate(p.size());
42 if (alloc_error) {
43 std::cerr << alloc_error << std::endl;
44 return EXIT_FAILURE;
45 }
46 std::istringstream ss(std::string(p.data(), p.size()));
47
48 auto start = std::chrono::steady_clock::now();
49 int count = 0;
50 std::string line;
51 int parse_res = simdjson::SUCCESS;
52 while (getline(ss, line)) {
53 // TODO we're likely triggering simdjson's padding reallocation here. Is
54 // that intentional?
55 parser.parse(line);
56 count++;
57 }
58
59 auto end = std::chrono::steady_clock::now();
60
61 std::chrono::duration<double> secs = end - start;
62 double speedinGBs = static_cast<double>(p.size()) /
63 (static_cast<double>(secs.count()) * 1000000000.0);
64 std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
65
66 if (parse_res != simdjson::SUCCESS) {
67 std::cerr << "Parsing failed" << std::endl;
68 exit(1);
69 }
70 }
71 }
72
73 std::map<size_t, double> batch_size_res;
74 if (test_per_batch) {
75 std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
76 << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
77 std::cout << "Batch Size\t"
78 << "Gigabytes/second\t"
79 << "Nb of documents parsed" << std::endl;
80 for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE;
81 i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 100) {
82 batch_size_res.insert(std::pair<size_t, double>(i, 0));
83 int count;
84 for (size_t j = 0; j < 5; j++) {
85 // Actual test
86 simdjson::dom::parser parser;
87 simdjson::error_code error;
88
89 auto start = std::chrono::steady_clock::now();
90 count = 0;
91 simdjson::dom::document_stream docs;
92 if ((error = parser.parse_many(p, i).get(docs))) {
93 std::wcerr << "Parsing failed with: " << error << std::endl;
94 exit(1);
95 }
96 for (auto result : docs) {
97 error = result.error();
98 if (error) {
99 std::wcerr << "Parsing failed with: " << error << std::endl;
100 exit(1);
101 }
102 count++;
103 }
104 auto end = std::chrono::steady_clock::now();
105
106 std::chrono::duration<double> secs = end - start;
107 double speedinGBs = static_cast<double>(p.size()) /
108 (static_cast<double>(secs.count()) * 1000000000.0);
109 if (speedinGBs > batch_size_res.at(i))
110 batch_size_res[i] = speedinGBs;
111 }
112 std::cout << i << "\t\t" << std::fixed << std::setprecision(3)
113 << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
114 }
115 }
116 size_t optimal_batch_size{};
117 double best_speed{};
118 if (test_per_batch) {
119 std::pair<size_t, double> best_results;
120 best_results =
121 (*min_element(batch_size_res.begin(), batch_size_res.end(), compare));
122 optimal_batch_size = best_results.first;
123 best_speed = best_results.second;
124 } else {
125 optimal_batch_size = MIN_BATCH_SIZE;
126 }
127 std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..."
128 << std::endl;
129 std::wclog << "Best speed: " << best_speed << "..." << std::endl;
130
131 if (test_best_batch) {
132 std::wclog << "Starting speed test... Best of " << NB_ITERATION
133 << " iterations..." << std::endl;
134 std::vector<double> res;
135 for (int i = 0; i < NB_ITERATION; i++) {
136
137 // Actual test
138 simdjson::dom::parser parser;
139 simdjson::error_code error;
140
141 auto start = std::chrono::steady_clock::now();
142 // This includes allocation of the parser
143 simdjson::dom::document_stream docs;
144 if ((error = parser.parse_many(p, optimal_batch_size).get(docs))) {
145 std::wcerr << "Parsing failed with: " << error << std::endl;
146 exit(1);
147 }
148 for (auto result : docs) {
149 error = result.error();
150 if (error) {
151 std::wcerr << "Parsing failed with: " << error << std::endl;
152 exit(1);
153 }
154 }
155 auto end = std::chrono::steady_clock::now();
156
157 std::chrono::duration<double> secs = end - start;
158 res.push_back(secs.count());
159 }
160
161 double min_result = *min_element(res.begin(), res.end());
162 double speedinGBs =
163 static_cast<double>(p.size()) / (min_result * 1000000000.0);
164
165 std::cout << "Min: " << min_result << " bytes read: " << p.size()
166 << " Gigabytes/second: " << speedinGBs << std::endl;
167 }
168 #ifdef SIMDJSON_THREADS_ENABLED
169 // Multithreading probably does not help matters for small files (less than 10
170 // MB).
171 if (p.size() < 10000000) {
172 std::cout << std::endl;
173
174 std::cout << "Warning: your file is small and the performance results are "
175 "probably meaningless"
176 << std::endl;
177 std::cout << "as far as multithreaded performance goes." << std::endl;
178
179 std::cout << std::endl;
180
181 std::cout
182 << "Try to concatenate the file with itself to generate a large one."
183 << std::endl;
184 std::cout << "In bash: " << std::endl;
185 std::cout << "for i in {1..1000}; do cat '" << filename
186 << "' >> bar.ndjson; done" << std::endl;
187 std::cout << argv[0] << " bar.ndjson" << std::endl;
188 }
189 #endif
190
191 return 0;
192 }
193