1 #include <algorithm>
2 #include <chrono>
3 #include <iostream>
4 #include <map>
5 #include <vector>
6 
7 #include "simdjson.h"
8 
9 #define NB_ITERATION 20
10 #define MIN_BATCH_SIZE 10000
11 #define MAX_BATCH_SIZE 10000000
12 
13 bool test_baseline = false;
14 bool test_per_batch = true;
15 bool test_best_batch = false;
16 
compare(std::pair<size_t,double> i,std::pair<size_t,double> j)17 bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j) {
18   return i.second > j.second;
19 }
20 
main(int argc,char * argv[])21 int main(int argc, char *argv[]) {
22 
23   if (argc <= 1) {
24     std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
25     exit(1);
26   }
27   const char *filename = argv[1];
28   auto v = simdjson::padded_string::load(filename);
29   if (v.error()) {
30     std::cerr << "Could not load the file " << filename << std::endl;
31     return EXIT_FAILURE;
32   }
33   const simdjson::padded_string& p = v.value_unsafe();
34   if (test_baseline) {
35     std::wclog << "Baseline: Getline + normal parse... " << std::endl;
36     std::cout << "Gigabytes/second\t"
37               << "Nb of documents parsed" << std::endl;
38     for (auto i = 0; i < 3; i++) {
39       // Actual test
40       simdjson::dom::parser parser;
41       simdjson::error_code alloc_error = parser.allocate(p.size());
42       if (alloc_error) {
43         std::cerr << alloc_error << std::endl;
44         return EXIT_FAILURE;
45       }
46       std::istringstream ss(std::string(p.data(), p.size()));
47 
48       auto start = std::chrono::steady_clock::now();
49       int count = 0;
50       std::string line;
51       int parse_res = simdjson::SUCCESS;
52       while (getline(ss, line)) {
53         // TODO we're likely triggering simdjson's padding reallocation here. Is
54         // that intentional?
55         parser.parse(line);
56         count++;
57       }
58 
59       auto end = std::chrono::steady_clock::now();
60 
61       std::chrono::duration<double> secs = end - start;
62       double speedinGBs = static_cast<double>(p.size()) /
63                           (static_cast<double>(secs.count()) * 1000000000.0);
64       std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
65 
66       if (parse_res != simdjson::SUCCESS) {
67         std::cerr << "Parsing failed" << std::endl;
68         exit(1);
69       }
70     }
71   }
72 
73   std::map<size_t, double> batch_size_res;
74   if (test_per_batch) {
75     std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
76                << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
77     std::cout << "Batch Size\t"
78               << "Gigabytes/second\t"
79               << "Nb of documents parsed" << std::endl;
80     for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE;
81          i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 100) {
82       batch_size_res.insert(std::pair<size_t, double>(i, 0));
83       int count;
84       for (size_t j = 0; j < 5; j++) {
85         // Actual test
86         simdjson::dom::parser parser;
87         simdjson::error_code error;
88 
89         auto start = std::chrono::steady_clock::now();
90         count = 0;
91         simdjson::dom::document_stream docs;
92         if ((error = parser.parse_many(p, i).get(docs))) {
93           std::wcerr << "Parsing failed with: " << error << std::endl;
94           exit(1);
95         }
96         for (auto result : docs) {
97           error = result.error();
98           if (error) {
99             std::wcerr << "Parsing failed with: " << error << std::endl;
100             exit(1);
101           }
102           count++;
103         }
104         auto end = std::chrono::steady_clock::now();
105 
106         std::chrono::duration<double> secs = end - start;
107         double speedinGBs = static_cast<double>(p.size()) /
108                             (static_cast<double>(secs.count()) * 1000000000.0);
109         if (speedinGBs > batch_size_res.at(i))
110           batch_size_res[i] = speedinGBs;
111       }
112       std::cout << i << "\t\t" << std::fixed << std::setprecision(3)
113                 << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
114     }
115   }
116   size_t optimal_batch_size{};
117   double best_speed{};
118   if (test_per_batch) {
119     std::pair<size_t, double> best_results;
120     best_results =
121         (*min_element(batch_size_res.begin(), batch_size_res.end(), compare));
122     optimal_batch_size = best_results.first;
123     best_speed = best_results.second;
124   } else {
125     optimal_batch_size = MIN_BATCH_SIZE;
126   }
127   std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..."
128              << std::endl;
129   std::wclog << "Best speed: " << best_speed << "..." << std::endl;
130 
131   if (test_best_batch) {
132     std::wclog << "Starting speed test... Best of " << NB_ITERATION
133                << " iterations..." << std::endl;
134     std::vector<double> res;
135     for (int i = 0; i < NB_ITERATION; i++) {
136 
137       // Actual test
138       simdjson::dom::parser parser;
139       simdjson::error_code error;
140 
141       auto start = std::chrono::steady_clock::now();
142       // This includes allocation of the parser
143       simdjson::dom::document_stream docs;
144       if ((error = parser.parse_many(p, optimal_batch_size).get(docs))) {
145         std::wcerr << "Parsing failed with: " << error << std::endl;
146         exit(1);
147       }
148       for (auto result : docs) {
149         error = result.error();
150         if (error) {
151           std::wcerr << "Parsing failed with: " << error << std::endl;
152           exit(1);
153         }
154       }
155       auto end = std::chrono::steady_clock::now();
156 
157       std::chrono::duration<double> secs = end - start;
158       res.push_back(secs.count());
159     }
160 
161     double min_result = *min_element(res.begin(), res.end());
162     double speedinGBs =
163         static_cast<double>(p.size()) / (min_result * 1000000000.0);
164 
165     std::cout << "Min:  " << min_result << " bytes read: " << p.size()
166               << " Gigabytes/second: " << speedinGBs << std::endl;
167   }
168 #ifdef SIMDJSON_THREADS_ENABLED
169   // Multithreading probably does not help matters for small files (less than 10
170   // MB).
171   if (p.size() < 10000000) {
172     std::cout << std::endl;
173 
174     std::cout << "Warning: your file is small and the performance results are "
175                  "probably meaningless"
176               << std::endl;
177     std::cout << "as far as multithreaded performance goes." << std::endl;
178 
179     std::cout << std::endl;
180 
181     std::cout
182         << "Try to concatenate the file with itself to generate a large one."
183         << std::endl;
184     std::cout << "In bash: " << std::endl;
185     std::cout << "for i in {1..1000}; do cat '" << filename
186               << "' >> bar.ndjson; done" << std::endl;
187     std::cout << argv[0] << " bar.ndjson" << std::endl;
188   }
189 #endif
190 
191   return 0;
192 }
193