1 #ifndef __BENCHMARKER_H
2 #define __BENCHMARKER_H
3 
4 #include "event_counter.h"
5 #include "simdjson.h" // For SIMDJSON_DISABLE_DEPRECATED_WARNINGS
6 
7 #include <cassert>
8 #include <cctype>
9 #ifndef _MSC_VER
10 #include <dirent.h>
11 #endif
12 #include <unistd.h>
13 #include <cinttypes>
14 
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 
19 #include <algorithm>
20 #include <chrono>
21 #include <cstring>
22 #include <fstream>
23 #include <iomanip>
24 #include <iostream>
25 #include <map>
26 #include <set>
27 #include <sstream>
28 #include <string>
29 #include <vector>
30 
31 #include "linux-perf-events.h"
32 #ifdef __linux__
33 #include <libgen.h>
34 #endif
35 #include "simdjson.h"
36 
37 #include <functional>
38 
39 using namespace simdjson;
40 using std::cerr;
41 using std::cout;
42 using std::endl;
43 using std::string;
44 using std::to_string;
45 using std::vector;
46 using std::ostream;
47 using std::ofstream;
48 using std::exception;
49 using std::min;
50 using std::max;
51 
52 // Initialize "verbose" to go nowhere. We'll read options in main() and set to cout if verbose is true.
53 std::ofstream dev_null;
54 ostream *verbose_stream = &dev_null;
55 const size_t BYTES_PER_BLOCK = 64;
56 
verbose()57 ostream& verbose() {
58   return *verbose_stream;
59 }
60 
exit_error(string message)61 void exit_error(string message) {
62   cerr << message << endl;
63   exit(EXIT_FAILURE);
64 }
65 
66 struct json_stats {
67   size_t bytes = 0;
68   size_t blocks = 0;
69   size_t structurals = 0;
70   size_t blocks_with_utf8 = 0;
71   size_t blocks_with_utf8_flipped = 0;
72   size_t blocks_with_escapes = 0;
73   size_t blocks_with_escapes_flipped = 0;
74   size_t blocks_with_0_structurals = 0;
75   size_t blocks_with_0_structurals_flipped = 0;
76   size_t blocks_with_1_structural = 0;
77   size_t blocks_with_1_structural_flipped = 0;
78   size_t blocks_with_8_structurals = 0;
79   size_t blocks_with_8_structurals_flipped = 0;
80   size_t blocks_with_16_structurals = 0;
81   size_t blocks_with_16_structurals_flipped = 0;
82 
json_statsjson_stats83   json_stats(const padded_string& json, const dom::parser& parser) {
84     bytes = json.size();
85     blocks = bytes / BYTES_PER_BLOCK;
86     if (bytes % BYTES_PER_BLOCK > 0) { blocks++; } // Account for remainder block
87     structurals = parser.implementation->n_structural_indexes-1;
88 
89     // Calculate stats on blocks that will trigger utf-8 if statements / mispredictions
90     bool last_block_has_utf8 = false;
91     for (size_t block=0; block<blocks; block++) {
92       // Find utf-8 in the block
93       size_t block_start = block*BYTES_PER_BLOCK;
94       size_t block_end = block_start+BYTES_PER_BLOCK;
95       if (block_end > json.size()) { block_end = json.size(); }
96       bool block_has_utf8 = false;
97       for (size_t i=block_start; i<block_end; i++) {
98         if (json.data()[i] & 0x80) {
99           block_has_utf8 = true;
100           break;
101         }
102       }
103       if (block_has_utf8) {
104         blocks_with_utf8++;
105       }
106       if (block > 0 && last_block_has_utf8 != block_has_utf8) {
107         blocks_with_utf8_flipped++;
108       }
109       last_block_has_utf8 = block_has_utf8;
110     }
111 
112     // Calculate stats on blocks that will trigger escape if statements / mispredictions
113     bool last_block_has_escapes = false;
114     for (size_t block=0; block<blocks; block++) {
115       // Find utf-8 in the block
116       size_t block_start = block*BYTES_PER_BLOCK;
117       size_t block_end = block_start+BYTES_PER_BLOCK;
118       if (block_end > json.size()) { block_end = json.size(); }
119       bool block_has_escapes = false;
120       for (size_t i=block_start; i<block_end; i++) {
121         if (json.data()[i] == '\\') {
122           block_has_escapes = true;
123           break;
124         }
125       }
126       if (block_has_escapes) {
127         blocks_with_escapes++;
128       }
129       if (block > 0 && last_block_has_escapes != block_has_escapes) {
130         blocks_with_escapes_flipped++;
131       }
132       last_block_has_escapes = block_has_escapes;
133     }
134 
135     // Calculate stats on blocks that will trigger structural count if statements / mispredictions
136     bool last_block_has_0_structurals = false;
137     bool last_block_has_1_structural = false;
138     bool last_block_has_8_structurals = false;
139     bool last_block_has_16_structurals = false;
140     size_t structural=0;
141     for (size_t block=0; block<blocks; block++) {
142       // Count structurals in the block
143       int block_structurals=0;
144       while (structural < parser.implementation->n_structural_indexes && parser.implementation->structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
145         block_structurals++;
146         structural++;
147       }
148 
149       bool block_has_0_structurals = block_structurals == 0;
150       if (block_has_0_structurals) {
151         blocks_with_0_structurals++;
152       }
153       if (block > 0 && last_block_has_0_structurals != block_has_0_structurals) {
154         blocks_with_0_structurals_flipped++;
155       }
156       last_block_has_0_structurals = block_has_0_structurals;
157 
158       bool block_has_1_structural = block_structurals >= 1;
159       if (block_has_1_structural) {
160         blocks_with_1_structural++;
161       }
162       if (block > 0 && last_block_has_1_structural != block_has_1_structural) {
163         blocks_with_1_structural_flipped++;
164       }
165       last_block_has_1_structural = block_has_1_structural;
166 
167       bool block_has_8_structurals = block_structurals >= 8;
168       if (block_has_8_structurals) {
169         blocks_with_8_structurals++;
170       }
171       if (block > 0 && last_block_has_8_structurals != block_has_8_structurals) {
172         blocks_with_8_structurals_flipped++;
173       }
174       last_block_has_8_structurals = block_has_8_structurals;
175 
176       bool block_has_16_structurals = block_structurals >= 16;
177       if (block_has_16_structurals) {
178         blocks_with_16_structurals++;
179       }
180       if (block > 0 && last_block_has_16_structurals != block_has_16_structurals) {
181         blocks_with_16_structurals_flipped++;
182       }
183       last_block_has_16_structurals = block_has_16_structurals;
184     }
185   }
186 };
187 
188 struct progress_bar {
189   int max_value;
190   int total_ticks;
191   double ticks_per_value;
192   int next_tick;
progress_barprogress_bar193   progress_bar(int _max_value, int _total_ticks) : max_value(_max_value), total_ticks(_total_ticks), ticks_per_value(double(_total_ticks)/_max_value), next_tick(0) {
194     fprintf(stderr, "[");
195     for (int i=0;i<total_ticks;i++) {
196       fprintf(stderr, " ");
197     }
198     fprintf(stderr, "]");
199     for (int i=0;i<total_ticks+1;i++) {
200       fprintf(stderr, "\b");
201     }
202   }
203 
printprogress_bar204   void print(int value) {
205     double ticks = value*ticks_per_value;
206     if (ticks >= total_ticks) {
207       ticks = total_ticks-1;
208     }
209     int tick;
210     for (tick=next_tick; tick <= ticks && tick <= total_ticks; tick++) {
211       fprintf(stderr, "=");
212     }
213     next_tick = tick;
214   }
eraseprogress_bar215   void erase() const {
216     for (int i=0;i<next_tick+1;i++) {
217       fprintf(stderr, "\b");
218     }
219     for (int tick=0; tick<=total_ticks+2; tick++) {
220       fprintf(stderr, " ");
221     }
222     for (int tick=0; tick<=total_ticks+2; tick++) {
223       fprintf(stderr, "\b");
224     }
225   }
226 };
227 
228 /**
229  * The speed at which we can allocate memory is strictly system specific.
230  * It depends on the OS and the runtime library. It is subject to various
231  * system-specific knobs. It is not something that we can reasonably
232  * benchmark with crude timings.
233  * If someone wants to optimize how simdjson allocate memory, then it will
234  * almost surely require a distinct benchmarking tool. What is meant by
235  * "memory allocation" also requires a definition. Doing "new char[size]" can
236  * do many different things depending on the system.
237  */
238 
239 enum class BenchmarkStage {
240   ALL, // This excludes allocation
241   ALLOCATE,
242   STAGE1,
243   STAGE2
244 };
245 
benchmark_stage_name(BenchmarkStage stage)246 const char* benchmark_stage_name(BenchmarkStage stage) {
247   switch (stage) {
248     case BenchmarkStage::ALL: return "All (Without Allocation)";
249     case BenchmarkStage::ALLOCATE: return "Allocate";
250     case BenchmarkStage::STAGE1: return "Stage 1";
251     case BenchmarkStage::STAGE2: return "Stage 2";
252     default: return "Unknown";
253   }
254 }
255 
256 struct benchmarker {
257   // JSON text from loading the file. Owns the memory.
258   padded_string json{};
259   // JSON filename
260   const char *filename;
261   // Event collector that can be turned on to measure cycles, missed branches, etc.
262   event_collector& collector;
263 
264   // Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
265   // Loaded on first parse.
266   json_stats* stats;
267   // Speed and event summary for full parse (stage 1 and stage 2, but *excluding* allocation)
268   event_aggregate all_stages_without_allocation{};
269   // Speed and event summary for stage 1
270   event_aggregate stage1{};
271   // Speed and event summary for stage 2
272   event_aggregate stage2{};
273   // Speed and event summary for allocation
274   event_aggregate allocate_stage{};
275   // Speed and event summary for the repeatly-parsing mode
276   event_aggregate loop{};
277 
benchmarkerbenchmarker278   benchmarker(const char *_filename, event_collector& _collector)
279     : filename(_filename), collector(_collector), stats(NULL) {
280     verbose() << "[verbose] loading " << filename << endl;
281     auto error = padded_string::load(filename).get(json);
282     if (error) {
283       exit_error(string("Could not load the file ") + filename);
284     }
285     verbose() << "[verbose] loaded " << filename << endl;
286   }
287 
~benchmarkerbenchmarker288   ~benchmarker() {
289     if (stats) {
290       delete stats;
291     }
292   }
293 
294   benchmarker(const benchmarker&) = delete;
295   benchmarker& operator=(const benchmarker&) = delete;
296 
297   const event_aggregate& operator[](BenchmarkStage stage) const {
298     switch (stage) {
299       case BenchmarkStage::ALL: return this->all_stages_without_allocation;
300       case BenchmarkStage::STAGE1: return this->stage1;
301       case BenchmarkStage::STAGE2: return this->stage2;
302       case BenchmarkStage::ALLOCATE: return this->allocate_stage;
303       default: exit_error("Unknown stage"); return this->all_stages_without_allocation;
304     }
305   }
306 
iterationsbenchmarker307   int iterations() const {
308     return all_stages_without_allocation.iterations;
309   }
310 
311   simdjson_really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
312     // Allocate dom::parser
313     collector.start();
314     dom::parser parser;
315     // We always allocate at least 64KB. Smaller allocations may actually be slower under some systems.
316     error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size());
317     if (error) {
318       exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON text: " + error_message(error));
319     }
320     event_count allocate_count = collector.end();
321     allocate_stage << allocate_count;
322     // Run it once to get hot buffers
323     if(hotbuffers) {
324       auto result = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
325       if (result.error()) {
326         exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error()));
327       }
328     }
329 
330     verbose() << "[verbose] allocated memory for parsed JSON " << endl;
331 
332     // Stage 1 (find structurals)
333     collector.start();
334     error = parser.implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.size(), false);
335     event_count stage1_count = collector.end();
336     stage1 << stage1_count;
337     if (error) {
338       exit_error(string("Failed to parse ") + filename + " during stage 1: " + error_message(error));
339     }
340 
341     // Stage 2 (unified machine) and the rest
342 
343     if (stage1_only) {
344       all_stages_without_allocation << stage1_count;
345     } else {
346       event_count stage2_count;
347       collector.start();
348       error = parser.implementation->stage2(parser.doc);
349       if (error) {
350         exit_error(string("Failed to parse ") + filename + " during stage 2 parsing " + error_message(error));
351       }
352       stage2_count = collector.end();
353       stage2 << stage2_count;
354       all_stages_without_allocation << stage1_count + stage2_count;
355     }
356     // Calculate stats the first time we parse
357     if (stats == NULL) {
358       if (stage1_only) { //  we need stage 2 once
359         error = parser.implementation->stage2(parser.doc);
360         if (error) {
361           printf("Warning: failed to parse during stage 2. Unable to acquire statistics.\n");
362         }
363       }
364       stats = new json_stats(json, parser);
365     }
366   }
367 
run_loopbenchmarker368   void run_loop(size_t iterations) {
369     dom::parser parser;
370     auto firstresult = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
371     if (firstresult.error()) {
372       exit_error(string("Failed to parse ") + filename + string(":") + error_message(firstresult.error()));
373     }
374 
375     collector.start();
376     // some users want something closer to "number of documents per second"
377     for(size_t i = 0; i < iterations; i++) {
378       auto result = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
379       if (result.error()) {
380         exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error()));
381       }
382     }
383     event_count all_loop_count = collector.end();
384     loop << all_loop_count;
385   }
386 
387   simdjson_really_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers=false) {
388     for (size_t i = 0; i<iterations; i++) {
389       run_iteration(stage1_only, hotbuffers);
390     }
391     run_loop(iterations);
392   }
393 
394   // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
395   template<typename T>
print_aggregatebenchmarker396   void print_aggregate(const char* prefix, const T& stage) const {
397     printf("%s%-13s: %8.4f ns per block (%6.2f%%) - %8.4f ns per byte - %8.4f ns per structural - %8.4f GB/s\n",
398       prefix,
399       "Speed",
400       stage.elapsed_ns() / static_cast<double>(stats->blocks), // per block
401       percent(stage.elapsed_sec(), all_stages_without_allocation.elapsed_sec()), // %
402       stage.elapsed_ns() / static_cast<double>(stats->bytes), // per byte
403       stage.elapsed_ns() / static_cast<double>(stats->structurals), // per structural
404       (static_cast<double>(json.size()) / 1000000000.0) / stage.elapsed_sec() // GB/s
405     );
406 
407     if (collector.has_events()) {
408       printf("%s%-13s: %8.4f per block    (%6.2f%%) - %8.4f per byte    - %8.4f per structural    - %8.3f GHz est. frequency\n",
409         prefix,
410         "Cycles",
411         stage.cycles() / static_cast<double>(stats->blocks),
412         percent(stage.cycles(), all_stages_without_allocation.cycles()),
413         stage.cycles() / static_cast<double>(stats->bytes),
414         stage.cycles() / static_cast<double>(stats->structurals),
415         (stage.cycles() / stage.elapsed_sec()) / 1000000000.0
416       );
417       printf("%s%-13s: %8.4f per block    (%6.2f%%) - %8.4f per byte    - %8.4f per structural    - %8.3f per cycle\n",
418         prefix,
419         "Instructions",
420         stage.instructions() / static_cast<double>(stats->blocks),
421         percent(stage.instructions(), all_stages_without_allocation.instructions()),
422         stage.instructions() / static_cast<double>(stats->bytes),
423         stage.instructions() / static_cast<double>(stats->structurals),
424         stage.instructions() / static_cast<double>(stage.cycles())
425       );
426 
427       // NOTE: removed cycles/miss because it is a somewhat misleading stat
428       printf("%s%-13s: %7.0f branch misses (%6.2f%%) - %.0f cache misses (%6.2f%%) - %.2f cache references\n",
429         prefix,
430         "Misses",
431         stage.branch_misses(),
432         percent(stage.branch_misses(), all_stages_without_allocation.branch_misses()),
433         stage.cache_misses(),
434         percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()),
435         stage.cache_references()
436       );
437     }
438   }
439 
percentbenchmarker440   static double percent(size_t a, size_t b) {
441     return 100.0 * static_cast<double>(a) / static_cast<double>(b);
442   }
percentbenchmarker443   static double percent(double a, double b) {
444     return 100.0 * a / b;
445   }
446 
printbenchmarker447   void print(bool tabbed_output) const {
448     if (tabbed_output) {
449       char* filename_copy = reinterpret_cast<char*>(malloc(strlen(filename)+1));
450       SIMDJSON_PUSH_DISABLE_WARNINGS
451       SIMDJSON_DISABLE_DEPRECATED_WARNING // Validated CRT_SECURE safe here
452       strcpy(filename_copy, filename);
453       SIMDJSON_POP_DISABLE_WARNINGS
454 
455       #if defined(__linux__)
456       char* base = ::basename(filename_copy);
457       #else
458       char* base = filename_copy;
459       #endif
460       if (strlen(base) >= 5 && !strcmp(base+strlen(base)-5, ".json")) {
461         base[strlen(base)-5] = '\0';
462       }
463 
464       double gb = static_cast<double>(json.size()) / 1000000000.0;
465       if (collector.has_events()) {
466         printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",
467                 base,
468                 allocate_stage.best.cycles() / static_cast<double>(json.size()),
469                 stage1.best.cycles() / static_cast<double>(json.size()),
470                 stage2.best.cycles() / static_cast<double>(json.size()),
471                 all_stages_without_allocation.best.cycles() / static_cast<double>(json.size()),
472                 gb / all_stages_without_allocation.best.elapsed_sec(),
473                 gb / stage1.best.elapsed_sec(),
474                 gb / stage2.best.elapsed_sec());
475       } else {
476         printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
477                 base,
478                 gb / all_stages_without_allocation.best.elapsed_sec(),
479                 gb / stage1.best.elapsed_sec(),
480                 gb / stage2.best.elapsed_sec());
481       }
482       free(filename_copy);
483     } else {
484       printf("\n");
485       printf("%s\n", filename);
486       printf("%s\n", string(strlen(filename), '=').c_str());
487       printf("%9zu blocks - %10zu bytes - %5zu structurals (%5.1f %%)\n", stats->bytes / BYTES_PER_BLOCK, stats->bytes, stats->structurals, percent(stats->structurals, stats->bytes));
488       if (stats) {
489         printf("special blocks with: utf8 %9zu (%5.1f %%) - escape %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
490           stats->blocks_with_utf8, percent(stats->blocks_with_utf8, stats->blocks),
491           stats->blocks_with_escapes, percent(stats->blocks_with_escapes, stats->blocks),
492           stats->blocks_with_0_structurals, percent(stats->blocks_with_0_structurals, stats->blocks),
493           stats->blocks_with_1_structural, percent(stats->blocks_with_1_structural, stats->blocks),
494           stats->blocks_with_8_structurals, percent(stats->blocks_with_8_structurals, stats->blocks),
495           stats->blocks_with_16_structurals, percent(stats->blocks_with_16_structurals, stats->blocks));
496         printf("special block flips: utf8 %9zu (%5.1f %%) - escape %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
497           stats->blocks_with_utf8_flipped, percent(stats->blocks_with_utf8_flipped, stats->blocks),
498           stats->blocks_with_escapes_flipped, percent(stats->blocks_with_escapes_flipped, stats->blocks),
499           stats->blocks_with_0_structurals_flipped, percent(stats->blocks_with_0_structurals_flipped, stats->blocks),
500           stats->blocks_with_1_structural_flipped, percent(stats->blocks_with_1_structural_flipped, stats->blocks),
501           stats->blocks_with_8_structurals_flipped, percent(stats->blocks_with_8_structurals_flipped, stats->blocks),
502           stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks));
503       }
504       printf("\n");
505       printf("All Stages (excluding allocation)\n");
506       print_aggregate("|    "   , all_stages_without_allocation.best);
507       // frequently, allocation is a tiny fraction of the running time so we omit it
508       if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) {
509         printf("|- Allocation\n");
510         print_aggregate("|    ", allocate_stage.best);
511       }
512       printf("|- Stage 1\n");
513       print_aggregate("|    ", stage1.best);
514       printf("|- Stage 2\n");
515       print_aggregate("|    ", stage2.best);
516       if (collector.has_events()) {
517         double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
518         double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0;
519         double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0;
520         double freqmin = min(freq1, freq2);
521         double freqmax = max(freq1, freq2);
522         if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) {
523           printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n"
524           "Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n",
525           freqmin, freqmax, freqall);
526         }
527       }
528       printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages_without_allocation.best.elapsed_sec()));
529     }
530   }
531 };
532 
533 #endif
534