1 #ifndef __BENCHMARKER_H
2 #define __BENCHMARKER_H
3
4 #include "event_counter.h"
5 #include "simdjson.h" // For SIMDJSON_DISABLE_DEPRECATED_WARNINGS
6
7 #include <cassert>
8 #include <cctype>
9 #ifndef _MSC_VER
10 #include <dirent.h>
11 #endif
12 #include <unistd.h>
13 #include <cinttypes>
14
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18
19 #include <algorithm>
20 #include <chrono>
21 #include <cstring>
22 #include <fstream>
23 #include <iomanip>
24 #include <iostream>
25 #include <map>
26 #include <set>
27 #include <sstream>
28 #include <string>
29 #include <vector>
30
31 #include "linux-perf-events.h"
32 #ifdef __linux__
33 #include <libgen.h>
34 #endif
35 #include "simdjson.h"
36
37 #include <functional>
38
39 using namespace simdjson;
40 using std::cerr;
41 using std::cout;
42 using std::endl;
43 using std::string;
44 using std::to_string;
45 using std::vector;
46 using std::ostream;
47 using std::ofstream;
48 using std::exception;
49 using std::min;
50 using std::max;
51
52 // Initialize "verbose" to go nowhere. We'll read options in main() and set to cout if verbose is true.
53 std::ofstream dev_null;
54 ostream *verbose_stream = &dev_null;
55 const size_t BYTES_PER_BLOCK = 64;
56
verbose()57 ostream& verbose() {
58 return *verbose_stream;
59 }
60
exit_error(string message)61 void exit_error(string message) {
62 cerr << message << endl;
63 exit(EXIT_FAILURE);
64 }
65
66 struct json_stats {
67 size_t bytes = 0;
68 size_t blocks = 0;
69 size_t structurals = 0;
70 size_t blocks_with_utf8 = 0;
71 size_t blocks_with_utf8_flipped = 0;
72 size_t blocks_with_escapes = 0;
73 size_t blocks_with_escapes_flipped = 0;
74 size_t blocks_with_0_structurals = 0;
75 size_t blocks_with_0_structurals_flipped = 0;
76 size_t blocks_with_1_structural = 0;
77 size_t blocks_with_1_structural_flipped = 0;
78 size_t blocks_with_8_structurals = 0;
79 size_t blocks_with_8_structurals_flipped = 0;
80 size_t blocks_with_16_structurals = 0;
81 size_t blocks_with_16_structurals_flipped = 0;
82
json_statsjson_stats83 json_stats(const padded_string& json, const dom::parser& parser) {
84 bytes = json.size();
85 blocks = bytes / BYTES_PER_BLOCK;
86 if (bytes % BYTES_PER_BLOCK > 0) { blocks++; } // Account for remainder block
87 structurals = parser.implementation->n_structural_indexes-1;
88
89 // Calculate stats on blocks that will trigger utf-8 if statements / mispredictions
90 bool last_block_has_utf8 = false;
91 for (size_t block=0; block<blocks; block++) {
92 // Find utf-8 in the block
93 size_t block_start = block*BYTES_PER_BLOCK;
94 size_t block_end = block_start+BYTES_PER_BLOCK;
95 if (block_end > json.size()) { block_end = json.size(); }
96 bool block_has_utf8 = false;
97 for (size_t i=block_start; i<block_end; i++) {
98 if (json.data()[i] & 0x80) {
99 block_has_utf8 = true;
100 break;
101 }
102 }
103 if (block_has_utf8) {
104 blocks_with_utf8++;
105 }
106 if (block > 0 && last_block_has_utf8 != block_has_utf8) {
107 blocks_with_utf8_flipped++;
108 }
109 last_block_has_utf8 = block_has_utf8;
110 }
111
112 // Calculate stats on blocks that will trigger escape if statements / mispredictions
113 bool last_block_has_escapes = false;
114 for (size_t block=0; block<blocks; block++) {
115 // Find utf-8 in the block
116 size_t block_start = block*BYTES_PER_BLOCK;
117 size_t block_end = block_start+BYTES_PER_BLOCK;
118 if (block_end > json.size()) { block_end = json.size(); }
119 bool block_has_escapes = false;
120 for (size_t i=block_start; i<block_end; i++) {
121 if (json.data()[i] == '\\') {
122 block_has_escapes = true;
123 break;
124 }
125 }
126 if (block_has_escapes) {
127 blocks_with_escapes++;
128 }
129 if (block > 0 && last_block_has_escapes != block_has_escapes) {
130 blocks_with_escapes_flipped++;
131 }
132 last_block_has_escapes = block_has_escapes;
133 }
134
135 // Calculate stats on blocks that will trigger structural count if statements / mispredictions
136 bool last_block_has_0_structurals = false;
137 bool last_block_has_1_structural = false;
138 bool last_block_has_8_structurals = false;
139 bool last_block_has_16_structurals = false;
140 size_t structural=0;
141 for (size_t block=0; block<blocks; block++) {
142 // Count structurals in the block
143 int block_structurals=0;
144 while (structural < parser.implementation->n_structural_indexes && parser.implementation->structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
145 block_structurals++;
146 structural++;
147 }
148
149 bool block_has_0_structurals = block_structurals == 0;
150 if (block_has_0_structurals) {
151 blocks_with_0_structurals++;
152 }
153 if (block > 0 && last_block_has_0_structurals != block_has_0_structurals) {
154 blocks_with_0_structurals_flipped++;
155 }
156 last_block_has_0_structurals = block_has_0_structurals;
157
158 bool block_has_1_structural = block_structurals >= 1;
159 if (block_has_1_structural) {
160 blocks_with_1_structural++;
161 }
162 if (block > 0 && last_block_has_1_structural != block_has_1_structural) {
163 blocks_with_1_structural_flipped++;
164 }
165 last_block_has_1_structural = block_has_1_structural;
166
167 bool block_has_8_structurals = block_structurals >= 8;
168 if (block_has_8_structurals) {
169 blocks_with_8_structurals++;
170 }
171 if (block > 0 && last_block_has_8_structurals != block_has_8_structurals) {
172 blocks_with_8_structurals_flipped++;
173 }
174 last_block_has_8_structurals = block_has_8_structurals;
175
176 bool block_has_16_structurals = block_structurals >= 16;
177 if (block_has_16_structurals) {
178 blocks_with_16_structurals++;
179 }
180 if (block > 0 && last_block_has_16_structurals != block_has_16_structurals) {
181 blocks_with_16_structurals_flipped++;
182 }
183 last_block_has_16_structurals = block_has_16_structurals;
184 }
185 }
186 };
187
188 struct progress_bar {
189 int max_value;
190 int total_ticks;
191 double ticks_per_value;
192 int next_tick;
progress_barprogress_bar193 progress_bar(int _max_value, int _total_ticks) : max_value(_max_value), total_ticks(_total_ticks), ticks_per_value(double(_total_ticks)/_max_value), next_tick(0) {
194 fprintf(stderr, "[");
195 for (int i=0;i<total_ticks;i++) {
196 fprintf(stderr, " ");
197 }
198 fprintf(stderr, "]");
199 for (int i=0;i<total_ticks+1;i++) {
200 fprintf(stderr, "\b");
201 }
202 }
203
printprogress_bar204 void print(int value) {
205 double ticks = value*ticks_per_value;
206 if (ticks >= total_ticks) {
207 ticks = total_ticks-1;
208 }
209 int tick;
210 for (tick=next_tick; tick <= ticks && tick <= total_ticks; tick++) {
211 fprintf(stderr, "=");
212 }
213 next_tick = tick;
214 }
eraseprogress_bar215 void erase() const {
216 for (int i=0;i<next_tick+1;i++) {
217 fprintf(stderr, "\b");
218 }
219 for (int tick=0; tick<=total_ticks+2; tick++) {
220 fprintf(stderr, " ");
221 }
222 for (int tick=0; tick<=total_ticks+2; tick++) {
223 fprintf(stderr, "\b");
224 }
225 }
226 };
227
228 /**
229 * The speed at which we can allocate memory is strictly system specific.
230 * It depends on the OS and the runtime library. It is subject to various
231 * system-specific knobs. It is not something that we can reasonably
232 * benchmark with crude timings.
233 * If someone wants to optimize how simdjson allocate memory, then it will
234 * almost surely require a distinct benchmarking tool. What is meant by
235 * "memory allocation" also requires a definition. Doing "new char[size]" can
236 * do many different things depending on the system.
237 */
238
239 enum class BenchmarkStage {
240 ALL, // This excludes allocation
241 ALLOCATE,
242 STAGE1,
243 STAGE2
244 };
245
benchmark_stage_name(BenchmarkStage stage)246 const char* benchmark_stage_name(BenchmarkStage stage) {
247 switch (stage) {
248 case BenchmarkStage::ALL: return "All (Without Allocation)";
249 case BenchmarkStage::ALLOCATE: return "Allocate";
250 case BenchmarkStage::STAGE1: return "Stage 1";
251 case BenchmarkStage::STAGE2: return "Stage 2";
252 default: return "Unknown";
253 }
254 }
255
256 struct benchmarker {
257 // JSON text from loading the file. Owns the memory.
258 padded_string json{};
259 // JSON filename
260 const char *filename;
261 // Event collector that can be turned on to measure cycles, missed branches, etc.
262 event_collector& collector;
263
264 // Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
265 // Loaded on first parse.
266 json_stats* stats;
267 // Speed and event summary for full parse (stage 1 and stage 2, but *excluding* allocation)
268 event_aggregate all_stages_without_allocation{};
269 // Speed and event summary for stage 1
270 event_aggregate stage1{};
271 // Speed and event summary for stage 2
272 event_aggregate stage2{};
273 // Speed and event summary for allocation
274 event_aggregate allocate_stage{};
275 // Speed and event summary for the repeatly-parsing mode
276 event_aggregate loop{};
277
benchmarkerbenchmarker278 benchmarker(const char *_filename, event_collector& _collector)
279 : filename(_filename), collector(_collector), stats(NULL) {
280 verbose() << "[verbose] loading " << filename << endl;
281 auto error = padded_string::load(filename).get(json);
282 if (error) {
283 exit_error(string("Could not load the file ") + filename);
284 }
285 verbose() << "[verbose] loaded " << filename << endl;
286 }
287
~benchmarkerbenchmarker288 ~benchmarker() {
289 if (stats) {
290 delete stats;
291 }
292 }
293
294 benchmarker(const benchmarker&) = delete;
295 benchmarker& operator=(const benchmarker&) = delete;
296
297 const event_aggregate& operator[](BenchmarkStage stage) const {
298 switch (stage) {
299 case BenchmarkStage::ALL: return this->all_stages_without_allocation;
300 case BenchmarkStage::STAGE1: return this->stage1;
301 case BenchmarkStage::STAGE2: return this->stage2;
302 case BenchmarkStage::ALLOCATE: return this->allocate_stage;
303 default: exit_error("Unknown stage"); return this->all_stages_without_allocation;
304 }
305 }
306
iterationsbenchmarker307 int iterations() const {
308 return all_stages_without_allocation.iterations;
309 }
310
311 simdjson_really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
312 // Allocate dom::parser
313 collector.start();
314 dom::parser parser;
315 // We always allocate at least 64KB. Smaller allocations may actually be slower under some systems.
316 error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size());
317 if (error) {
318 exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON text: " + error_message(error));
319 }
320 event_count allocate_count = collector.end();
321 allocate_stage << allocate_count;
322 // Run it once to get hot buffers
323 if(hotbuffers) {
324 auto result = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
325 if (result.error()) {
326 exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error()));
327 }
328 }
329
330 verbose() << "[verbose] allocated memory for parsed JSON " << endl;
331
332 // Stage 1 (find structurals)
333 collector.start();
334 error = parser.implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.size(), false);
335 event_count stage1_count = collector.end();
336 stage1 << stage1_count;
337 if (error) {
338 exit_error(string("Failed to parse ") + filename + " during stage 1: " + error_message(error));
339 }
340
341 // Stage 2 (unified machine) and the rest
342
343 if (stage1_only) {
344 all_stages_without_allocation << stage1_count;
345 } else {
346 event_count stage2_count;
347 collector.start();
348 error = parser.implementation->stage2(parser.doc);
349 if (error) {
350 exit_error(string("Failed to parse ") + filename + " during stage 2 parsing " + error_message(error));
351 }
352 stage2_count = collector.end();
353 stage2 << stage2_count;
354 all_stages_without_allocation << stage1_count + stage2_count;
355 }
356 // Calculate stats the first time we parse
357 if (stats == NULL) {
358 if (stage1_only) { // we need stage 2 once
359 error = parser.implementation->stage2(parser.doc);
360 if (error) {
361 printf("Warning: failed to parse during stage 2. Unable to acquire statistics.\n");
362 }
363 }
364 stats = new json_stats(json, parser);
365 }
366 }
367
run_loopbenchmarker368 void run_loop(size_t iterations) {
369 dom::parser parser;
370 auto firstresult = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
371 if (firstresult.error()) {
372 exit_error(string("Failed to parse ") + filename + string(":") + error_message(firstresult.error()));
373 }
374
375 collector.start();
376 // some users want something closer to "number of documents per second"
377 for(size_t i = 0; i < iterations; i++) {
378 auto result = parser.parse(reinterpret_cast<const uint8_t *>(json.data()), json.size());
379 if (result.error()) {
380 exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error()));
381 }
382 }
383 event_count all_loop_count = collector.end();
384 loop << all_loop_count;
385 }
386
387 simdjson_really_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers=false) {
388 for (size_t i = 0; i<iterations; i++) {
389 run_iteration(stage1_only, hotbuffers);
390 }
391 run_loop(iterations);
392 }
393
394 // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
395 template<typename T>
print_aggregatebenchmarker396 void print_aggregate(const char* prefix, const T& stage) const {
397 printf("%s%-13s: %8.4f ns per block (%6.2f%%) - %8.4f ns per byte - %8.4f ns per structural - %8.4f GB/s\n",
398 prefix,
399 "Speed",
400 stage.elapsed_ns() / static_cast<double>(stats->blocks), // per block
401 percent(stage.elapsed_sec(), all_stages_without_allocation.elapsed_sec()), // %
402 stage.elapsed_ns() / static_cast<double>(stats->bytes), // per byte
403 stage.elapsed_ns() / static_cast<double>(stats->structurals), // per structural
404 (static_cast<double>(json.size()) / 1000000000.0) / stage.elapsed_sec() // GB/s
405 );
406
407 if (collector.has_events()) {
408 printf("%s%-13s: %8.4f per block (%6.2f%%) - %8.4f per byte - %8.4f per structural - %8.3f GHz est. frequency\n",
409 prefix,
410 "Cycles",
411 stage.cycles() / static_cast<double>(stats->blocks),
412 percent(stage.cycles(), all_stages_without_allocation.cycles()),
413 stage.cycles() / static_cast<double>(stats->bytes),
414 stage.cycles() / static_cast<double>(stats->structurals),
415 (stage.cycles() / stage.elapsed_sec()) / 1000000000.0
416 );
417 printf("%s%-13s: %8.4f per block (%6.2f%%) - %8.4f per byte - %8.4f per structural - %8.3f per cycle\n",
418 prefix,
419 "Instructions",
420 stage.instructions() / static_cast<double>(stats->blocks),
421 percent(stage.instructions(), all_stages_without_allocation.instructions()),
422 stage.instructions() / static_cast<double>(stats->bytes),
423 stage.instructions() / static_cast<double>(stats->structurals),
424 stage.instructions() / static_cast<double>(stage.cycles())
425 );
426
427 // NOTE: removed cycles/miss because it is a somewhat misleading stat
428 printf("%s%-13s: %7.0f branch misses (%6.2f%%) - %.0f cache misses (%6.2f%%) - %.2f cache references\n",
429 prefix,
430 "Misses",
431 stage.branch_misses(),
432 percent(stage.branch_misses(), all_stages_without_allocation.branch_misses()),
433 stage.cache_misses(),
434 percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()),
435 stage.cache_references()
436 );
437 }
438 }
439
percentbenchmarker440 static double percent(size_t a, size_t b) {
441 return 100.0 * static_cast<double>(a) / static_cast<double>(b);
442 }
percentbenchmarker443 static double percent(double a, double b) {
444 return 100.0 * a / b;
445 }
446
printbenchmarker447 void print(bool tabbed_output) const {
448 if (tabbed_output) {
449 char* filename_copy = reinterpret_cast<char*>(malloc(strlen(filename)+1));
450 SIMDJSON_PUSH_DISABLE_WARNINGS
451 SIMDJSON_DISABLE_DEPRECATED_WARNING // Validated CRT_SECURE safe here
452 strcpy(filename_copy, filename);
453 SIMDJSON_POP_DISABLE_WARNINGS
454
455 #if defined(__linux__)
456 char* base = ::basename(filename_copy);
457 #else
458 char* base = filename_copy;
459 #endif
460 if (strlen(base) >= 5 && !strcmp(base+strlen(base)-5, ".json")) {
461 base[strlen(base)-5] = '\0';
462 }
463
464 double gb = static_cast<double>(json.size()) / 1000000000.0;
465 if (collector.has_events()) {
466 printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",
467 base,
468 allocate_stage.best.cycles() / static_cast<double>(json.size()),
469 stage1.best.cycles() / static_cast<double>(json.size()),
470 stage2.best.cycles() / static_cast<double>(json.size()),
471 all_stages_without_allocation.best.cycles() / static_cast<double>(json.size()),
472 gb / all_stages_without_allocation.best.elapsed_sec(),
473 gb / stage1.best.elapsed_sec(),
474 gb / stage2.best.elapsed_sec());
475 } else {
476 printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
477 base,
478 gb / all_stages_without_allocation.best.elapsed_sec(),
479 gb / stage1.best.elapsed_sec(),
480 gb / stage2.best.elapsed_sec());
481 }
482 free(filename_copy);
483 } else {
484 printf("\n");
485 printf("%s\n", filename);
486 printf("%s\n", string(strlen(filename), '=').c_str());
487 printf("%9zu blocks - %10zu bytes - %5zu structurals (%5.1f %%)\n", stats->bytes / BYTES_PER_BLOCK, stats->bytes, stats->structurals, percent(stats->structurals, stats->bytes));
488 if (stats) {
489 printf("special blocks with: utf8 %9zu (%5.1f %%) - escape %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
490 stats->blocks_with_utf8, percent(stats->blocks_with_utf8, stats->blocks),
491 stats->blocks_with_escapes, percent(stats->blocks_with_escapes, stats->blocks),
492 stats->blocks_with_0_structurals, percent(stats->blocks_with_0_structurals, stats->blocks),
493 stats->blocks_with_1_structural, percent(stats->blocks_with_1_structural, stats->blocks),
494 stats->blocks_with_8_structurals, percent(stats->blocks_with_8_structurals, stats->blocks),
495 stats->blocks_with_16_structurals, percent(stats->blocks_with_16_structurals, stats->blocks));
496 printf("special block flips: utf8 %9zu (%5.1f %%) - escape %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
497 stats->blocks_with_utf8_flipped, percent(stats->blocks_with_utf8_flipped, stats->blocks),
498 stats->blocks_with_escapes_flipped, percent(stats->blocks_with_escapes_flipped, stats->blocks),
499 stats->blocks_with_0_structurals_flipped, percent(stats->blocks_with_0_structurals_flipped, stats->blocks),
500 stats->blocks_with_1_structural_flipped, percent(stats->blocks_with_1_structural_flipped, stats->blocks),
501 stats->blocks_with_8_structurals_flipped, percent(stats->blocks_with_8_structurals_flipped, stats->blocks),
502 stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks));
503 }
504 printf("\n");
505 printf("All Stages (excluding allocation)\n");
506 print_aggregate("| " , all_stages_without_allocation.best);
507 // frequently, allocation is a tiny fraction of the running time so we omit it
508 if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) {
509 printf("|- Allocation\n");
510 print_aggregate("| ", allocate_stage.best);
511 }
512 printf("|- Stage 1\n");
513 print_aggregate("| ", stage1.best);
514 printf("|- Stage 2\n");
515 print_aggregate("| ", stage2.best);
516 if (collector.has_events()) {
517 double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
518 double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0;
519 double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0;
520 double freqmin = min(freq1, freq2);
521 double freqmax = max(freq1, freq2);
522 if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) {
523 printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n"
524 "Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n",
525 freqmin, freqmax, freqall);
526 }
527 }
528 printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages_without_allocation.best.elapsed_sec()));
529 }
530 }
531 };
532
533 #endif
534