1 /*********************************************************************
2   Copyright (C) 2021  The Blosc Developers <blosc@blosc.org>
3   https://blosc.org
4   License: BSD 3-Clause (see LICENSE.txt)
5 
6   Small benchmark for testing basic capabilities of Blosc.
7 
8   You can select different degrees of 'randomness' in input buffer, as
9   well as external datafiles (uncomment the lines after "For data
10   coming from a file" comment).
11 
12   For usage instructions of this benchmark, please see:
13 
14     https://www.blosc.org/pages/synthetic-benchmarks/
15 
16   I'm collecting speeds for different machines, so the output of your
17   benchmarks and your processor specifications are welcome!
18 
19   Note: Compiling this with VS2008 does not work well with cmake.  Here
20   it is a way to compile the benchmark (with added support for LZ4):
21 
22   > cl /arch:SSE2 /Ox /Febench.exe /Iblosc /Iinternal-complibs\lz4-1.7.0 bench\bench.c blosc\blosc.c blosc\blosclz.c blosc\shuffle.c blosc\shuffle-sse2.c blosc\shuffle-generic.c blosc\bitshuffle-generic.c blosc\bitshuffle-sse2.c internal-complibs\lz4-1.7.0\*.c
23 
24   See LICENSE.txt for details about copyright and rights to use.
25 **********************************************************************/
26 
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <sys/stat.h>
31 #include "blosc2.h"
32 
33 #define KB  1024u
34 #define MB  (1024*KB)
35 #define GB  (1024*MB)
36 
37 #define NCHUNKS (32*1024)       /* maximum number of chunks */
38 
39 
40 int nchunks = NCHUNKS;
41 int niter = 1;
42 int niter_c = 1;
43 int niter_d = 1;
44 /* default number of iterations */
45 double totalsize = 0.;          /* total compressed/decompressed size */
46 
47 /* Define posix_memalign for Windows */
48 #if defined(_WIN32)
49 #include <malloc.h>
50 
posix_memalign(void ** memptr,size_t alignment,size_t size)51 int posix_memalign(void **memptr, size_t alignment, size_t size)
52 {
53     *memptr = _aligned_malloc(size, alignment);
54     return 0;
55 }
56 
57 /* Buffers allocated with _aligned_malloc need to be freed with _aligned_free. */
58 #define aligned_free(memptr) _aligned_free(memptr)
59 #else
60 /* If not using MSVC, aligned memory can be freed in the usual way. */
61 #define aligned_free(memptr) free(memptr)
62 #endif  /* defined(_WIN32) && !defined(__MINGW32__) */
63 
64 /* Given two timeval stamps, return the time per chunk in usec */
get_usec_chunk(blosc_timestamp_t last,blosc_timestamp_t current,int niter_,int nchunks_)65 double get_usec_chunk(blosc_timestamp_t last, blosc_timestamp_t current,
66                       int niter_, int nchunks_) {
67   double elapsed_usecs = 1e-3 * blosc_elapsed_nsecs(last, current);
68   return elapsed_usecs / (double)(niter_ * nchunks_);
69 }
70 
71 
get_value(int i,int rshift)72 int get_value(int i, int rshift) {
73   int v;
74 
75   v = (i << 26) ^ (i << 18) ^ (i << 11) ^ (i << 3) ^ i;
76   if (rshift < 32) {
77     v &= (1 << rshift) - 1;
78   }
79   return v;
80 }
81 
82 
init_buffer(void * src,size_t size,int rshift)83 void init_buffer(void* src, size_t size, int rshift) {
84   unsigned int i;
85   int* _src = (int*)src;
86 
87   /* To have reproducible results */
88   srand(1);
89 
90   /* Initialize the original buffer */
91   for (i = 0; i < size / sizeof(int); ++i) {
92     /* Choose one below */
93     /* _src[i] = 0;
94      * _src[i] = 0x01010101;
95      * _src[i] = 0x01020304;
96      * _src[i] = i * 1/.3;
97      * _src[i] = i; */
98     //_src[i] = rand() >> (32 - rshift);
99     _src[i] = get_value(i, rshift);
100   }
101 }
102 
103 
do_bench(char * compressor,char * shuffle,int nthreads,int size_,int elsize,int rshift,FILE * ofile)104 void do_bench(char* compressor, char* shuffle, int nthreads, int size_, int elsize,
105               int rshift, FILE* ofile) {
106   size_t size = (size_t)size_;
107   void* src, *srccpy;
108   void* dest[NCHUNKS], *dest2;
109   int nbytes = 0, cbytes = 0;
110   int i, j, retcode;
111   unsigned char* orig, * round;
112   blosc_timestamp_t last, current;
113   double tmemcpy, tshuf, tunshuf;
114   int clevel, doshuffle = BLOSC_NOFILTER;
115 
116   if (strcmp(shuffle, "shuffle") == 0) {
117     doshuffle = BLOSC_SHUFFLE;
118   }
119   else if (strcmp(shuffle, "bitshuffle") == 0) {
120     doshuffle = BLOSC_BITSHUFFLE;
121   }
122   else if (strcmp(shuffle, "noshuffle") == 0) {
123     doshuffle = BLOSC_NOSHUFFLE;
124   }
125 
126   blosc_set_nthreads((int16_t)nthreads);
127   if (blosc_set_compressor(compressor) < 0) {
128     printf("Compiled w/o support for compressor: '%s', so sorry.\n",
129            compressor);
130     exit(1);
131   }
132 
133   /* Initialize buffers */
134   srccpy = malloc(size);
135   retcode = posix_memalign(&src, 32, size);
136   if (retcode != 0) {
137     printf("Error in allocating memory!");
138   }
139   retcode = posix_memalign(&dest2, 32, size);
140   if (retcode != 0) {
141     printf("Error in allocating memory!");
142   }
143 
144   /* zero src to initialize all bytes on it, and not only multiples of 4 */
145   memset(src, 0, size);
146   init_buffer(src, size, rshift);
147   memcpy(srccpy, src, size);
148   for (j = 0; j < nchunks; j++) {
149     retcode = posix_memalign(&dest[j], 32, size + BLOSC_MAX_OVERHEAD);
150     if (retcode != 0) {
151       printf("Error in allocating memory!");
152     }
153   }
154   memset(dest2, 0, size);  // just to avoid some GCC compiler warnings
155 
156   fprintf(ofile, "--> %d, %d, %d, %d, %s, %s\n", nthreads, (int)size, elsize, rshift, compressor, shuffle);
157   fprintf(ofile, "********************** Run info ******************************\n");
158   fprintf(ofile, "Blosc version: %s (%s)\n", BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
159   fprintf(ofile, "Using synthetic data with %d significant bits (out of 32)\n", rshift);
160   fprintf(ofile, "Dataset size: %d bytes\tType size: %d bytes\n", (int)size, elsize);
161   fprintf(ofile, "Working set: %.1f MB\t\t", (size * nchunks) / (float)MB);
162   fprintf(ofile, "Number of threads: %d\n", nthreads);
163   fprintf(ofile, "********************** Running benchmarks *********************\n");
164 
165   blosc_set_timestamp(&last);
166   for (i = 0; i < niter; i++) {
167     for (j = 0; j < nchunks; j++) {
168       memcpy(dest[j], src, size);
169     }
170   }
171   blosc_set_timestamp(&current);
172   tmemcpy = get_usec_chunk(last, current, niter, nchunks);
173   fprintf(ofile, "memcpy(write):\t\t %6.1f us, %.1f MB/s\n",
174           tmemcpy, (size * 1e6) / (tmemcpy * MB));
175 
176   blosc_set_timestamp(&last);
177   for (i = 0; i < niter; i++) {
178     for (j = 0; j < nchunks; j++) {
179       memcpy(dest2, dest[j], size);
180     }
181   }
182   blosc_set_timestamp(&current);
183   tmemcpy = get_usec_chunk(last, current, niter, nchunks);
184   fprintf(ofile, "memcpy(read):\t\t %6.1f us, %.1f MB/s\n",
185           tmemcpy, (size * 1e6) / (tmemcpy * MB));
186 
187   for (clevel = 0; clevel < 10; clevel++) {
188 
189     fprintf(ofile, "Compression level: %d\n", clevel);
190 
191     blosc_set_timestamp(&last);
192     for (i = 0; i < niter_c; i++) {
193       for (j = 0; j < nchunks; j++) {
194         cbytes = blosc_compress(clevel, doshuffle, (size_t)elsize, size, src,
195                                 dest[j], size + BLOSC_MAX_OVERHEAD);
196       }
197     }
198     blosc_set_timestamp(&current);
199     tshuf = get_usec_chunk(last, current, niter_c, nchunks);
200     fprintf(ofile, "comp(write):\t %6.1f us, %.1f MB/s\t  ",
201             tshuf, (size * 1e6) / (tshuf * MB));
202     fprintf(ofile, "Final bytes: %d  ", cbytes);
203     if (cbytes > 0) {
204       fprintf(ofile, "Ratio: %3.2f", size / (float)cbytes);
205     }
206     fprintf(ofile, "\n");
207 
208     /* Compressor was unable to compress.  Copy the buffer manually. */
209     if (cbytes == 0) {
210       for (j = 0; j < nchunks; j++) {
211         memcpy(dest[j], src, size);
212       }
213     }
214 
215     blosc_set_timestamp(&last);
216     for (i = 0; i < niter_d; i++) {
217       for (j = 0; j < nchunks; j++) {
218         if (cbytes == 0) {
219           memcpy(dest2, dest[j], size);
220           nbytes = (int)size;
221         }
222         else {
223           nbytes = blosc_decompress(dest[j], dest2, size);
224         }
225       }
226     }
227     blosc_set_timestamp(&current);
228     tunshuf = get_usec_chunk(last, current, niter_d, nchunks);
229     fprintf(ofile, "decomp(read):\t %6.1f us, %.1f MB/s\t  ",
230             tunshuf, (nbytes * 1e6) / (tunshuf * MB));
231     if (nbytes < 0) {
232       fprintf(ofile, "FAILED.  Error code: %d\n", nbytes);
233     }
234     /* fprintf(ofile, "Orig bytes: %d\tFinal bytes: %d\n", cbytes, nbytes); */
235 
236     /* Check if data has had a good roundtrip.
237        Byte-by-byte comparison is slow, so use 'memcmp' to check whether the
238        roundtripped data is correct. If not, fall back to the slow path to
239        print diagnostic messages. */
240     orig = (unsigned char*)srccpy;
241     round = (unsigned char*)dest2;
242     if (memcmp(orig, round, size) != 0) {
243       for (i = 0; i < (int)size; ++i) {
244         if (orig[i] != round[i]) {
245           fprintf(ofile, "\nError: Original data and round-trip do not match in pos %d\n", i);
246           fprintf(ofile, "Orig--> %x, round-trip--> %x\n", orig[i], round[i]);
247           break;
248         }
249       }
250     }
251     else {
252       i = (int)size;
253     }
254 
255     if (i == (int)size) fprintf(ofile, "OK\n");
256 
257   } /* End clevel loop */
258 
259 
260   /* To compute the totalsize, we should take into account the 10
261      compression levels */
262   totalsize += (size * nchunks * niter * 10.);
263 
264   aligned_free(src);
265   free(srccpy);
266   aligned_free(dest2);
267   for (i = 0; i < nchunks; i++) {
268     aligned_free(dest[i]);
269   }
270 }
271 
272 
273 /* Compute a sensible value for nchunks */
get_nchunks(int size_,int ws)274 int get_nchunks(int size_, int ws) {
275   int nchunks_;
276 
277   nchunks_ = ws / size_;
278   if (nchunks_ > NCHUNKS) nchunks_ = NCHUNKS;
279   if (nchunks_ < 1) nchunks_ = 1;
280   return nchunks_;
281 }
282 
print_compress_info(void)283 void print_compress_info(void) {
284   char* name = NULL, * version = NULL;
285   int ret;
286 
287   printf("Blosc version: %s (%s)\n", BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
288 
289   printf("List of supported compressors in this build: %s\n",
290          blosc_list_compressors());
291 
292   printf("Supported compression libraries:\n");
293   ret = blosc_get_complib_info("blosclz", &name, &version);
294   if (ret >= 0) printf("  %s: %s\n", name, version);
295   free(name); free(version);
296   ret = blosc_get_complib_info("lz4", &name, &version);
297   if (ret >= 0) printf("  %s: %s\n", name, version);
298   free(name); free(version);
299   ret = blosc_get_complib_info("zlib", &name, &version);
300   if (ret >= 0) printf("  %s: %s\n", name, version);
301   free(name); free(version);
302   ret = blosc_get_complib_info("zstd", &name, &version);
303   if (ret >= 0) printf("  %s: %s\n", name, version);
304   free(name); free(version);
305 }
306 
307 
main(int argc,char * argv[])308 int main(int argc, char* argv[]) {
309   char compressor[32];
310   char shuffle[32] = "shuffle";
311   char bsuite[32];
312   int single = 1;
313   int suite = 0;
314   int hard_suite = 0;
315   int extreme_suite = 0;
316   int debug_suite = 0;
317   int nthreads = 8;                     /* The number of threads */
318   int size = 8 * MB;                    /* Buffer size */
319   int elsize = 4;                       /* Datatype size */
320   int rshift = 19;                      /* Significant bits */
321   unsigned int workingset = 256 * MB;            /* The maximum allocated memory */
322   int nthreads_, size_, elsize_, rshift_, i;
323   FILE* output_file = stdout;
324   blosc_timestamp_t last, current;
325   double totaltime;
326   char usage[256];
327 
328   print_compress_info();
329 
330   strncpy(usage, "Usage: bench [blosclz | lz4 | lz4hc | zlib | zstd] "
331       "[noshuffle | shuffle | bitshuffle] "
332       "[single | suite | hardsuite | extremesuite | debugsuite] "
333       "[nthreads] [bufsize(bytes)] [typesize] [sbits]", 255);
334 
335   if (argc < 1) {
336     printf("%s\n", usage);
337     exit(1);
338   }
339 
340   if (argc >= 2) {
341     strcpy(compressor, argv[1]);
342   }
343   else {
344     strcpy(compressor, "blosclz");
345   }
346 
347   if (strcmp(compressor, "blosclz") != 0 &&
348       strcmp(compressor, "lz4") != 0 &&
349       strcmp(compressor, "lz4hc") != 0 &&
350       strcmp(compressor, "zlib") != 0 &&
351       strcmp(compressor, "zstd") != 0) {
352     printf("No such compressor: '%s'\n", compressor);
353     printf("%s\n", usage);
354     exit(2);
355   }
356 
357   if (argc >= 3) {
358     strcpy(shuffle, argv[2]);
359     if (strcmp(shuffle, "shuffle") != 0 &&
360         strcmp(shuffle, "bitshuffle") != 0 &&
361         strcmp(shuffle, "noshuffle") != 0) {
362       printf("No such shuffler: '%s'\n", shuffle);
363       printf("%s\n", usage);
364       exit(2);
365     }
366   }
367 
368   if (argc < 4)
369     strcpy(bsuite, "single");
370   else
371     strcpy(bsuite, argv[3]);
372 
373   if (strcmp(bsuite, "single") == 0) {
374     single = 1;
375   }
376   else if (strcmp(bsuite, "test") == 0) {
377     single = 1;
378     workingset /= 2;
379   }
380   else if (strcmp(bsuite, "suite") == 0) {
381     suite = 1;
382   }
383   else if (strcmp(bsuite, "hardsuite") == 0) {
384     hard_suite = 1;
385     workingset /= 4;
386     /* Values here are ending points for loops */
387     nthreads = 2;
388     size = 8 * MB;
389     elsize = 32;
390     rshift = 32;
391   }
392   else if (strcmp(bsuite, "extremesuite") == 0) {
393     extreme_suite = 1;
394     workingset /= 8;
395     niter = 1;
396     /* Values here are ending points for loops */
397     nthreads = 4;
398     size = 16 * MB;
399     elsize = 32;
400     rshift = 32;
401   }
402   else if (strcmp(bsuite, "debugsuite") == 0) {
403     debug_suite = 1;
404     workingset /= 8;
405     niter = 1;
406     /* Warning: values here are starting points for loops.  This is
407        useful for debugging. */
408     nthreads = 1;
409     size = 16 * KB;
410     elsize = 1;
411     rshift = 0;
412   }
413   else {
414     printf("%s\n", usage);
415     exit(1);
416   }
417 
418   printf("Using compressor: %s\n", compressor);
419   printf("Using shuffle type: %s\n", shuffle);
420   printf("Running suite: %s\n", bsuite);
421 
422   if (argc >= 5) {
423     nthreads = (int)strtol(argv[4], NULL, 10);
424   }
425   if (argc >= 6) {
426     size = (int)strtol(argv[5], NULL, 10);
427   }
428   if (argc >= 7) {
429     elsize = (int)strtol(argv[6], NULL, 10);
430   }
431   if (argc >= 8) {
432     rshift = (int)strtol(argv[7], NULL, 10);
433   }
434 
435   if ((argc >= 9) || !(single || suite || hard_suite || extreme_suite)) {
436     printf("%s\n", usage);
437     exit(1);
438   }
439 
440   nchunks = get_nchunks(size, workingset);
441   blosc_set_timestamp(&last);
442 
443   blosc_init();
444 
445   if (suite) {
446     for (nthreads_ = 1; nthreads_ <= nthreads; nthreads_++) {
447       do_bench(compressor, shuffle, nthreads_, size, elsize, rshift, output_file);
448     }
449   }
450   else if (hard_suite) {
451     /* Let's start the rshift loop by 4 so that 19 is visited.  This
452        is to allow a direct comparison with the plain suite, that runs
453        precisely at 19 significant bits. */
454     for (rshift_ = 4; rshift_ <= rshift; rshift_ += 5) {
455       for (elsize_ = 1; elsize_ <= elsize; elsize_ *= 2) {
456         /* The next loop is for getting sizes that are not power of 2 */
457         for (i = -elsize_; i <= elsize_; i += elsize_) {
458           for (size_ = 32 * KB; size_ <= size; size_ *= 2) {
459             nchunks = get_nchunks(size_ + i, workingset);
460             niter = 1;
461             for (nthreads_ = 1; nthreads_ <= nthreads; nthreads_++) {
462               do_bench(compressor, shuffle, nthreads_, size_ + i, elsize_, rshift_, output_file);
463               blosc_set_timestamp(&current);
464               totaltime = blosc_elapsed_secs(last, current);
465               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
466                      totaltime, totalsize / GB);
467             }
468           }
469         }
470       }
471     }
472   }
473   else if (extreme_suite) {
474     for (rshift_ = 0; rshift_ <= rshift; rshift_++) {
475       for (elsize_ = 1; elsize_ <= elsize; elsize_++) {
476         /* The next loop is for getting sizes that are not power of 2 */
477         for (i = -elsize_ * 2; i <= elsize_ * 2; i += elsize_) {
478           for (size_ = 32 * KB; size_ <= size; size_ *= 2) {
479             nchunks = get_nchunks(size_ + i, workingset);
480             for (nthreads_ = 1; nthreads_ <= nthreads; nthreads_++) {
481               do_bench(compressor, shuffle, nthreads_, size_ + i, elsize_, rshift_, output_file);
482               blosc_set_timestamp(&current);
483               totaltime = blosc_elapsed_secs(last, current);
484               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
485                      totaltime, totalsize / GB);
486             }
487           }
488         }
489       }
490     }
491   }
492   else if (debug_suite) {
493     for (rshift_ = rshift; rshift_ <= 32; rshift_++) {
494       for (elsize_ = elsize; elsize_ <= 32; elsize_++) {
495         /* The next loop is for getting sizes that are not power of 2 */
496         for (i = -elsize_ * 2; i <= elsize_ * 2; i += elsize_) {
497           for (size_ = size; size_ <= 16 * MB; size_ *= 2) {
498             nchunks = get_nchunks(size_ + i, workingset);
499             for (nthreads_ = nthreads; nthreads_ <= 6; nthreads_++) {
500               do_bench(compressor, shuffle, nthreads_, size_ + i, elsize_, rshift_, output_file);
501               blosc_set_timestamp(&current);
502               totaltime = blosc_elapsed_secs(last, current);
503               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
504                      totaltime, totalsize / GB);
505             }
506           }
507         }
508       }
509     }
510   }
511     /* Single mode */
512   else {
513     do_bench(compressor, shuffle, nthreads, size, elsize, rshift, output_file);
514   }
515 
516   /* Print out some statistics */
517   blosc_set_timestamp(&current);
518   totaltime = (float)blosc_elapsed_secs(last, current);
519   printf("\nRound-trip compr/decompr on %.1f GB\n", totalsize / GB);
520   printf("Elapsed time:\t %6.1f s, %.1f MB/s\n",
521          totaltime, totalsize * 2 * 1.1 / (MB * totaltime));
522 
523   /* Free blosc resources */
524   blosc_free_resources();
525   blosc_destroy();
526   return 0;
527 }
528