1 /******************************************************************************
2  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *     * Redistributions of source code must retain the above copyright
7  *       notice, this list of conditions and the following disclaimer.
8  *     * Redistributions in binary form must reproduce the above copyright
9  *       notice, this list of conditions and the following disclaimer in the
10  *       documentation and/or other materials provided with the distribution.
11  *     * Neither the name of the NVIDIA CORPORATION nor the
12  *       names of its contributors may be used to endorse or promote products
13  *       derived from this software without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  ******************************************************************************/
27 
28 #include <stdio.h>
29 #include <map>
30 #include <vector>
31 #include <algorithm>
32 #include <cstdio>
33 #include <fstream>
34 
35 #include "histogram/histogram_gmem_atomics.h"
36 #include "histogram/histogram_smem_atomics.h"
37 #include "histogram/histogram_cub.h"
38 
39 #include <cub/util_allocator.cuh>
40 #include <test/test_util.h>
41 
42 using namespace cub;
43 
44 //---------------------------------------------------------------------
45 // Globals, constants, and type declarations
46 //---------------------------------------------------------------------
47 
48 // Ensure printing of CUDA runtime errors to console
49 #define CUB_STDERR
50 
51 bool                    g_verbose = false;  // Whether to display input/output to console
52 bool                    g_report = false;   // Whether to display a full report in CSV format
53 CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
54 
55 struct less_than_value
56 {
operator ()less_than_value57     inline bool operator()(
58         const std::pair<std::string, double> &a,
59         const std::pair<std::string, double> &b)
60     {
61         return a.second < b.second;
62     }
63 };
64 
65 
66 //---------------------------------------------------------------------
67 // Targa (.tga) image file parsing
68 //---------------------------------------------------------------------
69 
70 /**
71  * TGA image header info
72  */
73 struct TgaHeader
74 {
75     char idlength;
76     char colormaptype;
77     char datatypecode;
78     short colormaporigin;
79     short colormaplength;
80     char colormapdepth;
81     short x_origin;
82     short y_origin;
83     short width;
84     short height;
85     char bitsperpixel;
86     char imagedescriptor;
87 
ParseTgaHeader88     void Parse (FILE *fptr)
89     {
90         idlength = fgetc(fptr);
91         colormaptype = fgetc(fptr);
92         datatypecode = fgetc(fptr);
93         fread(&colormaporigin, 2, 1, fptr);
94         fread(&colormaplength, 2, 1, fptr);
95         colormapdepth = fgetc(fptr);
96         fread(&x_origin, 2, 1, fptr);
97         fread(&y_origin, 2, 1, fptr);
98         fread(&width, 2, 1, fptr);
99         fread(&height, 2, 1, fptr);
100         bitsperpixel = fgetc(fptr);
101         imagedescriptor = fgetc(fptr);
102     }
103 
DisplayTgaHeader104     void Display (FILE *fptr)
105     {
106         fprintf(fptr, "ID length:           %d\n", idlength);
107         fprintf(fptr, "Color map type:      %d\n", colormaptype);
108         fprintf(fptr, "Image type:          %d\n", datatypecode);
109         fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
110         fprintf(fptr, "Color map length:    %d\n", colormaplength);
111         fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
112         fprintf(fptr, "X origin:            %d\n", x_origin);
113         fprintf(fptr, "Y origin:            %d\n", y_origin);
114         fprintf(fptr, "Width:               %d\n", width);
115         fprintf(fptr, "Height:              %d\n", height);
116         fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
117         fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
118     }
119 };
120 
121 
122 /**
123  * Decode image byte data into pixel
124  */
ParseTgaPixel(uchar4 & pixel,unsigned char * tga_pixel,int bytes)125 void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
126 {
127     if (bytes == 4)
128     {
129         pixel.x = tga_pixel[2];
130         pixel.y = tga_pixel[1];
131         pixel.z = tga_pixel[0];
132         pixel.w = tga_pixel[3];
133     }
134     else if (bytes == 3)
135     {
136         pixel.x = tga_pixel[2];
137         pixel.y = tga_pixel[1];
138         pixel.z = tga_pixel[0];
139         pixel.w = 0;
140     }
141     else if (bytes == 2)
142     {
143         pixel.x = (tga_pixel[1] & 0x7c) << 1;
144         pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
145         pixel.z = (tga_pixel[0] & 0x1f) << 3;
146         pixel.w = (tga_pixel[1] & 0x80);
147     }
148 }
149 
150 
151 /**
152  * Reads a .tga image file
153  */
ReadTga(uchar4 * & pixels,int & width,int & height,const char * filename)154 void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
155 {
156     // Open the file
157     FILE *fptr;
158     if ((fptr = fopen(filename, "rb")) == NULL)
159     {
160         fprintf(stderr, "File open failed\n");
161         exit(-1);
162     }
163 
164     // Parse header
165     TgaHeader header;
166     header.Parse(fptr);
167 //    header.Display(stdout);
168     width = header.width;
169     height = header.height;
170 
171     // Verify compatibility
172     if (header.datatypecode != 2 && header.datatypecode != 10)
173     {
174         fprintf(stderr, "Can only handle image type 2 and 10\n");
175         exit(-1);
176     }
177     if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
178     {
179         fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
180         exit(-1);
181     }
182     if (header.colormaptype != 0 && header.colormaptype != 1)
183     {
184         fprintf(stderr, "Can only handle color map types of 0 and 1\n");
185         exit(-1);
186     }
187 
188     // Skip unnecessary header info
189     int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
190     fseek(fptr, skip_bytes, SEEK_CUR);
191 
192     // Read the image
193     int pixel_bytes = header.bitsperpixel / 8;
194 
195     // Allocate and initialize pixel data
196     size_t image_bytes = width * height * sizeof(uchar4);
197     if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
198     {
199         fprintf(stderr, "malloc of image failed\n");
200         exit(-1);
201     }
202     memset(pixels, 0, image_bytes);
203 
204     // Parse pixels
205     unsigned char   tga_pixel[5];
206     int             current_pixel = 0;
207     while (current_pixel < header.width * header.height)
208     {
209         if (header.datatypecode == 2)
210         {
211             // Uncompressed
212             if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
213             {
214                 fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
215                 exit(-1);
216             }
217             ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
218             current_pixel++;
219         }
220         else if (header.datatypecode == 10)
221         {
222             // Compressed
223             if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
224             {
225                 fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
226                 exit(-1);
227             }
228             int run_length = tga_pixel[0] & 0x7f;
229             ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
230             current_pixel++;
231 
232             if (tga_pixel[0] & 0x80)
233             {
234                 // RLE chunk
235                 for (int i = 0; i < run_length; i++)
236                 {
237                     ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
238                     current_pixel++;
239                 }
240             }
241             else
242             {
243                 // Normal chunk
244                 for (int i = 0; i < run_length; i++)
245                 {
246                     if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
247                     {
248                         fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
249                         exit(-1);
250                     }
251                     ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
252                     current_pixel++;
253                 }
254             }
255         }
256     }
257 
258     // Close file
259     fclose(fptr);
260 }
261 
262 
263 
264 //---------------------------------------------------------------------
265 // Random image generation
266 //---------------------------------------------------------------------
267 
268 /**
269  * Generate a random image with specified entropy
270  */
GenerateRandomImage(uchar4 * & pixels,int width,int height,int entropy_reduction)271 void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
272 {
273     int num_pixels = width * height;
274     size_t image_bytes = num_pixels * sizeof(uchar4);
275     if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
276     {
277         fprintf(stderr, "malloc of image failed\n");
278         exit(-1);
279     }
280 
281     for (int i = 0; i < num_pixels; ++i)
282     {
283         RandomBits(pixels[i].x, entropy_reduction);
284         RandomBits(pixels[i].y, entropy_reduction);
285         RandomBits(pixels[i].z, entropy_reduction);
286         RandomBits(pixels[i].w, entropy_reduction);
287     }
288 }
289 
290 
291 
292 //---------------------------------------------------------------------
293 // Histogram verification
294 //---------------------------------------------------------------------
295 
296 // Decode float4 pixel into bins
297 template <int NUM_BINS, int ACTIVE_CHANNELS>
DecodePixelGold(float4 pixel,unsigned int (& bins)[ACTIVE_CHANNELS])298 void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
299 {
300     float* samples = reinterpret_cast<float*>(&pixel);
301 
302     for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
303         bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
304 }
305 
306 // Decode uchar4 pixel into bins
307 template <int NUM_BINS, int ACTIVE_CHANNELS>
DecodePixelGold(uchar4 pixel,unsigned int (& bins)[ACTIVE_CHANNELS])308 void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
309 {
310     unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
311 
312     for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
313         bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
314 }
315 
316 // Decode uchar1 pixel into bins
317 template <int NUM_BINS, int ACTIVE_CHANNELS>
DecodePixelGold(uchar1 pixel,unsigned int (& bins)[ACTIVE_CHANNELS])318 void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
319 {
320     bins[0] = (unsigned int) pixel.x;
321 }
322 
323 
324 // Compute reference histogram.  Specialized for uchar4
325 template <
326     int         ACTIVE_CHANNELS,
327     int         NUM_BINS,
328     typename    PixelType>
HistogramGold(PixelType * image,int width,int height,unsigned int * hist)329 void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
330 {
331     memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
332 
333     for (int i = 0; i < width; i++)
334     {
335         for (int j = 0; j < height; j++)
336         {
337             PixelType pixel = image[i + j * width];
338 
339             unsigned int bins[ACTIVE_CHANNELS];
340             DecodePixelGold<NUM_BINS>(pixel, bins);
341 
342             for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
343             {
344                 hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
345             }
346         }
347     }
348 }
349 
350 
351 //---------------------------------------------------------------------
352 // Test execution
353 //---------------------------------------------------------------------
354 
355 /**
356  * Run a specific histogram implementation
357  */
358 template <
359     int         ACTIVE_CHANNELS,
360     int         NUM_BINS,
361     typename    PixelType>
RunTest(std::vector<std::pair<std::string,double>> & timings,PixelType * d_pixels,const int width,const int height,unsigned int * d_hist,unsigned int * h_hist,int timing_iterations,const char * long_name,const char * short_name,double (* f)(PixelType *,int,int,unsigned int *,bool))362 void RunTest(
363     std::vector<std::pair<std::string, double> >&   timings,
364     PixelType*                                      d_pixels,
365     const int                                       width,
366     const int                                       height,
367     unsigned int *                                  d_hist,
368     unsigned int *                                  h_hist,
369     int                                             timing_iterations,
370     const char *                                    long_name,
371     const char *                                    short_name,
372     double (*f)(PixelType*, int, int, unsigned int*, bool))
373 {
374     if (!g_report) printf("%s ", long_name); fflush(stdout);
375 
376     // Run single test to verify (and code cache)
377     (*f)(d_pixels, width, height, d_hist, !g_report);
378 
379     int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
380     if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
381 
382     double elapsed_ms = 0;
383     for (int i = 0; i < timing_iterations; i++)
384     {
385         elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
386     }
387     double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
388     timings.push_back(std::pair<std::string, double>(short_name, avg_us));
389 
390     if (!g_report)
391     {
392         printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
393     }
394     else
395     {
396         printf("%.3f, ", avg_us); fflush(stdout);
397     }
398 
399     AssertEquals(0, compare);
400 }
401 
402 
403 /**
404  * Evaluate corpus of histogram implementations
405  */
406 template <
407     int         NUM_CHANNELS,
408     int         ACTIVE_CHANNELS,
409     int         NUM_BINS,
410     typename    PixelType>
TestMethods(PixelType * h_pixels,int height,int width,int timing_iterations,double bandwidth_GBs)411 void TestMethods(
412     PixelType*  h_pixels,
413     int         height,
414     int         width,
415     int         timing_iterations,
416     double      bandwidth_GBs)
417 {
418     // Copy data to gpu
419     PixelType* d_pixels;
420     size_t pixel_bytes = width * height * sizeof(PixelType);
421     CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
422     CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
423 
424     if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
425 
426     // Allocate results arrays on cpu/gpu
427     unsigned int *h_hist;
428     unsigned int *d_hist;
429     size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
430     h_hist = (unsigned int *) malloc(histogram_bytes);
431     g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
432 
433     // Compute reference cpu histogram
434     HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
435 
436     // Store timings
437     std::vector<std::pair<std::string, double> > timings;
438 
439     // Run experiments
440     RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
441         "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
442     RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
443         "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
444     RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
445         "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
446 
447     // Report timings
448     if (!g_report)
449     {
450         std::sort(timings.begin(), timings.end(), less_than_value());
451         printf("Timings (us):\n");
452         for (int i = 0; i < timings.size(); i++)
453         {
454             double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
455             printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
456         }
457         printf("\n");
458     }
459 
460     // Free data
461     CubDebugExit(g_allocator.DeviceFree(d_pixels));
462     CubDebugExit(g_allocator.DeviceFree(d_hist));
463     free(h_hist);
464 }
465 
466 
467 /**
468  * Test different problem genres
469  */
TestGenres(uchar4 * uchar4_pixels,int height,int width,int timing_iterations,double bandwidth_GBs)470 void TestGenres(
471     uchar4*     uchar4_pixels,
472     int         height,
473     int         width,
474     int         timing_iterations,
475     double      bandwidth_GBs)
476 {
477     int num_pixels = width * height;
478 
479     {
480         if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
481 
482         size_t      image_bytes     = num_pixels * sizeof(uchar1);
483         uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
484 
485         // Convert to 1-channel (averaging first 3 channels)
486         for (int i = 0; i < num_pixels; ++i)
487         {
488             uchar1_pixels[i].x = (unsigned char)
489                 (((unsigned int) uchar4_pixels[i].x +
490                   (unsigned int) uchar4_pixels[i].y +
491                   (unsigned int) uchar4_pixels[i].z) / 3);
492         }
493 
494         TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
495         free(uchar1_pixels);
496         if (g_report) printf(", ");
497     }
498 
499     {
500         if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
501         TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
502         if (g_report) printf(", ");
503     }
504 
505     {
506         if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
507         size_t      image_bytes     = num_pixels * sizeof(float4);
508         float4*     float4_pixels   = (float4*) malloc(image_bytes);
509 
510         // Convert to float4 with range [0.0, 1.0)
511         for (int i = 0; i < num_pixels; ++i)
512         {
513             float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
514             float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
515             float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
516             float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
517         }
518         TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
519         free(float4_pixels);
520         if (g_report) printf("\n");
521     }
522 }
523 
524 
525 /**
526  * Main
527  */
main(int argc,char ** argv)528 int main(int argc, char **argv)
529 {
530     // Initialize command line
531     CommandLineArgs args(argc, argv);
532     if (args.CheckCmdLineFlag("help"))
533     {
534         printf(
535             "%s "
536             "[--device=<device-id>] "
537             "[--v] "
538             "[--i=<timing iterations>] "
539             "\n\t"
540                 "--file=<.tga filename> "
541             "\n\t"
542                 "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
543                 "[--height=<default: 1080>] "
544                 "[--width=<default: 1920>] "
545             "\n", argv[0]);
546         exit(0);
547     }
548 
549     std::string         filename;
550     int                 timing_iterations   = 100;
551     int                 entropy_reduction   = 0;
552     int                 height              = 1080;
553     int                 width               = 1920;
554 
555     g_verbose = args.CheckCmdLineFlag("v");
556     g_report = args.CheckCmdLineFlag("report");
557     args.GetCmdLineArgument("i", timing_iterations);
558     args.GetCmdLineArgument("file", filename);
559     args.GetCmdLineArgument("height", height);
560     args.GetCmdLineArgument("width", width);
561     args.GetCmdLineArgument("entropy", entropy_reduction);
562 
563     // Initialize device
564     CubDebugExit(args.DeviceInit());
565 
566     // Get GPU device bandwidth (GB/s)
567     int device_ordinal, bus_width, mem_clock_khz;
568     CubDebugExit(cudaGetDevice(&device_ordinal));
569     CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
570     CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
571     double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
572 
573     // Run test(s)
574     uchar4* uchar4_pixels = NULL;
575     if (!g_report)
576     {
577         if (!filename.empty())
578         {
579             // Parse targa file
580             ReadTga(uchar4_pixels, width, height, filename.c_str());
581             printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
582         }
583         else
584         {
585             // Generate image
586             GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
587             printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
588         }
589 
590         TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
591     }
592     else
593     {
594         // Run test suite
595         printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
596 
597         // Entropy reduction tests
598         for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
599         {
600             printf("entropy reduction %d, ", entropy_reduction);
601             GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
602             TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
603         }
604         printf("entropy reduction -1, ");
605         GenerateRandomImage(uchar4_pixels, width, height, -1);
606         TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
607         printf("\n");
608 
609         // File image tests
610         std::vector<std::string> file_tests;
611         file_tests.push_back("animals");
612         file_tests.push_back("apples");
613         file_tests.push_back("sunset");
614         file_tests.push_back("cheetah");
615         file_tests.push_back("nature");
616         file_tests.push_back("operahouse");
617         file_tests.push_back("austin");
618         file_tests.push_back("cityscape");
619 
620         for (int i = 0; i < file_tests.size(); ++i)
621         {
622             printf("%s, ", file_tests[i].c_str());
623             std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
624             ReadTga(uchar4_pixels, width, height, filename.c_str());
625             TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
626         }
627     }
628 
629     free(uchar4_pixels);
630 
631     CubDebugExit(cudaDeviceSynchronize());
632     printf("\n\n");
633 
634     return 0;
635 }
636