1 /*
2   Copyright (c) 2020-2021, Intel Corporation
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are
7   met:
8 
9     * Redistributions of source code must retain the above copyright
10       notice, this list of conditions and the following disclaimer.
11 
12     * Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16     * Neither the name of Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 #ifdef _MSC_VER
34 #define NOMINMAX
35 #pragma warning(disable : 4244)
36 #pragma warning(disable : 4305)
37 // preventing MSVC fopen() deprecation complaints
38 #define _CRT_SECURE_NO_DEPRECATE
39 #endif
40 
41 #include <chrono>
42 #include <cmath>
43 #include <cstdio>
44 #include <cstdlib>
45 #include <iostream>
46 #include <limits>
47 
48 #include "common_helpers.h"
49 #include "timing.h"
50 
51 // ispcrt
52 #include "ispcrt.hpp"
53 
54 
55 #define CORRECTNESS_THRESHOLD 0.0002
56 #define WIDTH 768
57 #define HEIGHT 768
58 #define SZ WIDTH *HEIGHT
59 #define TIMEOUT (40 * 1000)
60 
61 extern void noise_serial(float x0, float y0, float x1, float y1, int width, int height, float output[]);
62 
63 using namespace hostutil;
64 
65 struct Parameters {
66     float x0;
67     float y0;
68     float x1;
69     float y1;
70     int width;
71     int height;
72     float *output{nullptr};
73 };
74 
run(int niter,int gx,int gy)75 static int run(int niter, int gx, int gy) {
76     std::cout.setf(std::ios::unitbuf);
77     const unsigned int height = HEIGHT;
78     const unsigned int width = WIDTH;
79 
80     const float x0 = -10;
81     const float y0 = -10;
82     const float x1 = 10;
83     const float y1 = 10;
84 
85     std::vector<float> buf(SZ);
86     std::vector<float> gold(SZ);
87 
88     auto run_kernel = [&](ISPCRTDeviceType type) {
89         ispcrt::Device device(type);
90 
91         // Setup output array
92         ispcrt::Array<float> buf_dev(device, buf);
93 
94         // Setup parameters structure
95         Parameters p;
96 
97         p.x0 = x0;
98         p.y0 = y0;
99         p.x1 = x1;
100         p.y1 = y1;
101         p.width = width;
102         p.height = height;
103         p.output = buf_dev.devicePtr();
104 
105         auto p_dev = ispcrt::Array<Parameters>(device, p);
106 
107         // Create module and kernel to execute
108         ispcrt::Module module(device, "genx_noise");
109         ispcrt::Kernel kernel(device, module, "noise_ispc");
110 
111         // Create task queue and execute kernel
112         ispcrt::TaskQueue queue(device);
113         double minCyclesISPC = 1e30;
114         double kernelTicks = 1e30;
115         const char *device_str = (type == ISPCRT_DEVICE_TYPE_GPU) ? "GPU" : "CPU";
116         std::fill(buf.begin(), buf.end(), 0);
117         for (unsigned int i = 0; i < niter; i++) {
118             reset_and_start_timer();
119             queue.copyToDevice(p_dev);
120             queue.barrier();
121             auto res = queue.launch(kernel, p_dev, gx, gy);
122             queue.barrier();
123             queue.copyToHost(buf_dev);
124             queue.barrier();
125             queue.sync();
126             if (res.valid()) {
127                 kernelTicks = res.time() * 1e-6;
128             }
129             double mcycles = get_elapsed_mcycles();
130             // Print resulting time
131             printf("@time of %s run:\t\t\t[%.3f] milliseconds\n", device_str, kernelTicks);
132             printf("@time of %s run:\t\t\t[%.3f] million cycles\n", device_str, mcycles);
133             minCyclesISPC = std::min(minCyclesISPC, mcycles);
134         }
135         printf("[noise ISPC %s]:\t\t[%.3f] million cycles (%d x %d image)\n", device_str, minCyclesISPC, width, height);
136     };
137 
138     run_kernel(ISPCRT_DEVICE_TYPE_CPU);
139     run_kernel(ISPCRT_DEVICE_TYPE_GPU);
140 
141     double minCyclesSerial = 1e30;
142     std::fill(gold.begin(), gold.end(), 0);
143     for (unsigned int i = 0; i < niter; i++) {
144         reset_and_start_timer();
145         auto wct = std::chrono::system_clock::now();
146         noise_serial(x0, y0, x1, y1, width, height, gold.data());
147         double mcycles = get_elapsed_mcycles();
148         auto dur = (std::chrono::system_clock::now() - wct);
149         auto secs = std::chrono::duration_cast<std::chrono::milliseconds>(dur);
150 
151         // Print resulting time
152         printf("@time of serial run:\t\t\t[%ld] milliseconds\n", secs.count());
153         printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles);
154         minCyclesSerial = std::min(minCyclesSerial, mcycles);
155     }
156 
157     printf("[noise serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minCyclesSerial, width, height);
158 
159     // Result check
160     bool pass = true;
161     double err = 0.0;
162     double max_err = 0.0;
163 
164     int i = 0;
165     for (; i < width * height; i++) {
166         err = std::fabs(buf.at(i) - gold.at(i));
167         max_err = std::max(err, max_err);
168         if (err > CORRECTNESS_THRESHOLD) {
169             pass = false;
170             break;
171         }
172     }
173     if (!pass) {
174         std::cout << "Mismatch on " << i << "th value." << std::endl;
175         std::cout << "Was " << buf.at(i) << ", should be " << gold.at(i) << std::endl;
176     } else {
177         std::cout << "No issues found, max error:" << max_err << std::endl;
178     }
179 
180     return (pass) ? 0 : 1;
181 }
182 
usage()183 static void usage() {
184     fprintf(stderr, "usage: noise [niterations] [group threads width] [group threads height]\n");
185     exit(1);
186 }
187 
main(int argc,char * argv[])188 int main(int argc, char *argv[]) {
189     int niterations = 1;
190     int gx = 1, gy = 8;
191     if (argc == 4) {
192         niterations = atoi(argv[1]);
193         gx = atoi(argv[2]);
194         gy = atoi(argv[3]);
195     }
196     if (niterations < 1 || gx < 1 || gy < 1) {
197         usage();
198     }
199     int success = 0;
200 
201     std::cout << "Running test with " << niterations << " iterations of ISPC on GEN and CPU using " << gx << " * " << gy
202               << " threads." << std::endl;
203     success = run(niterations, gx, gy);
204 
205     return success;
206 }
207