1 #include "tests.h"
2 #include <sys/time.h>
3 
4 #define TEX_SIZE 2048
5 #define CUBE_SIZE 64
6 #define NUM_FBOS 16
7 #define BENCH_DUR 3
8 
create_test_img(pl_gpu gpu)9 static pl_tex create_test_img(pl_gpu gpu)
10 {
11     pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, PL_FMT_CAP_LINEAR);
12     REQUIRE(fmt);
13 
14     int cube_stride = TEX_SIZE / CUBE_SIZE;
15     int cube_count  = cube_stride * cube_stride;
16 
17     assert(cube_count * CUBE_SIZE * CUBE_SIZE == TEX_SIZE * TEX_SIZE);
18     float *data = malloc(TEX_SIZE * TEX_SIZE * sizeof(float[4]));
19     for (int n = 0; n < cube_count; n++) {
20         int xbase = (n % cube_stride) * CUBE_SIZE;
21         int ybase = (n / cube_stride) * CUBE_SIZE;
22         for (int g = 0; g < CUBE_SIZE; g++) {
23             for (int r = 0; r < CUBE_SIZE; r++) {
24                 int xpos = xbase + r;
25                 int ypos = ybase + g;
26                 assert(xpos < TEX_SIZE && ypos < TEX_SIZE);
27 
28                 float *color = &data[(ypos * TEX_SIZE + xpos) * 4];
29                 color[0] = (float) r / CUBE_SIZE;
30                 color[1] = (float) g / CUBE_SIZE;
31                 color[2] = (float) n / cube_count;
32                 color[3] = 1.0;
33             }
34         }
35     }
36 
37     pl_tex tex = pl_tex_create(gpu, &(struct pl_tex_params) {
38         .format         = fmt,
39         .w              = TEX_SIZE,
40         .h              = TEX_SIZE,
41         .sampleable     = true,
42         .initial_data   = data,
43     });
44 
45     free(data);
46     REQUIRE(tex);
47     return tex;
48 }
49 
50 struct bench {
51     void (*run_sh)(pl_shader sh, pl_shader_obj *state,
52                    pl_tex src);
53 
54     void (*run_tex)(pl_gpu gpu, pl_tex tex);
55 };
56 
run_bench(pl_gpu gpu,pl_dispatch dp,pl_shader_obj * state,pl_tex src,pl_tex fbo,pl_timer timer,const struct bench * bench)57 static void run_bench(pl_gpu gpu, pl_dispatch dp,
58                       pl_shader_obj *state, pl_tex src,
59                       pl_tex fbo, pl_timer timer,
60                       const struct bench *bench)
61 {
62     if (bench->run_sh) {
63         pl_shader sh = pl_dispatch_begin(dp);
64         bench->run_sh(sh, state, src);
65 
66         pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
67             .shader = &sh,
68             .target = fbo,
69             .timer = timer,
70         });
71     } else {
72         bench->run_tex(gpu, fbo);
73     }
74 }
75 
benchmark(pl_gpu gpu,const char * name,const struct bench * bench)76 static void benchmark(pl_gpu gpu, const char *name,
77                       const struct bench *bench)
78 {
79     pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
80     pl_shader_obj state = NULL;
81     pl_tex src = create_test_img(gpu);
82 
83     // Create the FBOs
84     pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32,
85                              PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE);
86     REQUIRE(fmt);
87 
88     pl_tex fbos[NUM_FBOS] = {0};
89     for (int i = 0; i < NUM_FBOS; i++) {
90         fbos[i] = pl_tex_create(gpu, &(struct pl_tex_params) {
91             .format         = fmt,
92             .w              = TEX_SIZE,
93             .h              = TEX_SIZE,
94             .renderable     = true,
95             .blit_dst       = true,
96             .host_writable  = true,
97             .host_readable  = true,
98             .storable       = !!(fmt->caps & PL_FMT_CAP_STORABLE),
99         });
100         REQUIRE(fbos[i]);
101 
102         pl_tex_clear(gpu, fbos[i], (float[4]){ 0.0 });
103     }
104 
105     // Run the benchmark and flush+block once to force shader compilation etc.
106     run_bench(gpu, dp, &state, src, fbos[0], NULL, bench);
107     pl_gpu_finish(gpu);
108 
109     // Perform the actual benchmark
110     struct timeval start = {0}, stop = {0};
111     unsigned long frames = 0;
112     int index = 0;
113 
114     pl_timer timer = pl_timer_create(gpu);
115     uint64_t gputime_total = 0;
116     unsigned long gputime_count = 0;
117     uint64_t gputime;
118 
119     gettimeofday(&start, NULL);
120     do {
121         frames++;
122         run_bench(gpu, dp, &state, src, fbos[index++], timer, bench);
123         index %= NUM_FBOS;
124         if (index == 0) {
125             pl_gpu_flush(gpu);
126             gettimeofday(&stop, NULL);
127         }
128         while ((gputime = pl_timer_query(gpu, timer))) {
129             gputime_total += gputime;
130             gputime_count++;
131         }
132     } while (stop.tv_sec - start.tv_sec < BENCH_DUR);
133 
134     // Force the GPU to finish execution and re-measure the final stop time
135     pl_gpu_finish(gpu);
136 
137     gettimeofday(&stop, NULL);
138     while ((gputime = pl_timer_query(gpu, timer))) {
139         gputime_total += gputime;
140         gputime_count++;
141     }
142 
143     float secs = (float) (stop.tv_sec - start.tv_sec) +
144                  1e-6 * (stop.tv_usec - start.tv_usec);
145     printf("'%s':\t%4lu frames in %1.6f seconds => %2.6f ms/frame (%5.2f FPS)",
146           name, frames, secs, 1000 * secs / frames, frames / secs);
147     if (gputime_count)
148         printf(", gpu time: %2.6f ms", 1e-6 * (gputime_total / gputime_count));
149     printf("\n");
150 
151     pl_timer_destroy(gpu, &timer);
152     pl_shader_obj_destroy(&state);
153     pl_dispatch_destroy(&dp);
154     pl_tex_destroy(gpu, &src);
155     for (int i = 0; i < NUM_FBOS; i++)
156         pl_tex_destroy(gpu, &fbos[i]);
157 }
158 
159 // List of benchmarks
bench_deband(pl_shader sh,pl_shader_obj * state,pl_tex src)160 static void bench_deband(pl_shader sh, pl_shader_obj *state, pl_tex src)
161 {
162     pl_shader_deband(sh, &(struct pl_sample_src) { .tex = src }, NULL);
163 }
164 
bench_deband_heavy(pl_shader sh,pl_shader_obj * state,pl_tex src)165 static void bench_deband_heavy(pl_shader sh, pl_shader_obj *state, pl_tex src)
166 {
167     pl_shader_deband(sh, &(struct pl_sample_src) { .tex = src },
168         &(struct pl_deband_params) {
169         .iterations = 4,
170         .threshold  = 4.0,
171         .radius     = 4.0,
172         .grain      = 16.0,
173     });
174 }
175 
bench_bilinear(pl_shader sh,pl_shader_obj * state,pl_tex src)176 static void bench_bilinear(pl_shader sh, pl_shader_obj *state, pl_tex src)
177 {
178     pl_shader_sample_bilinear(sh, &(struct pl_sample_src) { .tex = src });
179 }
180 
bench_bicubic(pl_shader sh,pl_shader_obj * state,pl_tex src)181 static void bench_bicubic(pl_shader sh, pl_shader_obj *state, pl_tex src)
182 {
183     pl_shader_sample_bicubic(sh, &(struct pl_sample_src) { .tex = src });
184 }
185 
bench_dither_blue(pl_shader sh,pl_shader_obj * state,pl_tex src)186 static void bench_dither_blue(pl_shader sh, pl_shader_obj *state, pl_tex src)
187 {
188     struct pl_dither_params params = pl_dither_default_params;
189     params.method = PL_DITHER_BLUE_NOISE;
190 
191     pl_shader_sample_direct(sh, &(struct pl_sample_src) { .tex = src });
192     pl_shader_dither(sh, 8, state, &params);
193 }
194 
bench_dither_white(pl_shader sh,pl_shader_obj * state,pl_tex src)195 static void bench_dither_white(pl_shader sh, pl_shader_obj *state, pl_tex src)
196 {
197     struct pl_dither_params params = pl_dither_default_params;
198     params.method = PL_DITHER_WHITE_NOISE;
199 
200     pl_shader_sample_direct(sh, &(struct pl_sample_src) { .tex = src });
201     pl_shader_dither(sh, 8, state, &params);
202 }
203 
bench_dither_ordered_fix(pl_shader sh,pl_shader_obj * state,pl_tex src)204 static void bench_dither_ordered_fix(pl_shader sh, pl_shader_obj *state, pl_tex src)
205 {
206     struct pl_dither_params params = pl_dither_default_params;
207     params.method = PL_DITHER_ORDERED_FIXED;
208 
209     pl_shader_sample_direct(sh, &(struct pl_sample_src) { .tex = src });
210     pl_shader_dither(sh, 8, state, &params);
211 }
212 
bench_polar(pl_shader sh,pl_shader_obj * state,pl_tex src)213 static void bench_polar(pl_shader sh, pl_shader_obj *state, pl_tex src)
214 {
215     struct pl_sample_filter_params params = {
216         .filter = pl_filter_ewa_lanczos,
217         .lut = state,
218     };
219 
220     pl_shader_sample_polar(sh, &(struct pl_sample_src) { .tex = src }, &params);
221 }
222 
bench_polar_nocompute(pl_shader sh,pl_shader_obj * state,pl_tex src)223 static void bench_polar_nocompute(pl_shader sh, pl_shader_obj *state, pl_tex src)
224 {
225     struct pl_sample_filter_params params = {
226         .filter = pl_filter_ewa_lanczos,
227         .no_compute = true,
228         .lut = state,
229     };
230 
231     pl_shader_sample_polar(sh, &(struct pl_sample_src) { .tex = src }, &params);
232 }
233 
234 
bench_hdr_peak(pl_shader sh,pl_shader_obj * state,pl_tex src)235 static void bench_hdr_peak(pl_shader sh, pl_shader_obj *state, pl_tex src)
236 {
237     pl_shader_sample_direct(sh, &(struct pl_sample_src) { .tex = src });
238     pl_shader_detect_peak(sh, pl_color_space_hdr10, state, NULL);
239 }
240 
bench_av1_grain(pl_shader sh,pl_shader_obj * state,pl_tex src)241 static void bench_av1_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
242 {
243     struct pl_av1_grain_params params = {
244         .data = av1_grain_data,
245         .tex = src,
246         .components = 3,
247         .component_mapping = {0, 1, 2},
248         .repr = &(struct pl_color_repr) {0},
249     };
250 
251     params.data.grain_seed = rand();
252     pl_shader_av1_grain(sh, state, &params);
253 }
254 
bench_av1_grain_lap(pl_shader sh,pl_shader_obj * state,pl_tex src)255 static void bench_av1_grain_lap(pl_shader sh, pl_shader_obj *state, pl_tex src)
256 {
257     struct pl_av1_grain_params params = {
258         .data = av1_grain_data,
259         .tex = src,
260         .components = 3,
261         .component_mapping = {0, 1, 2},
262         .repr = &(struct pl_color_repr) {0},
263     };
264 
265     params.data.overlap = true;
266     params.data.grain_seed = rand();
267     pl_shader_av1_grain(sh, state, &params);
268 }
269 
270 static float data[TEX_SIZE * TEX_SIZE * 4 + 8192];
271 
bench_download(pl_gpu gpu,pl_tex tex)272 static void bench_download(pl_gpu gpu, pl_tex tex)
273 {
274     pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
275         .tex = tex,
276         .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
277     });
278 }
279 
bench_upload(pl_gpu gpu,pl_tex tex)280 static void bench_upload(pl_gpu gpu, pl_tex tex)
281 {
282     pl_tex_upload(gpu, &(struct pl_tex_transfer_params) {
283         .tex = tex,
284         .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
285     });
286 }
287 
dummy_cb(void * arg)288 static void dummy_cb(void *arg) {}
289 
bench_download_async(pl_gpu gpu,pl_tex tex)290 static void bench_download_async(pl_gpu gpu, pl_tex tex)
291 {
292     pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
293         .tex = tex,
294         .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
295         .callback = dummy_cb,
296     });
297 }
298 
bench_upload_async(pl_gpu gpu,pl_tex tex)299 static void bench_upload_async(pl_gpu gpu, pl_tex tex)
300 {
301     pl_tex_upload(gpu, &(struct pl_tex_transfer_params) {
302         .tex = tex,
303         .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
304         .callback = dummy_cb,
305     });
306 }
307 
main()308 int main()
309 {
310     setbuf(stdout, NULL);
311     setbuf(stderr, NULL);
312 
313     pl_log log = pl_log_create(PL_API_VER, &(struct pl_log_params) {
314         .log_cb     = isatty(fileno(stdout)) ? pl_log_color : pl_log_simple,
315         .log_level  = PL_LOG_WARN,
316     });
317 
318     pl_vulkan vk = pl_vulkan_create(log, &(struct pl_vulkan_params) {
319         .allow_software = true,
320         .async_compute = true,
321         .queue_count = NUM_FBOS,
322     });
323 
324     if (!vk)
325         return SKIP;
326 
327 #define BENCH_SH(fn) &(struct bench) { .run_sh = fn }
328 #define BENCH_TEX(fn) &(struct bench) { .run_tex = fn }
329 
330     printf("= Running benchmarks =\n");
331     benchmark(vk->gpu, "tex_download ptr", BENCH_TEX(bench_download));
332     benchmark(vk->gpu, "tex_download ptr async", BENCH_TEX(bench_download_async));
333     benchmark(vk->gpu, "tex_upload ptr", BENCH_TEX(bench_upload));
334     benchmark(vk->gpu, "tex_upload ptr async", BENCH_TEX(bench_upload_async));
335     benchmark(vk->gpu, "bilinear", BENCH_SH(bench_bilinear));
336     benchmark(vk->gpu, "bicubic", BENCH_SH(bench_bicubic));
337     benchmark(vk->gpu, "deband", BENCH_SH(bench_deband));
338     benchmark(vk->gpu, "deband_heavy", BENCH_SH(bench_deband_heavy));
339 
340     // Polar sampling
341     benchmark(vk->gpu, "polar", BENCH_SH(bench_polar));
342     if (vk->gpu->glsl.compute)
343         benchmark(vk->gpu, "polar_nocompute", BENCH_SH(bench_polar_nocompute));
344 
345     // Dithering algorithms
346     benchmark(vk->gpu, "dither_blue", BENCH_SH(bench_dither_blue));
347     benchmark(vk->gpu, "dither_white", BENCH_SH(bench_dither_white));
348     benchmark(vk->gpu, "dither_ordered_fixed", BENCH_SH(bench_dither_ordered_fix));
349 
350     // HDR peak detection
351     if (vk->gpu->glsl.compute)
352         benchmark(vk->gpu, "hdr_peakdetect", BENCH_SH(bench_hdr_peak));
353 
354     // Misc stuff
355     benchmark(vk->gpu, "av1_grain", BENCH_SH(bench_av1_grain));
356     benchmark(vk->gpu, "av1_grain_lap", BENCH_SH(bench_av1_grain_lap));
357 
358     pl_vulkan_destroy(&vk);
359     pl_log_destroy(&log);
360     return 0;
361 }
362