1 /* Presented are two hypothetical scenarios of how one might use libplacebo
2 * as something like an FFmpeg or mpv video filter. We examine two example
3 * APIs (loosely modeled after real video filtering APIs) and how each style
4 * would like to use libplacebo.
5 *
6 * For sake of a simple example, let's assume this is a debanding filter.
7 * For those of you too lazy to compile/run this file but still want to see
8 * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
9 *
10 * RADV+ACO:
11 * api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
12 * render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
13 * api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
14 * render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
15 *
16 * AMDVLK:
17 * api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
18 * render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
19 * api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
20 * render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
21 *
22 * You can see that AMDVLK is still better at doing texture streaming than
23 * RADV - this is because as of writing RADV still does not support
24 * asynchronous texture queues / DMA engine transfers. If we disable the
25 * `async_transfer` option with AMDVLK we get this:
26 *
27 * api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
28 * render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
29 * api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
30 * render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
31 *
32 * License: CC0 / Public Domain
33 */
34
35 #include <assert.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <sys/time.h>
41 #include <time.h>
42
43 #include "common.h"
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #include <libplacebo/dispatch.h>
50 #include <libplacebo/shaders/sampling.h>
51 #include <libplacebo/utils/upload.h>
52 #include <libplacebo/vulkan.h>
53
54 ///////////////////////
55 /// API definitions ///
56 ///////////////////////
57
58 // Stuff that would be common to each API
59
60 void *init(void);
61 void uninit(void *priv);
62
63 struct format {
64 // For simplicity let's make a few assumptions here, since configuring the
65 // texture format is not the point of this example. (In practice you can
66 // go nuts with the `utils/upload.h` helpers)
67 //
68 // - All formats contain unsigned integers only
69 // - All components have the same size in bits
70 // - All components are in the "canonical" order
71 // - All formats have power of two sizes only (2 or 4 components, not 3)
72 // - All plane strides are a multiple of the pixel size
73 int num_comps;
74 int bitdepth;
75 };
76
77 struct plane {
78 int subx, suby; // subsampling shift
79 struct format fmt;
80 size_t stride;
81 void *data;
82 };
83
84 #define MAX_PLANES 4
85
86 struct image {
87 int width, height;
88 int num_planes;
89 struct plane planes[MAX_PLANES];
90
91 // For API #2, the associated mapped buffer (if any)
92 struct api2_buf *associated_buf;
93 };
94
95
96 // Example API design #1: synchronous, blocking, double-copy (bad!)
97 //
98 // In this API, `api1_filter` must immediately return with the new data.
99 // This prevents parallelism on the GPU and should be avoided if possible,
100 // but sometimes that's what you have to work with. So this is what it
101 // would look like.
102 //
103 // Also, let's assume this API design reconfigures the filter chain (using
104 // a blank `proxy` image every time the image format or dimensions change,
105 // and doesn't expect us to fail due to format mismatches or resource
106 // exhaustion afterwards.
107
108 bool api1_reconfig(void *priv, const struct image *proxy);
109 bool api1_filter(void *priv, struct image *dst, struct image *src);
110
111
112 // Example API design #2: asynchronous, streaming, queued, zero-copy (good!)
113 //
114 // In this API, `api2_process` will run by the calling code every so often
115 // (e.g. when new data is available or expected). This function has access
116 // to non-blocking functions `get_image` and `put_image` that interface
117 // with the video filtering engine's internal queueing system.
118 //
119 // This API is also designed to feed multiple frames ahead of time, i.e.
120 // it will feed us as many frames as it can while we're still returning
121 // `API2_WANT_MORE`. To drain the filter chain, it would continue running
122 // the process function until `API2_HAVE_MORE` is no longer present
123 // in the output.
124 //
125 // This API is also designed to do zero-copy where possible. When it wants
126 // to create a data buffer of a given size, it will call our function
127 // `api2_alloc` which will return a buffer that we can process directly.
128 // We can use this to do zero-copy uploading to the GPU, by creating
129 // host-visible persistently mapped buffers. In order to prevent the video
130 // filtering system from re-using our buffers while copies are happening, we
131 // use special functions `image_lock` and `image_unlock` to increase a
132 // refcount on the image's backing storage. (As is typical of such APIs)
133 //
134 // Finally, this API is designed to be fully dynamic: The image parameters
135 // could change at any time, and we must be equipped to handle that.
136
137 enum api2_status {
138 // Negative values are used to signal error conditions
139 API2_ERR_FMT = -2, // incompatible / unsupported format
140 API2_ERR_UNKNOWN = -1, // some other error happened
141 API2_OK = 0, // no error, no status - everything's good
142
143 // Positive values represent a mask of status conditions
144 API2_WANT_MORE = (1 << 0), // we want more frames, please feed some more!
145 API2_HAVE_MORE = (1 << 1), // we have more frames but they're not ready
146 };
147
148 enum api2_status api2_process(void *priv);
149
150 // Functions for creating persistently mapped buffers
151 struct api2_buf {
152 void *data;
153 size_t size;
154 void *priv;
155 };
156
157 bool api2_alloc(void *priv, size_t size, struct api2_buf *out);
158 void api2_free(void *priv, const struct api2_buf *buf);
159
160 // These functions are provided by the API. The exact details of how images
161 // are enqueued, dequeued and locked are not really important here, so just
162 // do something unrealistic but simple to demonstrate with.
163 struct image *get_image(void);
164 void put_image(struct image *img);
165 void image_lock(struct image *img);
166 void image_unlock(struct image *img);
167
168
169 /////////////////////////////////
170 /// libplacebo implementation ///
171 /////////////////////////////////
172
173
174 // For API #2:
175 #define PARALLELISM 8
176
177 struct entry {
178 pl_buf buf; // to stream the download
179 pl_tex tex_in[MAX_PLANES];
180 pl_tex tex_out[MAX_PLANES];
181 struct image image;
182
183 // For entries that are associated with a held image, so we can unlock them
184 // as soon as possible
185 struct image *held_image;
186 pl_buf held_buf;
187 };
188
189 // For both APIs:
190 struct priv {
191 pl_log log;
192 pl_vulkan vk;
193 pl_gpu gpu;
194 pl_dispatch dp;
195 pl_shader_obj dither_state;
196
197 // Timer objects
198 pl_timer render_timer;
199 pl_timer upload_timer;
200 pl_timer download_timer;
201 uint64_t render_sum;
202 uint64_t upload_sum;
203 uint64_t download_sum;
204 int render_count;
205 int upload_count;
206 int download_count;
207
208 // API #1: A simple pair of input and output textures
209 pl_tex tex_in[MAX_PLANES];
210 pl_tex tex_out[MAX_PLANES];
211
212 // API #2: A ring buffer of textures/buffers for streaming
213 int idx_in; // points the next free entry
214 int idx_out; // points to the first entry still in progress
215 struct entry entries[PARALLELISM];
216 };
217
init(void)218 void *init(void) {
219 struct priv *p = calloc(1, sizeof(struct priv));
220 if (!p)
221 return NULL;
222
223 p->log = pl_log_create(PL_API_VER, &(struct pl_log_params) {
224 .log_cb = pl_log_simple,
225 .log_level = PL_LOG_WARN,
226 });
227
228 p->vk = pl_vulkan_create(p->log, &(struct pl_vulkan_params) {
229 // Note: This is for API #2. In API #1 you could just pass params=NULL
230 // and it wouldn't really matter much.
231 .async_transfer = true,
232 .async_compute = true,
233 .queue_count = PARALLELISM,
234 });
235
236 if (!p->vk) {
237 fprintf(stderr, "Failed creating vulkan context\n");
238 goto error;
239 }
240
241 // Give this a shorter name for convenience
242 p->gpu = p->vk->gpu;
243
244 p->dp = pl_dispatch_create(p->log, p->gpu);
245 if (!p->dp) {
246 fprintf(stderr, "Failed creating shader dispatch object\n");
247 goto error;
248 }
249
250 p->render_timer = pl_timer_create(p->gpu);
251 p->upload_timer = pl_timer_create(p->gpu);
252 p->download_timer = pl_timer_create(p->gpu);
253
254 return p;
255
256 error:
257 uninit(p);
258 return NULL;
259 }
260
uninit(void * priv)261 void uninit(void *priv)
262 {
263 struct priv *p = priv;
264
265 // API #1
266 for (int i = 0; i < MAX_PLANES; i++) {
267 pl_tex_destroy(p->gpu, &p->tex_in[i]);
268 pl_tex_destroy(p->gpu, &p->tex_out[i]);
269 }
270
271 // API #2
272 for (int i = 0; i < PARALLELISM; i++) {
273 pl_buf_destroy(p->gpu, &p->entries[i].buf);
274 for (int j = 0; j < MAX_PLANES; j++) {
275 pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]);
276 pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]);
277 }
278 if (p->entries[i].held_image)
279 image_unlock(p->entries[i].held_image);
280 }
281
282 pl_timer_destroy(p->gpu, &p->render_timer);
283 pl_timer_destroy(p->gpu, &p->upload_timer);
284 pl_timer_destroy(p->gpu, &p->download_timer);
285
286 pl_shader_obj_destroy(&p->dither_state);
287 pl_dispatch_destroy(&p->dp);
288 pl_vulkan_destroy(&p->vk);
289 pl_log_destroy(&p->log);
290
291 free(p);
292 }
293
294 // Helper function to set up the `pl_plane_data` struct from the image params
setup_plane_data(const struct image * img,struct pl_plane_data out[MAX_PLANES])295 static void setup_plane_data(const struct image *img,
296 struct pl_plane_data out[MAX_PLANES])
297 {
298 for (int i = 0; i < img->num_planes; i++) {
299 const struct plane *plane = &img->planes[i];
300
301 out[i] = (struct pl_plane_data) {
302 .type = PL_FMT_UNORM,
303 .width = img->width >> plane->subx,
304 .height = img->height >> plane->suby,
305 .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8,
306 .row_stride = plane->stride,
307 .pixels = plane->data,
308 };
309
310 // For API 2 (direct rendering)
311 if (img->associated_buf) {
312 pl_buf buf = img->associated_buf->priv;
313 out[i].pixels = NULL;
314 out[i].buf = buf;
315 out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data;
316 }
317
318 for (int c = 0; c < plane->fmt.num_comps; c++) {
319 out[i].component_size[c] = plane->fmt.bitdepth;
320 out[i].component_pad[c] = 0;
321 out[i].component_map[c] = c;
322 }
323 }
324 }
325
do_plane(struct priv * p,pl_tex dst,pl_tex src)326 static bool do_plane(struct priv *p, pl_tex dst, pl_tex src)
327 {
328 int new_depth = dst->params.format->component_depth[0];
329
330 // Do some debanding, and then also make sure to dither to the new depth
331 // so that our debanded gradients are actually preserved well
332 pl_shader sh = pl_dispatch_begin(p->dp);
333 pl_shader_deband(sh, &(struct pl_sample_src){ .tex = src }, NULL);
334 pl_shader_dither(sh, new_depth, &p->dither_state, NULL);
335 return pl_dispatch_finish(p->dp, &(struct pl_dispatch_params) {
336 .shader = &sh,
337 .target = dst,
338 .timer = p->render_timer,
339 });
340 }
341
check_timers(struct priv * p)342 static void check_timers(struct priv *p)
343 {
344 uint64_t ret;
345
346 while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
347 p->render_sum += ret;
348 p->render_count++;
349 }
350
351 while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
352 p->upload_sum += ret;
353 p->upload_count++;
354 }
355
356 while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
357 p->download_sum += ret;
358 p->download_count++;
359 }
360 }
361
362 // API #1 implementation:
363 //
364 // In this design, we will create all GPU resources inside `reconfig`, based on
365 // the texture format configured from the proxy image. This will avoid failing
366 // later on due to e.g. resource exhaustion or texture format mismatch, and
367 // thereby falls within the intended semantics of this style of API.
368
api1_reconfig(void * priv,const struct image * proxy)369 bool api1_reconfig(void *priv, const struct image *proxy)
370 {
371 struct priv *p = priv;
372 struct pl_plane_data data[MAX_PLANES];
373 setup_plane_data(proxy, data);
374
375 for (int i = 0; i < proxy->num_planes; i++) {
376 pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
377 if (!fmt) {
378 fprintf(stderr, "Failed configuring filter: no good texture format!\n");
379 return false;
380 }
381
382 bool ok = true;
383 ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], &(struct pl_tex_params) {
384 .w = data[i].width,
385 .h = data[i].height,
386 .format = fmt,
387 .sampleable = true,
388 .host_writable = true,
389 });
390
391 ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], &(struct pl_tex_params) {
392 .w = data[i].width,
393 .h = data[i].height,
394 .format = fmt,
395 .renderable = true,
396 .host_readable = true,
397 });
398
399 if (!ok) {
400 fprintf(stderr, "Failed creating GPU textures!\n");
401 return false;
402 }
403 }
404
405 return true;
406 }
407
api1_filter(void * priv,struct image * dst,struct image * src)408 bool api1_filter(void *priv, struct image *dst, struct image *src)
409 {
410 struct priv *p = priv;
411 struct pl_plane_data data[MAX_PLANES];
412 setup_plane_data(src, data);
413
414 // Upload planes
415 for (int i = 0; i < src->num_planes; i++) {
416 bool ok = pl_tex_upload(p->gpu, &(struct pl_tex_transfer_params) {
417 .tex = p->tex_in[i],
418 .stride_w = data[i].row_stride / data[i].pixel_stride,
419 .ptr = src->planes[i].data,
420 .timer = p->upload_timer,
421 });
422
423 if (!ok) {
424 fprintf(stderr, "Failed uploading data to the GPU!\n");
425 return false;
426 }
427 }
428
429 // Process planes
430 for (int i = 0; i < src->num_planes; i++) {
431 if (!do_plane(p, p->tex_out[i], p->tex_in[i])) {
432 fprintf(stderr, "Failed processing planes!\n");
433 return false;
434 }
435 }
436
437 // Download planes
438 for (int i = 0; i < src->num_planes; i++) {
439 bool ok = pl_tex_download(p->gpu, &(struct pl_tex_transfer_params) {
440 .tex = p->tex_out[i],
441 .stride_w = dst->planes[i].stride / data[i].pixel_stride,
442 .ptr = dst->planes[i].data,
443 .timer = p->download_timer,
444 });
445
446 if (!ok) {
447 fprintf(stderr, "Failed downloading data from the GPU!\n");
448 return false;
449 }
450 }
451
452 check_timers(p);
453 return true;
454 }
455
456
457 // API #2 implementation:
458 //
459 // In this implementation we maintain a queue (implemented as ring buffer)
460 // of "work entries", which are isolated structs that hold independent GPU
461 // resources - so that the GPU has no cross-entry dependencies on any of the
462 // textures or other resources. (Side note: It still has a dependency on the
463 // dither state, but this is just a shared LUT anyway)
464
465 // Align up to the nearest multiple of a power of two
466 #define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
467
submit_work(struct priv * p,struct entry * e,struct image * img)468 static enum api2_status submit_work(struct priv *p, struct entry *e,
469 struct image *img)
470 {
471 // If the image comes from a mapped buffer, we have to take a lock
472 // while our upload is in progress
473 if (img->associated_buf) {
474 assert(!e->held_image);
475 image_lock(img);
476 e->held_image = img;
477 e->held_buf = img->associated_buf->priv;
478 }
479
480 // Upload this image's data
481 struct pl_plane_data data[MAX_PLANES];
482 setup_plane_data(img, data);
483
484 for (int i = 0; i < img->num_planes; i++) {
485 pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
486 if (!fmt)
487 return API2_ERR_FMT;
488
489 // FIXME: can we plumb a `pl_timer` in here somehow?
490 if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
491 return API2_ERR_UNKNOWN;
492
493 // Re-create the target FBO as well with this format if necessary
494 bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], &(struct pl_tex_params) {
495 .w = data[i].width,
496 .h = data[i].height,
497 .format = fmt,
498 .renderable = true,
499 .host_readable = true,
500 });
501 if (!ok)
502 return API2_ERR_UNKNOWN;
503 }
504
505 // Dispatch the work for this image
506 for (int i = 0; i < img->num_planes; i++) {
507 if (!do_plane(p, e->tex_out[i], e->tex_in[i]))
508 return API2_ERR_UNKNOWN;
509 }
510
511 // Set up the resulting `struct image` that will hold our target
512 // data. We just copy the format etc. from the source image
513 memcpy(&e->image, img, sizeof(struct image));
514
515 size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0;
516 for (int i = 0; i < img->num_planes; i++) {
517 // For performance, we want to make sure we align the stride
518 // to a multiple of the GPU's preferred texture transfer stride
519 // (This is entirely optional)
520 stride[i] = ALIGN2(img->planes[i].stride,
521 p->gpu->limits.align_tex_xfer_stride);
522 int height = img->height >> img->planes[i].suby;
523
524 // Round up the offset to the nearest multiple of the optimal
525 // transfer alignment. (This is also entirely optional)
526 offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset);
527 total_size = offset[i] + stride[i] * height;
528 }
529
530 // Dispatch the asynchronous download into a mapped buffer
531 bool ok = pl_buf_recreate(p->gpu, &e->buf, &(struct pl_buf_params) {
532 .size = total_size,
533 .host_mapped = true,
534 });
535 if (!ok)
536 return API2_ERR_UNKNOWN;
537
538 for (int i = 0; i < img->num_planes; i++) {
539 ok = pl_tex_download(p->gpu, &(struct pl_tex_transfer_params) {
540 .tex = e->tex_out[i],
541 .stride_w = stride[i] / data[i].pixel_stride,
542 .buf = e->buf,
543 .buf_offset = offset[i],
544 .timer = p->download_timer,
545 });
546 if (!ok)
547 return API2_ERR_UNKNOWN;
548
549 // Update the output fields
550 e->image.planes[i].data = e->buf->data + offset[i];
551 e->image.planes[i].stride = stride[i];
552 }
553
554 // Make sure this work starts processing in the background, and especially
555 // so we can move on to the next queue on the gPU
556 pl_gpu_flush(p->gpu);
557 return API2_OK;
558 }
559
api2_process(void * priv)560 enum api2_status api2_process(void *priv)
561 {
562 struct priv *p = priv;
563 enum api2_status ret = 0;
564
565 // Opportunistically release any held images. We do this across the ring
566 // buffer, rather than doing this as part of the following loop, because
567 // we want to release images ahead-of-time (no FIFO constraints)
568 for (int i = 0; i < PARALLELISM; i++) {
569 struct entry *e = &p->entries[i];
570 if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) {
571 // upload buffer is no longer in use, release it
572 image_unlock(e->held_image);
573 e->held_image = NULL;
574 e->held_buf = NULL;
575 }
576 }
577
578 // Poll the status of existing entries and dequeue the ones that are done
579 while (p->idx_out != p->idx_in) {
580 struct entry *e = &p->entries[p->idx_out];
581 if (pl_buf_poll(p->gpu, e->buf, 0))
582 break;
583
584 if (e->held_image) {
585 image_unlock(e->held_image);
586 e->held_image = NULL;
587 e->held_buf = NULL;
588 }
589
590 // download buffer is no longer busy, dequeue the frame
591 put_image(&e->image);
592 p->idx_out = (p->idx_out + 1) % PARALLELISM;
593 }
594
595 // Fill up the queue with more work
596 int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1;
597 while (p->idx_in != last_free_idx) {
598 struct image *img = get_image();
599 if (!img) {
600 ret |= API2_WANT_MORE;
601 break;
602 }
603
604 enum api2_status err = submit_work(p, &p->entries[p->idx_in], img);
605 if (err < 0)
606 return err;
607
608 p->idx_in = (p->idx_in + 1) % PARALLELISM;
609 }
610
611 if (p->idx_out != p->idx_in)
612 ret |= API2_HAVE_MORE;
613
614 return ret;
615 }
616
api2_alloc(void * priv,size_t size,struct api2_buf * out)617 bool api2_alloc(void *priv, size_t size, struct api2_buf *out)
618 {
619 struct priv *p = priv;
620 if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size)
621 return false;
622
623 pl_buf buf = pl_buf_create(p->gpu, &(struct pl_buf_params) {
624 .size = size,
625 .host_mapped = true,
626 });
627
628 if (!buf)
629 return false;
630
631 *out = (struct api2_buf) {
632 .data = buf->data,
633 .size = size,
634 .priv = (void *) buf,
635 };
636 return true;
637 }
638
api2_free(void * priv,const struct api2_buf * buf)639 void api2_free(void *priv, const struct api2_buf *buf)
640 {
641 struct priv *p = priv;
642 pl_buf plbuf = buf->priv;
643 pl_buf_destroy(p->gpu, &plbuf);
644 }
645
646
647 ////////////////////////////////////
648 /// Proof of Concept / Benchmark ///
649 ////////////////////////////////////
650
651 #define FRAMES 10000
652
653 // Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily
654 // with a stride aligned to 256 bytes. (For no particular reason)
655 #define TEXELSZ sizeof(uint8_t)
656 #define WIDTH 1920
657 #define HEIGHT 1080
658 #define STRIDE (ALIGN2(WIDTH, 256) * TEXELSZ)
659 // Subsampled planes
660 #define SWIDTH (WIDTH >> 1)
661 #define SHEIGHT (HEIGHT >> 1)
662 #define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ)
663 // Plane offsets / sizes
664 #define SIZE0 (HEIGHT * STRIDE)
665 #define SIZE1 (2 * SHEIGHT * SSTRIDE)
666 #define OFFSET0 0
667 #define OFFSET1 SIZE0
668 #define BUFSIZE (OFFSET1 + SIZE1)
669
670 // Skeleton of an example image
671 static const struct image example_image = {
672 .width = WIDTH,
673 .height = HEIGHT,
674 .num_planes = 2,
675 .planes = {
676 {
677 .subx = 0,
678 .suby = 0,
679 .stride = STRIDE,
680 .fmt = {
681 .num_comps = 1,
682 .bitdepth = 8 * TEXELSZ,
683 },
684 }, {
685 .subx = 1,
686 .suby = 1,
687 .stride = SSTRIDE * 2,
688 .fmt = {
689 .num_comps = 2,
690 .bitdepth = 8 * TEXELSZ,
691 },
692 },
693 },
694 };
695
696 // API #1: Nice and simple (but slow)
api1_example(void)697 static void api1_example(void)
698 {
699 struct priv *vf = init();
700 if (!vf)
701 return;
702
703 if (!api1_reconfig(vf, &example_image)) {
704 fprintf(stderr, "api1: Failed configuring video filter!\n");
705 return;
706 }
707
708 // Allocate two buffers to hold the example data, and fill the source
709 // buffer arbitrarily with a "simple" pattern. (Decoding the data into
710 // the buffer is not meant to be part of this benchmark)
711 uint8_t *srcbuf = malloc(BUFSIZE),
712 *dstbuf = malloc(BUFSIZE);
713 if (!srcbuf || !dstbuf)
714 goto done;
715
716 for (size_t i = 0; i < BUFSIZE; i++)
717 srcbuf[i] = i;
718
719 struct image src = example_image, dst = example_image;
720 src.planes[0].data = srcbuf + OFFSET0;
721 src.planes[1].data = srcbuf + OFFSET1;
722 dst.planes[0].data = dstbuf + OFFSET0;
723 dst.planes[1].data = dstbuf + OFFSET1;
724
725 struct timeval start = {0}, stop = {0};
726 gettimeofday(&start, NULL);
727
728 // Process this dummy frame a bunch of times
729 unsigned frames = 0;
730 for (frames = 0; frames < FRAMES; frames++) {
731 if (!api1_filter(vf, &dst, &src)) {
732 fprintf(stderr, "api1: Failed filtering frame... aborting\n");
733 break;
734 }
735 }
736
737 gettimeofday(&stop, NULL);
738 float secs = (float) (stop.tv_sec - start.tv_sec) +
739 1e-6 * (stop.tv_usec - start.tv_usec);
740
741 printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
742 frames, secs, 1000 * secs / frames, frames / secs);
743
744 if (vf->render_count) {
745 printf(" render: %f ms, upload: %f ms, download: %f ms\n",
746 1e-6 * vf->render_sum / vf->render_count,
747 vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
748 vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
749 }
750
751 done:
752 free(srcbuf);
753 free(dstbuf);
754 uninit(vf);
755 }
756
757
758 // API #2: Pretend we have some fancy pool of images.
759 #define POOLSIZE (PARALLELISM + 1)
760
761 static struct api2_buf buffers[POOLSIZE] = {0};
762 static struct image images[POOLSIZE] = {0};
763 static int refcount[POOLSIZE] = {0};
764 static unsigned api2_frames_in = 0;
765 static unsigned api2_frames_out = 0;
766
api2_example(void)767 static void api2_example(void)
768 {
769 struct priv *vf = init();
770 if (!vf)
771 return;
772
773 // Set up a bunch of dummy images
774 for (int i = 0; i < POOLSIZE; i++) {
775 uint8_t *data;
776 images[i] = example_image;
777 if (api2_alloc(vf, BUFSIZE, &buffers[i])) {
778 data = buffers[i].data;
779 images[i].associated_buf = &buffers[i];
780 } else {
781 // Fall back in case mapped buffers are unsupported
782 fprintf(stderr, "warning: falling back to malloc, may be slow\n");
783 data = malloc(BUFSIZE);
784 }
785 // Fill with some "data" (like in API #1)
786 for (size_t n = 0; n < BUFSIZE; n++)
787 data[i] = n;
788 images[i].planes[0].data = data + OFFSET0;
789 images[i].planes[1].data = data + OFFSET1;
790 }
791
792 struct timeval start = {0}, stop = {0};
793 gettimeofday(&start, NULL);
794
795 // Just keep driving the event loop regardless of the return status
796 // until we reach the critical number of frames. (Good enough for this PoC)
797 while (api2_frames_out < FRAMES) {
798 enum api2_status ret = api2_process(vf);
799 if (ret < 0) {
800 fprintf(stderr, "api2: Failed processing... aborting\n");
801 break;
802 }
803
804 // Sleep a short time (100us) to prevent busy waiting the CPU
805 #ifdef _WIN32
806 Sleep(0);
807 #else
808 nanosleep(&(struct timespec) { .tv_nsec = 100000 }, NULL);
809 #endif
810 check_timers(vf);
811 }
812
813 gettimeofday(&stop, NULL);
814 float secs = (float) (stop.tv_sec - start.tv_sec) +
815 1e-6 * (stop.tv_usec - start.tv_usec);
816
817 printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
818 api2_frames_out, secs, 1000 * secs / api2_frames_out,
819 api2_frames_out / secs);
820
821 if (vf->render_count) {
822 printf(" render: %f ms, upload: %f ms, download: %f ms\n",
823 1e-6 * vf->render_sum / vf->render_count,
824 vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
825 vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
826 }
827
828 for (int i = 0; i < POOLSIZE; i++) {
829 if (images[i].associated_buf) {
830 api2_free(vf, images[i].associated_buf);
831 } else {
832 // This is what we originally malloc'd
833 free(images[i].planes[0].data);
834 }
835 }
836
837 uninit(vf);
838 }
839
get_image(void)840 struct image *get_image(void)
841 {
842 if (api2_frames_in == FRAMES)
843 return NULL; // simulate EOF, to avoid queueing up "extra" work
844
845 // if we can find a free (unlocked) image, give it that
846 for (int i = 0; i < POOLSIZE; i++) {
847 if (refcount[i] == 0) {
848 api2_frames_in++;
849 return &images[i];
850 }
851 }
852
853 return NULL; // no free image available
854 }
855
put_image(struct image * img)856 void put_image(struct image *img)
857 {
858 (void)img;
859 api2_frames_out++;
860 }
861
image_lock(struct image * img)862 void image_lock(struct image *img)
863 {
864 int index = img - images; // cheat, for lack of having actual image management
865 refcount[index]++;
866 }
867
image_unlock(struct image * img)868 void image_unlock(struct image *img)
869 {
870 int index = img - images;
871 refcount[index]--;
872 }
873
main(void)874 int main(void)
875 {
876 printf("Running benchmarks...\n");
877 api1_example();
878 api2_example();
879 return 0;
880 }
881