1 /* Presented are two hypothetical scenarios of how one might use libplacebo
2  * as something like an FFmpeg or mpv video filter. We examine two example
3  * APIs (loosely modeled after real video filtering APIs) and how each style
4  * would like to use libplacebo.
5  *
6  * For sake of a simple example, let's assume this is a debanding filter.
7  * For those of you too lazy to compile/run this file but still want to see
8  * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
9  *
10  * RADV+ACO:
11  *   api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
12  *         render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
13  *   api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
14  *         render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
15  *
16  * AMDVLK:
17  *   api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
18  *         render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
19  *   api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
20  *         render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
21  *
22  * You can see that AMDVLK is still better at doing texture streaming than
23  * RADV - this is because as of writing RADV still does not support
24  * asynchronous texture queues / DMA engine transfers. If we disable the
25  * `async_transfer` option with AMDVLK we get this:
26  *
27  *   api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
28  *         render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
29  *   api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
30  *         render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
31  *
32  * License: CC0 / Public Domain
33  */
34 
35 #include <assert.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <sys/time.h>
41 #include <time.h>
42 
43 #include "common.h"
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #include <libplacebo/dispatch.h>
50 #include <libplacebo/shaders/sampling.h>
51 #include <libplacebo/utils/upload.h>
52 #include <libplacebo/vulkan.h>
53 
54 ///////////////////////
55 /// API definitions ///
56 ///////////////////////
57 
58 // Stuff that would be common to each API
59 
60 void *init(void);
61 void uninit(void *priv);
62 
63 struct format {
64     // For simplicity let's make a few assumptions here, since configuring the
65     // texture format is not the point of this example. (In practice you can
66     // go nuts with the `utils/upload.h` helpers)
67     //
68     // - All formats contain unsigned integers only
69     // - All components have the same size in bits
70     // - All components are in the "canonical" order
71     // - All formats have power of two sizes only (2 or 4 components, not 3)
72     // - All plane strides are a multiple of the pixel size
73     int num_comps;
74     int bitdepth;
75 };
76 
77 struct plane {
78     int subx, suby; // subsampling shift
79     struct format fmt;
80     size_t stride;
81     void *data;
82 };
83 
84 #define MAX_PLANES 4
85 
86 struct image {
87     int width, height;
88     int num_planes;
89     struct plane planes[MAX_PLANES];
90 
91     // For API #2, the associated mapped buffer (if any)
92     struct api2_buf *associated_buf;
93 };
94 
95 
96 // Example API design #1: synchronous, blocking, double-copy (bad!)
97 //
98 // In this API, `api1_filter` must immediately return with the new data.
99 // This prevents parallelism on the GPU and should be avoided if possible,
100 // but sometimes that's what you have to work with. So this is what it
101 // would look like.
102 //
103 // Also, let's assume this API design reconfigures the filter chain (using
104 // a blank `proxy` image every time the image format or dimensions change,
105 // and doesn't expect us to fail due to format mismatches or resource
106 // exhaustion afterwards.
107 
108 bool api1_reconfig(void *priv, const struct image *proxy);
109 bool api1_filter(void *priv, struct image *dst, struct image *src);
110 
111 
112 // Example API design #2: asynchronous, streaming, queued, zero-copy (good!)
113 //
114 // In this API, `api2_process` will run by the calling code every so often
115 // (e.g. when new data is available or expected). This function has access
116 // to non-blocking functions `get_image` and `put_image` that interface
117 // with the video filtering engine's internal queueing system.
118 //
119 // This API is also designed to feed multiple frames ahead of time, i.e.
120 // it will feed us as many frames as it can while we're still returning
121 // `API2_WANT_MORE`. To drain the filter chain, it would continue running
122 // the process function until `API2_HAVE_MORE` is no longer present
123 // in the output.
124 //
125 // This API is also designed to do zero-copy where possible. When it wants
126 // to create a data buffer of a given size, it will call our function
127 // `api2_alloc` which will return a buffer that we can process directly.
128 // We can use this to do zero-copy uploading to the GPU, by creating
129 // host-visible persistently mapped buffers. In order to prevent the video
130 // filtering system from re-using our buffers while copies are happening, we
131 // use special functions `image_lock` and `image_unlock` to increase a
132 // refcount on the image's backing storage. (As is typical of such APIs)
133 //
134 // Finally, this API is designed to be fully dynamic: The image parameters
135 // could change at any time, and we must be equipped to handle that.
136 
137 enum api2_status {
138     // Negative values are used to signal error conditions
139     API2_ERR_FMT = -2,          // incompatible / unsupported format
140     API2_ERR_UNKNOWN = -1,      // some other error happened
141     API2_OK = 0,                // no error, no status - everything's good
142 
143     // Positive values represent a mask of status conditions
144     API2_WANT_MORE = (1 << 0),  // we want more frames, please feed some more!
145     API2_HAVE_MORE = (1 << 1),  // we have more frames but they're not ready
146 };
147 
148 enum api2_status api2_process(void *priv);
149 
150 // Functions for creating persistently mapped buffers
151 struct api2_buf {
152     void *data;
153     size_t size;
154     void *priv;
155 };
156 
157 bool api2_alloc(void *priv, size_t size, struct api2_buf *out);
158 void api2_free(void *priv, const struct api2_buf *buf);
159 
160 // These functions are provided by the API. The exact details of how images
161 // are enqueued, dequeued and locked are not really important here, so just
162 // do something unrealistic but simple to demonstrate with.
163 struct image *get_image(void);
164 void put_image(struct image *img);
165 void image_lock(struct image *img);
166 void image_unlock(struct image *img);
167 
168 
169 /////////////////////////////////
170 /// libplacebo implementation ///
171 /////////////////////////////////
172 
173 
174 // For API #2:
175 #define PARALLELISM 8
176 
177 struct entry {
178     pl_buf buf; // to stream the download
179     pl_tex tex_in[MAX_PLANES];
180     pl_tex tex_out[MAX_PLANES];
181     struct image image;
182 
183     // For entries that are associated with a held image, so we can unlock them
184     // as soon as possible
185     struct image *held_image;
186     pl_buf held_buf;
187 };
188 
189 // For both APIs:
190 struct priv {
191     pl_log log;
192     pl_vulkan vk;
193     pl_gpu gpu;
194     pl_dispatch dp;
195     pl_shader_obj dither_state;
196 
197     // Timer objects
198     pl_timer render_timer;
199     pl_timer upload_timer;
200     pl_timer download_timer;
201     uint64_t render_sum;
202     uint64_t upload_sum;
203     uint64_t download_sum;
204     int render_count;
205     int upload_count;
206     int download_count;
207 
208     // API #1: A simple pair of input and output textures
209     pl_tex tex_in[MAX_PLANES];
210     pl_tex tex_out[MAX_PLANES];
211 
212     // API #2: A ring buffer of textures/buffers for streaming
213     int idx_in;  // points the next free entry
214     int idx_out; // points to the first entry still in progress
215     struct entry entries[PARALLELISM];
216 };
217 
init(void)218 void *init(void) {
219     struct priv *p = calloc(1, sizeof(struct priv));
220     if (!p)
221         return NULL;
222 
223     p->log = pl_log_create(PL_API_VER, &(struct pl_log_params) {
224         .log_cb = pl_log_simple,
225         .log_level = PL_LOG_WARN,
226     });
227 
228     p->vk = pl_vulkan_create(p->log, &(struct pl_vulkan_params) {
229         // Note: This is for API #2. In API #1 you could just pass params=NULL
230         // and it wouldn't really matter much.
231         .async_transfer = true,
232         .async_compute = true,
233         .queue_count = PARALLELISM,
234     });
235 
236     if (!p->vk) {
237         fprintf(stderr, "Failed creating vulkan context\n");
238         goto error;
239     }
240 
241     // Give this a shorter name for convenience
242     p->gpu = p->vk->gpu;
243 
244     p->dp = pl_dispatch_create(p->log, p->gpu);
245     if (!p->dp) {
246         fprintf(stderr, "Failed creating shader dispatch object\n");
247         goto error;
248     }
249 
250     p->render_timer = pl_timer_create(p->gpu);
251     p->upload_timer = pl_timer_create(p->gpu);
252     p->download_timer = pl_timer_create(p->gpu);
253 
254     return p;
255 
256 error:
257     uninit(p);
258     return NULL;
259 }
260 
uninit(void * priv)261 void uninit(void *priv)
262 {
263     struct priv *p = priv;
264 
265     // API #1
266     for (int i = 0; i < MAX_PLANES; i++) {
267         pl_tex_destroy(p->gpu, &p->tex_in[i]);
268         pl_tex_destroy(p->gpu, &p->tex_out[i]);
269     }
270 
271     // API #2
272     for (int i = 0; i < PARALLELISM; i++) {
273         pl_buf_destroy(p->gpu, &p->entries[i].buf);
274         for (int j = 0; j < MAX_PLANES; j++) {
275             pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]);
276             pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]);
277         }
278         if (p->entries[i].held_image)
279             image_unlock(p->entries[i].held_image);
280     }
281 
282     pl_timer_destroy(p->gpu, &p->render_timer);
283     pl_timer_destroy(p->gpu, &p->upload_timer);
284     pl_timer_destroy(p->gpu, &p->download_timer);
285 
286     pl_shader_obj_destroy(&p->dither_state);
287     pl_dispatch_destroy(&p->dp);
288     pl_vulkan_destroy(&p->vk);
289     pl_log_destroy(&p->log);
290 
291     free(p);
292 }
293 
294 // Helper function to set up the `pl_plane_data` struct from the image params
setup_plane_data(const struct image * img,struct pl_plane_data out[MAX_PLANES])295 static void setup_plane_data(const struct image *img,
296                              struct pl_plane_data out[MAX_PLANES])
297 {
298     for (int i = 0; i < img->num_planes; i++) {
299         const struct plane *plane = &img->planes[i];
300 
301         out[i] = (struct pl_plane_data) {
302             .type = PL_FMT_UNORM,
303             .width = img->width >> plane->subx,
304             .height = img->height >> plane->suby,
305             .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8,
306             .row_stride = plane->stride,
307             .pixels = plane->data,
308         };
309 
310         // For API 2 (direct rendering)
311         if (img->associated_buf) {
312             pl_buf buf = img->associated_buf->priv;
313             out[i].pixels = NULL;
314             out[i].buf = buf;
315             out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data;
316         }
317 
318         for (int c = 0; c < plane->fmt.num_comps; c++) {
319             out[i].component_size[c] = plane->fmt.bitdepth;
320             out[i].component_pad[c] = 0;
321             out[i].component_map[c] = c;
322         }
323     }
324 }
325 
do_plane(struct priv * p,pl_tex dst,pl_tex src)326 static bool do_plane(struct priv *p, pl_tex dst, pl_tex src)
327 {
328     int new_depth = dst->params.format->component_depth[0];
329 
330     // Do some debanding, and then also make sure to dither to the new depth
331     // so that our debanded gradients are actually preserved well
332     pl_shader sh = pl_dispatch_begin(p->dp);
333     pl_shader_deband(sh, &(struct pl_sample_src){ .tex = src }, NULL);
334     pl_shader_dither(sh, new_depth, &p->dither_state, NULL);
335     return pl_dispatch_finish(p->dp, &(struct pl_dispatch_params) {
336         .shader = &sh,
337         .target = dst,
338         .timer  = p->render_timer,
339     });
340 }
341 
check_timers(struct priv * p)342 static void check_timers(struct priv *p)
343 {
344     uint64_t ret;
345 
346     while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
347         p->render_sum += ret;
348         p->render_count++;
349     }
350 
351     while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
352         p->upload_sum += ret;
353         p->upload_count++;
354     }
355 
356     while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
357         p->download_sum += ret;
358         p->download_count++;
359     }
360 }
361 
362 // API #1 implementation:
363 //
364 // In this design, we will create all GPU resources inside `reconfig`, based on
365 // the texture format configured from the proxy image. This will avoid failing
366 // later on due to e.g. resource exhaustion or texture format mismatch, and
367 // thereby falls within the intended semantics of this style of API.
368 
api1_reconfig(void * priv,const struct image * proxy)369 bool api1_reconfig(void *priv, const struct image *proxy)
370 {
371     struct priv *p = priv;
372     struct pl_plane_data data[MAX_PLANES];
373     setup_plane_data(proxy, data);
374 
375     for (int i = 0; i < proxy->num_planes; i++) {
376         pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
377         if (!fmt) {
378             fprintf(stderr, "Failed configuring filter: no good texture format!\n");
379             return false;
380         }
381 
382         bool ok = true;
383         ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], &(struct pl_tex_params) {
384             .w = data[i].width,
385             .h = data[i].height,
386             .format = fmt,
387             .sampleable = true,
388             .host_writable = true,
389         });
390 
391         ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], &(struct pl_tex_params) {
392             .w = data[i].width,
393             .h = data[i].height,
394             .format = fmt,
395             .renderable = true,
396             .host_readable = true,
397         });
398 
399         if (!ok) {
400             fprintf(stderr, "Failed creating GPU textures!\n");
401             return false;
402         }
403     }
404 
405     return true;
406 }
407 
api1_filter(void * priv,struct image * dst,struct image * src)408 bool api1_filter(void *priv, struct image *dst, struct image *src)
409 {
410     struct priv *p = priv;
411     struct pl_plane_data data[MAX_PLANES];
412     setup_plane_data(src, data);
413 
414     // Upload planes
415     for (int i = 0; i < src->num_planes; i++) {
416         bool ok = pl_tex_upload(p->gpu, &(struct pl_tex_transfer_params) {
417             .tex = p->tex_in[i],
418             .stride_w = data[i].row_stride / data[i].pixel_stride,
419             .ptr = src->planes[i].data,
420             .timer = p->upload_timer,
421         });
422 
423         if (!ok) {
424             fprintf(stderr, "Failed uploading data to the GPU!\n");
425             return false;
426         }
427     }
428 
429     // Process planes
430     for (int i = 0; i < src->num_planes; i++) {
431         if (!do_plane(p, p->tex_out[i], p->tex_in[i])) {
432             fprintf(stderr, "Failed processing planes!\n");
433             return false;
434         }
435     }
436 
437     // Download planes
438     for (int i = 0; i < src->num_planes; i++) {
439         bool ok = pl_tex_download(p->gpu, &(struct pl_tex_transfer_params) {
440             .tex = p->tex_out[i],
441             .stride_w = dst->planes[i].stride / data[i].pixel_stride,
442             .ptr = dst->planes[i].data,
443             .timer = p->download_timer,
444         });
445 
446         if (!ok) {
447             fprintf(stderr, "Failed downloading data from the GPU!\n");
448             return false;
449         }
450     }
451 
452     check_timers(p);
453     return true;
454 }
455 
456 
457 // API #2 implementation:
458 //
459 // In this implementation we maintain a queue (implemented as ring buffer)
460 // of "work entries", which are isolated structs that hold independent GPU
461 // resources - so that the GPU has no cross-entry dependencies on any of the
462 // textures or other resources. (Side note: It still has a dependency on the
463 // dither state, but this is just a shared LUT anyway)
464 
465 // Align up to the nearest multiple of a power of two
466 #define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
467 
submit_work(struct priv * p,struct entry * e,struct image * img)468 static enum api2_status submit_work(struct priv *p, struct entry *e,
469                                     struct image *img)
470 {
471     // If the image comes from a mapped buffer, we have to take a lock
472     // while our upload is in progress
473     if (img->associated_buf) {
474         assert(!e->held_image);
475         image_lock(img);
476         e->held_image = img;
477         e->held_buf = img->associated_buf->priv;
478     }
479 
480     // Upload this image's data
481     struct pl_plane_data data[MAX_PLANES];
482     setup_plane_data(img, data);
483 
484     for (int i = 0; i < img->num_planes; i++) {
485         pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
486         if (!fmt)
487             return API2_ERR_FMT;
488 
489         // FIXME: can we plumb a `pl_timer` in here somehow?
490         if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
491             return API2_ERR_UNKNOWN;
492 
493         // Re-create the target FBO as well with this format if necessary
494         bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], &(struct pl_tex_params) {
495             .w = data[i].width,
496             .h = data[i].height,
497             .format = fmt,
498             .renderable = true,
499             .host_readable = true,
500         });
501         if (!ok)
502             return API2_ERR_UNKNOWN;
503     }
504 
505     // Dispatch the work for this image
506     for (int i = 0; i < img->num_planes; i++) {
507         if (!do_plane(p, e->tex_out[i], e->tex_in[i]))
508             return API2_ERR_UNKNOWN;
509     }
510 
511     // Set up the resulting `struct image` that will hold our target
512     // data. We just copy the format etc. from the source image
513     memcpy(&e->image, img, sizeof(struct image));
514 
515     size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0;
516     for (int i = 0; i < img->num_planes; i++) {
517         // For performance, we want to make sure we align the stride
518         // to a multiple of the GPU's preferred texture transfer stride
519         // (This is entirely optional)
520         stride[i] = ALIGN2(img->planes[i].stride,
521                            p->gpu->limits.align_tex_xfer_stride);
522         int height = img->height >> img->planes[i].suby;
523 
524         // Round up the offset to the nearest multiple of the optimal
525         // transfer alignment. (This is also entirely optional)
526         offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset);
527         total_size = offset[i] + stride[i] * height;
528     }
529 
530     // Dispatch the asynchronous download into a mapped buffer
531     bool ok = pl_buf_recreate(p->gpu, &e->buf, &(struct pl_buf_params) {
532         .size = total_size,
533         .host_mapped = true,
534     });
535     if (!ok)
536         return API2_ERR_UNKNOWN;
537 
538     for (int i = 0; i < img->num_planes; i++) {
539         ok = pl_tex_download(p->gpu, &(struct pl_tex_transfer_params) {
540             .tex = e->tex_out[i],
541             .stride_w = stride[i] / data[i].pixel_stride,
542             .buf = e->buf,
543             .buf_offset = offset[i],
544             .timer = p->download_timer,
545         });
546         if (!ok)
547             return API2_ERR_UNKNOWN;
548 
549         // Update the output fields
550         e->image.planes[i].data = e->buf->data + offset[i];
551         e->image.planes[i].stride = stride[i];
552     }
553 
554     // Make sure this work starts processing in the background, and especially
555     // so we can move on to the next queue on the gPU
556     pl_gpu_flush(p->gpu);
557     return API2_OK;
558 }
559 
api2_process(void * priv)560 enum api2_status api2_process(void *priv)
561 {
562     struct priv *p = priv;
563     enum api2_status ret = 0;
564 
565     // Opportunistically release any held images. We do this across the ring
566     // buffer, rather than doing this as part of the following loop, because
567     // we want to release images ahead-of-time (no FIFO constraints)
568     for (int i = 0; i < PARALLELISM; i++) {
569         struct entry *e = &p->entries[i];
570         if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) {
571             // upload buffer is no longer in use, release it
572             image_unlock(e->held_image);
573             e->held_image = NULL;
574             e->held_buf = NULL;
575         }
576     }
577 
578     // Poll the status of existing entries and dequeue the ones that are done
579     while (p->idx_out != p->idx_in) {
580         struct entry *e = &p->entries[p->idx_out];
581         if (pl_buf_poll(p->gpu, e->buf, 0))
582             break;
583 
584         if (e->held_image) {
585             image_unlock(e->held_image);
586             e->held_image = NULL;
587             e->held_buf = NULL;
588         }
589 
590         // download buffer is no longer busy, dequeue the frame
591         put_image(&e->image);
592         p->idx_out = (p->idx_out + 1) % PARALLELISM;
593     }
594 
595     // Fill up the queue with more work
596     int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1;
597     while (p->idx_in != last_free_idx) {
598         struct image *img = get_image();
599         if (!img) {
600             ret |= API2_WANT_MORE;
601             break;
602         }
603 
604         enum api2_status err = submit_work(p, &p->entries[p->idx_in], img);
605         if (err < 0)
606             return err;
607 
608         p->idx_in = (p->idx_in + 1) % PARALLELISM;
609     }
610 
611     if (p->idx_out != p->idx_in)
612         ret |= API2_HAVE_MORE;
613 
614     return ret;
615 }
616 
api2_alloc(void * priv,size_t size,struct api2_buf * out)617 bool api2_alloc(void *priv, size_t size, struct api2_buf *out)
618 {
619     struct priv *p = priv;
620     if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size)
621         return false;
622 
623     pl_buf buf = pl_buf_create(p->gpu, &(struct pl_buf_params) {
624         .size = size,
625         .host_mapped = true,
626     });
627 
628     if (!buf)
629         return false;
630 
631     *out = (struct api2_buf) {
632         .data = buf->data,
633         .size = size,
634         .priv = (void *) buf,
635     };
636     return true;
637 }
638 
api2_free(void * priv,const struct api2_buf * buf)639 void api2_free(void *priv, const struct api2_buf *buf)
640 {
641     struct priv *p = priv;
642     pl_buf plbuf = buf->priv;
643     pl_buf_destroy(p->gpu, &plbuf);
644 }
645 
646 
647 ////////////////////////////////////
648 /// Proof of Concept / Benchmark ///
649 ////////////////////////////////////
650 
651 #define FRAMES 10000
652 
653 // Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily
654 // with a stride aligned to 256 bytes. (For no particular reason)
655 #define TEXELSZ sizeof(uint8_t)
656 #define WIDTH   1920
657 #define HEIGHT  1080
658 #define STRIDE  (ALIGN2(WIDTH, 256) * TEXELSZ)
659 // Subsampled planes
660 #define SWIDTH  (WIDTH >> 1)
661 #define SHEIGHT (HEIGHT >> 1)
662 #define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ)
663 // Plane offsets / sizes
664 #define SIZE0   (HEIGHT * STRIDE)
665 #define SIZE1   (2 * SHEIGHT * SSTRIDE)
666 #define OFFSET0 0
667 #define OFFSET1 SIZE0
668 #define BUFSIZE (OFFSET1 + SIZE1)
669 
670 // Skeleton of an example image
671 static const struct image example_image = {
672     .width = WIDTH,
673     .height = HEIGHT,
674     .num_planes = 2,
675     .planes = {
676         {
677             .subx = 0,
678             .suby = 0,
679             .stride = STRIDE,
680             .fmt = {
681                 .num_comps = 1,
682                 .bitdepth = 8 * TEXELSZ,
683             },
684         }, {
685             .subx = 1,
686             .suby = 1,
687             .stride = SSTRIDE * 2,
688             .fmt = {
689                 .num_comps = 2,
690                 .bitdepth = 8 * TEXELSZ,
691             },
692         },
693     },
694 };
695 
696 // API #1: Nice and simple (but slow)
api1_example(void)697 static void api1_example(void)
698 {
699     struct priv *vf = init();
700     if (!vf)
701         return;
702 
703     if (!api1_reconfig(vf, &example_image)) {
704         fprintf(stderr, "api1: Failed configuring video filter!\n");
705         return;
706     }
707 
708     // Allocate two buffers to hold the example data, and fill the source
709     // buffer arbitrarily with a "simple" pattern. (Decoding the data into
710     // the buffer is not meant to be part of this benchmark)
711     uint8_t *srcbuf = malloc(BUFSIZE),
712             *dstbuf = malloc(BUFSIZE);
713     if (!srcbuf || !dstbuf)
714         goto done;
715 
716     for (size_t i = 0; i < BUFSIZE; i++)
717         srcbuf[i] = i;
718 
719     struct image src = example_image, dst = example_image;
720     src.planes[0].data = srcbuf + OFFSET0;
721     src.planes[1].data = srcbuf + OFFSET1;
722     dst.planes[0].data = dstbuf + OFFSET0;
723     dst.planes[1].data = dstbuf + OFFSET1;
724 
725     struct timeval start = {0}, stop = {0};
726     gettimeofday(&start, NULL);
727 
728     // Process this dummy frame a bunch of times
729     unsigned frames = 0;
730     for (frames = 0; frames < FRAMES; frames++) {
731         if (!api1_filter(vf, &dst, &src)) {
732             fprintf(stderr, "api1: Failed filtering frame... aborting\n");
733             break;
734         }
735     }
736 
737     gettimeofday(&stop, NULL);
738     float secs = (float) (stop.tv_sec - start.tv_sec) +
739                  1e-6 * (stop.tv_usec - start.tv_usec);
740 
741     printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
742            frames, secs, 1000 * secs / frames, frames / secs);
743 
744     if (vf->render_count) {
745         printf("      render: %f ms, upload: %f ms, download: %f ms\n",
746                1e-6 * vf->render_sum / vf->render_count,
747                vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
748                vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
749     }
750 
751 done:
752     free(srcbuf);
753     free(dstbuf);
754     uninit(vf);
755 }
756 
757 
758 // API #2: Pretend we have some fancy pool of images.
759 #define POOLSIZE (PARALLELISM + 1)
760 
761 static struct api2_buf buffers[POOLSIZE] = {0};
762 static struct image images[POOLSIZE] = {0};
763 static int refcount[POOLSIZE] = {0};
764 static unsigned api2_frames_in = 0;
765 static unsigned api2_frames_out = 0;
766 
api2_example(void)767 static void api2_example(void)
768 {
769     struct priv *vf = init();
770     if (!vf)
771         return;
772 
773     // Set up a bunch of dummy images
774     for (int i = 0; i < POOLSIZE; i++) {
775         uint8_t *data;
776         images[i] = example_image;
777         if (api2_alloc(vf, BUFSIZE, &buffers[i])) {
778             data = buffers[i].data;
779             images[i].associated_buf = &buffers[i];
780         } else {
781             // Fall back in case mapped buffers are unsupported
782             fprintf(stderr, "warning: falling back to malloc, may be slow\n");
783             data = malloc(BUFSIZE);
784         }
785         // Fill with some "data" (like in API #1)
786         for (size_t n = 0; n < BUFSIZE; n++)
787             data[i] = n;
788         images[i].planes[0].data = data + OFFSET0;
789         images[i].planes[1].data = data + OFFSET1;
790     }
791 
792     struct timeval start = {0}, stop = {0};
793     gettimeofday(&start, NULL);
794 
795     // Just keep driving the event loop regardless of the return status
796     // until we reach the critical number of frames. (Good enough for this PoC)
797     while (api2_frames_out < FRAMES) {
798         enum api2_status ret = api2_process(vf);
799         if (ret < 0) {
800             fprintf(stderr, "api2: Failed processing... aborting\n");
801             break;
802         }
803 
804         // Sleep a short time (100us) to prevent busy waiting the CPU
805     #ifdef _WIN32
806         Sleep(0);
807     #else
808         nanosleep(&(struct timespec) { .tv_nsec = 100000 }, NULL);
809     #endif
810         check_timers(vf);
811     }
812 
813     gettimeofday(&stop, NULL);
814     float secs = (float) (stop.tv_sec - start.tv_sec) +
815                  1e-6 * (stop.tv_usec - start.tv_usec);
816 
817     printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
818            api2_frames_out, secs, 1000 * secs / api2_frames_out,
819            api2_frames_out / secs);
820 
821     if (vf->render_count) {
822         printf("      render: %f ms, upload: %f ms, download: %f ms\n",
823                1e-6 * vf->render_sum / vf->render_count,
824                vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
825                vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
826     }
827 
828     for (int i = 0; i < POOLSIZE; i++) {
829         if (images[i].associated_buf) {
830             api2_free(vf, images[i].associated_buf);
831         } else {
832             // This is what we originally malloc'd
833             free(images[i].planes[0].data);
834         }
835     }
836 
837     uninit(vf);
838 }
839 
get_image(void)840 struct image *get_image(void)
841 {
842     if (api2_frames_in == FRAMES)
843         return NULL; // simulate EOF, to avoid queueing up "extra" work
844 
845     // if we can find a free (unlocked) image, give it that
846     for (int i = 0; i < POOLSIZE; i++) {
847         if (refcount[i] == 0) {
848             api2_frames_in++;
849             return &images[i];
850         }
851     }
852 
853     return NULL; // no free image available
854 }
855 
put_image(struct image * img)856 void put_image(struct image *img)
857 {
858     (void)img;
859     api2_frames_out++;
860 }
861 
image_lock(struct image * img)862 void image_lock(struct image *img)
863 {
864     int index = img - images; // cheat, for lack of having actual image management
865     refcount[index]++;
866 }
867 
image_unlock(struct image * img)868 void image_unlock(struct image *img)
869 {
870     int index = img - images;
871     refcount[index]--;
872 }
873 
main(void)874 int main(void)
875 {
876     printf("Running benchmarks...\n");
877     api1_example();
878     api2_example();
879     return 0;
880 }
881