1 #include "Halide.h"
2 #include "halide_benchmark.h"
3 #include <cstdio>
4 #include <memory>
5 
6 using namespace Halide;
7 using namespace Halide::Tools;
8 
test_deinterleave()9 void test_deinterleave() {
10     ImageParam src(UInt(8), 3);
11     Func dst;
12     Var x, y, c;
13 
14     dst(x, y, c) = src(x, y, c);
15 
16     src.dim(0).set_stride(3).dim(2).set_stride(1).set_bounds(0, 3);
17 
18     // This is the default format for Halide, but made explicit for illustration.
19     dst.output_buffer()
20         .dim(0)
21         .set_stride(1)
22         .dim(2)
23         .set_extent(3);
24 
25     dst.reorder(c, x, y).unroll(c);
26     dst.vectorize(x, 16);
27 
28     // Allocate two 16 megapixel, 3 channel, 8-bit images -- input and output
29 
30     // Setup src to be RGB interleaved, with no extra padding between channels or rows.
31     Buffer<uint8_t> src_image = Buffer<uint8_t>::make_interleaved(1 << 12, 1 << 12, 3);
32 
33     // Setup dst to be planar, with no extra padding between channels or rows.
34     Buffer<uint8_t> dst_image(1 << 12, 1 << 12, 3);
35 
36     src_image.for_each_element([&](int x, int y) {
37         src_image(x, y, 0) = 0;
38         src_image(x, y, 1) = 128;
39         src_image(x, y, 2) = 255;
40     });
41     dst_image.fill(0);
42 
43     src.set(src_image);
44 
45     dst.compile_jit();
46 
47     // Warm up caches, etc.
48     dst.realize(dst_image);
49 
50     double t1 = benchmark([&]() {
51         dst.realize(dst_image);
52     });
53 
54     printf("Interleaved to planar bandwidth %.3e byte/s.\n",
55            dst_image.number_of_elements() / t1);
56 
57     dst_image.for_each_element([&](int x, int y) {
58         assert(dst_image(x, y, 0) == 0);
59         assert(dst_image(x, y, 1) == 128);
60         assert(dst_image(x, y, 2) == 255);
61     });
62 
63     // Setup a semi-planar output case.
64     dst_image = Buffer<uint8_t>(1 << 12, 3, 1 << 12);
65     dst_image.transpose(1, 2);
66     dst_image.fill(0);
67 
68     double t2 = benchmark([&]() {
69         dst.realize(dst_image);
70     });
71 
72     dst_image.for_each_element([&](int x, int y) {
73         assert(dst_image(x, y, 0) == 0);
74         assert(dst_image(x, y, 1) == 128);
75         assert(dst_image(x, y, 2) == 255);
76     });
77 
78     printf("Interleaved to semi-planar bandwidth %.3e byte/s.\n",
79            dst_image.number_of_elements() / t2);
80 }
81 
test_interleave(bool fast)82 void test_interleave(bool fast) {
83     ImageParam src(UInt(8), 3);
84     Func dst;
85     Var x, y, c;
86 
87     dst(x, y, c) = src(x, y, c);
88 
89     // This is the default format for Halide, but made explicit for illustration.
90     src.dim(0).set_stride(1).dim(2).set_extent(3);
91 
92     dst.output_buffer()
93         .dim(0)
94         .set_stride(3)
95         .dim(2)
96         .set_stride(1)
97         .set_bounds(0, 3);
98 
99     if (fast) {
100         dst.reorder(c, x, y).bound(c, 0, 3).unroll(c);
101         dst.vectorize(x, 16);
102     } else {
103         dst.reorder(c, x, y).vectorize(x, 16);
104     }
105 
106     // Allocate two 16 megapixel, 3 channel, 8-bit images -- input and output
107 
108     // Setup src to be planar
109     Buffer<uint8_t> src_image(1 << 12, 1 << 12, 3);
110 
111     // Setup dst to be interleaved
112     Buffer<uint8_t> dst_image = Buffer<uint8_t>::make_interleaved(1 << 12, 1 << 12, 3);
113 
114     src_image.for_each_element([&](int x, int y) {
115         src_image(x, y, 0) = 0;
116         src_image(x, y, 1) = 128;
117         src_image(x, y, 2) = 255;
118     });
119     dst_image.fill(0);
120 
121     src.set(src_image);
122 
123     if (fast) {
124         dst.compile_to_lowered_stmt("rgb_interleave_fast.stmt", dst.infer_arguments());
125     } else {
126         dst.compile_to_lowered_stmt("rgb_interleave_slow.stmt", dst.infer_arguments());
127     }
128 
129     // Warm up caches, etc.
130     dst.realize(dst_image);
131 
132     double t = benchmark([&]() {
133         dst.realize(dst_image);
134     });
135 
136     printf("Planar to interleaved bandwidth %.3e byte/s.\n",
137            dst_image.number_of_elements() / t);
138 
139     dst_image.for_each_element([&](int x, int y) {
140         assert(dst_image(x, y, 0) == 0);
141         assert(dst_image(x, y, 1) == 128);
142         assert(dst_image(x, y, 2) == 255);
143     });
144 }
145 
main(int argc,char ** argv)146 int main(int argc, char **argv) {
147     Target target = get_jit_target_from_environment();
148     if (target.arch == Target::WebAssembly) {
149         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
150         return 0;
151     }
152 
153     test_deinterleave();
154     test_interleave(false);
155     test_interleave(true);
156     printf("Success!\n");
157     return 0;
158 }
159