1# implement elementwise addition with/without broadcast using BinaryOp operation
2
3* input must be fp32 storage without packing
4* output is expected to be fp32 storage without packing
5
6```cpp
7void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
8{
9    ncnn::Option opt;
10    opt.num_threads = 2;
11    opt.use_fp16_storage = false;
12    opt.use_packing_layout = false;
13
14    ncnn::Layer* op = ncnn::create_layer("BinaryOp");
15
16    // set param
17    ncnn::ParamDict pd;
18    pd.set(0, 0);// op_type
19
20    op->load_param(pd);
21
22    op->create_pipeline(opt);
23
24    // forward
25    std::vector<ncnn::Mat> bottoms(2);
26    bottoms[0] = a;
27    bottoms[1] = b;
28
29    std::vector<ncnn::Mat> tops(1);
30    op->forward(bottoms, tops, opt);
31
32    c = tops[0];
33
34    op->destroy_pipeline(opt);
35
36    delete op;
37}
38```
39
40# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation
41
42* input must be fp32 storage without packing
43* output is expected to be fp32 storage without packing
44
45```cpp
46void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
47{
48    ncnn::Option opt;
49    opt.num_threads = 2;
50    opt.use_fp16_storage = false;
51    opt.use_packing_layout = false;
52
53    ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");
54
55    // set param
56    ncnn::ParamDict pd;
57    pd.set(0, 3);// num_output
58    pd.set(1, 3);// kernel_w
59    pd.set(5, 0);// bias_term
60    pd.set(6, 3*3*3);// weight_data_size
61    pd.set(7, 3);// group
62
63    op->load_param(pd);
64
65    // set weights
66    ncnn::Mat weights[1];
67    weights[0].create(3*3*3);// weight_data
68
69    for (int i=0; i<3*3*3; i++)
70    {
71        weights[0][i] = 1.f / 9;
72    }
73
74    op->load_model(ncnn::ModelBinFromMatArray(weights));
75
76    op->create_pipeline(opt);
77
78    // forward
79    op->forward(rgb, out, opt);
80
81    op->destroy_pipeline(opt);
82
83    delete op;
84}
85```
86# transpose Mat, chw to cwh
87
88* input must be fp32 storage with/without packing
89* output is expected to be fp32 storage packed
90
91```cpp
92void transpose(const ncnn::Mat& in, ncnn::Mat& out)
93{
94    ncnn::Option opt;
95    opt.num_threads = 2;
96    opt.use_fp16_storage = false;
97    opt.use_packing_layout = true;
98
99    ncnn::Layer* op = ncnn::create_layer("Permute");
100
101    // set param
102    ncnn::ParamDict pd;
103    pd.set(0, 1);// order_type
104
105    op->load_param(pd);
106
107    op->create_pipeline(opt);
108
109    ncnn::Mat in_packed = in;
110    {
111        // resolve dst_elempack
112        int dims = in.dims;
113        int elemcount = 0;
114        if (dims == 1) elemcount = in.elempack * in.w;
115        if (dims == 2) elemcount = in.elempack * in.h;
116        if (dims == 3) elemcount = in.elempack * in.c;
117
118        int dst_elempack = 1;
119        if (layer->support_packing)
120        {
121            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx2())
122                dst_elempack = 8;
123            else if (elemcount % 4 == 0)
124                dst_elempack = 4;
125        }
126
127        if (in.elempack != dst_elempack)
128        {
129            convert_packing(in, in_packed, dst_elempack, opt);
130        }
131    }
132
133    // forward
134    op->forward(in_packed, out, opt);
135
136    op->destroy_pipeline(opt);
137
138    delete op;
139}
140```
141# apply instance normalization
142// x = (x - mean) / sqrt(var)
143
144* input can be fp32/fp16 storage with/without packing
145* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise
146
147```cpp
148void normalize(const ncnn::Mat& in, ncnn::Mat& out)
149{
150    ncnn::Option opt;
151    opt.num_threads = 2;
152    opt.use_fp16_storage = true;
153    opt.use_packing_layout = true;
154
155    ncnn::Layer* op = ncnn::create_layer("InstanceNorm");
156
157    // set param
158    ncnn::ParamDict pd;
159    pd.set(0, in.c);// channels
160    pd.set(1, 0.f);// eps
161
162    op->load_param(pd);
163
164    // set weights
165    ncnn::Mat weights[2];
166    weights[0].create(in.c);// gamma_data
167    weights[1].create(in.c);// beta_data
168
169    weights[0].fill(1.f);
170    weights[1].fill(0.f);
171
172    op->load_model(ncnn::ModelBinFromMatArray(weights));
173
174    op->create_pipeline(opt);
175
176    ncnn::Mat in_fp16 = in;
177    if (in.elembits() == 32 && op->support_fp16_storage)
178    {
179        cast_float32_to_float16(in, in_fp16, opt);
180    }
181    if (in.elembits() == 16 && !op->support_fp16_storage)
182    {
183        cast_float16_to_float32(in, in_fp16, opt);
184    }
185
186    ncnn::Mat in_fp16_packed = in_fp16;
187    {
188        // resolve dst_elempack
189        int dims = in_fp16.dims;
190        int elemcount = 0;
191        if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w;
192        if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h;
193        if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c;
194
195        int dst_elempack = 1;
196        if (layer->support_packing)
197        {
198            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx2())
199                dst_elempack = 8;
200            else if (elemcount % 4 == 0)
201                dst_elempack = 4;
202        }
203
204        if (in_fp16.elempack != dst_elempack)
205        {
206            convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt);
207        }
208    }
209
210    // forward
211    op->forward(in_fp16_packed, out, opt);
212
213    op->destroy_pipeline(opt);
214
215    delete op;
216}
217```
218
219# cpu -> gpu -> forward -> gpu -> cpu
220
221```cpp
222ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
223
224ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
225ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
226
227ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev);
228ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev);
229
230// create layer
231ncnn::Layer* convolution = ncnn::create_layer("Convolution");
232convolution->vkdev = vkdev;
233
234// set option
235ncnn::Option opt;
236opt.num_threads = 4;
237opt.use_vulkan_compute = true;
238opt.blob_vkallocator = blob_vkallocator;
239opt.workspace_vkallocator = blob_vkallocator;
240opt.staging_vkallocator = staging_vkallocator;
241
242// load param
243{
244    ncnn::ParamDict pd;
245    pd.set(0, outch);
246    pd.set(1, ksize);
247    pd.set(6, outch*inch*ksize*ksize);
248    pd.use_vulkan_compute = 1;
249
250    convolution->load_param(pd);
251}
252
253// load model
254{
255    ncnn::Mat weights[2];
256    weights[0] = random_mat(outch*inch*ksize*ksize);
257    weights[1] = random_mat(outch);
258
259    ncnn::ModelBinFromMatArray mb(weights);
260    convolution->load_model(mb);
261}
262
263// create pipeline
264convolution->create_pipeline(opt);
265
266// upload model
267{
268    ncnn::VkTransfer cmd(vkdev);
269
270    ncnn::Option opt_upload = opt;
271    opt_upload.blob_vkallocator = weight_vkallocator;
272    opt_upload.workspace_vkallocator = weight_vkallocator;
273    opt_upload.staging_vkallocator = weight_staging_vkallocator;
274
275    convolution->upload_model(cmd, opt_upload);
276
277    cmd.submit_and_wait();
278}
279
280ncnn::Mat bottom = random_mat(w, h, inch);
281
282ncnn::Mat top;
283
284// forward
285{
286    ncnn::VkCompute cmd(vkdev);
287
288    ncnn::VkMat bottom_gpu;
289    cmd.record_upload(bottom, bottom_gpu, opt);
290
291    ncnn::VkMat top_gpu;
292    convolution->forward(bottom_gpu, top_gpu, cmd, opt);
293
294    cmd.record_download(top_gpu, top, opt);
295
296    cmd.submit_and_wait();
297}
298
299convolution->destroy_pipeline(opt);
300
301delete convolution;
302
303vkdev->reclaim_blob_allocator(blob_vkallocator);
304vkdev->reclaim_staging_allocator(staging_vkallocator);
305
306weight_vkallocator->clear();
307weight_staging_vkallocator->clear();
308delete weight_vkallocator;
309delete weight_staging_vkallocator;
310```
311
312