1# implement elementwise addition with/without broadcast using BinaryOp operation 2 3* input must be fp32 storage without packing 4* output is expected to be fp32 storage without packing 5 6```cpp 7void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c) 8{ 9 ncnn::Option opt; 10 opt.num_threads = 2; 11 opt.use_fp16_storage = false; 12 opt.use_packing_layout = false; 13 14 ncnn::Layer* op = ncnn::create_layer("BinaryOp"); 15 16 // set param 17 ncnn::ParamDict pd; 18 pd.set(0, 0);// op_type 19 20 op->load_param(pd); 21 22 op->create_pipeline(opt); 23 24 // forward 25 std::vector<ncnn::Mat> bottoms(2); 26 bottoms[0] = a; 27 bottoms[1] = b; 28 29 std::vector<ncnn::Mat> tops(1); 30 op->forward(bottoms, tops, opt); 31 32 c = tops[0]; 33 34 op->destroy_pipeline(opt); 35 36 delete op; 37} 38``` 39 40# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation 41 42* input must be fp32 storage without packing 43* output is expected to be fp32 storage without packing 44 45```cpp 46void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out) 47{ 48 ncnn::Option opt; 49 opt.num_threads = 2; 50 opt.use_fp16_storage = false; 51 opt.use_packing_layout = false; 52 53 ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise"); 54 55 // set param 56 ncnn::ParamDict pd; 57 pd.set(0, 3);// num_output 58 pd.set(1, 3);// kernel_w 59 pd.set(5, 0);// bias_term 60 pd.set(6, 3*3*3);// weight_data_size 61 pd.set(7, 3);// group 62 63 op->load_param(pd); 64 65 // set weights 66 ncnn::Mat weights[1]; 67 weights[0].create(3*3*3);// weight_data 68 69 for (int i=0; i<3*3*3; i++) 70 { 71 weights[0][i] = 1.f / 9; 72 } 73 74 op->load_model(ncnn::ModelBinFromMatArray(weights)); 75 76 op->create_pipeline(opt); 77 78 // forward 79 op->forward(rgb, out, opt); 80 81 op->destroy_pipeline(opt); 82 83 delete op; 84} 85``` 86# transpose Mat, chw to cwh 87 88* input must be fp32 storage with/without packing 89* output is expected to be fp32 storage packed 90 91```cpp 92void transpose(const ncnn::Mat& in, ncnn::Mat& out) 93{ 94 ncnn::Option opt; 95 opt.num_threads = 2; 96 opt.use_fp16_storage = false; 97 opt.use_packing_layout = true; 98 99 ncnn::Layer* op = ncnn::create_layer("Permute"); 100 101 // set param 102 ncnn::ParamDict pd; 103 pd.set(0, 1);// order_type 104 105 op->load_param(pd); 106 107 op->create_pipeline(opt); 108 109 ncnn::Mat in_packed = in; 110 { 111 // resolve dst_elempack 112 int dims = in.dims; 113 int elemcount = 0; 114 if (dims == 1) elemcount = in.elempack * in.w; 115 if (dims == 2) elemcount = in.elempack * in.h; 116 if (dims == 3) elemcount = in.elempack * in.c; 117 118 int dst_elempack = 1; 119 if (layer->support_packing) 120 { 121 if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx2()) 122 dst_elempack = 8; 123 else if (elemcount % 4 == 0) 124 dst_elempack = 4; 125 } 126 127 if (in.elempack != dst_elempack) 128 { 129 convert_packing(in, in_packed, dst_elempack, opt); 130 } 131 } 132 133 // forward 134 op->forward(in_packed, out, opt); 135 136 op->destroy_pipeline(opt); 137 138 delete op; 139} 140``` 141# apply instance normalization 142// x = (x - mean) / sqrt(var) 143 144* input can be fp32/fp16 storage with/without packing 145* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise 146 147```cpp 148void normalize(const ncnn::Mat& in, ncnn::Mat& out) 149{ 150 ncnn::Option opt; 151 opt.num_threads = 2; 152 opt.use_fp16_storage = true; 153 opt.use_packing_layout = true; 154 155 ncnn::Layer* op = ncnn::create_layer("InstanceNorm"); 156 157 // set param 158 ncnn::ParamDict pd; 159 pd.set(0, in.c);// channels 160 pd.set(1, 0.f);// eps 161 162 op->load_param(pd); 163 164 // set weights 165 ncnn::Mat weights[2]; 166 weights[0].create(in.c);// gamma_data 167 weights[1].create(in.c);// beta_data 168 169 weights[0].fill(1.f); 170 weights[1].fill(0.f); 171 172 op->load_model(ncnn::ModelBinFromMatArray(weights)); 173 174 op->create_pipeline(opt); 175 176 ncnn::Mat in_fp16 = in; 177 if (in.elembits() == 32 && op->support_fp16_storage) 178 { 179 cast_float32_to_float16(in, in_fp16, opt); 180 } 181 if (in.elembits() == 16 && !op->support_fp16_storage) 182 { 183 cast_float16_to_float32(in, in_fp16, opt); 184 } 185 186 ncnn::Mat in_fp16_packed = in_fp16; 187 { 188 // resolve dst_elempack 189 int dims = in_fp16.dims; 190 int elemcount = 0; 191 if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w; 192 if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h; 193 if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c; 194 195 int dst_elempack = 1; 196 if (layer->support_packing) 197 { 198 if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx2()) 199 dst_elempack = 8; 200 else if (elemcount % 4 == 0) 201 dst_elempack = 4; 202 } 203 204 if (in_fp16.elempack != dst_elempack) 205 { 206 convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt); 207 } 208 } 209 210 // forward 211 op->forward(in_fp16_packed, out, opt); 212 213 op->destroy_pipeline(opt); 214 215 delete op; 216} 217``` 218 219# cpu -> gpu -> forward -> gpu -> cpu 220 221```cpp 222ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); 223 224ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); 225ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); 226 227ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev); 228ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev); 229 230// create layer 231ncnn::Layer* convolution = ncnn::create_layer("Convolution"); 232convolution->vkdev = vkdev; 233 234// set option 235ncnn::Option opt; 236opt.num_threads = 4; 237opt.use_vulkan_compute = true; 238opt.blob_vkallocator = blob_vkallocator; 239opt.workspace_vkallocator = blob_vkallocator; 240opt.staging_vkallocator = staging_vkallocator; 241 242// load param 243{ 244 ncnn::ParamDict pd; 245 pd.set(0, outch); 246 pd.set(1, ksize); 247 pd.set(6, outch*inch*ksize*ksize); 248 pd.use_vulkan_compute = 1; 249 250 convolution->load_param(pd); 251} 252 253// load model 254{ 255 ncnn::Mat weights[2]; 256 weights[0] = random_mat(outch*inch*ksize*ksize); 257 weights[1] = random_mat(outch); 258 259 ncnn::ModelBinFromMatArray mb(weights); 260 convolution->load_model(mb); 261} 262 263// create pipeline 264convolution->create_pipeline(opt); 265 266// upload model 267{ 268 ncnn::VkTransfer cmd(vkdev); 269 270 ncnn::Option opt_upload = opt; 271 opt_upload.blob_vkallocator = weight_vkallocator; 272 opt_upload.workspace_vkallocator = weight_vkallocator; 273 opt_upload.staging_vkallocator = weight_staging_vkallocator; 274 275 convolution->upload_model(cmd, opt_upload); 276 277 cmd.submit_and_wait(); 278} 279 280ncnn::Mat bottom = random_mat(w, h, inch); 281 282ncnn::Mat top; 283 284// forward 285{ 286 ncnn::VkCompute cmd(vkdev); 287 288 ncnn::VkMat bottom_gpu; 289 cmd.record_upload(bottom, bottom_gpu, opt); 290 291 ncnn::VkMat top_gpu; 292 convolution->forward(bottom_gpu, top_gpu, cmd, opt); 293 294 cmd.record_download(top_gpu, top, opt); 295 296 cmd.submit_and_wait(); 297} 298 299convolution->destroy_pipeline(opt); 300 301delete convolution; 302 303vkdev->reclaim_blob_allocator(blob_vkallocator); 304vkdev->reclaim_staging_allocator(staging_vkallocator); 305 306weight_vkallocator->clear(); 307weight_staging_vkallocator->clear(); 308delete weight_vkallocator; 309delete weight_staging_vkallocator; 310``` 311 312