1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolutiondepthwise_vulkan.h"
16 
17 #include "layer_shader_type.h"
18 #include "layer_type.h"
19 
20 namespace ncnn {
21 
ConvolutionDepthWise_vulkan()22 ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
23 {
24     support_vulkan = true;
25     support_image_storage = true;
26 
27     padding = 0;
28 
29     pipeline_convolutiondepthwise = 0;
30     pipeline_convolutiondepthwise_pack4 = 0;
31     pipeline_convolutiondepthwise_pack8 = 0;
32 
33     pipeline_convolutiondepthwise_group = 0;
34     pipeline_convolutiondepthwise_group_pack4 = 0;
35     pipeline_convolutiondepthwise_group_pack1to4 = 0;
36     pipeline_convolutiondepthwise_group_pack4to1 = 0;
37     pipeline_convolutiondepthwise_group_pack8 = 0;
38     pipeline_convolutiondepthwise_group_pack1to8 = 0;
39     pipeline_convolutiondepthwise_group_pack4to8 = 0;
40     pipeline_convolutiondepthwise_group_pack8to4 = 0;
41     pipeline_convolutiondepthwise_group_pack8to1 = 0;
42 }
43 
create_pipeline(const Option & _opt)44 int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
45 {
46     Option opt = _opt;
47     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
48     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
49 
50     // the shape after padding
51     Mat shape_bordered;
52     if (shape.dims != 0)
53     {
54         if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
55         {
56             shape_bordered = Mat(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c, (void*)0);
57         }
58         else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
59                  || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
60         {
61             const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
62             const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
63 
64             int wpad = kernel_extent_w + (shape.w - 1) / stride_w * stride_w - shape.w;
65             int hpad = kernel_extent_h + (shape.h - 1) / stride_h * stride_h - shape.h;
66             if (wpad > 0 || hpad > 0)
67             {
68                 shape_bordered = Mat(shape.w + wpad, shape.h + hpad, shape.c, (void*)0);
69             }
70         }
71         else
72         {
73             shape_bordered = shape;
74         }
75     }
76 
77     const int maxk = kernel_w * kernel_h;
78     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
79 
80     int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
81     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
82 
83     size_t elemsize;
84     size_t out_elemsize;
85     if (opt.use_fp16_storage)
86     {
87         elemsize = elempack * 2u;
88         out_elemsize = out_elempack * 2u;
89     }
90     else if (opt.use_fp16_packed)
91     {
92         elemsize = elempack == 1 ? 4u : elempack * 2u;
93         out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
94     }
95     else
96     {
97         elemsize = elempack * 4u;
98         out_elemsize = out_elempack * 4u;
99     }
100 
101     Mat shape_bordered_packed;
102     if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack, (void*)0, elemsize, elempack);
103 
104     Mat out_shape_packed;
105     if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
106 
107     // group convolution
108     const int channels_g = channels / group;
109     const int num_output_g = num_output / group;
110 
111     int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
112     int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
113 
114     size_t elemsize_g;
115     size_t out_elemsize_g;
116     if (opt.use_fp16_storage)
117     {
118         elemsize_g = elempack_g * 2u;
119         out_elemsize_g = out_elempack_g * 2u;
120     }
121     else if (opt.use_fp16_packed)
122     {
123         elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
124         out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
125     }
126     else
127     {
128         elemsize_g = elempack_g * 4u;
129         out_elemsize_g = out_elempack_g * 4u;
130     }
131 
132     Mat shape_bordered_g_packed;
133     if (shape_bordered.dims == 3) shape_bordered_g_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack_g, (void*)0, elemsize_g, elempack_g);
134 
135     Mat out_shape_g_packed;
136     if (out_shape.dims == 3) out_shape_g_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g);
137 
138     // check blob shape
139     if (!vkdev->shape_support_image_storage(shape_bordered_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
140     {
141         support_image_storage = false;
142         opt.use_image_storage = false;
143     }
144 
145     // check weight shape
146     if (channels == group && group == num_output)
147     {
148         Mat weight_data_packed(maxk, group / elempack, (void*)0, (size_t)4 * elempack, elempack);
149         if (!vkdev->shape_support_image_storage(weight_data_packed))
150         {
151             support_image_storage = false;
152             opt.use_image_storage = false;
153         }
154     }
155     else
156     {
157         // check blob shape
158         if (!vkdev->shape_support_image_storage(shape_bordered_g_packed) || !vkdev->shape_support_image_storage(out_shape_g_packed))
159         {
160             support_image_storage = false;
161             opt.use_image_storage = false;
162         }
163 
164         Mat weight_data_packed_groups(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
165         if (!vkdev->shape_support_image_storage(weight_data_packed_groups))
166         {
167             support_image_storage = false;
168             opt.use_image_storage = false;
169         }
170     }
171 
172     {
173         padding = ncnn::create_layer(ncnn::LayerType::Padding);
174         padding->vkdev = vkdev;
175 
176         padding->bottom_shapes.resize(1);
177         padding->bottom_shapes[0] = shape;
178         padding->top_shapes.resize(1);
179         padding->top_shapes[0] = shape_bordered;
180 
181         ncnn::ParamDict pd;
182         pd.set(0, pad_top);
183         pd.set(1, pad_bottom);
184         pd.set(2, pad_left);
185         pd.set(3, pad_right);
186         pd.set(4, 0);
187         pd.set(5, pad_value);
188 
189         padding->load_param(pd);
190 
191         padding->create_pipeline(opt);
192     }
193 
194     std::vector<vk_specialization_type> specializations(11 + 10);
195     specializations[0].i = kernel_w;
196     specializations[1].i = kernel_h;
197     specializations[2].i = dilation_w;
198     specializations[3].i = dilation_h;
199     specializations[4].i = stride_w;
200     specializations[5].i = stride_h;
201     specializations[6].i = bias_term;
202     specializations[7].i = group;
203     specializations[8].i = activation_type;
204     specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
205     specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
206 
207     // depth-wise
208     if (channels == group && group == num_output)
209     {
210         specializations[11 + 0].i = shape_bordered_packed.dims;
211         specializations[11 + 1].i = shape_bordered_packed.w;
212         specializations[11 + 2].i = shape_bordered_packed.h;
213         specializations[11 + 3].i = shape_bordered_packed.c;
214         specializations[11 + 4].i = shape_bordered_packed.cstep;
215         specializations[11 + 5].i = out_shape_packed.dims;
216         specializations[11 + 6].i = out_shape_packed.w;
217         specializations[11 + 7].i = out_shape_packed.h;
218         specializations[11 + 8].i = out_shape_packed.c;
219         specializations[11 + 9].i = out_shape_packed.cstep;
220 
221         Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
222         if (out_shape_packed.dims != 0)
223         {
224             local_size_xyz.w = std::min(8, out_shape_packed.w);
225             local_size_xyz.h = std::min(8, out_shape_packed.h);
226             local_size_xyz.c = std::min(4, out_shape_packed.c);
227         }
228 
229         // pack1
230         if (elempack == 1)
231         {
232             pipeline_convolutiondepthwise = new Pipeline(vkdev);
233             pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
234             pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise, opt, specializations);
235         }
236 
237         // pack4
238         if (elempack == 4)
239         {
240             pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
241             pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
242             pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4, opt, specializations);
243         }
244 
245         // pack8
246         if (elempack == 8)
247         {
248             pipeline_convolutiondepthwise_pack8 = new Pipeline(vkdev);
249             pipeline_convolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
250             pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
251         }
252 
253         return 0;
254     }
255 
256     specializations[11 + 0].i = shape_bordered_g_packed.dims;
257     specializations[11 + 1].i = shape_bordered_g_packed.w;
258     specializations[11 + 2].i = shape_bordered_g_packed.h;
259     specializations[11 + 3].i = shape_bordered_g_packed.c;
260     specializations[11 + 4].i = shape_bordered_g_packed.cstep;
261     specializations[11 + 5].i = out_shape_g_packed.dims;
262     specializations[11 + 6].i = out_shape_g_packed.w;
263     specializations[11 + 7].i = out_shape_g_packed.h;
264     specializations[11 + 8].i = out_shape_g_packed.c;
265     specializations[11 + 9].i = out_shape_g_packed.cstep;
266 
267     Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack_g), (void*)0);
268     if (out_shape_g_packed.dims != 0)
269     {
270         local_size_xyz.w = std::min(8, out_shape_g_packed.w);
271         local_size_xyz.h = std::min(8, out_shape_g_packed.h);
272         local_size_xyz.c = std::min(4, out_shape_g_packed.c);
273     }
274 
275     // pack1
276     if (elempack_g == 1 && out_elempack_g == 1)
277     {
278         pipeline_convolutiondepthwise_group = new Pipeline(vkdev);
279         pipeline_convolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz);
280         pipeline_convolutiondepthwise_group->create(LayerShaderType::convolutiondepthwise_group, opt, specializations);
281     }
282 
283     // pack4
284     if (elempack_g == 4 && out_elempack_g == 4)
285     {
286         pipeline_convolutiondepthwise_group_pack4 = new Pipeline(vkdev);
287         pipeline_convolutiondepthwise_group_pack4->set_optimal_local_size_xyz(local_size_xyz);
288         pipeline_convolutiondepthwise_group_pack4->create(LayerShaderType::convolutiondepthwise_group_pack4, opt, specializations);
289     }
290 
291     // pack1to4
292     if (elempack_g == 1 && out_elempack_g == 4)
293     {
294         pipeline_convolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
295         pipeline_convolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
296         pipeline_convolutiondepthwise_group_pack1to4->create(LayerShaderType::convolutiondepthwise_group_pack1to4, opt, specializations);
297     }
298 
299     // pack4to1
300     if (elempack_g == 4 && out_elempack_g == 1)
301     {
302         pipeline_convolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
303         pipeline_convolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
304         pipeline_convolutiondepthwise_group_pack4to1->create(LayerShaderType::convolutiondepthwise_group_pack4to1, opt, specializations);
305     }
306 
307     // pack8
308     if (elempack_g == 8 && out_elempack_g == 8)
309     {
310         pipeline_convolutiondepthwise_group_pack8 = new Pipeline(vkdev);
311         pipeline_convolutiondepthwise_group_pack8->set_optimal_local_size_xyz(local_size_xyz);
312         pipeline_convolutiondepthwise_group_pack8->create(LayerShaderType::convolutiondepthwise_group_pack8, opt, specializations);
313     }
314 
315     // pack1to8
316     if (elempack_g == 1 && out_elempack_g == 8)
317     {
318         pipeline_convolutiondepthwise_group_pack1to8 = new Pipeline(vkdev);
319         pipeline_convolutiondepthwise_group_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
320         pipeline_convolutiondepthwise_group_pack1to8->create(LayerShaderType::convolutiondepthwise_group_pack1to8, opt, specializations);
321     }
322 
323     // pack4to8
324     if (elempack_g == 4 && out_elempack_g == 8)
325     {
326         pipeline_convolutiondepthwise_group_pack4to8 = new Pipeline(vkdev);
327         pipeline_convolutiondepthwise_group_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
328         pipeline_convolutiondepthwise_group_pack4to8->create(LayerShaderType::convolutiondepthwise_group_pack4to8, opt, specializations);
329     }
330 
331     // pack8to4
332     if (elempack_g == 8 && out_elempack_g == 4)
333     {
334         pipeline_convolutiondepthwise_group_pack8to4 = new Pipeline(vkdev);
335         pipeline_convolutiondepthwise_group_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
336         pipeline_convolutiondepthwise_group_pack8to4->create(LayerShaderType::convolutiondepthwise_group_pack8to4, opt, specializations);
337     }
338 
339     // pack8to1
340     if (elempack_g == 8 && out_elempack_g == 1)
341     {
342         pipeline_convolutiondepthwise_group_pack8to1 = new Pipeline(vkdev);
343         pipeline_convolutiondepthwise_group_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
344         pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
345     }
346 
347     return 0;
348 }
349 
destroy_pipeline(const Option & opt)350 int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
351 {
352     if (padding)
353     {
354         padding->destroy_pipeline(opt);
355         delete padding;
356         padding = 0;
357     }
358 
359     delete pipeline_convolutiondepthwise;
360     pipeline_convolutiondepthwise = 0;
361 
362     delete pipeline_convolutiondepthwise_pack4;
363     pipeline_convolutiondepthwise_pack4 = 0;
364 
365     delete pipeline_convolutiondepthwise_pack8;
366     pipeline_convolutiondepthwise_pack8 = 0;
367 
368     delete pipeline_convolutiondepthwise_group;
369     pipeline_convolutiondepthwise_group = 0;
370 
371     delete pipeline_convolutiondepthwise_group_pack4;
372     pipeline_convolutiondepthwise_group_pack4 = 0;
373 
374     delete pipeline_convolutiondepthwise_group_pack1to4;
375     pipeline_convolutiondepthwise_group_pack1to4 = 0;
376 
377     delete pipeline_convolutiondepthwise_group_pack4to1;
378     pipeline_convolutiondepthwise_group_pack4to1 = 0;
379 
380     delete pipeline_convolutiondepthwise_group_pack8;
381     pipeline_convolutiondepthwise_group_pack8 = 0;
382 
383     delete pipeline_convolutiondepthwise_group_pack1to8;
384     pipeline_convolutiondepthwise_group_pack1to8 = 0;
385 
386     delete pipeline_convolutiondepthwise_group_pack4to8;
387     pipeline_convolutiondepthwise_group_pack4to8 = 0;
388 
389     delete pipeline_convolutiondepthwise_group_pack8to4;
390     pipeline_convolutiondepthwise_group_pack8to4 = 0;
391 
392     delete pipeline_convolutiondepthwise_group_pack8to1;
393     pipeline_convolutiondepthwise_group_pack8to1 = 0;
394 
395     return 0;
396 }
397 
upload_model(VkTransfer & cmd,const Option & opt)398 int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
399 {
400     if (padding)
401     {
402         padding->upload_model(cmd, opt);
403     }
404 
405     const int maxk = kernel_w * kernel_h;
406     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
407 
408     int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
409     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
410 
411     // depth-wise
412     if (channels == group && group == num_output)
413     {
414         Mat weight_data_packed;
415         Mat weight_data_r2 = weight_data.reshape(maxk, group);
416         convert_packing(weight_data_r2, weight_data_packed, elempack, opt);
417 
418         cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
419 
420         cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
421 
422         if (bias_term)
423         {
424             Mat bias_data_packed;
425             convert_packing(bias_data, bias_data_packed, out_elempack, opt);
426 
427             if (support_image_storage && opt.use_image_storage)
428             {
429                 cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
430             }
431             else
432             {
433                 cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
434             }
435         }
436 
437         return 0;
438     }
439 
440     // group convolution
441     const int channels_g = channels / group;
442     const int num_output_g = num_output / group;
443 
444     int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
445     int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
446 
447     // src = kw-kh-inch-outch
448     // dst = pa-pb-kw-kh-inch/pa-outch/pb
449     Mat weight_data_packed_groups;
450     {
451         Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
452 
453         weight_data_packed_groups.create(maxk, channels_g / elempack_g, num_output_g / out_elempack_g * group, (size_t)4 * elempack_g * out_elempack_g, elempack_g * out_elempack_g);
454 
455         for (int g = 0; g < group; g++)
456         {
457             const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
458 
459             Mat weight_data_packed = weight_data_packed_groups.channel_range(num_output_g / out_elempack_g * g, num_output_g / out_elempack_g);
460 
461             for (int q = 0; q + (out_elempack_g - 1) < num_output_g; q += out_elempack_g)
462             {
463                 Mat g0 = weight_data_packed.channel(q / out_elempack_g);
464 
465                 for (int p = 0; p + (elempack_g - 1) < channels_g; p += elempack_g)
466                 {
467                     float* g00 = g0.row(p / elempack_g);
468 
469                     for (int k = 0; k < maxk; k++)
470                     {
471                         for (int i = 0; i < out_elempack_g; i++)
472                         {
473                             const Mat k0 = weight_data_r2.channel(q + i);
474 
475                             for (int j = 0; j < elempack_g; j++)
476                             {
477                                 const float* k00 = k0.row(p + j);
478 
479                                 g00[0] = k00[k];
480 
481                                 g00++;
482                             }
483                         }
484                     }
485                 }
486             }
487         }
488     }
489 
490     if (support_image_storage && opt.use_image_storage)
491     {
492         cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt);
493     }
494     else
495     {
496         cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
497     }
498 
499     if (bias_term)
500     {
501         Mat bias_data_packed;
502         convert_packing(bias_data, bias_data_packed, out_elempack_g, opt);
503 
504         if (support_image_storage && opt.use_image_storage)
505         {
506             cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
507         }
508         else
509         {
510             cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
511         }
512     }
513 
514     return 0;
515 }
516 
forward(const VkMat & bottom_blob,VkMat & top_blob,VkCompute & cmd,const Option & opt) const517 int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
518 {
519     int w = bottom_blob.w;
520     int h = bottom_blob.h;
521     int channels = bottom_blob.c;
522     size_t elemsize = bottom_blob.elemsize;
523     int elempack = bottom_blob.elempack;
524 
525     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
526     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
527 
528     VkMat bottom_blob_bordered = bottom_blob;
529     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
530     {
531         Option opt_pad = opt;
532         opt_pad.blob_vkallocator = opt.workspace_vkallocator;
533 
534         padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
535     }
536     else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
537     {
538         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
539         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
540         if (wpad > 0 || hpad > 0)
541         {
542             Option opt_pad = opt;
543             opt_pad.blob_vkallocator = opt.workspace_vkallocator;
544 
545             VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
546             int* padding_params = padding_param_blob.mapped();
547 
548             padding_params[0] = hpad / 2;
549             padding_params[1] = hpad - hpad / 2;
550             padding_params[2] = wpad / 2;
551             padding_params[3] = wpad - wpad / 2;
552             padding_params[4] = 0;
553             padding_params[5] = 0;
554 
555             std::vector<VkMat> padding_inputs(2);
556             padding_inputs[0] = bottom_blob;
557             padding_inputs[1] = padding_param_blob;
558 
559             std::vector<VkMat> padding_outputs(1);
560             padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
561             bottom_blob_bordered = padding_outputs[0];
562         }
563     }
564     else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
565     {
566         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
567         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
568         if (wpad > 0 || hpad > 0)
569         {
570             Option opt_pad = opt;
571             opt_pad.blob_vkallocator = opt.workspace_vkallocator;
572 
573             VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
574             int* padding_params = padding_param_blob.mapped();
575 
576             padding_params[0] = hpad - hpad / 2;
577             padding_params[1] = hpad / 2;
578             padding_params[2] = wpad - wpad / 2;
579             padding_params[3] = wpad / 2;
580             padding_params[4] = 0;
581             padding_params[5] = 0;
582 
583             std::vector<VkMat> padding_inputs(2);
584             padding_inputs[0] = bottom_blob;
585             padding_inputs[1] = padding_param_blob;
586 
587             std::vector<VkMat> padding_outputs(1);
588             padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
589             bottom_blob_bordered = padding_outputs[0];
590         }
591     }
592 
593     w = bottom_blob_bordered.w;
594     h = bottom_blob_bordered.h;
595 
596     int outw = (w - kernel_extent_w) / stride_w + 1;
597     int outh = (h - kernel_extent_h) / stride_h + 1;
598     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
599     size_t out_elemsize = elemsize / elempack * out_elempack;
600 
601     if (opt.use_fp16_packed && !opt.use_fp16_storage)
602     {
603         if (out_elempack == 8) out_elemsize = 8 * 2u;
604         if (out_elempack == 4) out_elemsize = 4 * 2u;
605         if (out_elempack == 1) out_elemsize = 4u;
606     }
607 
608     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
609     if (top_blob.empty())
610         return -100;
611 
612     // depth-wise
613     if (channels == group / elempack && group / elempack == num_output / elempack)
614     {
615         std::vector<VkMat> bindings(4);
616         bindings[0] = bottom_blob_bordered;
617         bindings[1] = top_blob;
618         bindings[2] = weight_data_gpu;
619         bindings[3] = bias_data_gpu;
620 
621         std::vector<vk_constant_type> constants(10);
622         constants[0].i = bottom_blob_bordered.dims;
623         constants[1].i = bottom_blob_bordered.w;
624         constants[2].i = bottom_blob_bordered.h;
625         constants[3].i = bottom_blob_bordered.c;
626         constants[4].i = bottom_blob_bordered.cstep;
627         constants[5].i = top_blob.dims;
628         constants[6].i = top_blob.w;
629         constants[7].i = top_blob.h;
630         constants[8].i = top_blob.c;
631         constants[9].i = top_blob.cstep;
632 
633         const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
634                                    : elempack == 4 ? pipeline_convolutiondepthwise_pack4
635                                    : pipeline_convolutiondepthwise;
636 
637         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
638 
639         return 0;
640     }
641 
642     const int channels_g = channels * elempack / group;
643     const int num_output_g = num_output / group;
644 
645     int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
646     int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
647     size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
648 
649     if (opt.use_fp16_packed && !opt.use_fp16_storage)
650     {
651         if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
652         if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
653         if (out_elempack_g == 1) out_elemsize_g = 4u;
654     }
655 
656     // unpacking
657     VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
658     if (elempack > elempack_g)
659     {
660         Option opt_pack1 = opt;
661         opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
662 
663         vkdev->convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, elempack_g, cmd, opt_pack1);
664     }
665 
666     VkMat top_blob_unpacked = top_blob;
667     if (out_elempack_g < out_elempack)
668     {
669         top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
670         if (top_blob_unpacked.empty())
671             return -100;
672     }
673 
674     std::vector<VkMat> bindings(4);
675     bindings[0] = bottom_blob_bordered_unpacked;
676     bindings[1] = top_blob_unpacked;
677     bindings[2] = weight_data_gpu;
678     bindings[3] = bias_data_gpu;
679 
680     std::vector<vk_constant_type> constants(10);
681     constants[0].i = bottom_blob_bordered_unpacked.dims;
682     constants[1].i = bottom_blob_bordered_unpacked.w;
683     constants[2].i = bottom_blob_bordered_unpacked.h;
684     constants[3].i = bottom_blob_bordered_unpacked.c;
685     constants[4].i = bottom_blob_bordered_unpacked.cstep;
686     constants[5].i = top_blob_unpacked.dims;
687     constants[6].i = top_blob_unpacked.w;
688     constants[7].i = top_blob_unpacked.h;
689     constants[8].i = top_blob_unpacked.c;
690     constants[9].i = top_blob_unpacked.cstep;
691 
692     const Pipeline* pipeline = 0;
693     if (elempack_g == 1 && out_elempack_g == 1)
694     {
695         pipeline = pipeline_convolutiondepthwise_group;
696     }
697     else if (elempack_g == 4 && out_elempack_g == 4)
698     {
699         pipeline = pipeline_convolutiondepthwise_group_pack4;
700     }
701     else if (elempack_g == 1 && out_elempack_g == 4)
702     {
703         pipeline = pipeline_convolutiondepthwise_group_pack1to4;
704     }
705     else if (elempack_g == 4 && out_elempack_g == 1)
706     {
707         pipeline = pipeline_convolutiondepthwise_group_pack4to1;
708     }
709     else if (elempack_g == 8 && out_elempack_g == 8)
710     {
711         pipeline = pipeline_convolutiondepthwise_group_pack8;
712     }
713     else if (elempack_g == 1 && out_elempack_g == 8)
714     {
715         pipeline = pipeline_convolutiondepthwise_group_pack1to8;
716     }
717     else if (elempack_g == 4 && out_elempack_g == 8)
718     {
719         pipeline = pipeline_convolutiondepthwise_group_pack4to8;
720     }
721     else if (elempack_g == 8 && out_elempack_g == 4)
722     {
723         pipeline = pipeline_convolutiondepthwise_group_pack8to4;
724     }
725     else if (elempack_g == 8 && out_elempack_g == 1)
726     {
727         pipeline = pipeline_convolutiondepthwise_group_pack8to1;
728     }
729 
730     cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
731 
732     // packing
733     if (out_elempack_g < out_elempack)
734     {
735         vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
736     }
737     else
738     {
739         top_blob = top_blob_unpacked;
740     }
741 
742     return 0;
743 }
744 
forward(const VkImageMat & bottom_blob,VkImageMat & top_blob,VkCompute & cmd,const Option & opt) const745 int ConvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
746 {
747     int w = bottom_blob.w;
748     int h = bottom_blob.h;
749     int channels = bottom_blob.c;
750     size_t elemsize = bottom_blob.elemsize;
751     int elempack = bottom_blob.elempack;
752 
753     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
754     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
755 
756     VkImageMat bottom_blob_bordered = bottom_blob;
757     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
758     {
759         Option opt_pad = opt;
760         opt_pad.blob_vkallocator = opt.workspace_vkallocator;
761 
762         padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
763     }
764     else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
765     {
766         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
767         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
768         if (wpad > 0 || hpad > 0)
769         {
770             Option opt_pad = opt;
771             opt_pad.blob_vkallocator = opt.workspace_vkallocator;
772 
773             VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
774             int* padding_params = padding_param_blob.mapped();
775 
776             padding_params[0] = hpad / 2;
777             padding_params[1] = hpad - hpad / 2;
778             padding_params[2] = wpad / 2;
779             padding_params[3] = wpad - wpad / 2;
780             padding_params[4] = 0;
781             padding_params[5] = 0;
782 
783             std::vector<VkImageMat> padding_inputs(2);
784             padding_inputs[0] = bottom_blob;
785             padding_inputs[1] = padding_param_blob;
786 
787             std::vector<VkImageMat> padding_outputs(1);
788             padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
789             bottom_blob_bordered = padding_outputs[0];
790         }
791     }
792     else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
793     {
794         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
795         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
796         if (wpad > 0 || hpad > 0)
797         {
798             Option opt_pad = opt;
799             opt_pad.blob_vkallocator = opt.workspace_vkallocator;
800 
801             VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
802             int* padding_params = padding_param_blob.mapped();
803 
804             padding_params[0] = hpad - hpad / 2;
805             padding_params[1] = hpad / 2;
806             padding_params[2] = wpad - wpad / 2;
807             padding_params[3] = wpad / 2;
808             padding_params[4] = 0;
809             padding_params[5] = 0;
810 
811             std::vector<VkImageMat> padding_inputs(2);
812             padding_inputs[0] = bottom_blob;
813             padding_inputs[1] = padding_param_blob;
814 
815             std::vector<VkImageMat> padding_outputs(1);
816             padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
817             bottom_blob_bordered = padding_outputs[0];
818         }
819     }
820 
821     w = bottom_blob_bordered.w;
822     h = bottom_blob_bordered.h;
823 
824     int outw = (w - kernel_extent_w) / stride_w + 1;
825     int outh = (h - kernel_extent_h) / stride_h + 1;
826     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
827     size_t out_elemsize = elemsize / elempack * out_elempack;
828 
829     if (opt.use_fp16_packed && !opt.use_fp16_storage)
830     {
831         if (out_elempack == 8) out_elemsize = 8 * 2u;
832         if (out_elempack == 4) out_elemsize = 4 * 2u;
833         if (out_elempack == 1) out_elemsize = 4u;
834     }
835 
836     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
837     if (top_blob.empty())
838         return -100;
839 
840     // depth-wise
841     if (channels == group / elempack && group / elempack == num_output / elempack)
842     {
843         std::vector<VkImageMat> bindings(4);
844         bindings[0] = bottom_blob_bordered;
845         bindings[1] = top_blob;
846         bindings[2] = weight_data_gpu_image;
847         bindings[3] = bias_data_gpu_image;
848 
849         std::vector<vk_constant_type> constants(10);
850         constants[0].i = bottom_blob_bordered.dims;
851         constants[1].i = bottom_blob_bordered.w;
852         constants[2].i = bottom_blob_bordered.h;
853         constants[3].i = bottom_blob_bordered.c;
854         constants[4].i = 0; //bottom_blob_bordered.cstep;
855         constants[5].i = top_blob.dims;
856         constants[6].i = top_blob.w;
857         constants[7].i = top_blob.h;
858         constants[8].i = top_blob.c;
859         constants[9].i = 0; //top_blob.cstep;
860 
861         const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
862                                    : elempack == 4 ? pipeline_convolutiondepthwise_pack4
863                                    : pipeline_convolutiondepthwise;
864 
865         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
866 
867         return 0;
868     }
869 
870     const int channels_g = channels * elempack / group;
871     const int num_output_g = num_output / group;
872 
873     int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
874     int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
875     size_t out_elemsize_g = elemsize / elempack * out_elempack_g;
876 
877     if (opt.use_fp16_packed && !opt.use_fp16_storage)
878     {
879         if (out_elempack_g == 8) out_elemsize_g = 8 * 2u;
880         if (out_elempack_g == 4) out_elemsize_g = 4 * 2u;
881         if (out_elempack_g == 1) out_elemsize_g = 4u;
882     }
883 
884     // unpacking
885     VkImageMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
886     if (elempack > elempack_g)
887     {
888         Option opt_pack1 = opt;
889         opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
890 
891         vkdev->convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, elempack_g, cmd, opt_pack1);
892     }
893 
894     VkImageMat top_blob_unpacked = top_blob;
895     if (out_elempack_g < out_elempack)
896     {
897         top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
898         if (top_blob_unpacked.empty())
899             return -100;
900     }
901 
902     std::vector<VkImageMat> bindings(4);
903     bindings[0] = bottom_blob_bordered_unpacked;
904     bindings[1] = top_blob_unpacked;
905     bindings[2] = weight_data_gpu_image;
906     bindings[3] = bias_data_gpu_image;
907 
908     std::vector<vk_constant_type> constants(10);
909     constants[0].i = bottom_blob_bordered_unpacked.dims;
910     constants[1].i = bottom_blob_bordered_unpacked.w;
911     constants[2].i = bottom_blob_bordered_unpacked.h;
912     constants[3].i = bottom_blob_bordered_unpacked.c;
913     constants[4].i = 0; //bottom_blob_bordered_unpacked.cstep;
914     constants[5].i = top_blob_unpacked.dims;
915     constants[6].i = top_blob_unpacked.w;
916     constants[7].i = top_blob_unpacked.h;
917     constants[8].i = top_blob_unpacked.c;
918     constants[9].i = 0; //top_blob_unpacked.cstep;
919 
920     const Pipeline* pipeline = 0;
921     if (elempack_g == 1 && out_elempack_g == 1)
922     {
923         pipeline = pipeline_convolutiondepthwise_group;
924     }
925     else if (elempack_g == 4 && out_elempack_g == 4)
926     {
927         pipeline = pipeline_convolutiondepthwise_group_pack4;
928     }
929     else if (elempack_g == 1 && out_elempack_g == 4)
930     {
931         pipeline = pipeline_convolutiondepthwise_group_pack1to4;
932     }
933     else if (elempack_g == 4 && out_elempack_g == 1)
934     {
935         pipeline = pipeline_convolutiondepthwise_group_pack4to1;
936     }
937     else if (elempack_g == 8 && out_elempack_g == 8)
938     {
939         pipeline = pipeline_convolutiondepthwise_group_pack8;
940     }
941     else if (elempack_g == 1 && out_elempack_g == 8)
942     {
943         pipeline = pipeline_convolutiondepthwise_group_pack1to8;
944     }
945     else if (elempack_g == 4 && out_elempack_g == 8)
946     {
947         pipeline = pipeline_convolutiondepthwise_group_pack4to8;
948     }
949     else if (elempack_g == 8 && out_elempack_g == 4)
950     {
951         pipeline = pipeline_convolutiondepthwise_group_pack8to4;
952     }
953     else if (elempack_g == 8 && out_elempack_g == 1)
954     {
955         pipeline = pipeline_convolutiondepthwise_group_pack8to1;
956     }
957 
958     cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
959 
960     // packing
961     if (out_elempack_g < out_elempack)
962     {
963         vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
964     }
965     else
966     {
967         top_blob = top_blob_unpacked;
968     }
969 
970     return 0;
971 }
972 
973 } // namespace ncnn
974