1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "innerproduct_vulkan.h"
16 
17 #include "layer_shader_type.h"
18 #include "layer_type.h"
19 
20 namespace ncnn {
21 
InnerProduct_vulkan()22 InnerProduct_vulkan::InnerProduct_vulkan()
23 {
24     support_vulkan = true;
25     support_image_storage = true;
26 
27     flatten = 0;
28 
29     pipeline_innerproduct = 0;
30     pipeline_innerproduct_pack4 = 0;
31     pipeline_innerproduct_pack1to4 = 0;
32     pipeline_innerproduct_pack4to1 = 0;
33     pipeline_innerproduct_pack8 = 0;
34     pipeline_innerproduct_pack1to8 = 0;
35     pipeline_innerproduct_pack4to8 = 0;
36     pipeline_innerproduct_pack8to4 = 0;
37     pipeline_innerproduct_pack8to1 = 0;
38 
39     pipeline_innerproduct_gemm = 0;
40 }
41 
create_pipeline(const Option & _opt)42 int InnerProduct_vulkan::create_pipeline(const Option& _opt)
43 {
44     Option opt = _opt;
45     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
46     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
47 
48     const int num_input = weight_data_size / num_output;
49 
50     int in_elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
51     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
52 
53     if (shape.dims == 2 && shape.w == num_input && shape.h > 1)
54     {
55         // gemm
56         int elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
57 
58         size_t elemsize;
59         if (opt.use_fp16_storage)
60         {
61             elemsize = elempack * 2u;
62         }
63         else if (opt.use_fp16_packed)
64         {
65             elemsize = elempack == 1 ? 4u : elempack * 2u;
66         }
67         else
68         {
69             elemsize = elempack * 4u;
70         }
71 
72         Mat shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
73         Mat out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, elemsize, elempack);
74 
75         // check blob shape
76         if (!vkdev->shape_support_image_storage(shape) || !vkdev->shape_support_image_storage(out_shape))
77         {
78             support_image_storage = false;
79             opt.use_image_storage = false;
80         }
81 
82         // check blob shape
83         if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
84         {
85             support_image_storage = false;
86             opt.use_image_storage = false;
87         }
88 
89         std::vector<vk_specialization_type> specializations(4 + 10);
90         specializations[0].i = bias_term;
91         specializations[1].i = activation_type;
92         specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
93         specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
94         specializations[4 + 0].i = shape.dims;
95         specializations[4 + 1].i = shape.w;
96         specializations[4 + 2].i = shape.h;
97         specializations[4 + 3].i = shape.c;
98         specializations[4 + 4].i = shape.cstep;
99         specializations[4 + 5].i = out_shape.dims;
100         specializations[4 + 6].i = out_shape.w;
101         specializations[4 + 7].i = out_shape.h;
102         specializations[4 + 8].i = out_shape.c;
103         specializations[4 + 9].i = out_shape.cstep;
104 
105         Mat local_size_xyz(std::min(16, num_output / out_elempack), 4, 1, (void*)0);
106         if (out_shape.dims != 0)
107         {
108             local_size_xyz.w = std::min(16, out_shape.w / out_elempack);
109             local_size_xyz.h = std::min(4, out_shape.h);
110             local_size_xyz.c = 1;
111         }
112 
113         {
114             pipeline_innerproduct_gemm = new Pipeline(vkdev);
115             pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
116 
117             // pack1
118             if (in_elempack == 1 && out_elempack == 1)
119             {
120                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations);
121             }
122 
123             // pack4
124             if (in_elempack == 4 && out_elempack == 4)
125             {
126                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations);
127             }
128 
129             // pack1to4
130             if (in_elempack == 1 && out_elempack == 4)
131             {
132                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations);
133             }
134 
135             // pack4to1
136             if (in_elempack == 4 && out_elempack == 1)
137             {
138                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations);
139             }
140 
141             // pack8
142             if (in_elempack == 8 && out_elempack == 8)
143             {
144                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations);
145             }
146 
147             // pack1to8
148             if (in_elempack == 1 && out_elempack == 8)
149             {
150                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations);
151             }
152 
153             // pack4to8
154             if (in_elempack == 4 && out_elempack == 8)
155             {
156                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations);
157             }
158 
159             // pack8to4
160             if (in_elempack == 8 && out_elempack == 4)
161             {
162                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations);
163             }
164 
165             // pack8to1
166             if (in_elempack == 8 && out_elempack == 1)
167             {
168                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations);
169             }
170         }
171 
172         return 0;
173     }
174 
175     Mat shape_flatten;
176     if (shape.dims != 0)
177     {
178         shape_flatten = Mat(shape.w * shape.h * shape.c, (void*)0);
179     }
180 
181     size_t elemsize;
182     size_t out_elemsize;
183     if (opt.use_fp16_storage)
184     {
185         elemsize = in_elempack * 2u;
186         out_elemsize = out_elempack * 2u;
187     }
188     else if (opt.use_fp16_packed)
189     {
190         elemsize = in_elempack == 1 ? 4u : in_elempack * 2u;
191         out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
192     }
193     else
194     {
195         elemsize = in_elempack * 4u;
196         out_elemsize = out_elempack * 4u;
197     }
198 
199     Mat shape_flatten_packed;
200     if (shape_flatten.dims == 1) shape_flatten_packed = Mat(shape_flatten.w / in_elempack, (void*)0, elemsize, in_elempack);
201 
202     Mat out_shape_packed;
203     if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
204 
205     // check blob shape
206     if (!vkdev->shape_support_image_storage(shape_flatten_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
207     {
208         support_image_storage = false;
209         opt.use_image_storage = false;
210     }
211 
212     // check weight shape
213     Mat weight_data_packed(num_input / in_elempack, num_output / out_elempack, (void*)0, (size_t)4 * in_elempack * out_elempack, in_elempack * out_elempack);
214     if (!vkdev->shape_support_image_storage(weight_data_packed))
215     {
216         support_image_storage = false;
217         opt.use_image_storage = false;
218     }
219 
220     if (shape.dims == 0)
221     {
222         // check weight shape
223         Mat weight_data_packed(num_input, num_output, (void*)0, (size_t)4u, 1);
224         if (!vkdev->shape_support_image_storage(weight_data_packed))
225         {
226             support_image_storage = false;
227             opt.use_image_storage = false;
228         }
229     }
230 
231     {
232         flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
233         flatten->vkdev = vkdev;
234 
235         flatten->bottom_shapes.resize(1);
236         flatten->bottom_shapes[0] = shape;
237         flatten->top_shapes.resize(1);
238         flatten->top_shapes[0] = shape_flatten;
239 
240         ncnn::ParamDict pd;
241 
242         flatten->load_param(pd);
243 
244         flatten->create_pipeline(opt);
245     }
246 
247     std::vector<vk_specialization_type> specializations(4 + 10);
248     specializations[0].i = bias_term;
249     specializations[1].i = activation_type;
250     specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
251     specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
252     specializations[4 + 0].i = shape_flatten_packed.dims;
253     specializations[4 + 1].i = shape_flatten_packed.w;
254     specializations[4 + 2].i = shape_flatten_packed.h;
255     specializations[4 + 3].i = shape_flatten_packed.c;
256     specializations[4 + 4].i = shape_flatten_packed.cstep;
257     specializations[4 + 5].i = out_shape_packed.dims;
258     specializations[4 + 6].i = out_shape_packed.w;
259     specializations[4 + 7].i = out_shape_packed.h;
260     specializations[4 + 8].i = out_shape_packed.c;
261     specializations[4 + 9].i = out_shape_packed.cstep;
262 
263     Mat local_size_xyz(std::min(64, num_output / out_elempack), 1, 1, (void*)0);
264     if (out_shape_packed.dims != 0)
265     {
266         local_size_xyz.w = std::min(64, out_shape_packed.w);
267         local_size_xyz.h = 1;
268         local_size_xyz.c = 1;
269     }
270 
271     // pack1
272     if (in_elempack == 1 && out_elempack == 1)
273     {
274         pipeline_innerproduct = new Pipeline(vkdev);
275         pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz);
276         pipeline_innerproduct->create(LayerShaderType::innerproduct, opt, specializations);
277     }
278 
279     // pack4
280     if (in_elempack == 4 && out_elempack == 4)
281     {
282         pipeline_innerproduct_pack4 = new Pipeline(vkdev);
283         pipeline_innerproduct_pack4->set_optimal_local_size_xyz(local_size_xyz);
284         pipeline_innerproduct_pack4->create(LayerShaderType::innerproduct_pack4, opt, specializations);
285     }
286 
287     // pack1to4
288     if (in_elempack == 1 && out_elempack == 4)
289     {
290         pipeline_innerproduct_pack1to4 = new Pipeline(vkdev);
291         pipeline_innerproduct_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
292         pipeline_innerproduct_pack1to4->create(LayerShaderType::innerproduct_pack1to4, opt, specializations);
293     }
294 
295     // pack4to1
296     if (in_elempack == 4 && out_elempack == 1)
297     {
298         pipeline_innerproduct_pack4to1 = new Pipeline(vkdev);
299         pipeline_innerproduct_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
300         pipeline_innerproduct_pack4to1->create(LayerShaderType::innerproduct_pack4to1, opt, specializations);
301     }
302 
303     // pack8
304     if (in_elempack == 8 && out_elempack == 8)
305     {
306         pipeline_innerproduct_pack8 = new Pipeline(vkdev);
307         pipeline_innerproduct_pack8->set_optimal_local_size_xyz(local_size_xyz);
308         pipeline_innerproduct_pack8->create(LayerShaderType::innerproduct_pack8, opt, specializations);
309     }
310 
311     // pack1to8
312     if (in_elempack == 1 && out_elempack == 8)
313     {
314         pipeline_innerproduct_pack1to8 = new Pipeline(vkdev);
315         pipeline_innerproduct_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
316         pipeline_innerproduct_pack1to8->create(LayerShaderType::innerproduct_pack1to8, opt, specializations);
317     }
318 
319     // pack4to8
320     if (in_elempack == 4 && out_elempack == 8)
321     {
322         pipeline_innerproduct_pack4to8 = new Pipeline(vkdev);
323         pipeline_innerproduct_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
324         pipeline_innerproduct_pack4to8->create(LayerShaderType::innerproduct_pack4to8, opt, specializations);
325     }
326 
327     // pack8to4
328     if (in_elempack == 8 && out_elempack == 4)
329     {
330         pipeline_innerproduct_pack8to4 = new Pipeline(vkdev);
331         pipeline_innerproduct_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
332         pipeline_innerproduct_pack8to4->create(LayerShaderType::innerproduct_pack8to4, opt, specializations);
333     }
334 
335     // pack8to1
336     if (in_elempack == 8 && out_elempack == 1)
337     {
338         pipeline_innerproduct_pack8to1 = new Pipeline(vkdev);
339         pipeline_innerproduct_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
340         pipeline_innerproduct_pack8to1->create(LayerShaderType::innerproduct_pack8to1, opt, specializations);
341     }
342 
343     // gemm for no shape hint
344     if (shape.dims == 0)
345     {
346         std::vector<vk_specialization_type> specializations(4 + 10);
347         specializations[0].i = bias_term;
348         specializations[1].i = activation_type;
349         specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
350         specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
351         specializations[4 + 0].i = 0;
352         specializations[4 + 1].i = 0;
353         specializations[4 + 2].i = 0;
354         specializations[4 + 3].i = 0;
355         specializations[4 + 4].i = 0;
356         specializations[4 + 5].i = 0;
357         specializations[4 + 6].i = 0;
358         specializations[4 + 7].i = 0;
359         specializations[4 + 8].i = 0;
360         specializations[4 + 9].i = 0;
361 
362         Mat local_size_xyz(std::min(16, num_output / out_elempack), 4, 1, (void*)0);
363 
364         {
365             pipeline_innerproduct_gemm = new Pipeline(vkdev);
366             pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
367 
368             // pack1
369             if (in_elempack == 1 && out_elempack == 1)
370             {
371                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations);
372             }
373 
374             // pack4
375             if (in_elempack == 4 && out_elempack == 4)
376             {
377                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations);
378             }
379 
380             // pack1to4
381             if (in_elempack == 1 && out_elempack == 4)
382             {
383                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations);
384             }
385 
386             // pack4to1
387             if (in_elempack == 4 && out_elempack == 1)
388             {
389                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations);
390             }
391 
392             // pack8
393             if (in_elempack == 8 && out_elempack == 8)
394             {
395                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations);
396             }
397 
398             // pack1to8
399             if (in_elempack == 1 && out_elempack == 8)
400             {
401                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations);
402             }
403 
404             // pack4to8
405             if (in_elempack == 4 && out_elempack == 8)
406             {
407                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations);
408             }
409 
410             // pack8to4
411             if (in_elempack == 8 && out_elempack == 4)
412             {
413                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations);
414             }
415 
416             // pack8to1
417             if (in_elempack == 8 && out_elempack == 1)
418             {
419                 pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations);
420             }
421         }
422 
423         return 0;
424     }
425 
426     return 0;
427 }
428 
destroy_pipeline(const Option & opt)429 int InnerProduct_vulkan::destroy_pipeline(const Option& opt)
430 {
431     if (flatten)
432     {
433         flatten->destroy_pipeline(opt);
434         delete flatten;
435         flatten = 0;
436     }
437 
438     delete pipeline_innerproduct;
439     pipeline_innerproduct = 0;
440 
441     delete pipeline_innerproduct_pack4;
442     pipeline_innerproduct_pack4 = 0;
443 
444     delete pipeline_innerproduct_pack1to4;
445     pipeline_innerproduct_pack1to4 = 0;
446 
447     delete pipeline_innerproduct_pack4to1;
448     pipeline_innerproduct_pack4to1 = 0;
449 
450     delete pipeline_innerproduct_pack8;
451     pipeline_innerproduct_pack8 = 0;
452 
453     delete pipeline_innerproduct_pack1to8;
454     pipeline_innerproduct_pack1to8 = 0;
455 
456     delete pipeline_innerproduct_pack4to8;
457     pipeline_innerproduct_pack4to8 = 0;
458 
459     delete pipeline_innerproduct_pack8to4;
460     pipeline_innerproduct_pack8to4 = 0;
461 
462     delete pipeline_innerproduct_pack8to1;
463     pipeline_innerproduct_pack8to1 = 0;
464 
465     delete pipeline_innerproduct_gemm;
466     pipeline_innerproduct_gemm = 0;
467 
468     return 0;
469 }
470 
upload_model(VkTransfer & cmd,const Option & opt)471 int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
472 {
473     const int num_input = weight_data_size / num_output;
474 
475     int in_elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
476     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
477 
478     // src = inch-outch
479     // dst = pa-pb-inch/pa-outch/pb
480     Mat weight_data_packed;
481     {
482         Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
483 
484         weight_data_packed.create(num_input / in_elempack, num_output / out_elempack, (size_t)4 * in_elempack * out_elempack, in_elempack * out_elempack);
485 
486         for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
487         {
488             float* g00 = weight_data_packed.row(q / out_elempack);
489 
490             for (int p = 0; p + (in_elempack - 1) < num_input; p += in_elempack)
491             {
492                 for (int i = 0; i < out_elempack; i++)
493                 {
494                     const float* k0 = weight_data_r2.row(q + i);
495                     k0 += p;
496 
497                     for (int j = 0; j < in_elempack; j++)
498                     {
499                         g00[0] = k0[j];
500 
501                         g00++;
502                     }
503                 }
504             }
505         }
506     }
507 
508     if (support_image_storage && opt.use_image_storage)
509     {
510         cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
511     }
512     else
513     {
514         cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
515     }
516 
517     if (bias_term)
518     {
519         Mat bias_data_packed;
520         convert_packing(bias_data, bias_data_packed, out_elempack);
521 
522         if (support_image_storage && opt.use_image_storage)
523         {
524             cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
525         }
526         else
527         {
528             cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
529         }
530     }
531 
532     return 0;
533 }
534 
forward(const VkMat & bottom_blob,VkMat & top_blob,VkCompute & cmd,const Option & opt) const535 int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
536 {
537     const int num_input = weight_data_size / num_output;
538 
539     int in_elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
540     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
541 
542     if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
543     {
544         // gemm
545         int h = bottom_blob.h;
546         size_t elemsize = bottom_blob.elemsize;
547         int elempack = bottom_blob.elempack;
548 
549         // unpacking
550         VkMat bottom_blob_unpacked = bottom_blob;
551         if (elempack > 1)
552         {
553             Option opt_pack1 = opt;
554             opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
555 
556             vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, 1, cmd, opt_pack1);
557         }
558 
559         top_blob.create(num_output, h, elemsize, elempack, opt.blob_vkallocator);
560         if (top_blob.empty())
561             return -100;
562 
563         VkMat top_blob_unpacked = top_blob;
564         if (elempack > 1)
565         {
566             top_blob_unpacked.create(num_output, h * elempack, bottom_blob_unpacked.elemsize, 1, opt.workspace_vkallocator);
567             if (top_blob_unpacked.empty())
568                 return -100;
569         }
570 
571         std::vector<VkMat> bindings(4);
572         bindings[0] = bottom_blob_unpacked;
573         bindings[1] = top_blob_unpacked;
574         bindings[2] = weight_data_gpu;
575         bindings[3] = bias_data_gpu;
576 
577         std::vector<vk_constant_type> constants(10);
578         constants[0].i = bottom_blob_unpacked.dims;
579         constants[1].i = bottom_blob_unpacked.w;
580         constants[2].i = bottom_blob_unpacked.h;
581         constants[3].i = bottom_blob_unpacked.c;
582         constants[4].i = bottom_blob_unpacked.cstep;
583         constants[5].i = top_blob_unpacked.dims;
584         constants[6].i = top_blob_unpacked.w;
585         constants[7].i = top_blob_unpacked.h;
586         constants[8].i = top_blob_unpacked.c;
587         constants[9].i = top_blob_unpacked.cstep;
588 
589         const Pipeline* pipeline = pipeline_innerproduct_gemm;
590 
591         VkMat dispatcher;
592         dispatcher.w = top_blob_unpacked.w / out_elempack;
593         dispatcher.h = top_blob_unpacked.h;
594         dispatcher.c = 1;
595 
596         cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
597 
598         // packing
599         if (elempack > 1)
600         {
601             vkdev->convert_packing(top_blob_unpacked, top_blob, elempack, cmd, opt);
602         }
603 
604         return 0;
605     }
606 
607     // flatten
608     VkMat bottom_blob_flattened = bottom_blob;
609     {
610         Option opt_flatten = opt;
611         opt_flatten.blob_vkallocator = opt.workspace_vkallocator;
612 
613         flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten);
614     }
615 
616     size_t elemsize = bottom_blob_flattened.elemsize;
617     size_t out_elemsize = elemsize / in_elempack * out_elempack;
618 
619     if (opt.use_fp16_packed && !opt.use_fp16_storage)
620     {
621         if (out_elempack == 8) out_elemsize = 8 * 2u;
622         if (out_elempack == 4) out_elemsize = 4 * 2u;
623         if (out_elempack == 1) out_elemsize = 4u;
624     }
625 
626     top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
627     if (top_blob.empty())
628         return -100;
629 
630     std::vector<VkMat> bindings(4);
631     bindings[0] = bottom_blob_flattened;
632     bindings[1] = top_blob;
633     bindings[2] = weight_data_gpu;
634     bindings[3] = bias_data_gpu;
635 
636     std::vector<vk_constant_type> constants(10);
637     constants[0].i = bottom_blob_flattened.dims;
638     constants[1].i = bottom_blob_flattened.w;
639     constants[2].i = bottom_blob_flattened.h;
640     constants[3].i = bottom_blob_flattened.c;
641     constants[4].i = bottom_blob_flattened.cstep;
642     constants[5].i = top_blob.dims;
643     constants[6].i = top_blob.w;
644     constants[7].i = top_blob.h;
645     constants[8].i = top_blob.c;
646     constants[9].i = top_blob.cstep;
647 
648     const Pipeline* pipeline = 0;
649     if (in_elempack == 1 && out_elempack == 1)
650     {
651         pipeline = pipeline_innerproduct;
652     }
653     else if (in_elempack == 4 && out_elempack == 4)
654     {
655         pipeline = pipeline_innerproduct_pack4;
656     }
657     else if (in_elempack == 1 && out_elempack == 4)
658     {
659         pipeline = pipeline_innerproduct_pack1to4;
660     }
661     else if (in_elempack == 4 && out_elempack == 1)
662     {
663         pipeline = pipeline_innerproduct_pack4to1;
664     }
665     else if (in_elempack == 8 && out_elempack == 8)
666     {
667         pipeline = pipeline_innerproduct_pack8;
668     }
669     else if (in_elempack == 1 && out_elempack == 8)
670     {
671         pipeline = pipeline_innerproduct_pack1to8;
672     }
673     else if (in_elempack == 4 && out_elempack == 8)
674     {
675         pipeline = pipeline_innerproduct_pack4to8;
676     }
677     else if (in_elempack == 8 && out_elempack == 4)
678     {
679         pipeline = pipeline_innerproduct_pack8to4;
680     }
681     else if (in_elempack == 8 && out_elempack == 1)
682     {
683         pipeline = pipeline_innerproduct_pack8to1;
684     }
685 
686     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
687 
688     return 0;
689 }
690 
forward(const VkImageMat & bottom_blob,VkImageMat & top_blob,VkCompute & cmd,const Option & opt) const691 int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
692 {
693     const int num_input = weight_data_size / num_output;
694 
695     int in_elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
696     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
697 
698     if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
699     {
700         // gemm
701         int h = bottom_blob.h;
702         size_t elemsize = bottom_blob.elemsize;
703         int elempack = bottom_blob.elempack;
704 
705         // unpacking
706         VkImageMat bottom_blob_unpacked = bottom_blob;
707         if (elempack > 1)
708         {
709             Option opt_pack1 = opt;
710             opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
711 
712             vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, 1, cmd, opt_pack1);
713         }
714 
715         top_blob.create(num_output, h, elemsize, elempack, opt.blob_vkallocator);
716         if (top_blob.empty())
717             return -100;
718 
719         VkImageMat top_blob_unpacked = top_blob;
720         if (elempack > 1)
721         {
722             top_blob_unpacked.create(num_output, h * elempack, bottom_blob_unpacked.elemsize, 1, opt.workspace_vkallocator);
723             if (top_blob_unpacked.empty())
724                 return -100;
725         }
726 
727         std::vector<VkImageMat> bindings(4);
728         bindings[0] = bottom_blob_unpacked;
729         bindings[1] = top_blob_unpacked;
730         bindings[2] = weight_data_gpu_image;
731         bindings[3] = bias_data_gpu_image;
732 
733         std::vector<vk_constant_type> constants(10);
734         constants[0].i = bottom_blob_unpacked.dims;
735         constants[1].i = bottom_blob_unpacked.w;
736         constants[2].i = bottom_blob_unpacked.h;
737         constants[3].i = bottom_blob_unpacked.c;
738         constants[4].i = 0; //bottom_blob_unpacked.cstep;
739         constants[5].i = top_blob_unpacked.dims;
740         constants[6].i = top_blob_unpacked.w;
741         constants[7].i = top_blob_unpacked.h;
742         constants[8].i = top_blob_unpacked.c;
743         constants[9].i = 0; //top_blob_unpacked.cstep;
744 
745         const Pipeline* pipeline = pipeline_innerproduct_gemm;
746 
747         VkImageMat dispatcher;
748         dispatcher.w = top_blob_unpacked.w / out_elempack;
749         dispatcher.h = top_blob_unpacked.h;
750         dispatcher.c = 1;
751 
752         cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
753 
754         // packing
755         if (elempack > 1)
756         {
757             vkdev->convert_packing(top_blob_unpacked, top_blob, elempack, cmd, opt);
758         }
759 
760         return 0;
761     }
762 
763     // flatten
764     VkImageMat bottom_blob_flattened = bottom_blob;
765     {
766         Option opt_flatten = opt;
767         opt_flatten.blob_vkallocator = opt.workspace_vkallocator;
768 
769         flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten);
770     }
771 
772     size_t elemsize = bottom_blob_flattened.elemsize;
773     size_t out_elemsize = elemsize / in_elempack * out_elempack;
774 
775     if (opt.use_fp16_packed && !opt.use_fp16_storage)
776     {
777         if (out_elempack == 8) out_elemsize = 8 * 2u;
778         if (out_elempack == 4) out_elemsize = 4 * 2u;
779         if (out_elempack == 1) out_elemsize = 4u;
780     }
781 
782     top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
783     if (top_blob.empty())
784         return -100;
785 
786     std::vector<VkImageMat> bindings(4);
787     bindings[0] = bottom_blob_flattened;
788     bindings[1] = top_blob;
789     bindings[2] = weight_data_gpu_image;
790     bindings[3] = bias_data_gpu_image;
791 
792     std::vector<vk_constant_type> constants(10);
793     constants[0].i = bottom_blob_flattened.dims;
794     constants[1].i = bottom_blob_flattened.w;
795     constants[2].i = bottom_blob_flattened.h;
796     constants[3].i = bottom_blob_flattened.c;
797     constants[4].i = 0; //bottom_blob_flattened.cstep;
798     constants[5].i = top_blob.dims;
799     constants[6].i = top_blob.w;
800     constants[7].i = top_blob.h;
801     constants[8].i = top_blob.c;
802     constants[9].i = 0; //top_blob.cstep;
803 
804     const Pipeline* pipeline = 0;
805     if (in_elempack == 1 && out_elempack == 1)
806     {
807         pipeline = pipeline_innerproduct;
808     }
809     else if (in_elempack == 4 && out_elempack == 4)
810     {
811         pipeline = pipeline_innerproduct_pack4;
812     }
813     else if (in_elempack == 1 && out_elempack == 4)
814     {
815         pipeline = pipeline_innerproduct_pack1to4;
816     }
817     else if (in_elempack == 4 && out_elempack == 1)
818     {
819         pipeline = pipeline_innerproduct_pack4to1;
820     }
821     else if (in_elempack == 8 && out_elempack == 8)
822     {
823         pipeline = pipeline_innerproduct_pack8;
824     }
825     else if (in_elempack == 1 && out_elempack == 8)
826     {
827         pipeline = pipeline_innerproduct_pack1to8;
828     }
829     else if (in_elempack == 4 && out_elempack == 8)
830     {
831         pipeline = pipeline_innerproduct_pack4to8;
832     }
833     else if (in_elempack == 8 && out_elempack == 4)
834     {
835         pipeline = pipeline_innerproduct_pack8to4;
836     }
837     else if (in_elempack == 8 && out_elempack == 1)
838     {
839         pipeline = pipeline_innerproduct_pack8to1;
840     }
841 
842     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
843 
844     return 0;
845 }
846 
847 } // namespace ncnn
848