1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "layer/cast.h"
16 #include "testutil.h"
17 
test_cast_cpu(const ncnn::Mat & a,int type_from,int type_to)18 static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
19 {
20     ncnn::ParamDict pd;
21     pd.set(0, type_from);
22     pd.set(1, type_to);
23 
24     std::vector<ncnn::Mat> weights(0);
25 
26     ncnn::Option opt;
27     opt.num_threads = 1;
28     opt.use_vulkan_compute = false;
29     opt.use_int8_inference = false;
30     opt.use_packing_layout = false;
31 
32     ncnn::Layer* op = ncnn::create_layer("Cast");
33 
34     op->load_param(pd);
35 
36     ncnn::ModelBinFromMatArray mb(weights.data());
37 
38     op->load_model(mb);
39 
40     op->create_pipeline(opt);
41 
42     ncnn::Mat a_fp16;
43     if (type_from == 2)
44     {
45         ncnn::cast_float32_to_float16(a, a_fp16, opt);
46     }
47     else if (type_from == 4)
48     {
49         ncnn::cast_float32_to_bfloat16(a, a_fp16, opt);
50     }
51     else
52     {
53         a_fp16 = a;
54     }
55 
56     ncnn::Mat b;
57     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
58 
59     ncnn::Mat c;
60     op->forward(a_fp16, c, opt);
61 
62     op->destroy_pipeline(opt);
63 
64     delete op;
65 
66     if (CompareMat(b, c, 0.001) != 0)
67     {
68         fprintf(stderr, "test_cast_cpu failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
69         return -1;
70     }
71 
72     return 0;
73 }
74 
test_cast_cpu_packed(const ncnn::Mat & a,int type_from,int type_to)75 static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
76 {
77     ncnn::ParamDict pd;
78     pd.set(0, type_from);
79     pd.set(1, type_to);
80 
81     std::vector<ncnn::Mat> weights(0);
82 
83     ncnn::Option opt;
84     opt.num_threads = 1;
85     opt.use_vulkan_compute = false;
86     opt.use_packing_layout = false;
87 
88     ncnn::Layer* op = ncnn::create_layer("Cast");
89 
90     op->load_param(pd);
91 
92     ncnn::ModelBinFromMatArray mb(weights.data());
93 
94     op->load_model(mb);
95 
96     op->create_pipeline(opt);
97 
98     ncnn::Mat a_fp16;
99     if (type_from == 2)
100     {
101         ncnn::cast_float32_to_float16(a, a_fp16, opt);
102     }
103     else if (type_from == 4)
104     {
105         ncnn::cast_float32_to_bfloat16(a, a_fp16, opt);
106     }
107     else
108     {
109         a_fp16 = a;
110     }
111 
112     ncnn::Mat b;
113     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
114 
115     ncnn::Mat a4;
116     ncnn::convert_packing(a, a4, 4, opt);
117 
118     ncnn::Mat a4_fp16;
119     if (type_from == 2)
120     {
121         ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
122     }
123     else if (type_from == 4)
124     {
125         ncnn::cast_float32_to_bfloat16(a4, a4_fp16, opt);
126     }
127     else
128     {
129         a4_fp16 = a4;
130     }
131 
132     ncnn::Mat c;
133     op->forward(a4_fp16, c, opt);
134 
135     op->destroy_pipeline(opt);
136 
137     delete op;
138 
139     if (CompareMat(b, c, 0.001) != 0)
140     {
141         fprintf(stderr, "test_cast_cpu_packed failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
142         return -1;
143     }
144 
145     return 0;
146 }
147 
148 #if NCNN_VULKAN
test_cast_gpu_fp16p(const ncnn::Mat & a,int type_from,int type_to)149 static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
150 {
151     if (type_to == 4 || type_from == 4)
152         return 0;
153     ncnn::ParamDict pd;
154     pd.set(0, type_from);
155     pd.set(1, type_to);
156 
157     std::vector<ncnn::Mat> weights(0);
158 
159     ncnn::Option opt;
160     opt.num_threads = 1;
161     opt.use_vulkan_compute = true;
162     opt.use_int8_inference = false;
163     opt.use_fp16_packed = true;
164     opt.use_fp16_storage = false;
165     opt.use_fp16_arithmetic = false;
166     opt.use_int8_storage = false;
167     opt.use_int8_arithmetic = false;
168     opt.use_packing_layout = true;
169     opt.use_image_storage = false;
170 
171     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
172 
173     ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
174     ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
175 
176     opt.blob_vkallocator = blob_vkallocator;
177     opt.workspace_vkallocator = blob_vkallocator;
178     opt.staging_vkallocator = staging_vkallocator;
179 
180     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
181     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
182 
183     ncnn::Layer* op = ncnn::create_layer("Cast");
184 
185     op->vkdev = vkdev;
186 
187     op->load_param(pd);
188 
189     ncnn::ModelBinFromMatArray mb(weights.data());
190 
191     op->load_model(mb);
192 
193     op->create_pipeline(opt);
194 
195     ncnn::Mat a_fp16;
196     if (type_from == 2)
197     {
198         ncnn::cast_float32_to_float16(a, a_fp16, opt);
199     }
200     else
201     {
202         a_fp16 = a;
203     }
204 
205     ncnn::Mat b;
206     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
207 
208     ncnn::Mat d;
209 
210     // pack
211     ncnn::Mat a4;
212     ncnn::convert_packing(a, a4, 4, opt);
213 
214     ncnn::Mat a4_fp16;
215     if (type_from == 2 && a4.elempack == 4)
216     {
217         ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
218     }
219     else
220     {
221         a4_fp16 = a4;
222     }
223 
224     // forward
225     ncnn::VkCompute cmd(vkdev);
226 
227     // upload
228     ncnn::VkMat a4_gpu;
229     cmd.record_clone(a4_fp16, a4_gpu, opt);
230 
231     ncnn::VkMat d4_gpu;
232     if (op->support_inplace)
233     {
234         op->forward_inplace(a4_gpu, cmd, opt);
235 
236         d4_gpu = a4_gpu;
237     }
238     else
239     {
240         op->forward(a4_gpu, d4_gpu, cmd, opt);
241     }
242 
243     // download
244     cmd.record_clone(d4_gpu, d, opt);
245 
246     cmd.submit_and_wait();
247 
248     op->destroy_pipeline(opt);
249 
250     delete op;
251 
252     vkdev->reclaim_blob_allocator(blob_vkallocator);
253     vkdev->reclaim_staging_allocator(staging_vkallocator);
254 
255     if (CompareMat(b, d, 0.001) != 0)
256     {
257         fprintf(stderr, "test_cast_gpu_fp16p failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
258         return -1;
259     }
260 
261     return 0;
262 }
263 
test_cast_gpu_fp16p_pack8(const ncnn::Mat & a,int type_from,int type_to)264 static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type_to)
265 {
266     if (type_to == 4 || type_from == 4)
267         return 0;
268     ncnn::ParamDict pd;
269     pd.set(0, type_from);
270     pd.set(1, type_to);
271 
272     std::vector<ncnn::Mat> weights(0);
273 
274     ncnn::Option opt;
275     opt.num_threads = 1;
276     opt.use_vulkan_compute = true;
277     opt.use_int8_inference = false;
278     opt.use_fp16_packed = true;
279     opt.use_fp16_storage = false;
280     opt.use_fp16_arithmetic = false;
281     opt.use_int8_storage = false;
282     opt.use_int8_arithmetic = false;
283     opt.use_packing_layout = true;
284     opt.use_shader_pack8 = true;
285     opt.use_image_storage = false;
286 
287     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
288 
289     ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
290     ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
291 
292     opt.blob_vkallocator = blob_vkallocator;
293     opt.workspace_vkallocator = blob_vkallocator;
294     opt.staging_vkallocator = staging_vkallocator;
295 
296     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
297     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
298 
299     ncnn::Layer* op = ncnn::create_layer("Cast");
300 
301     op->vkdev = vkdev;
302 
303     op->load_param(pd);
304 
305     ncnn::ModelBinFromMatArray mb(weights.data());
306 
307     op->load_model(mb);
308 
309     op->create_pipeline(opt);
310 
311     ncnn::Mat a_fp16;
312     if (type_from == 2)
313     {
314         ncnn::cast_float32_to_float16(a, a_fp16, opt);
315     }
316     else
317     {
318         a_fp16 = a;
319     }
320 
321     ncnn::Mat b;
322     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
323 
324     ncnn::Mat d;
325 
326     // pack
327     ncnn::Mat a4;
328     ncnn::convert_packing(a, a4, 8, opt);
329     if (a4.elempack != 8)
330         ncnn::convert_packing(a, a4, 4, opt);
331 
332     ncnn::Mat a4_fp16;
333     if (type_from == 2 && (a4.elempack == 4 || a4.elempack == 8))
334     {
335         ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
336     }
337     else
338     {
339         a4_fp16 = a4;
340     }
341 
342     // forward
343     ncnn::VkCompute cmd(vkdev);
344 
345     // upload
346     ncnn::VkMat a4_gpu;
347     cmd.record_clone(a4_fp16, a4_gpu, opt);
348 
349     ncnn::VkMat d4_gpu;
350     if (op->support_inplace)
351     {
352         op->forward_inplace(a4_gpu, cmd, opt);
353 
354         d4_gpu = a4_gpu;
355     }
356     else
357     {
358         op->forward(a4_gpu, d4_gpu, cmd, opt);
359     }
360 
361     // download
362     cmd.record_clone(d4_gpu, d, opt);
363 
364     cmd.submit_and_wait();
365 
366     op->destroy_pipeline(opt);
367 
368     delete op;
369 
370     vkdev->reclaim_blob_allocator(blob_vkallocator);
371     vkdev->reclaim_staging_allocator(staging_vkallocator);
372 
373     if (CompareMat(b, d, 0.001) != 0)
374     {
375         fprintf(stderr, "test_cast_gpu_fp16p_pack8 failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
376         return -1;
377     }
378 
379     return 0;
380 }
381 
test_cast_gpu_image_fp16p(const ncnn::Mat & a,int type_from,int type_to)382 static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type_to)
383 {
384     if (type_to == 4 || type_from == 4)
385         return 0;
386     ncnn::ParamDict pd;
387     pd.set(0, type_from);
388     pd.set(1, type_to);
389 
390     std::vector<ncnn::Mat> weights(0);
391 
392     ncnn::Option opt;
393     opt.num_threads = 1;
394     opt.use_vulkan_compute = true;
395     opt.use_int8_inference = false;
396     opt.use_fp16_packed = true;
397     opt.use_fp16_storage = false;
398     opt.use_fp16_arithmetic = false;
399     opt.use_int8_storage = false;
400     opt.use_int8_arithmetic = false;
401     opt.use_packing_layout = true;
402     opt.use_image_storage = true;
403 
404     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
405 
406     ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
407     ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
408 
409     opt.blob_vkallocator = blob_vkallocator;
410     opt.workspace_vkallocator = blob_vkallocator;
411     opt.staging_vkallocator = staging_vkallocator;
412 
413     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
414     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
415 
416     ncnn::Layer* op = ncnn::create_layer("Cast");
417 
418     op->vkdev = vkdev;
419 
420     op->load_param(pd);
421 
422     ncnn::ModelBinFromMatArray mb(weights.data());
423 
424     op->load_model(mb);
425 
426     op->create_pipeline(opt);
427 
428     ncnn::Mat a_fp16;
429     if (type_from == 2)
430     {
431         ncnn::cast_float32_to_float16(a, a_fp16, opt);
432     }
433     else
434     {
435         a_fp16 = a;
436     }
437 
438     ncnn::Mat b;
439     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
440 
441     ncnn::Mat d;
442 
443     // pack
444     ncnn::Mat a4;
445     ncnn::convert_packing(a, a4, 4, opt);
446 
447     ncnn::Mat a4_fp16;
448     if (type_from == 2 && a4.elempack == 4)
449     {
450         ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
451     }
452     else
453     {
454         a4_fp16 = a4;
455     }
456 
457     // forward
458     ncnn::VkCompute cmd(vkdev);
459 
460     // upload
461     ncnn::VkImageMat a4_gpu;
462     cmd.record_clone(a4_fp16, a4_gpu, opt);
463 
464     ncnn::VkImageMat d4_gpu;
465     if (op->support_inplace)
466     {
467         op->forward_inplace(a4_gpu, cmd, opt);
468 
469         d4_gpu = a4_gpu;
470     }
471     else
472     {
473         op->forward(a4_gpu, d4_gpu, cmd, opt);
474     }
475 
476     // download
477     cmd.record_clone(d4_gpu, d, opt);
478 
479     cmd.submit_and_wait();
480 
481     op->destroy_pipeline(opt);
482 
483     delete op;
484 
485     vkdev->reclaim_blob_allocator(blob_vkallocator);
486     vkdev->reclaim_staging_allocator(staging_vkallocator);
487 
488     if (CompareMat(b, d, 0.001) != 0)
489     {
490         fprintf(stderr, "test_cast_gpu_image_fp16p failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
491         return -1;
492     }
493 
494     return 0;
495 }
496 
test_cast_gpu_image_fp16p_pack8(const ncnn::Mat & a,int type_from,int type_to)497 static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, int type_to)
498 {
499     if (type_to == 4 || type_from == 4)
500         return 0;
501     ncnn::ParamDict pd;
502     pd.set(0, type_from);
503     pd.set(1, type_to);
504 
505     std::vector<ncnn::Mat> weights(0);
506 
507     ncnn::Option opt;
508     opt.num_threads = 1;
509     opt.use_vulkan_compute = true;
510     opt.use_int8_inference = false;
511     opt.use_fp16_packed = true;
512     opt.use_fp16_storage = false;
513     opt.use_fp16_arithmetic = false;
514     opt.use_int8_storage = false;
515     opt.use_int8_arithmetic = false;
516     opt.use_packing_layout = true;
517     opt.use_shader_pack8 = true;
518     opt.use_image_storage = true;
519 
520     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
521 
522     ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
523     ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
524 
525     opt.blob_vkallocator = blob_vkallocator;
526     opt.workspace_vkallocator = blob_vkallocator;
527     opt.staging_vkallocator = staging_vkallocator;
528 
529     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
530     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
531 
532     ncnn::Layer* op = ncnn::create_layer("Cast");
533 
534     op->vkdev = vkdev;
535 
536     op->load_param(pd);
537 
538     ncnn::ModelBinFromMatArray mb(weights.data());
539 
540     op->load_model(mb);
541 
542     op->create_pipeline(opt);
543 
544     ncnn::Mat a_fp16;
545     if (type_from == 2)
546     {
547         ncnn::cast_float32_to_float16(a, a_fp16, opt);
548     }
549     else
550     {
551         a_fp16 = a;
552     }
553 
554     ncnn::Mat b;
555     ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
556 
557     ncnn::Mat d;
558 
559     // pack
560     ncnn::Mat a4;
561     ncnn::convert_packing(a, a4, 8, opt);
562     if (a4.elempack != 8)
563         ncnn::convert_packing(a, a4, 4, opt);
564 
565     ncnn::Mat a4_fp16;
566     if (type_from == 2 && (a4.elempack == 4 || a4.elempack == 8))
567     {
568         ncnn::cast_float32_to_float16(a4, a4_fp16, opt);
569     }
570     else
571     {
572         a4_fp16 = a4;
573     }
574 
575     // forward
576     ncnn::VkCompute cmd(vkdev);
577 
578     // upload
579     ncnn::VkImageMat a4_gpu;
580     cmd.record_clone(a4_fp16, a4_gpu, opt);
581 
582     ncnn::VkImageMat d4_gpu;
583     if (op->support_inplace)
584     {
585         op->forward_inplace(a4_gpu, cmd, opt);
586 
587         d4_gpu = a4_gpu;
588     }
589     else
590     {
591         op->forward(a4_gpu, d4_gpu, cmd, opt);
592     }
593 
594     // download
595     cmd.record_clone(d4_gpu, d, opt);
596 
597     cmd.submit_and_wait();
598 
599     op->destroy_pipeline(opt);
600 
601     delete op;
602 
603     vkdev->reclaim_blob_allocator(blob_vkallocator);
604     vkdev->reclaim_staging_allocator(staging_vkallocator);
605 
606     if (CompareMat(b, d, 0.001) != 0)
607     {
608         fprintf(stderr, "test_cast_gpu_image_fp16p_pack8 failed a.dims=%d a=(%d %d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.d, a.c, type_from, type_to);
609         return -1;
610     }
611 
612     return 0;
613 }
614 #endif // NCNN_VULKAN
615 
test_cast(const ncnn::Mat & a,int type_from,int type_to)616 static int test_cast(const ncnn::Mat& a, int type_from, int type_to)
617 {
618     return 0
619            || test_cast_cpu(a, type_from, type_to)
620            || test_cast_cpu_packed(a, type_from, type_to)
621 #if NCNN_VULKAN
622            || test_cast_gpu_fp16p(a, type_from, type_to)
623            || test_cast_gpu_fp16p_pack8(a, type_from, type_to)
624            || test_cast_gpu_image_fp16p(a, type_from, type_to)
625            || test_cast_gpu_image_fp16p_pack8(a, type_from, type_to)
626 #endif // NCNN_VULKAN
627            ;
628 }
629 
test_cast_0()630 static int test_cast_0()
631 {
632     return 0
633            || test_cast(RandomMat(5, 6, 7, 16), 1, 2)
634            || test_cast(RandomMat(3, 4, 5, 13), 1, 2)
635            || test_cast(RandomMat(5, 6, 7, 16), 2, 1)
636            || test_cast(RandomMat(3, 4, 5, 13), 2, 1)
637            || test_cast(RandomMat(5, 6, 7, 16), 1, 4)
638            || test_cast(RandomMat(3, 4, 5, 13), 1, 4)
639            || test_cast(RandomMat(5, 6, 7, 16), 4, 1)
640            || test_cast(RandomMat(3, 4, 5, 13), 4, 1);
641 }
642 
test_cast_1()643 static int test_cast_1()
644 {
645     return 0
646            || test_cast(RandomMat(5, 7, 16), 1, 2)
647            || test_cast(RandomMat(3, 5, 13), 1, 2)
648            || test_cast(RandomMat(5, 7, 16), 2, 1)
649            || test_cast(RandomMat(3, 5, 13), 2, 1)
650            || test_cast(RandomMat(5, 7, 16), 1, 4)
651            || test_cast(RandomMat(3, 5, 13), 1, 4)
652            || test_cast(RandomMat(5, 7, 16), 4, 1)
653            || test_cast(RandomMat(3, 5, 13), 4, 1);
654 }
655 
test_cast_2()656 static int test_cast_2()
657 {
658     return 0
659            || test_cast(RandomMat(6, 16), 1, 2)
660            || test_cast(RandomMat(7, 15), 1, 2)
661            || test_cast(RandomMat(6, 16), 2, 1)
662            || test_cast(RandomMat(7, 15), 2, 1)
663            || test_cast(RandomMat(6, 16), 1, 4)
664            || test_cast(RandomMat(7, 15), 1, 4)
665            || test_cast(RandomMat(6, 16), 4, 1)
666            || test_cast(RandomMat(7, 15), 4, 1);
667 }
668 
test_cast_3()669 static int test_cast_3()
670 {
671     return 0
672            || test_cast(RandomMat(128), 1, 2)
673            || test_cast(RandomMat(127), 1, 2)
674            || test_cast(RandomMat(128), 2, 1)
675            || test_cast(RandomMat(127), 2, 1)
676            || test_cast(RandomMat(128), 1, 4)
677            || test_cast(RandomMat(127), 1, 4)
678            || test_cast(RandomMat(128), 4, 1)
679            || test_cast(RandomMat(127), 4, 1);
680 }
681 
main()682 int main()
683 {
684     SRAND(7767517);
685 
686     return 0
687            || test_cast_0()
688            || test_cast_1()
689            || test_cast_2()
690            || test_cast_3();
691 }
692