1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2017, Intel Corporation, all rights reserved.
14 // Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "../../precomp.hpp"
44 
45 #include <opencv2/core/utils/configuration.private.hpp>
46 
47 #include <string>
48 #include <vector>
49 #include <fstream>
50 #include <sys/stat.h>
51 #include <assert.h>
52 #include "../include/common.hpp"
53 #include "../include/ocl4dnn.hpp"
54 #include "opencl_kernels_dnn.hpp"
55 #include "../include/math_functions.hpp"
56 #include "../include/default_kernel_config.hpp"
57 #include "opencv2/dnn/shape_utils.hpp"
58 #include "opencv2/core/utils/logger.hpp"
59 
60 #if defined WIN32 || defined _WIN32
61 #include <windows.h>
62 #include <direct.h>
63 #undef min
64 #undef max
65 #endif
66 
67 namespace cv { namespace dnn { namespace ocl4dnn {
68 static cv::Mutex kernelConfigMutex;
69 typedef std::map<std::string, std::string> kernel_hash_t;
70 static kernel_hash_t kernelConfigMap;
71 static bool defaultConfigLoaded = false;
72 
enableWorkaroundIDLF()73 static bool enableWorkaroundIDLF()
74 {
75     static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_WORKAROUND_IDLF", true);
76     return param;
77 }
78 
dumpFailedResult()79 static bool dumpFailedResult()
80 {
81     static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_DUMP_FAILED_RESULT", false);
82     return param;
83 }
84 
testAllKernels()85 static size_t testAllKernels()
86 {
87     static size_t param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_TEST_ALL_KERNELS", 0);
88     return param;
89 }
90 
raiseOnCheckError()91 static bool raiseOnCheckError()
92 {
93     static bool param = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_TUNING_RAISE_CHECK_ERROR", false);
94     return param;
95 }
96 
sanitize(const std::string & s)97 static std::string sanitize(const std::string& s)
98 {
99     std::string s_ = s;
100     for (size_t i = 0; i < s_.size(); i++)
101     {
102         char c = s_[i];
103         if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
104         {
105             s_[i] = '_';
106         }
107     }
108     // TODO add hash?
109     // s_ = s_ + cv::format("_%08llx", crc64((uchar*)s.c_str(), s.size()));
110     return s_;
111 }
112 
initializeGlobalBuiltinConfigurations(const std::string & cache_path)113 static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
114 {
115     CV_Assert(defaultConfigLoaded == false);
116     CV_Assert(kernelConfigMap.empty());
117 
118     /* fp32 config */
119     size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
120                         sizeof(default_kernel_config_intel_fp32[0]) / 2;
121     for (size_t i = 0; i < numConfigs; i++)
122     {
123         std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
124         if (!cache_path.empty())
125         {
126             std::string cacheFile = cache_path + sanitize(key);
127             std::ifstream cachedKernel(cacheFile.c_str());
128             if (cachedKernel)
129                 continue;  // external configuration found, skip builtin
130         }
131         std::pair<std::string, std::string> entry(
132                 key,
133                 default_kernel_config_intel_fp32[2 * i + 1]);
134         kernelConfigMap.insert(entry);
135     }
136 
137     /* fp16 config */
138     numConfigs = sizeof(default_kernel_config_intel_fp16) /
139                  sizeof(default_kernel_config_intel_fp16[0]) / 2;
140     for (size_t i = 0; i < numConfigs; i++)
141     {
142         std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
143         if (!cache_path.empty())
144         {
145             std::string cacheFile = cache_path + sanitize(key);
146             std::ifstream cachedKernel(cacheFile.c_str());
147             if (cachedKernel)
148                 continue;  // external configuration found, skip builtin
149         }
150         std::pair<std::string, std::string> entry(
151                 key,
152                 default_kernel_config_intel_fp16[2 * i + 1]);
153         kernelConfigMap.insert(entry);
154     }
155 
156     defaultConfigLoaded = true;
157 }
158 
159 
160 template<typename Dtype>
OCL4DNNConvSpatial(OCL4DNNConvConfig config)161 OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
162 {
163     bias_term_ = config.bias_term;
164     int dims = config.in_shape.size();
165     int spatial_dims = 2;
166 
167     channels_   = config.in_shape[dims - spatial_dims - 1];
168     num_output_ = config.out_shape[dims - spatial_dims - 1];
169     group_ = config.group;
170 
171     fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
172     fused_eltwise_ = false;
173     power_ = 1.f;
174     negative_slope_ = 0;
175     min_value_ = 0;
176     max_value_ = 0;
177     prev_kernel_type_ = -1;
178     tuned_ = false;
179     use_half_ = config.use_half;
180 
181     // assumption: spatial dimension is 2.
182     kernel_h_ = config.kernel.height;
183     kernel_w_ = config.kernel.width;
184     pad_h_ = config.pad.height;
185     pad_w_ = config.pad.width;
186     stride_h_ = config.stride.height;
187     stride_w_ = config.stride.width;
188     dilation_h_ = config.dilation.height;
189     dilation_w_ = config.dilation.width;
190     M_ = num_output_ / group_;
191     height_ = config.in_shape[dims - spatial_dims + 0];
192     width_ = config.in_shape[dims - spatial_dims + 1];
193     output_h_ = config.out_shape[dims - spatial_dims + 0];
194     output_w_ = config.out_shape[dims - spatial_dims + 1];
195     bottom_dim_ = channels_ * width_ * height_;
196     top_dim_ = num_output_ * output_w_ * output_h_;
197     int Ph = (output_h_ - 1) * stride_h_ + (dilation_h_ * (kernel_h_ - 1) + 1) - height_;
198     int Pw = (output_w_ - 1) * stride_w_ + (dilation_w_ * (kernel_w_ - 1) + 1) - width_;
199     Ph = (Ph > 0) ? Ph : 0;
200     Pw = (Pw > 0) ? Pw : 0;
201     pad_right_  = (Pw + 1) / 2;
202     pad_bottom_ = (Ph + 1) / 2;
203 
204     cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
205     dwconv_ = (num_output_ == channels_ && channels_ == group_);
206 
207     use_cache_path_ = false;
208     if (!cache_path_.empty())
209     {
210 #if defined _WIN32
211         struct _stat file_stat;
212         use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 &&
213                       ((_S_IFDIR & file_stat.st_mode) != 0);
214 #else
215         struct stat file_stat;
216         use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 &&
217                       S_ISDIR(file_stat.st_mode);
218 #endif
219         if (!use_cache_path_)
220         {
221             static int warn_ = 0;
222             if (!warn_)
223             {
224                 std::cerr
225                     << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
226                     << std::endl;
227                 warn_ = true;
228             }
229         }
230     }
231 
232     run_auto_tuning_ = use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false);
233     force_auto_tuning_ = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
234 }
235 
236 template<typename Dtype>
~OCL4DNNConvSpatial()237 OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
238 {
239     if (!swizzled_weights_umat.empty()) {
240         swizzled_weights_umat.release();
241     }
242 }
243 
244 template<typename Dtype>
setFusionDefine(ocl4dnnFusedActiv_t fused_activ,bool fused_eltwise)245 void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise)
246 {
247     if (fused_eltwise)
248         addDef("FUSED_CONV_ELTWISE", 1);
249 
250     switch (fused_activ) {
251         case OCL4DNN_CONV_FUSED_ACTIV_RELU:
252             addDef("FUSED_CONV_RELU", 1);
253             break;
254         case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
255             addDef("FUSED_CONV_PRELU", 1);
256             break;
257         case OCL4DNN_CONV_FUSED_ACTIV_POWER:
258             addDef("FUSED_CONV_POWER", 1);
259             break;
260         case OCL4DNN_CONV_FUSED_ACTIV_TANH:
261             addDef("FUSED_CONV_TANH", 1);
262             break;
263         case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
264             addDef("FUSED_CONV_RELU6", 1);
265             break;
266         default:
267             ;
268     }
269     return;
270 }
271 
272 template<typename Dtype>
setFusionArg(ocl4dnnFusedActiv_t fused_activ,bool fused_eltwise,ocl::Kernel & kernel,cl_uint & argIdx)273 void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
274 {
275     if (fused_eltwise)
276         kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
277 
278     switch (fused_activ) {
279         case OCL4DNN_CONV_FUSED_ACTIV_RELU:
280             kernel.set(argIdx++, (float)negative_slope_);
281             break;
282         case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
283             kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
284             break;
285         case OCL4DNN_CONV_FUSED_ACTIV_POWER:
286             kernel.set(argIdx++, (float)power_);
287             break;
288         case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
289             kernel.set(argIdx++, (float)min_value_);
290             kernel.set(argIdx++, (float)max_value_);
291             break;
292         default:
293             ;
294     }
295     return;
296 }
297 
298 typedef enum {
299     TYPE_FLOAT = 1,
300     TYPE_HALF = 2
301 } ocl4dnnConvSpatialType_t;
302 
303 template<typename Dtype>
collectCommonInformation()304 void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
305 {
306     if (use_half_)
307     {
308         addDef("TYPE", TYPE_HALF);
309         addDef("Dtype", "half");
310         addDef("Dtype2", "half2");
311         addDef("Dtype4", "half4");
312         addDef("Dtype8", "half8");
313         addDef("Dtype16", "half16");
314         addDef("as_Dtype", "as_half");
315         addDef("as_Dtype2", "as_half2");
316         addDef("as_Dtype4", "as_half4");
317         addDef("as_Dtype8", "as_half8");
318     }
319     else
320     {
321         addDef("TYPE", TYPE_FLOAT);
322         addDef("Dtype", "float");
323         addDef("Dtype2", "float2");
324         addDef("Dtype4", "float4");
325         addDef("Dtype8", "float8");
326         addDef("Dtype16", "float16");
327         addDef("as_Dtype", "as_float");
328         addDef("as_Dtype2", "as_float2");
329         addDef("as_Dtype4", "as_float4");
330         addDef("as_Dtype8", "as_float8");
331     }
332 }
333 
334 typedef enum {
335     KERNEL_TYPE_INTEL_IDLF = 2,
336     KERNEL_TYPE_BASIC = 4,
337     KERNEL_TYPE_GEMM_LIKE = 5,
338     KERNEL_TYPE_DWCONV = 6
339 } ocl4dnnConvSpatialKernelType_t;
340 
341 template<typename Dtype>
setupKernelDetails(int32_t kernelType,int32_t blockM,int32_t blockK,int32_t blockN)342 void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
343                                                    int32_t blockM,
344                                                    int32_t blockK,
345                                                    int32_t blockN)
346 {
347     std::string kernelUKey;
348     int32_t simd_size;
349 
350     if (kernelType == KERNEL_TYPE_INTEL_IDLF) {
351         simd_size = blockN;
352         kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1);
353 
354         // kernel name
355         kernel_name_ = "IDLF_";
356         kernel_name_ += kernelUKey;
357         if (simd_size == 16)
358             kernel_name_ += "_SIMD16";
359         else
360             kernel_name_ += "_SIMD8";
361 
362         // options
363         options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
364         options_ << " -cl-mad-enable";
365         if (clOptionSupport("-cl-no-subgroup-ifp"))
366             options_ << " -cl-no-subgroup-ifp ";
367 
368         // defs
369         int32_t output_block_width = blockM;
370         int32_t output_block_height = blockK;
371         int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
372         int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
373         int invec_size = tile_y;
374 
375         addDef("SIMD_SIZE", simd_size);
376         addDef("OUT_BLOCK_WIDTH", output_block_width);
377         addDef("OUT_BLOCK_HEIGHT", output_block_height);
378         addDef("INPUT_DEPTH", channels_ / group_);
379         addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
380         addDef("TOTAL_OUTPUT_DEPTH", num_output_);
381         addDef("NUM_FILTERS", M_);
382         addDef("TILE_X", tile_x);
383         addDef("TILE_Y", tile_y);
384         addDef("INVEC_SIZE", invec_size);
385         addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
386         addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
387         addDef("APPLY_BIAS", bias_term_);
388         addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
389         addDef("INPUT_PITCH", (width_ * height_));
390         addDef("OUTPUT_PITCH", (output_w_ * output_h_));
391         addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
392         addDef("INPUT_WIDTH", width_);
393         addDef("INPUT_HEIGHT", height_);
394         addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
395 
396         setFusionDefine(fused_activ_, fused_eltwise_);
397 
398         src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
399     }
400     else if (kernelType == KERNEL_TYPE_BASIC)
401     {
402         addDef("KERNEL_BASIC");
403 
404         kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN);
405         kernel_name_ = "BASIC_";
406         kernel_name_ += kernelUKey;
407 
408         // opts
409         options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_;
410         if (clOptionSupport("-cl-no-subgroup-ifp"))
411             options_ << " -cl-no-subgroup-ifp ";
412 
413         // defs
414         addDef("CHANNELS", channels_ / group_);
415         addDef("APPLY_BIAS", bias_term_);
416         addDef("OUTPUT_Z", M_);
417         addDef("ZPAR", 1);
418         setFusionDefine(fused_activ_, fused_eltwise_);
419 
420         src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
421     }
422     else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
423     {
424         simd_size = blockK;
425         kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN);
426 
427         kernel_name_ = "U_GEMM_LIKE_CONV_";
428         kernel_name_ += kernelUKey.c_str();
429         kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16";
430         std::stringstream kernelDef;
431         kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM;
432         if (blockK == 16)
433             kernelDef << "_SIMD16";
434 
435         // Build list of options and defines
436         options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str()
437             << " -D Conv_Interleaved=" << kernel_name_.c_str();
438         options_ << " -cl-mad-enable";
439         if (clOptionSupport("-cl-no-subgroup-ifp"))
440             options_ << " -cl-no-subgroup-ifp ";
441 
442         addDef("KERNEL_GEMM_LIKE");
443         addDef("INPUT_DEPTH", channels_);
444         addDef("WIDTH1", M_);
445         addDef("OUT_PADDING_LEFT", 0);
446         addDef("OUT_PADDING_HEIGHT", 0);
447         addDef("OUT_DEPTH", M_);
448         addDef("NUM_BATCHES", num_);
449         addDef("DY", blockM);
450         addDef("DX", blockN);
451         addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2);
452         addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2);
453         addDef("TILE_N_LAST", M_ % 32);
454         addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
455         addDef("APPLY_BIAS", bias_term_);
456         setFusionDefine(fused_activ_, fused_eltwise_);
457         src_ = ocl::dnn::conv_layer_spatial_oclsrc;
458     }
459     else if (kernelType == KERNEL_TYPE_DWCONV)
460     {
461         kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
462         kernel_name_ = "DWCONV_";
463         kernel_name_ += kernelUKey.c_str();
464 
465         options_ << " -cl-fast-relaxed-math ";
466         if (clOptionSupport("-cl-no-subgroup-ifp"))
467             options_ << " -cl-no-subgroup-ifp ";
468 
469         addDef("KERNEL_DWCONV");
470         addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
471         addDef("KERNEL_W", kernel_w_);
472         addDef("KERNEL_H", kernel_h_);
473         addDef("APPLY_BIAS", bias_term_);
474         addDef("OUTPUT_Z", num_output_ * num_);
475         addDef("CHANNELS", num_output_);
476         setFusionDefine(fused_activ_, fused_eltwise_);
477 
478         options_ << " -D DWCONV=" << kernel_name_;
479         src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
480     }
481 }
482 
483 template<typename Dtype>
setupKernel()484 void OCL4DNNConvSpatial<Dtype>::setupKernel()
485 {
486     collectCommonInformation();
487 
488     addDef("KERNEL_WIDTH", kernel_w_);
489     addDef("KERNEL_HEIGHT" , kernel_h_);
490     addDef("STRIDE_X", stride_w_);
491     addDef("STRIDE_Y", stride_h_);
492     addDef("DILATION_X", dilation_w_);
493     addDef("DILATION_Y", dilation_h_);
494     if (kernelType_ != KERNEL_TYPE_BASIC)
495     {
496         addDef("INPUT_PAD_W", pad_w_);
497         addDef("INPUT_PAD_H", pad_h_);
498         addDef("INPUT_PAD_RIGHT", pad_right_);
499         addDef("INPUT_PAD_BOTTOM", pad_bottom_);
500     }
501 
502     setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
503 }
504 
505 template<typename Dtype>
setBias(bool bias_term)506 void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
507 {
508     bias_term_ = bias_term;
509 }
510 
511 template<typename Dtype>
setActivReLU(bool fuse_activ,float slope)512 void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
513 {
514     if ( fuse_activ )
515     {
516         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
517         negative_slope_ = slope;
518     }
519     else
520         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
521 }
522 
523 template<typename Dtype>
setActivReLU6(bool fuse_activ,float min,float max)524 void OCL4DNNConvSpatial<Dtype>::setActivReLU6(bool fuse_activ, float min, float max)
525 {
526     if ( fuse_activ )
527     {
528         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
529         min_value_ = min;
530         max_value_ = max;
531     }
532     else
533         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
534 }
535 
536 template<typename Dtype>
setActivPReLU(bool fuse_activ,std::vector<float> & slope)537 void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
538 {
539     if ( fuse_activ )
540     {
541         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
542         Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
543         tmpMat.copyTo(negative_slope_umat_);
544     }
545     else
546         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
547 }
548 
549 template<typename Dtype>
setActivPower(bool fuse_activ,float power)550 void OCL4DNNConvSpatial<Dtype>::setActivPower(bool fuse_activ, float power)
551 {
552     if ( fuse_activ )
553     {
554         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_POWER;
555         power_ = power;
556     }
557     else
558         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
559 }
560 
561 template<typename Dtype>
setActivTanh(bool fuse_activ)562 void OCL4DNNConvSpatial<Dtype>::setActivTanh(bool fuse_activ)
563 {
564     if ( fuse_activ )
565     {
566         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_TANH;
567     }
568     else
569         fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
570 }
571 
572 template<typename Dtype>
Forward(const UMat & bottom,const UMat & bottom2,const UMat & weight,const UMat & bias,UMat & top,int32_t numImages)573 bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
574                                         const UMat& bottom2,
575                                         const UMat& weight,
576                                         const UMat& bias,
577                                         UMat& top,
578                                         int32_t numImages)
579 {
580     num_ = numImages;
581     if (!bottom2.empty())
582     {
583         fused_eltwise_ = true;
584         bottom_data2_ = bottom2;
585     }
586     else
587     {
588         fused_eltwise_ = false;
589     }
590 
591     if (use_half_ && !bias.empty())
592         CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
593 
594     if (use_half_)
595         CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
596 
597     prepareKernel(bottom, top, weight, bias, numImages);
598     if (bestKernelConfig.empty())
599         return false;
600     return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
601 }
602 
603 template<typename Dtype>
calculateBenchmark(const UMat & bottom,UMat & verifyTop,const UMat & weight,const UMat & bias,int32_t numImages)604 void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &verifyTop,
605                                                    const UMat &weight, const UMat &bias,
606                                                    int32_t numImages)
607 {
608     options_.str(""); options_.clear(); // clear contents and state flags
609     createBasicKernel(1, 1, 1);
610     CV_Assert(!kernelQueue.empty());  // basic kernel must be available
611     kernel_index_ = kernelQueue.size() - 1;
612     convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_]);
613     CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end());
614     //unloadProgram(kernelQueue[kernel_index_]->kernelName);
615     kernelQueue.pop_back();
616     return;
617 }
618 
619 // For large enough input size, we do not need to tune kernels for different
620 // size. The reason is with large input size, there will be enough work items
621 // to feed al the EUs.
622 // FIXME for the gemm like convolution, switch back to exact image size.
623 
624 #define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16)))
625 
626 template<typename Dtype>
generateKey()627 void OCL4DNNConvSpatial<Dtype>::generateKey()
628 {
629     std::string precision = (use_half_) ? "FP16" : "FP32";
630     std::stringstream keyBuilder;
631     // FIXME: to support fuse?
632     keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
633                << "cn" << channels_ << "_"
634                << "g" << group_ << "_"
635                << "s" << stride_w_ << "x" << stride_h_ << "_"
636                << "d" << dilation_w_ << "x" << dilation_h_ << "_"
637                << "b" << bias_term_ << "_"
638                << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
639                << "p" << pad_w_ << "x" << pad_h_ << "_"
640                << "num" << num_ << "_"
641                << "M" << M_ << "_"
642                << "activ" << (int)fused_activ_ << "_"
643                << "eltwise" << fused_eltwise_ << "_"
644                << precision;
645 
646 
647     key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
648     key_sanitized_ = sanitize(key_);
649     short_key_ = keyBuilder.str();
650 }
651 
652 template<typename Dtype>
generateSpecificKey(int32_t type,int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)653 std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t blockWidth,
654                                                            int32_t blockHeight, int32_t blockDepth)
655 {
656     std::stringstream keyBuilder;
657     keyBuilder << short_key_
658                << "_" << type
659                << "_" << blockWidth
660                << "_" << blockHeight
661                << "_" << blockDepth;
662 
663     return keyBuilder.str();
664 }
665 
666 template<typename Dtype>
interleaveMatrix(Dtype * mem_dst,const Dtype * mem,int r,int c,int interleavedRows,int nonInterleavedRows,int blockWidth,int rowAlignment)667 void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
668                       int r, int c, int interleavedRows, int nonInterleavedRows,
669                       int blockWidth, int rowAlignment )
670 {
671     CHECK_EQ(interleavedRows % 2, 0) <<
672              "interleaveMatrix only supports even values for interleavedRows.";
673 
674     size_t memSize = r * c * sizeof(float);
675     size_t dstSize = memSize *
676                      (interleavedRows + nonInterleavedRows * 2) /
677                      (interleavedRows + nonInterleavedRows);
678     memset(mem_dst, 0, dstSize);    // NOLINT
679 
680     const int xStride = blockWidth;
681     const int yStride = c * 2;
682     const Dtype *pSrc = mem;
683     Dtype* pDst = mem_dst;
684     for (int y = 0; y < r;) {
685         for (int rows = 0; rows < interleavedRows; rows += 2) {
686             if ( y >= r ) break;
687             if ((c % xStride) == 0) {
688                 for (int x = 0; x < c / xStride; x++) {
689                     memcpy(pDst + x * xStride * 2,                         // NOLINT
690                            pSrc + x * xStride,     xStride * sizeof(Dtype));
691                     memcpy(pDst + x * xStride * 2 + xStride,               // NOLINT
692                            pSrc + x * xStride + c, xStride * sizeof(Dtype));
693                 }
694             } else {
695                 const int count = c / xStride;
696                 int x = 0;
697                 for (; x < count - 1; x++) {
698                     memcpy(pDst + x * xStride * 2,                          // NOLINT
699                            pSrc + x * xStride, xStride * sizeof(Dtype));
700                     memcpy(pDst + x * xStride * 2 + xStride,                // NOLINT
701                            pSrc + x * xStride + c, xStride * sizeof(Dtype));
702                 }
703                 memcpy(pDst + x * xStride * 2,                            // NOLINT
704                        pSrc + x * xStride, xStride * sizeof(Dtype));
705             }
706             pSrc += yStride;
707             pDst += yStride;
708             y += 2;
709         }
710 
711         for (int rows = 0; rows < nonInterleavedRows; rows++) {
712             if (y >= r) break;
713             const int stride = rowAlignment;
714             int remaining = c;
715             for (int x = 0; x < c; x += stride) {
716                 if (remaining >= stride) {
717                     memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype));    // NOLINT
718                     remaining -=stride;
719                 } else {
720                     memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype));  // NOLINT
721                 }
722             }
723             pSrc += yStride / 2;
724             pDst += yStride;
725             y++;
726         }
727     }
728 }
729 
730 template<typename Dtype>
swizzleWeight(const UMat & weight,int32_t swizzled_factor,bool interleave)731 bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
732                                               int32_t swizzled_factor,
733                                               bool interleave)
734 {
735     // Simply skip the weight swizzle if we already got a swizzled_weights_
736     // in test phase and not in auto tuning
737     // This requires we always call convolve again with the winner configuration
738     // during the auto tuning stage.
739     if (tuned_ && !swizzled_weights_umat.empty())
740         return true;
741 
742     if (swizzled_weights_umat.empty())
743         swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
744                                      kernel_h_ * (int)alignSize(kernel_w_, 2),
745                                      (use_half_) ? CV_16SC1 : CV_32FC1);
746 
747     if (!interleave) {
748         int32_t channels = channels_ / group_;
749 
750         ocl::Kernel oclk_copy_weight(
751             use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
752             cv::ocl::dnn::conv_spatial_helper_oclsrc,
753             use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
754         );
755         if (oclk_copy_weight.empty())
756             return false;
757 
758         oclk_copy_weight.args(
759             ocl::KernelArg::PtrReadOnly(weight),
760             ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
761             kernel_w_,
762             kernel_h_,
763             channels,
764             num_output_,
765             swizzled_factor
766         );
767 
768         size_t global_work_size_copy[3] = {
769             (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
770 
771         if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
772         {
773             std::cout << "Swizzle kernel run failed." << std::endl;
774             return false;
775         }
776     } else {
777         // assumption: kernel dimension is 2
778         Mat weightMat;
779         Mat swizzledWeightMat;
780         UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
781         if (use_half_)
782         {
783             CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
784             convertFp16(weight, weight_tmp);
785             weightMat = weight_tmp.getMat(ACCESS_READ);
786             swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
787         }
788         else
789         {
790             weightMat = weight.getMat(ACCESS_READ);
791             swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
792         }
793 
794         CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
795         Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
796         Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
797 
798         int interleavedRows = (kernel_w_ / 2) * 2;
799         int nonInterleavedRows = kernel_w_ % 2;
800         int blockWidth = swizzled_factor;  // should equal to simd size.
801         int rowAlignment = 32;
802         size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
803         cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
804         for (int od = 0; od < M_; od++)
805             for (int id = 0; id < channels_; id++)
806                 for (int r = 0; r < kernel_h_; r++)
807                     for (int c = 0; c < kernel_w_; c++)
808                         tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
809                             cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
810 
811         interleaveMatrix(cpu_swizzled_weight,
812                          tmpSwizzledWeight.data(),
813                          kernel_w_ * kernel_h_ * channels_, M_,
814                          interleavedRows,
815                          nonInterleavedRows,
816                          blockWidth,
817                          rowAlignment);
818 
819         // unmap OpenCL buffers
820         weightMat.release();
821 
822         if (use_half_)
823             convertFp16(swizzledWeightMat, swizzled_weights_umat);
824     }
825 
826     return true;
827 }
828 
829 template<>
createBasicKernel(int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)830 bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
831                                                   int32_t blockHeight, int32_t blockDepth)
832 {
833     kernelType_ = KERNEL_TYPE_BASIC;
834     blockM_ = blockWidth;
835     blockK_ = blockHeight;
836     blockN_ = blockDepth;
837     setupKernel();
838 
839     ocl::Program program = compileKernel();
840     if (program.ptr())
841     {
842         int32_t workItemOutput[3] = { 1, 1, 1 };
843         size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ };
844         kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0],
845                                                     false, KERNEL_TYPE_BASIC));
846         return true;
847     }
848     else
849         return false;
850 }
851 
852 template<>
CreateSubBuffer(const UMat & buffer,UMat & sub_buffer,int32_t offset,int32_t size,bool write_only)853 void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
854                                                 int32_t offset, int32_t size, bool write_only)
855 {
856     cl_mem sub_mem;
857     cl_buffer_region region;
858     cl_int err;
859     size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
860 
861     region.origin = offset * element_size + buffer.offset;
862     region.size = size * element_size;
863     sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
864                                 write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
865                                 CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
866     if (err)
867     {
868         std::cout << "Failed to create sub buffer." << std::endl;
869         return;
870     }
871 
872     int step = element_size, rows = size, cols = 1;
873     ocl::convertFromBuffer(sub_mem, step, rows, cols,
874                            (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
875 
876     //decrease ocl mem refcount
877     clReleaseMemObject(sub_mem);
878 }
879 
880 template<>
convolve(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config)881 bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
882                                          const UMat &weight, const UMat &bias,
883                                          int32_t numImages, kernelConfig* config)
884 {
885     ocl::Program program;
886     phash_t::iterator it = phash.find(config->kernelName);
887     if (it != phash.end())
888         program = it->second;
889     else
890         return false;
891 
892     int32_t bias_offset;
893     int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
894 
895     if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
896         if (!swizzleWeight(weight, config->workItem_output[2], false))
897             return false;
898         size_t total_bottom_size = bottom_dim_ * numImages;
899         size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
900         size_t total_bias_size = M_ * group_;
901         size_t total_top_size = top_dim_ * numImages;
902         for (int32_t g = 0; g < group_; ++g) {
903             bias_offset = M_ * g;
904             int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
905             int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
906             int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
907 
908             ocl::Kernel kernel(config->kernelName.c_str(), program);
909             if (kernel.empty())
910                 return false;
911 
912             cl_uint argIdx = 0;
913             setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
914 
915             UMat img_buffer;
916             if (image_offset)
917             {
918                 CreateSubBuffer(bottom, img_buffer, image_offset,
919                                 total_bottom_size - image_offset, false);
920                 if (img_buffer.empty())
921                     return false;
922 
923                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
924             }
925             else
926             {
927                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
928             }
929 
930             UMat kernel_buffer;
931             if (kernel_offset)
932             {
933                 CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
934                                 total_kernel_size - kernel_offset, false);
935                 if (kernel_buffer.empty())
936                     return false;
937 
938                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
939             }
940             else
941             {
942                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
943             }
944 
945             UMat bias_buffer;
946             if (bias_term_)
947             {
948                 if (bias_offset)
949                 {
950                     CreateSubBuffer(bias, bias_buffer, bias_offset,
951                                     total_bias_size - bias_offset, false);
952                     if (bias_buffer.empty())
953                         return false;
954 
955                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
956                 }
957                 else
958                 {
959                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
960                 }
961             }
962 
963             UMat out_buffer;
964             if (output_image_offset)
965             {
966                 CreateSubBuffer(top, out_buffer, output_image_offset,
967                                 total_top_size - output_image_offset, true);
968                 if (out_buffer.empty())
969                     return false;
970 
971                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
972                 kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
973             }
974             else
975             {
976                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
977                 kernel.set(argIdx++, (int)(top.offset / element_size));
978             }
979 
980             kernel.set(argIdx++, (uint16_t)width_);
981             kernel.set(argIdx++, (uint16_t)height_);
982             kernel.set(argIdx++, (uint16_t)output_w_);
983             kernel.set(argIdx++, (uint16_t)output_h_);
984             if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
985             {
986                 std::cout << "IDLF kernel run failed." << std::endl;
987                 return false;
988             }
989         }
990     } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
991         if (!swizzleWeight(weight, config->workItem_output[1], true))
992             return false;
993         size_t total_bottom_size = bottom_dim_ * numImages;
994         size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
995         size_t total_bias_size = M_ * group_;
996         size_t total_top_size = top_dim_ * numImages;
997         for (int32_t g = 0; g < group_; ++g) {
998             bias_offset = M_ * g;
999             int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
1000             int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
1001             int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
1002 
1003             ocl::Kernel kernel(config->kernelName.c_str(), program);
1004             if (kernel.empty())
1005                 return false;
1006 
1007             cl_uint argIdx = 0;
1008             setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1009 
1010             UMat img_buffer;
1011             if (image_offset)
1012             {
1013                 CreateSubBuffer(bottom, img_buffer, image_offset,
1014                                 total_bottom_size - image_offset, false);
1015                 if (img_buffer.empty())
1016                     return false;
1017 
1018                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
1019             }
1020             else
1021             {
1022                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1023             }
1024 
1025             UMat kernel_buffer;
1026             if (kernel_offset)
1027             {
1028                 CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
1029                                 total_kernel_size - kernel_offset, false);
1030                 if (kernel_buffer.empty())
1031                     return false;
1032 
1033                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
1034             }
1035             else
1036             {
1037                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
1038             }
1039 
1040             UMat bias_buffer;
1041             if (bias_term_)
1042             {
1043                 if (bias_offset)
1044                 {
1045                     CreateSubBuffer(bias, bias_buffer, bias_offset,
1046                                     total_bias_size - bias_offset, false);
1047                     if (bias_buffer.empty())
1048                         return false;
1049 
1050                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
1051                 }
1052                 else
1053                 {
1054                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1055                 }
1056             }
1057 
1058             UMat out_buffer;
1059             if (output_image_offset)
1060             {
1061                 CreateSubBuffer(top, out_buffer, output_image_offset,
1062                                 total_top_size - output_image_offset, true);
1063                 if (out_buffer.empty())
1064                     return false;
1065 
1066                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
1067                 kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
1068             }
1069             else
1070             {
1071                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1072                 kernel.set(argIdx++, (int)(top.offset / element_size));
1073             }
1074 
1075             kernel.set(argIdx++, (uint16_t)width_);
1076             kernel.set(argIdx++, (uint16_t)height_);
1077             kernel.set(argIdx++, (uint16_t)output_w_);
1078             kernel.set(argIdx++, (uint16_t)output_h_);
1079 
1080             int out_pitch_y = output_w_ * output_h_;
1081             int out_pitch_z = out_pitch_y * M_;
1082             int aligned_input_size = height_ * width_ * channels_ / group_;
1083             int slice_pitch = width_ * height_;
1084             kernel.set(argIdx++, (uint32_t)out_pitch_y);
1085             kernel.set(argIdx++, (uint32_t)out_pitch_z);
1086             kernel.set(argIdx++, (uint32_t)aligned_input_size);
1087             kernel.set(argIdx++, (uint32_t)slice_pitch);
1088 
1089             int blockM = config->workItem_output[0];
1090             int blockK = config->workItem_output[1];
1091             int blockN = config->workItem_output[2];
1092             int alignedFilterWidth = alignSize(M_, blockN);
1093             int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM);
1094             int globalWorkSizeDX = blockN;
1095             int globalWorkSizeDY = blockM;
1096             size_t sgemm_m = alignedExpandHeight;
1097             size_t sgemm_n = alignedFilterWidth;
1098             size_t gx = divUp(sgemm_n, globalWorkSizeDX);
1099             size_t gy = divUp(sgemm_m, globalWorkSizeDY);
1100             gy = alignSize(gy, blockK);
1101             size_t global_size[3] = { gx, gy, config->global_work_size[2] };
1102 
1103             if (!kernel.run(3, global_size, config->local_work_size, false))
1104             {
1105                 std::cout << "GEMM like kernel run failed." << std::endl;
1106                 return false;
1107             }
1108         }
1109     } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
1110         ocl::Kernel kernel(config->kernelName.c_str(), program);
1111         if (kernel.empty())
1112             return false;
1113 
1114         cl_uint argIdx = 0;
1115         setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1116         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1117         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1118         if (bias_term_)
1119             kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1120         kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1121         kernel.set(argIdx++, (int)(top.offset / element_size));
1122         kernel.set(argIdx++, (uint16_t)width_);
1123         kernel.set(argIdx++, (uint16_t)height_);
1124         kernel.set(argIdx++, (uint16_t)output_w_);
1125         kernel.set(argIdx++, (uint16_t)output_h_);
1126 
1127         size_t global_size[3];
1128         global_size[0] = output_w_;
1129         global_size[1] = output_h_;
1130         global_size[2] = num_output_ * num_;
1131 
1132         if (!kernel.run(3, global_size, NULL, false))
1133         {
1134             std::cout << "DWCONV kernel run failed." << std::endl;
1135             return false;
1136         }
1137     } else {
1138         for (int32_t n = 0; n < numImages; ++n) {
1139             for (int32_t g = 0; g < group_; ++g) {
1140                 bias_offset = M_ * g;
1141                 int32_t image_offset = n * bottom_dim_
1142                     + width_ * height_ * (channels_ / group_) * g;
1143                 int32_t output_image_offset = n * top_dim_
1144                     + output_w_ * output_h_ * M_ * g;
1145 
1146                 int32_t kernel_offset = kernel_h_ * kernel_w_ *
1147                                        (channels_ / group_) * M_
1148                                        * g;
1149 
1150                 ocl::Kernel kernel(config->kernelName.c_str(), program);
1151                 if (kernel.empty())
1152                     return false;
1153 
1154                 cl_uint argIdx = 0;
1155                 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1156                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1157                 kernel.set(argIdx++, image_offset);
1158                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1159                 kernel.set(argIdx++, kernel_offset);
1160                 if (bias_term_)
1161                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1162                 else
1163                     kernel.set(argIdx++, (void *)NULL);
1164                 kernel.set(argIdx++, bias_offset);
1165                 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1166                 kernel.set(argIdx++, (int)(top.offset / element_size));
1167                 kernel.set(argIdx++, output_image_offset);
1168                 kernel.set(argIdx++, (uint16_t)width_);
1169                 kernel.set(argIdx++, (uint16_t)height_);
1170                 kernel.set(argIdx++, (uint16_t)output_w_);
1171                 kernel.set(argIdx++, (uint16_t)output_h_);
1172                 kernel.set(argIdx++, (uint16_t)pad_w_);
1173                 kernel.set(argIdx++, (uint16_t)pad_h_);
1174                 if (!kernel.run(3, config->global_work_size,
1175                                 (config->use_null_local) ? NULL : config->local_work_size,
1176                                 false))
1177                 {
1178                     std::cout << "Basic kernel run failed." << std::endl;
1179                     return false;
1180                 }
1181             }
1182         }
1183     }
1184 
1185     return true;
1186 }
1187 
1188 template<>
timedConvolve(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config)1189 float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
1190                                                const UMat &weight, const UMat &bias,
1191                                                int32_t numImages, kernelConfig* config)
1192 {
1193     cv::ocl::Queue queue;
1194     try
1195     {
1196         queue = cv::ocl::Queue::getDefault();
1197     }
1198     catch (const cv::Exception&)
1199     {
1200         static int warn_ = 0;
1201         if (!warn_)
1202         {
1203             std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
1204             warn_ = true;
1205         }
1206         return 1e6;
1207     }
1208 
1209     // warm up.
1210     bool saved_tuned = tuned_;
1211     tuned_ = false;
1212     convolve(bottom, top, weight, bias, numImages, config);
1213 
1214     cv::ocl::Timer timer(queue);
1215     timer.start();
1216     bool res = true;;
1217     CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
1218     tuned_ = true;
1219     int loop_cnt = 4;
1220     for (int i = 0; i < loop_cnt; i++) {
1221         res = convolve(bottom, top, weight, bias, numImages, config);
1222         if (!res)
1223             break;
1224     }
1225     tuned_ = saved_tuned;
1226     timer.stop();
1227     if (!res) {
1228         config->tested = true;
1229         config->verified = false;
1230         return 1e5;
1231     }
1232 
1233     float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
1234     double out_w = output_w_;
1235     double out_h = output_h_;
1236     double out_z = M_;
1237     double k_w = kernel_w_;
1238     double k_h = kernel_h_;
1239     double k_z = channels_;
1240     double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
1241     CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
1242     CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
1243     return elapsedTime;
1244 }
1245 
1246 template<>
verifyResult(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config,UMat & verifyTop)1247 bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
1248                                              UMat &top,
1249                                              const UMat &weight,
1250                                              const UMat &bias,
1251                                              int32_t numImages,
1252                                              kernelConfig* config,
1253                                              UMat &verifyTop)
1254 {
1255     if (config->verified)
1256         return true;
1257     else if (config->tested)
1258         return false;
1259 
1260     int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
1261     top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
1262     bool saved_tuned = tuned_;
1263     tuned_ = false;
1264     convolve(bottom, top, weight, bias, numImages, config);
1265     tuned_ = saved_tuned;
1266 
1267     config->tested = true;
1268 
1269     UMat new_top, new_verify_top;
1270     Mat mat_top, mat_verify_top;
1271     if (use_half_)
1272     {
1273         convertFp16(top, new_top);
1274         convertFp16(verifyTop, new_verify_top);
1275 
1276         mat_top = new_top.getMat(ACCESS_READ);
1277         mat_verify_top = new_verify_top.getMat(ACCESS_READ);
1278     }
1279     else
1280     {
1281         mat_top = top.getMat(ACCESS_READ);
1282         mat_verify_top = verifyTop.getMat(ACCESS_READ);
1283     }
1284     const float* data = mat_top.ptr<float>();
1285     const float* verify_data = mat_verify_top.ptr<float>();
1286 
1287     int error_slice_offset = 0;
1288     int error_slice = 0;
1289     float relative_eps = use_half_ ? 0.1f : 0.01f;
1290 
1291     size_t errors = 0;
1292 
1293     double rel_err = norm(mat_top.reshape(1, 1), mat_verify_top.reshape(1, 1), NORM_L1 | NORM_RELATIVE);
1294     if (rel_err >= relative_eps)
1295     {
1296         for (int32_t n = 0; n < num_; ++n) {
1297             for (int32_t g = 0; g < group_; ++g) {
1298                 int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g;
1299                 for (int out_ch = 0; out_ch < M_; out_ch++)
1300                     for (int h = 0; h < output_h_; h++)
1301                         for (int w = 0; w < output_w_; w++) {
1302                             size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
1303 
1304                             bool has_error = !(data[offset] == data[offset]);  // is NaN
1305                             if (!has_error)
1306                             {
1307                                 float error_factor = std::fabs(data[offset] - verify_data[offset]);
1308                                 float base_value_abs = std::max(1e-3f, std::fabs(verify_data[offset]));
1309                                 has_error = error_factor > relative_eps * base_value_abs;
1310                             }
1311                             if (has_error)
1312                             {
1313                                 if (errors == 0)
1314                                 {
1315                                     error_slice = (int)(offset / (output_w_ * output_h_));
1316                                     error_slice_offset = (int)(offset % (output_w_ * output_h_));
1317                                     CV_LOG_ERROR(NULL, "Kernel: " << config->kernelName);
1318                                 }
1319                                 if (errors < 10)
1320                                     CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
1321                                             << " out_ch " << out_ch << " h " << h << " w " << w
1322                                             << " (offset: " << offset << ")"
1323                                             << " got " << data[offset] << " expected " << verify_data[offset]);
1324                                 errors++;
1325                             }
1326                         }
1327             }
1328         }
1329     }
1330 
1331     if (errors)
1332     {
1333         if (dumpFailedResult())
1334         {
1335             try
1336             {
1337                 int n_outputs = (int)(mat_top.size[0]*mat_top.size[1]);
1338                 int slice_size = (int)(mat_top.total() / n_outputs);
1339                 Rect roi(0, 0, slice_size, n_outputs);
1340                 roi.width = std::min(roi.width, 32);
1341                 roi.height = std::min(roi.height, 16);
1342                 roi.x = std::max(0, std::min(slice_size - roi.width, error_slice_offset - roi.width/2));
1343                 roi.y = std::max(0, std::min(n_outputs - roi.height, error_slice - roi.height/2));
1344                 std::cout << "roi = " << roi << " errors=" << errors << std::endl;
1345                 std::cout << "mat_top = " << shape(mat_top) << std::endl
1346                           << mat_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl;
1347                 std::cout << "verify_top = " << shape(mat_verify_top) << std::endl
1348                           << mat_verify_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl;
1349             }
1350             catch (const std::exception& e)
1351             {
1352                 CV_LOG_ERROR(NULL, "Results dump failed: " << e.what());
1353             }
1354             catch (...)
1355             {
1356                 CV_LOG_ERROR(NULL, "Results dump failed")
1357             }
1358         }
1359 
1360         if (raiseOnCheckError())
1361             CV_Error_(Error::StsError, ("ocl4dnn tuning verification failed: %s (errors %lld)", config->kernelName.c_str(), (long long int)errors));
1362         return false;
1363     }
1364     else
1365     {
1366         config->verified = true;
1367         return true;
1368     }
1369 }
1370 
1371 template<typename Dtype>
unloadProgram(const std::string & kernelName)1372 void OCL4DNNConvSpatial<Dtype>::unloadProgram(const std::string& kernelName)
1373 {
1374     ocl::Program program;
1375     phash_t::iterator it = phash.find(kernelName);
1376     if (it != phash.end())
1377     {
1378         program = it->second;
1379         it->second = ocl::Program();
1380     }
1381     else
1382         return;
1383 
1384     ocl::Context ctx = ocl::Context::getDefault();
1385     ctx.unloadProg(program);
1386 }
1387 
1388 template<typename Dtype>
compileKernel()1389 ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
1390 {
1391     phash_t::iterator it = phash.find(kernel_name_);
1392     if (it != phash.end())
1393     {
1394         return it->second;
1395     }
1396 
1397     String errmsg;
1398     ocl::Context ctx = ocl::Context::getDefault();
1399     std::string options = options_.str();
1400     CV_Assert(options.size() != 0);
1401     ocl::Program program = ctx.getProg(src_, options, errmsg);
1402 
1403     phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
1404     if (!program.ptr())
1405     {
1406         std::cout << "Failed to compile kernel: " << kernel_name_
1407                   << ", buildflags: " << options
1408                   << ", errmsg: " << errmsg << std::endl;
1409     }
1410     return program;
1411 }
1412 
1413 template<>
createGEMMLikeConvKernel(int32_t blockM,int32_t blockK,int32_t blockN)1414 bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
1415                                                          int32_t blockK,
1416                                                          int32_t blockN)
1417 {
1418     int32_t simd_size = blockK;
1419 
1420     int workItemOutput[3] = { blockM, blockK, blockN };
1421     size_t gx = (size_t)divUp(M_, blockN);
1422     size_t gy = (size_t)divUp(output_w_ * output_h_, blockM);
1423     gy = alignSize(gy, simd_size);
1424     size_t gz = num_;
1425     size_t global_size[3] = { gx, gy, gz };
1426     size_t local_size[3] = { 1, static_cast<size_t>(simd_size), 1 };
1427 
1428     kernelType_ = KERNEL_TYPE_GEMM_LIKE;
1429     blockM_ = blockM;
1430     blockK_ = blockK;
1431     blockN_ = blockN;
1432     setupKernel();
1433 
1434     ocl::Program program = compileKernel();
1435     if (program.ptr())
1436     {
1437         size_t workgroupSize_used;
1438         ocl::Kernel kernel(kernel_name_.c_str(), program);
1439         if (kernel.empty())
1440             return false;
1441 
1442         workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
1443         if (workgroupSize_used != simd_size)
1444         {
1445             std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
1446             std::cerr << "                 does not equal the size (" << simd_size << ") kernel source required." << std::endl;
1447             std::cerr << "                 Skip this kernel " << kernel_name_ << std::endl;
1448             unloadProgram(kernel_name_);
1449             return false;
1450         }
1451         else
1452         {
1453             kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
1454                                                         true, KERNEL_TYPE_GEMM_LIKE));
1455             return true;
1456         }
1457     }
1458     else
1459         return false;
1460 }
1461 
1462 template<>
createIDLFKernel(int32_t blockWidth,int32_t blockHeight,int32_t simd_size)1463 bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
1464                                                  int32_t blockHeight,
1465                                                  int32_t simd_size)
1466 {
1467     int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
1468     const int32_t num_output_maps = M_;
1469     int32_t output_width = output_w_;
1470     int32_t output_height = output_h_;
1471     int32_t output_block_width = blockWidth;
1472     int32_t output_block_height = blockHeight;
1473     int32_t num_batches = num_;
1474 
1475     size_t global_size[3] = {
1476         (size_t)divUp(output_width, output_block_width),
1477         (size_t)divUp(output_height, output_block_height),
1478         (size_t)num_batches * alignSize(num_output_maps, simd_size) };
1479     size_t local_size[3] = { 1, 1, static_cast<size_t>(simd_size) };
1480 
1481     kernelType_ = KERNEL_TYPE_INTEL_IDLF;
1482     blockM_ = blockWidth;
1483     blockK_ = blockHeight;
1484     blockN_ = simd_size;
1485 
1486     setupKernel();
1487 
1488     if (enableWorkaroundIDLF() && ocl::Device::getDefault().intelSubgroupsSupport())
1489     {
1490         // Issues are observed with these kernels: 3x1 (covered by tests), 2x1, 4x1, 5x1, 3x2
1491         // kernels 1x3, 3x3, 2x3 are good
1492         if (pad_h_ != 0 && kernel_w_ <= simd_size && kernel_h_ <= 2)
1493         {
1494             CV_LOG_INFO(NULL, "DNN(workaround): skip IDLF kernel: " << kernel_name_);
1495             return false;
1496         }
1497     }
1498 
1499     ocl::Program program = compileKernel();
1500     if (program.ptr())
1501     {
1502         size_t workgroupSize_used;
1503         ocl::Kernel kernel(kernel_name_.c_str(), program);
1504         if (kernel.empty())
1505             return false;
1506 
1507         workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
1508         if (workgroupSize_used != simd_size)
1509         {
1510             std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
1511             std::cerr << "                 does not equal the size (" << simd_size << ") kernel source required." << std::endl;
1512             std::cerr << "                 Skip this kernel " << kernel_name_ << std::endl;
1513             unloadProgram(kernel_name_);
1514             return false;
1515         }
1516         else
1517         {
1518             kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
1519                                                         true, KERNEL_TYPE_INTEL_IDLF));
1520             return true;
1521         }
1522     }
1523     else
1524         return false;
1525 }
1526 
1527 template<>
createDWConvKernel(int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)1528 bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
1529                                                    int32_t blockHeight,
1530                                                    int32_t blockDepth)
1531 {
1532     if (!dwconv_)
1533         return false;
1534 
1535     int workItemOutput[3] = { 1, 1, 1 };
1536     size_t local_size[3] = { 1, 1, 1 };
1537     size_t global_size[3];
1538     global_size[0] = divUp(output_w_, workItemOutput[0]);
1539     global_size[1] = divUp(output_h_, workItemOutput[1]);
1540     global_size[2] = divUp(M_ * num_, workItemOutput[2]);
1541 
1542     kernelType_ = KERNEL_TYPE_DWCONV;
1543     blockM_ = blockWidth;
1544     blockK_ = blockHeight;
1545     blockN_ = blockDepth;
1546 
1547     setupKernel();
1548 
1549     ocl::Program program = compileKernel();
1550     if (program.ptr())
1551     {
1552         kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
1553                               &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
1554         return true;
1555     }
1556     else
1557         return false;
1558 }
1559 
1560 template<>
createConvolutionKernel(int32_t kernelType,int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)1561 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1562                                                         int32_t blockWidth,
1563                                                         int32_t blockHeight,
1564                                                         int32_t blockDepth)
1565 {
1566     kernelType_ = kernelType;
1567     options_.str(""); options_.clear(); // clear contents and state flags
1568     src_ = ocl::ProgramSource();
1569 
1570     if (kernelType == KERNEL_TYPE_INTEL_IDLF)
1571         return createIDLFKernel(blockWidth, blockHeight, blockDepth);
1572     else if (kernelType == KERNEL_TYPE_BASIC)
1573         return createBasicKernel(blockWidth, blockHeight, blockDepth);
1574     else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
1575         return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
1576     else if (kernelType == KERNEL_TYPE_DWCONV)
1577         return createDWConvKernel(blockWidth, blockHeight, blockDepth);
1578     else
1579         CV_Assert(0 && "Internal error");
1580     return false;
1581 }
1582 
1583 template<>
generate_gemmlike_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int blockN)1584 void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1585                                                              int blockM, int blockK, int blockN)
1586 {
1587     if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
1588         return;
1589 
1590     if (blockM != 1 && blockM != 2)
1591         return;
1592 
1593     if (blockN != 32)
1594         return;
1595 
1596     if (blockK != 8 && blockK != 16)
1597         return;
1598 
1599     if (blockK == 16)
1600     {
1601         if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
1602             return;
1603         if ((blockM == 2) || M_ % 32 != 0)
1604             return;
1605     }
1606 
1607     tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
1608 }
1609 
1610 template<>
generate_idlf_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int simd_size)1611 void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1612                                                          int blockM, int blockK, int simd_size)
1613 {
1614     int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
1615 
1616     if (simd_size != 8 && simd_size != 16)
1617         return;
1618 
1619     if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
1620         return;
1621 
1622     if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
1623         return;
1624 
1625     int width_max, height_max, block_size_max;
1626     width_max = 14;
1627     height_max = 14;
1628     block_size_max = 32;
1629 
1630     if (blockM > width_max)
1631         return;
1632     if (blockK > height_max)
1633         return;
1634 
1635     if (blockM > output_w_)
1636         return;
1637     if (blockK > output_h_)
1638         return;
1639 
1640     // Only when the work items count is less than the device
1641     // max work items or the M_ is less than 16, we will tune
1642     // for simd 8.
1643     if (simd_size == 8 &&  M_ >= 16 &&
1644         ((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
1645         max_compute_units * 7 * 16))
1646         return;
1647 
1648     int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
1649     int tile_x = alignSize(actual_tile_x, simd_size);
1650     if (tile_x > simd_size)
1651         return;
1652 
1653     if (blockM * blockK > block_size_max)
1654         return;
1655 
1656     tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
1657 }
1658 
1659 template<>
generate_dwconv_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int blockN)1660 void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1661                                                            int blockM, int blockK, int blockN)
1662 {
1663     if (!dwconv_)
1664         return;
1665 
1666     tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
1667 }
1668 
1669 template<>
generateTunerItems(std::vector<cv::Ptr<tunerParam>> & tunerItems)1670 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
1671 {
1672     if (ocl::Device::getDefault().intelSubgroupsSupport())
1673     {
1674         // depthwise kernel
1675         generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
1676         if (tunerItems.size() > 0 && group_ > 8)
1677             return;
1678 
1679         // gemm like kernel
1680         generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
1681         generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
1682         generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
1683         generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
1684 
1685         // idlf kernel
1686         for (int simd_size = 8; simd_size <= 16; simd_size += 8)
1687         {
1688             int width_max, height_max;
1689             width_max = 14;
1690             height_max = 14;
1691             for (uint32_t width = width_max; width > 0; width--)
1692             {
1693                 for (uint32_t height = height_max; height > 0; height--)
1694                 {
1695                     generate_idlf_tuneritems(tunerItems, width, height, simd_size);
1696                 }
1697             }
1698         }
1699     }
1700 }
1701 
1702 template<>
useFirstAvailable(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,UMat & verifyTop)1703 void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom,
1704                                                   UMat &top,
1705                                                   const UMat &weight,
1706                                                   const UMat &bias,
1707                                                   int32_t numImages,
1708                                                   UMat &verifyTop)
1709 {
1710     std::vector< cv::Ptr<tunerParam> > tunerItems;
1711     generateTunerItems(tunerItems);
1712     tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1));
1713 
1714     for (int i = 0; i < tunerItems.size(); i++)
1715     {
1716         if (createConvolutionKernel(tunerItems[i]->kernelType,
1717                                     tunerItems[i]->blockWidth,
1718                                     tunerItems[i]->blockHeight,
1719                                     tunerItems[i]->blockDepth))
1720         {
1721             CV_Assert(!kernelQueue.empty());  // basic kernel must be available
1722             int kernelIdx = kernelQueue.size() - 1;
1723             kernelConfig* config = kernelQueue[kernelIdx].get();
1724             bool failed = false;
1725             const size_t testCount = testAllKernels();
1726             for(int t = 0; t < testCount; t++)
1727             {
1728                 try
1729                 {
1730                     config->tested = false;
1731                     config->verified = false;
1732                     if (!verifyResult(bottom, top, weight, bias, numImages, config, verifyTop))
1733                     {
1734                         CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1735                         failed = true;
1736                         break;
1737                     }
1738                 }
1739                 catch (...)
1740                 {
1741                     CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1742                     throw;
1743                 }
1744             }
1745             if (!failed && verifyResult(bottom, top, weight, bias, numImages, config, verifyTop))
1746             {
1747                 bestKernelConfig = kernelQueue[kernelIdx];
1748                 if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF &&
1749                     bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
1750                     if (!swizzled_weights_umat.empty())
1751                         swizzled_weights_umat.release();
1752 
1753                 for (int32_t j = 0; j < kernelIdx; j++) {
1754                     CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end());
1755                     unloadProgram(kernelQueue[j]->kernelName);
1756                 }
1757                 kernelQueue.clear();
1758                 tuned_ = true;
1759                 break;
1760             }
1761         }
1762     }
1763 }
1764 
1765 template<>
cacheTunedConfig()1766 void OCL4DNNConvSpatial<float>::cacheTunedConfig()
1767 {
1768     if (tuned_)
1769     {
1770         cv::AutoLock lock(kernelConfigMutex);
1771         std::stringstream outputKernel;
1772         outputKernel << bestKernelConfig->workItem_output[0] << " "
1773                      << bestKernelConfig->workItem_output[1] << " "
1774                      << bestKernelConfig->workItem_output[2] << " "
1775                      << bestKernelConfig->kernelType << " "
1776                      << bestKernelConfig->local_work_size[0] << " "
1777                      << bestKernelConfig->local_work_size[1] << " "
1778                      << bestKernelConfig->local_work_size[2] << " "
1779                      << bestKernelConfig->swizzle_weights << " "
1780                      << bestKernelConfig->use_null_local << " ";
1781         kernelConfigMap.insert(std::pair<std::string, std::string>(key_, outputKernel.str()));
1782     }
1783 }
1784 
1785 template<>
setupConvolution(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,UMat & verifyTop)1786 void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
1787                                                  UMat &top,
1788                                                  const UMat &weight,
1789                                                  const UMat &bias,
1790                                                  int32_t numImages,
1791                                                  UMat &verifyTop)
1792 {
1793     std::vector< cv::Ptr<tunerParam> > tunerItems;
1794 
1795     generateTunerItems(tunerItems);
1796     for (int i = 0; i < tunerItems.size(); i++)
1797         createConvolutionKernel(tunerItems[i]->kernelType,
1798                                 tunerItems[i]->blockWidth,
1799                                 tunerItems[i]->blockHeight,
1800                                 tunerItems[i]->blockDepth);
1801 
1802     const size_t testCount = testAllKernels();
1803     for (int32_t x = 0; x < kernelQueue.size(); x++)
1804     {
1805         kernelConfig* config = kernelQueue[x];
1806         config->executionTime = timedConvolve(bottom, top, weight, bias, numImages, config);
1807         for(int t = 0; t < testCount; t++)
1808         {
1809             try
1810             {
1811                 config->tested = false;
1812                 config->verified = false;
1813                 bool verified = verifyResult(bottom, top, weight, bias, numImages, config, verifyTop);
1814                 if (verified == false)
1815                 {
1816                     CV_LOG_ERROR(NULL, "Kernel " << config->kernelName << " failed verification");
1817                     CV_LOG_ERROR(NULL, "workItem="
1818                          << config->workItem_output[0] << ","
1819                          << config->workItem_output[1] << ","
1820                          << config->workItem_output[2] << " "
1821                          << "kernelType: " << config->kernelType << " "
1822                          << "global_work_size="
1823                          << config->global_work_size[0] << ","
1824                          << config->global_work_size[1] << ","
1825                          << config->global_work_size[2] << " "
1826                          << "local_work_size="
1827                          << config->local_work_size[0] << ","
1828                          << config->local_work_size[1] << ","
1829                          << config->local_work_size[2] << " "
1830                          << config->swizzle_weights << " "
1831                          << config->use_null_local);
1832                 }
1833                 else
1834                 {
1835                     CV_LOG_VERBOSE(NULL, 0, "Kernel " << config->kernelName << " pass verification");
1836                 }
1837             }
1838             catch (...)
1839             {
1840                 CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1841                 throw;
1842             }
1843         }
1844     }
1845 
1846     int32_t failures = 0;
1847     bool verification = false;
1848     if (kernelQueue.size()) {
1849         while (failures < kernelQueue.size()) {
1850             int32_t fastestKernel = -1;
1851             float fastestTime = std::numeric_limits<float>::infinity();
1852 
1853             for (int32_t x = 0; x < kernelQueue.size(); x++) {
1854                 if (kernelQueue[x]->executionTime < fastestTime &&
1855                     kernelQueue[x]->tested == false) {
1856                     fastestKernel = x;
1857                     fastestTime = kernelQueue[x]->executionTime;
1858                 }
1859             }
1860             if (fastestKernel < 0) break;
1861             // Test fastest kernel
1862             bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
1863             if (verified == true) {
1864                 kernel_index_ = fastestKernel;
1865                 verification = true;
1866                 break;
1867             } else {
1868                 CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
1869                              " failed verification");
1870                 failures++;
1871             }
1872         }
1873     }
1874     if (verification) {
1875         CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
1876                     "> passed verification");
1877         CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
1878         double out_w = output_w_;
1879         double out_h = output_h_;
1880         double out_z = M_;
1881         double k_w = kernel_w_;
1882         double k_h = kernel_h_;
1883         double k_z = channels_;
1884         float elapsedTime = kernelQueue[kernel_index_]->executionTime;
1885         double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
1886         CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
1887         CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
1888     } else {
1889         CV_LOG_INFO(NULL, "fallback to basic kernel");
1890         options_.str(""); options_.clear(); // clear contents and state flags
1891         createBasicKernel(1, 1, 1);
1892         CV_Assert(!kernelQueue.empty());  // basic kernel must be available
1893         kernel_index_ = kernelQueue.size() - 1;
1894     }
1895     this->bestKernelConfig = kernelQueue[kernel_index_];
1896 
1897 
1898     if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
1899         if (!swizzled_weights_umat.empty())
1900             swizzled_weights_umat.release();
1901 
1902     for (int32_t x = 0; x < kernelQueue.size(); x++) {
1903         if (x != kernel_index_) {
1904             CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end());
1905             unloadProgram(kernelQueue[x]->kernelName);
1906         }
1907     }
1908     kernelQueue.clear();
1909     tuned_ = true;
1910     saveTunedConfig();
1911 }
1912 
1913 template<typename Dtype>
saveTunedConfig()1914 void OCL4DNNConvSpatial<Dtype>::saveTunedConfig()
1915 {
1916     CV_Assert(tuned_);
1917     if (!use_cache_path_ || cache_path_.empty())
1918         return;
1919 
1920     std::string outputFile;
1921     outputFile = cache_path_ + "/" + key_sanitized_;
1922     std::ofstream outputKernel;
1923     outputKernel.open(outputFile.c_str());
1924     outputKernel << bestKernelConfig->workItem_output[0] << " "
1925                  << bestKernelConfig->workItem_output[1] << " "
1926                  << bestKernelConfig->workItem_output[2] << " "
1927                  << bestKernelConfig->kernelType << " "
1928                  << bestKernelConfig->local_work_size[0] << " "
1929                  << bestKernelConfig->local_work_size[1] << " "
1930                  << bestKernelConfig->local_work_size[2] << " "
1931                  << bestKernelConfig->swizzle_weights << " "
1932                  << bestKernelConfig->use_null_local << " ";
1933     outputKernel.close();
1934 }
1935 
1936 template<typename Dtype>
prepareKernel(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages)1937 void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
1938                                               const UMat &weight, const UMat &bias,
1939                                               int32_t numImages)
1940 {
1941     std::string previous_key = key_;
1942 
1943     generateKey();
1944     if (key_.compare(previous_key) == 0 && bestKernelConfig)
1945         return;
1946 
1947     if (bestKernelConfig)
1948     {
1949         prev_kernel_type_ = bestKernelConfig->kernelType;
1950         CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end());
1951         phash.erase(bestKernelConfig->kernelName);
1952         bestKernelConfig.release();
1953     }
1954 
1955     if (loadCachedConfig()) // check in-memory cache
1956         return;
1957 
1958     if (loadTunedConfig())  // check external storage
1959         return;
1960 
1961     UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
1962 
1963     calculateBenchmark(bottom, benchData, weight, bias, numImages);
1964 
1965     if (run_auto_tuning_ || force_auto_tuning_)
1966     {
1967         setupConvolution(bottom, top, weight, bias, numImages, benchData);
1968     }
1969     else
1970     {
1971         useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
1972     }
1973     cacheTunedConfig();
1974 }
1975 
1976 template<typename Dtype>
loadCachedConfig()1977 bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
1978 {
1979     cv::AutoLock lock(kernelConfigMutex);
1980     if (!defaultConfigLoaded && !force_auto_tuning_)
1981         initializeGlobalBuiltinConfigurations((use_cache_path_ && !cache_path_.empty()) ? (cache_path_ + '/') : std::string());
1982 
1983     kernel_hash_t::iterator it = kernelConfigMap.find(key_);
1984     if (it != kernelConfigMap.end())
1985     {
1986         int32_t x, y, z, type, lx, ly, lz;
1987         bool swizzle, nullLocal;
1988         std::stringstream cachedKernel(it->second);
1989         if (cachedKernel)
1990         {
1991             cachedKernel >> x;
1992             cachedKernel >> y;
1993             cachedKernel >> z;
1994             cachedKernel >> type;
1995             cachedKernel >> lx;
1996             cachedKernel >> ly;
1997             cachedKernel >> lz;
1998             cachedKernel >> swizzle;
1999             cachedKernel >> nullLocal;
2000             if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
2001                 tuned_ = true;
2002                 return true;
2003             }
2004         }
2005     }
2006     return false;
2007 }
2008 
2009 
2010 template<typename Dtype>
setupKernelByConfig(int x,int y,int z,int type,int lx,int ly,int lz,bool swizzle,bool nullLocal)2011 bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int type,
2012                                                     int lx, int ly, int lz,
2013                                                     bool swizzle, bool nullLocal)
2014 {
2015     if (type == KERNEL_TYPE_INTEL_IDLF)
2016     {
2017         if (z == 1)
2018             z = 16;
2019         CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
2020     }
2021     kernelQueue.clear();
2022     createConvolutionKernel(type, x, y, z);
2023     if (kernelQueue.size() != 1) {
2024         std::cerr << "Failed setup kernel by config:"
2025             << " x = " << x
2026             << " y = " << y
2027             << " z = " << z
2028             << " type = " << type
2029             << std::endl;
2030         return false;
2031     }
2032     bestKernelConfig = kernelQueue[0];
2033     kernelQueue.clear();
2034     bestKernelConfig->local_work_size[0] = lx;
2035     bestKernelConfig->local_work_size[1] = ly;
2036     bestKernelConfig->local_work_size[2] = lz;
2037     bestKernelConfig->swizzle_weights = swizzle;
2038     bestKernelConfig->use_null_local = nullLocal;
2039     // If kernel type changed to type 2 or 4, we need to reset the swizzled
2040     // weights pointer to invalidate the previous swizzled weights data.
2041     if (prev_kernel_type_ != bestKernelConfig->kernelType &&
2042         (bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF ||
2043         bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE))
2044     {
2045         if (!swizzled_weights_umat.empty())
2046             swizzled_weights_umat.release();
2047     }
2048     return true;
2049 }
2050 
2051 template<typename Dtype>
loadTunedConfig()2052 bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
2053 {
2054     if (force_auto_tuning_)
2055         return false;  // don't load results from external storage
2056 
2057     if (!use_cache_path_)
2058     {
2059         if (cache_path_.empty())
2060         {
2061             static int warn_ = 0;
2062             if (!warn_)
2063             {
2064                 std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
2065                           << "                 via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
2066                 warn_ = true;
2067             }
2068         }
2069         return false;
2070     }
2071 
2072     int32_t x, y, z, type, lx, ly, lz;
2073     bool swizzle, nullLocal;
2074 
2075     // Find cached kernel configuration from file
2076     std::string cacheFile = cache_path_ + "/" + key_sanitized_;
2077     std::ifstream cachedKernel(cacheFile.c_str());
2078     if (cachedKernel)
2079     {
2080         cachedKernel >> x;
2081         cachedKernel >> y;
2082         cachedKernel >> z;
2083         cachedKernel >> type;
2084         cachedKernel >> lx;
2085         cachedKernel >> ly;
2086         cachedKernel >> lz;
2087         cachedKernel >> swizzle;
2088         cachedKernel >> nullLocal;
2089         if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
2090             tuned_ = true;
2091             return true;
2092         }
2093     }
2094     return false;
2095 }
2096 
2097 template class OCL4DNNConvSpatial<float>;
2098 
2099 }}} // namespace cv::dnn::ocl4dnn
2100