1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2017, Intel Corporation, all rights reserved.
14 // Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../../precomp.hpp"
44
45 #include <opencv2/core/utils/configuration.private.hpp>
46
47 #include <string>
48 #include <vector>
49 #include <fstream>
50 #include <sys/stat.h>
51 #include <assert.h>
52 #include "../include/common.hpp"
53 #include "../include/ocl4dnn.hpp"
54 #include "opencl_kernels_dnn.hpp"
55 #include "../include/math_functions.hpp"
56 #include "../include/default_kernel_config.hpp"
57 #include "opencv2/dnn/shape_utils.hpp"
58 #include "opencv2/core/utils/logger.hpp"
59
60 #if defined WIN32 || defined _WIN32
61 #include <windows.h>
62 #include <direct.h>
63 #undef min
64 #undef max
65 #endif
66
67 namespace cv { namespace dnn { namespace ocl4dnn {
68 static cv::Mutex kernelConfigMutex;
69 typedef std::map<std::string, std::string> kernel_hash_t;
70 static kernel_hash_t kernelConfigMap;
71 static bool defaultConfigLoaded = false;
72
enableWorkaroundIDLF()73 static bool enableWorkaroundIDLF()
74 {
75 static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_WORKAROUND_IDLF", true);
76 return param;
77 }
78
dumpFailedResult()79 static bool dumpFailedResult()
80 {
81 static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_DUMP_FAILED_RESULT", false);
82 return param;
83 }
84
testAllKernels()85 static size_t testAllKernels()
86 {
87 static size_t param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_TEST_ALL_KERNELS", 0);
88 return param;
89 }
90
raiseOnCheckError()91 static bool raiseOnCheckError()
92 {
93 static bool param = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_TUNING_RAISE_CHECK_ERROR", false);
94 return param;
95 }
96
sanitize(const std::string & s)97 static std::string sanitize(const std::string& s)
98 {
99 std::string s_ = s;
100 for (size_t i = 0; i < s_.size(); i++)
101 {
102 char c = s_[i];
103 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
104 {
105 s_[i] = '_';
106 }
107 }
108 // TODO add hash?
109 // s_ = s_ + cv::format("_%08llx", crc64((uchar*)s.c_str(), s.size()));
110 return s_;
111 }
112
initializeGlobalBuiltinConfigurations(const std::string & cache_path)113 static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
114 {
115 CV_Assert(defaultConfigLoaded == false);
116 CV_Assert(kernelConfigMap.empty());
117
118 /* fp32 config */
119 size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
120 sizeof(default_kernel_config_intel_fp32[0]) / 2;
121 for (size_t i = 0; i < numConfigs; i++)
122 {
123 std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
124 if (!cache_path.empty())
125 {
126 std::string cacheFile = cache_path + sanitize(key);
127 std::ifstream cachedKernel(cacheFile.c_str());
128 if (cachedKernel)
129 continue; // external configuration found, skip builtin
130 }
131 std::pair<std::string, std::string> entry(
132 key,
133 default_kernel_config_intel_fp32[2 * i + 1]);
134 kernelConfigMap.insert(entry);
135 }
136
137 /* fp16 config */
138 numConfigs = sizeof(default_kernel_config_intel_fp16) /
139 sizeof(default_kernel_config_intel_fp16[0]) / 2;
140 for (size_t i = 0; i < numConfigs; i++)
141 {
142 std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
143 if (!cache_path.empty())
144 {
145 std::string cacheFile = cache_path + sanitize(key);
146 std::ifstream cachedKernel(cacheFile.c_str());
147 if (cachedKernel)
148 continue; // external configuration found, skip builtin
149 }
150 std::pair<std::string, std::string> entry(
151 key,
152 default_kernel_config_intel_fp16[2 * i + 1]);
153 kernelConfigMap.insert(entry);
154 }
155
156 defaultConfigLoaded = true;
157 }
158
159
160 template<typename Dtype>
OCL4DNNConvSpatial(OCL4DNNConvConfig config)161 OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
162 {
163 bias_term_ = config.bias_term;
164 int dims = config.in_shape.size();
165 int spatial_dims = 2;
166
167 channels_ = config.in_shape[dims - spatial_dims - 1];
168 num_output_ = config.out_shape[dims - spatial_dims - 1];
169 group_ = config.group;
170
171 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
172 fused_eltwise_ = false;
173 power_ = 1.f;
174 negative_slope_ = 0;
175 min_value_ = 0;
176 max_value_ = 0;
177 prev_kernel_type_ = -1;
178 tuned_ = false;
179 use_half_ = config.use_half;
180
181 // assumption: spatial dimension is 2.
182 kernel_h_ = config.kernel.height;
183 kernel_w_ = config.kernel.width;
184 pad_h_ = config.pad.height;
185 pad_w_ = config.pad.width;
186 stride_h_ = config.stride.height;
187 stride_w_ = config.stride.width;
188 dilation_h_ = config.dilation.height;
189 dilation_w_ = config.dilation.width;
190 M_ = num_output_ / group_;
191 height_ = config.in_shape[dims - spatial_dims + 0];
192 width_ = config.in_shape[dims - spatial_dims + 1];
193 output_h_ = config.out_shape[dims - spatial_dims + 0];
194 output_w_ = config.out_shape[dims - spatial_dims + 1];
195 bottom_dim_ = channels_ * width_ * height_;
196 top_dim_ = num_output_ * output_w_ * output_h_;
197 int Ph = (output_h_ - 1) * stride_h_ + (dilation_h_ * (kernel_h_ - 1) + 1) - height_;
198 int Pw = (output_w_ - 1) * stride_w_ + (dilation_w_ * (kernel_w_ - 1) + 1) - width_;
199 Ph = (Ph > 0) ? Ph : 0;
200 Pw = (Pw > 0) ? Pw : 0;
201 pad_right_ = (Pw + 1) / 2;
202 pad_bottom_ = (Ph + 1) / 2;
203
204 cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
205 dwconv_ = (num_output_ == channels_ && channels_ == group_);
206
207 use_cache_path_ = false;
208 if (!cache_path_.empty())
209 {
210 #if defined _WIN32
211 struct _stat file_stat;
212 use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 &&
213 ((_S_IFDIR & file_stat.st_mode) != 0);
214 #else
215 struct stat file_stat;
216 use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 &&
217 S_ISDIR(file_stat.st_mode);
218 #endif
219 if (!use_cache_path_)
220 {
221 static int warn_ = 0;
222 if (!warn_)
223 {
224 std::cerr
225 << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
226 << std::endl;
227 warn_ = true;
228 }
229 }
230 }
231
232 run_auto_tuning_ = use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false);
233 force_auto_tuning_ = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
234 }
235
236 template<typename Dtype>
~OCL4DNNConvSpatial()237 OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
238 {
239 if (!swizzled_weights_umat.empty()) {
240 swizzled_weights_umat.release();
241 }
242 }
243
244 template<typename Dtype>
setFusionDefine(ocl4dnnFusedActiv_t fused_activ,bool fused_eltwise)245 void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise)
246 {
247 if (fused_eltwise)
248 addDef("FUSED_CONV_ELTWISE", 1);
249
250 switch (fused_activ) {
251 case OCL4DNN_CONV_FUSED_ACTIV_RELU:
252 addDef("FUSED_CONV_RELU", 1);
253 break;
254 case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
255 addDef("FUSED_CONV_PRELU", 1);
256 break;
257 case OCL4DNN_CONV_FUSED_ACTIV_POWER:
258 addDef("FUSED_CONV_POWER", 1);
259 break;
260 case OCL4DNN_CONV_FUSED_ACTIV_TANH:
261 addDef("FUSED_CONV_TANH", 1);
262 break;
263 case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
264 addDef("FUSED_CONV_RELU6", 1);
265 break;
266 default:
267 ;
268 }
269 return;
270 }
271
272 template<typename Dtype>
setFusionArg(ocl4dnnFusedActiv_t fused_activ,bool fused_eltwise,ocl::Kernel & kernel,cl_uint & argIdx)273 void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
274 {
275 if (fused_eltwise)
276 kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
277
278 switch (fused_activ) {
279 case OCL4DNN_CONV_FUSED_ACTIV_RELU:
280 kernel.set(argIdx++, (float)negative_slope_);
281 break;
282 case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
283 kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
284 break;
285 case OCL4DNN_CONV_FUSED_ACTIV_POWER:
286 kernel.set(argIdx++, (float)power_);
287 break;
288 case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
289 kernel.set(argIdx++, (float)min_value_);
290 kernel.set(argIdx++, (float)max_value_);
291 break;
292 default:
293 ;
294 }
295 return;
296 }
297
298 typedef enum {
299 TYPE_FLOAT = 1,
300 TYPE_HALF = 2
301 } ocl4dnnConvSpatialType_t;
302
303 template<typename Dtype>
collectCommonInformation()304 void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
305 {
306 if (use_half_)
307 {
308 addDef("TYPE", TYPE_HALF);
309 addDef("Dtype", "half");
310 addDef("Dtype2", "half2");
311 addDef("Dtype4", "half4");
312 addDef("Dtype8", "half8");
313 addDef("Dtype16", "half16");
314 addDef("as_Dtype", "as_half");
315 addDef("as_Dtype2", "as_half2");
316 addDef("as_Dtype4", "as_half4");
317 addDef("as_Dtype8", "as_half8");
318 }
319 else
320 {
321 addDef("TYPE", TYPE_FLOAT);
322 addDef("Dtype", "float");
323 addDef("Dtype2", "float2");
324 addDef("Dtype4", "float4");
325 addDef("Dtype8", "float8");
326 addDef("Dtype16", "float16");
327 addDef("as_Dtype", "as_float");
328 addDef("as_Dtype2", "as_float2");
329 addDef("as_Dtype4", "as_float4");
330 addDef("as_Dtype8", "as_float8");
331 }
332 }
333
334 typedef enum {
335 KERNEL_TYPE_INTEL_IDLF = 2,
336 KERNEL_TYPE_BASIC = 4,
337 KERNEL_TYPE_GEMM_LIKE = 5,
338 KERNEL_TYPE_DWCONV = 6
339 } ocl4dnnConvSpatialKernelType_t;
340
341 template<typename Dtype>
setupKernelDetails(int32_t kernelType,int32_t blockM,int32_t blockK,int32_t blockN)342 void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
343 int32_t blockM,
344 int32_t blockK,
345 int32_t blockN)
346 {
347 std::string kernelUKey;
348 int32_t simd_size;
349
350 if (kernelType == KERNEL_TYPE_INTEL_IDLF) {
351 simd_size = blockN;
352 kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1);
353
354 // kernel name
355 kernel_name_ = "IDLF_";
356 kernel_name_ += kernelUKey;
357 if (simd_size == 16)
358 kernel_name_ += "_SIMD16";
359 else
360 kernel_name_ += "_SIMD8";
361
362 // options
363 options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
364 options_ << " -cl-mad-enable";
365 if (clOptionSupport("-cl-no-subgroup-ifp"))
366 options_ << " -cl-no-subgroup-ifp ";
367
368 // defs
369 int32_t output_block_width = blockM;
370 int32_t output_block_height = blockK;
371 int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
372 int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
373 int invec_size = tile_y;
374
375 addDef("SIMD_SIZE", simd_size);
376 addDef("OUT_BLOCK_WIDTH", output_block_width);
377 addDef("OUT_BLOCK_HEIGHT", output_block_height);
378 addDef("INPUT_DEPTH", channels_ / group_);
379 addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
380 addDef("TOTAL_OUTPUT_DEPTH", num_output_);
381 addDef("NUM_FILTERS", M_);
382 addDef("TILE_X", tile_x);
383 addDef("TILE_Y", tile_y);
384 addDef("INVEC_SIZE", invec_size);
385 addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
386 addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
387 addDef("APPLY_BIAS", bias_term_);
388 addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
389 addDef("INPUT_PITCH", (width_ * height_));
390 addDef("OUTPUT_PITCH", (output_w_ * output_h_));
391 addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
392 addDef("INPUT_WIDTH", width_);
393 addDef("INPUT_HEIGHT", height_);
394 addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
395
396 setFusionDefine(fused_activ_, fused_eltwise_);
397
398 src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
399 }
400 else if (kernelType == KERNEL_TYPE_BASIC)
401 {
402 addDef("KERNEL_BASIC");
403
404 kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN);
405 kernel_name_ = "BASIC_";
406 kernel_name_ += kernelUKey;
407
408 // opts
409 options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_;
410 if (clOptionSupport("-cl-no-subgroup-ifp"))
411 options_ << " -cl-no-subgroup-ifp ";
412
413 // defs
414 addDef("CHANNELS", channels_ / group_);
415 addDef("APPLY_BIAS", bias_term_);
416 addDef("OUTPUT_Z", M_);
417 addDef("ZPAR", 1);
418 setFusionDefine(fused_activ_, fused_eltwise_);
419
420 src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
421 }
422 else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
423 {
424 simd_size = blockK;
425 kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN);
426
427 kernel_name_ = "U_GEMM_LIKE_CONV_";
428 kernel_name_ += kernelUKey.c_str();
429 kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16";
430 std::stringstream kernelDef;
431 kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM;
432 if (blockK == 16)
433 kernelDef << "_SIMD16";
434
435 // Build list of options and defines
436 options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str()
437 << " -D Conv_Interleaved=" << kernel_name_.c_str();
438 options_ << " -cl-mad-enable";
439 if (clOptionSupport("-cl-no-subgroup-ifp"))
440 options_ << " -cl-no-subgroup-ifp ";
441
442 addDef("KERNEL_GEMM_LIKE");
443 addDef("INPUT_DEPTH", channels_);
444 addDef("WIDTH1", M_);
445 addDef("OUT_PADDING_LEFT", 0);
446 addDef("OUT_PADDING_HEIGHT", 0);
447 addDef("OUT_DEPTH", M_);
448 addDef("NUM_BATCHES", num_);
449 addDef("DY", blockM);
450 addDef("DX", blockN);
451 addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2);
452 addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2);
453 addDef("TILE_N_LAST", M_ % 32);
454 addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
455 addDef("APPLY_BIAS", bias_term_);
456 setFusionDefine(fused_activ_, fused_eltwise_);
457 src_ = ocl::dnn::conv_layer_spatial_oclsrc;
458 }
459 else if (kernelType == KERNEL_TYPE_DWCONV)
460 {
461 kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
462 kernel_name_ = "DWCONV_";
463 kernel_name_ += kernelUKey.c_str();
464
465 options_ << " -cl-fast-relaxed-math ";
466 if (clOptionSupport("-cl-no-subgroup-ifp"))
467 options_ << " -cl-no-subgroup-ifp ";
468
469 addDef("KERNEL_DWCONV");
470 addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
471 addDef("KERNEL_W", kernel_w_);
472 addDef("KERNEL_H", kernel_h_);
473 addDef("APPLY_BIAS", bias_term_);
474 addDef("OUTPUT_Z", num_output_ * num_);
475 addDef("CHANNELS", num_output_);
476 setFusionDefine(fused_activ_, fused_eltwise_);
477
478 options_ << " -D DWCONV=" << kernel_name_;
479 src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
480 }
481 }
482
483 template<typename Dtype>
setupKernel()484 void OCL4DNNConvSpatial<Dtype>::setupKernel()
485 {
486 collectCommonInformation();
487
488 addDef("KERNEL_WIDTH", kernel_w_);
489 addDef("KERNEL_HEIGHT" , kernel_h_);
490 addDef("STRIDE_X", stride_w_);
491 addDef("STRIDE_Y", stride_h_);
492 addDef("DILATION_X", dilation_w_);
493 addDef("DILATION_Y", dilation_h_);
494 if (kernelType_ != KERNEL_TYPE_BASIC)
495 {
496 addDef("INPUT_PAD_W", pad_w_);
497 addDef("INPUT_PAD_H", pad_h_);
498 addDef("INPUT_PAD_RIGHT", pad_right_);
499 addDef("INPUT_PAD_BOTTOM", pad_bottom_);
500 }
501
502 setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
503 }
504
505 template<typename Dtype>
setBias(bool bias_term)506 void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
507 {
508 bias_term_ = bias_term;
509 }
510
511 template<typename Dtype>
setActivReLU(bool fuse_activ,float slope)512 void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
513 {
514 if ( fuse_activ )
515 {
516 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
517 negative_slope_ = slope;
518 }
519 else
520 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
521 }
522
523 template<typename Dtype>
setActivReLU6(bool fuse_activ,float min,float max)524 void OCL4DNNConvSpatial<Dtype>::setActivReLU6(bool fuse_activ, float min, float max)
525 {
526 if ( fuse_activ )
527 {
528 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
529 min_value_ = min;
530 max_value_ = max;
531 }
532 else
533 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
534 }
535
536 template<typename Dtype>
setActivPReLU(bool fuse_activ,std::vector<float> & slope)537 void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
538 {
539 if ( fuse_activ )
540 {
541 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
542 Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
543 tmpMat.copyTo(negative_slope_umat_);
544 }
545 else
546 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
547 }
548
549 template<typename Dtype>
setActivPower(bool fuse_activ,float power)550 void OCL4DNNConvSpatial<Dtype>::setActivPower(bool fuse_activ, float power)
551 {
552 if ( fuse_activ )
553 {
554 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_POWER;
555 power_ = power;
556 }
557 else
558 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
559 }
560
561 template<typename Dtype>
setActivTanh(bool fuse_activ)562 void OCL4DNNConvSpatial<Dtype>::setActivTanh(bool fuse_activ)
563 {
564 if ( fuse_activ )
565 {
566 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_TANH;
567 }
568 else
569 fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
570 }
571
572 template<typename Dtype>
Forward(const UMat & bottom,const UMat & bottom2,const UMat & weight,const UMat & bias,UMat & top,int32_t numImages)573 bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
574 const UMat& bottom2,
575 const UMat& weight,
576 const UMat& bias,
577 UMat& top,
578 int32_t numImages)
579 {
580 num_ = numImages;
581 if (!bottom2.empty())
582 {
583 fused_eltwise_ = true;
584 bottom_data2_ = bottom2;
585 }
586 else
587 {
588 fused_eltwise_ = false;
589 }
590
591 if (use_half_ && !bias.empty())
592 CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
593
594 if (use_half_)
595 CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
596
597 prepareKernel(bottom, top, weight, bias, numImages);
598 if (bestKernelConfig.empty())
599 return false;
600 return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
601 }
602
603 template<typename Dtype>
calculateBenchmark(const UMat & bottom,UMat & verifyTop,const UMat & weight,const UMat & bias,int32_t numImages)604 void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &verifyTop,
605 const UMat &weight, const UMat &bias,
606 int32_t numImages)
607 {
608 options_.str(""); options_.clear(); // clear contents and state flags
609 createBasicKernel(1, 1, 1);
610 CV_Assert(!kernelQueue.empty()); // basic kernel must be available
611 kernel_index_ = kernelQueue.size() - 1;
612 convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_]);
613 CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end());
614 //unloadProgram(kernelQueue[kernel_index_]->kernelName);
615 kernelQueue.pop_back();
616 return;
617 }
618
619 // For large enough input size, we do not need to tune kernels for different
620 // size. The reason is with large input size, there will be enough work items
621 // to feed al the EUs.
622 // FIXME for the gemm like convolution, switch back to exact image size.
623
624 #define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16)))
625
626 template<typename Dtype>
generateKey()627 void OCL4DNNConvSpatial<Dtype>::generateKey()
628 {
629 std::string precision = (use_half_) ? "FP16" : "FP32";
630 std::stringstream keyBuilder;
631 // FIXME: to support fuse?
632 keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
633 << "cn" << channels_ << "_"
634 << "g" << group_ << "_"
635 << "s" << stride_w_ << "x" << stride_h_ << "_"
636 << "d" << dilation_w_ << "x" << dilation_h_ << "_"
637 << "b" << bias_term_ << "_"
638 << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
639 << "p" << pad_w_ << "x" << pad_h_ << "_"
640 << "num" << num_ << "_"
641 << "M" << M_ << "_"
642 << "activ" << (int)fused_activ_ << "_"
643 << "eltwise" << fused_eltwise_ << "_"
644 << precision;
645
646
647 key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
648 key_sanitized_ = sanitize(key_);
649 short_key_ = keyBuilder.str();
650 }
651
652 template<typename Dtype>
generateSpecificKey(int32_t type,int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)653 std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t blockWidth,
654 int32_t blockHeight, int32_t blockDepth)
655 {
656 std::stringstream keyBuilder;
657 keyBuilder << short_key_
658 << "_" << type
659 << "_" << blockWidth
660 << "_" << blockHeight
661 << "_" << blockDepth;
662
663 return keyBuilder.str();
664 }
665
666 template<typename Dtype>
interleaveMatrix(Dtype * mem_dst,const Dtype * mem,int r,int c,int interleavedRows,int nonInterleavedRows,int blockWidth,int rowAlignment)667 void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
668 int r, int c, int interleavedRows, int nonInterleavedRows,
669 int blockWidth, int rowAlignment )
670 {
671 CHECK_EQ(interleavedRows % 2, 0) <<
672 "interleaveMatrix only supports even values for interleavedRows.";
673
674 size_t memSize = r * c * sizeof(float);
675 size_t dstSize = memSize *
676 (interleavedRows + nonInterleavedRows * 2) /
677 (interleavedRows + nonInterleavedRows);
678 memset(mem_dst, 0, dstSize); // NOLINT
679
680 const int xStride = blockWidth;
681 const int yStride = c * 2;
682 const Dtype *pSrc = mem;
683 Dtype* pDst = mem_dst;
684 for (int y = 0; y < r;) {
685 for (int rows = 0; rows < interleavedRows; rows += 2) {
686 if ( y >= r ) break;
687 if ((c % xStride) == 0) {
688 for (int x = 0; x < c / xStride; x++) {
689 memcpy(pDst + x * xStride * 2, // NOLINT
690 pSrc + x * xStride, xStride * sizeof(Dtype));
691 memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
692 pSrc + x * xStride + c, xStride * sizeof(Dtype));
693 }
694 } else {
695 const int count = c / xStride;
696 int x = 0;
697 for (; x < count - 1; x++) {
698 memcpy(pDst + x * xStride * 2, // NOLINT
699 pSrc + x * xStride, xStride * sizeof(Dtype));
700 memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
701 pSrc + x * xStride + c, xStride * sizeof(Dtype));
702 }
703 memcpy(pDst + x * xStride * 2, // NOLINT
704 pSrc + x * xStride, xStride * sizeof(Dtype));
705 }
706 pSrc += yStride;
707 pDst += yStride;
708 y += 2;
709 }
710
711 for (int rows = 0; rows < nonInterleavedRows; rows++) {
712 if (y >= r) break;
713 const int stride = rowAlignment;
714 int remaining = c;
715 for (int x = 0; x < c; x += stride) {
716 if (remaining >= stride) {
717 memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT
718 remaining -=stride;
719 } else {
720 memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype)); // NOLINT
721 }
722 }
723 pSrc += yStride / 2;
724 pDst += yStride;
725 y++;
726 }
727 }
728 }
729
730 template<typename Dtype>
swizzleWeight(const UMat & weight,int32_t swizzled_factor,bool interleave)731 bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
732 int32_t swizzled_factor,
733 bool interleave)
734 {
735 // Simply skip the weight swizzle if we already got a swizzled_weights_
736 // in test phase and not in auto tuning
737 // This requires we always call convolve again with the winner configuration
738 // during the auto tuning stage.
739 if (tuned_ && !swizzled_weights_umat.empty())
740 return true;
741
742 if (swizzled_weights_umat.empty())
743 swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
744 kernel_h_ * (int)alignSize(kernel_w_, 2),
745 (use_half_) ? CV_16SC1 : CV_32FC1);
746
747 if (!interleave) {
748 int32_t channels = channels_ / group_;
749
750 ocl::Kernel oclk_copy_weight(
751 use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
752 cv::ocl::dnn::conv_spatial_helper_oclsrc,
753 use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
754 );
755 if (oclk_copy_weight.empty())
756 return false;
757
758 oclk_copy_weight.args(
759 ocl::KernelArg::PtrReadOnly(weight),
760 ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
761 kernel_w_,
762 kernel_h_,
763 channels,
764 num_output_,
765 swizzled_factor
766 );
767
768 size_t global_work_size_copy[3] = {
769 (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
770
771 if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
772 {
773 std::cout << "Swizzle kernel run failed." << std::endl;
774 return false;
775 }
776 } else {
777 // assumption: kernel dimension is 2
778 Mat weightMat;
779 Mat swizzledWeightMat;
780 UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
781 if (use_half_)
782 {
783 CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
784 convertFp16(weight, weight_tmp);
785 weightMat = weight_tmp.getMat(ACCESS_READ);
786 swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
787 }
788 else
789 {
790 weightMat = weight.getMat(ACCESS_READ);
791 swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
792 }
793
794 CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
795 Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
796 Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
797
798 int interleavedRows = (kernel_w_ / 2) * 2;
799 int nonInterleavedRows = kernel_w_ % 2;
800 int blockWidth = swizzled_factor; // should equal to simd size.
801 int rowAlignment = 32;
802 size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
803 cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
804 for (int od = 0; od < M_; od++)
805 for (int id = 0; id < channels_; id++)
806 for (int r = 0; r < kernel_h_; r++)
807 for (int c = 0; c < kernel_w_; c++)
808 tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
809 cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
810
811 interleaveMatrix(cpu_swizzled_weight,
812 tmpSwizzledWeight.data(),
813 kernel_w_ * kernel_h_ * channels_, M_,
814 interleavedRows,
815 nonInterleavedRows,
816 blockWidth,
817 rowAlignment);
818
819 // unmap OpenCL buffers
820 weightMat.release();
821
822 if (use_half_)
823 convertFp16(swizzledWeightMat, swizzled_weights_umat);
824 }
825
826 return true;
827 }
828
829 template<>
createBasicKernel(int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)830 bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
831 int32_t blockHeight, int32_t blockDepth)
832 {
833 kernelType_ = KERNEL_TYPE_BASIC;
834 blockM_ = blockWidth;
835 blockK_ = blockHeight;
836 blockN_ = blockDepth;
837 setupKernel();
838
839 ocl::Program program = compileKernel();
840 if (program.ptr())
841 {
842 int32_t workItemOutput[3] = { 1, 1, 1 };
843 size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ };
844 kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0],
845 false, KERNEL_TYPE_BASIC));
846 return true;
847 }
848 else
849 return false;
850 }
851
852 template<>
CreateSubBuffer(const UMat & buffer,UMat & sub_buffer,int32_t offset,int32_t size,bool write_only)853 void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
854 int32_t offset, int32_t size, bool write_only)
855 {
856 cl_mem sub_mem;
857 cl_buffer_region region;
858 cl_int err;
859 size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
860
861 region.origin = offset * element_size + buffer.offset;
862 region.size = size * element_size;
863 sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
864 write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
865 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
866 if (err)
867 {
868 std::cout << "Failed to create sub buffer." << std::endl;
869 return;
870 }
871
872 int step = element_size, rows = size, cols = 1;
873 ocl::convertFromBuffer(sub_mem, step, rows, cols,
874 (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
875
876 //decrease ocl mem refcount
877 clReleaseMemObject(sub_mem);
878 }
879
880 template<>
convolve(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config)881 bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
882 const UMat &weight, const UMat &bias,
883 int32_t numImages, kernelConfig* config)
884 {
885 ocl::Program program;
886 phash_t::iterator it = phash.find(config->kernelName);
887 if (it != phash.end())
888 program = it->second;
889 else
890 return false;
891
892 int32_t bias_offset;
893 int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
894
895 if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
896 if (!swizzleWeight(weight, config->workItem_output[2], false))
897 return false;
898 size_t total_bottom_size = bottom_dim_ * numImages;
899 size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
900 size_t total_bias_size = M_ * group_;
901 size_t total_top_size = top_dim_ * numImages;
902 for (int32_t g = 0; g < group_; ++g) {
903 bias_offset = M_ * g;
904 int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
905 int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
906 int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
907
908 ocl::Kernel kernel(config->kernelName.c_str(), program);
909 if (kernel.empty())
910 return false;
911
912 cl_uint argIdx = 0;
913 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
914
915 UMat img_buffer;
916 if (image_offset)
917 {
918 CreateSubBuffer(bottom, img_buffer, image_offset,
919 total_bottom_size - image_offset, false);
920 if (img_buffer.empty())
921 return false;
922
923 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
924 }
925 else
926 {
927 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
928 }
929
930 UMat kernel_buffer;
931 if (kernel_offset)
932 {
933 CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
934 total_kernel_size - kernel_offset, false);
935 if (kernel_buffer.empty())
936 return false;
937
938 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
939 }
940 else
941 {
942 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
943 }
944
945 UMat bias_buffer;
946 if (bias_term_)
947 {
948 if (bias_offset)
949 {
950 CreateSubBuffer(bias, bias_buffer, bias_offset,
951 total_bias_size - bias_offset, false);
952 if (bias_buffer.empty())
953 return false;
954
955 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
956 }
957 else
958 {
959 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
960 }
961 }
962
963 UMat out_buffer;
964 if (output_image_offset)
965 {
966 CreateSubBuffer(top, out_buffer, output_image_offset,
967 total_top_size - output_image_offset, true);
968 if (out_buffer.empty())
969 return false;
970
971 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
972 kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
973 }
974 else
975 {
976 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
977 kernel.set(argIdx++, (int)(top.offset / element_size));
978 }
979
980 kernel.set(argIdx++, (uint16_t)width_);
981 kernel.set(argIdx++, (uint16_t)height_);
982 kernel.set(argIdx++, (uint16_t)output_w_);
983 kernel.set(argIdx++, (uint16_t)output_h_);
984 if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
985 {
986 std::cout << "IDLF kernel run failed." << std::endl;
987 return false;
988 }
989 }
990 } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
991 if (!swizzleWeight(weight, config->workItem_output[1], true))
992 return false;
993 size_t total_bottom_size = bottom_dim_ * numImages;
994 size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
995 size_t total_bias_size = M_ * group_;
996 size_t total_top_size = top_dim_ * numImages;
997 for (int32_t g = 0; g < group_; ++g) {
998 bias_offset = M_ * g;
999 int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
1000 int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
1001 int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
1002
1003 ocl::Kernel kernel(config->kernelName.c_str(), program);
1004 if (kernel.empty())
1005 return false;
1006
1007 cl_uint argIdx = 0;
1008 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1009
1010 UMat img_buffer;
1011 if (image_offset)
1012 {
1013 CreateSubBuffer(bottom, img_buffer, image_offset,
1014 total_bottom_size - image_offset, false);
1015 if (img_buffer.empty())
1016 return false;
1017
1018 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
1019 }
1020 else
1021 {
1022 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1023 }
1024
1025 UMat kernel_buffer;
1026 if (kernel_offset)
1027 {
1028 CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
1029 total_kernel_size - kernel_offset, false);
1030 if (kernel_buffer.empty())
1031 return false;
1032
1033 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
1034 }
1035 else
1036 {
1037 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
1038 }
1039
1040 UMat bias_buffer;
1041 if (bias_term_)
1042 {
1043 if (bias_offset)
1044 {
1045 CreateSubBuffer(bias, bias_buffer, bias_offset,
1046 total_bias_size - bias_offset, false);
1047 if (bias_buffer.empty())
1048 return false;
1049
1050 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
1051 }
1052 else
1053 {
1054 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1055 }
1056 }
1057
1058 UMat out_buffer;
1059 if (output_image_offset)
1060 {
1061 CreateSubBuffer(top, out_buffer, output_image_offset,
1062 total_top_size - output_image_offset, true);
1063 if (out_buffer.empty())
1064 return false;
1065
1066 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
1067 kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
1068 }
1069 else
1070 {
1071 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1072 kernel.set(argIdx++, (int)(top.offset / element_size));
1073 }
1074
1075 kernel.set(argIdx++, (uint16_t)width_);
1076 kernel.set(argIdx++, (uint16_t)height_);
1077 kernel.set(argIdx++, (uint16_t)output_w_);
1078 kernel.set(argIdx++, (uint16_t)output_h_);
1079
1080 int out_pitch_y = output_w_ * output_h_;
1081 int out_pitch_z = out_pitch_y * M_;
1082 int aligned_input_size = height_ * width_ * channels_ / group_;
1083 int slice_pitch = width_ * height_;
1084 kernel.set(argIdx++, (uint32_t)out_pitch_y);
1085 kernel.set(argIdx++, (uint32_t)out_pitch_z);
1086 kernel.set(argIdx++, (uint32_t)aligned_input_size);
1087 kernel.set(argIdx++, (uint32_t)slice_pitch);
1088
1089 int blockM = config->workItem_output[0];
1090 int blockK = config->workItem_output[1];
1091 int blockN = config->workItem_output[2];
1092 int alignedFilterWidth = alignSize(M_, blockN);
1093 int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM);
1094 int globalWorkSizeDX = blockN;
1095 int globalWorkSizeDY = blockM;
1096 size_t sgemm_m = alignedExpandHeight;
1097 size_t sgemm_n = alignedFilterWidth;
1098 size_t gx = divUp(sgemm_n, globalWorkSizeDX);
1099 size_t gy = divUp(sgemm_m, globalWorkSizeDY);
1100 gy = alignSize(gy, blockK);
1101 size_t global_size[3] = { gx, gy, config->global_work_size[2] };
1102
1103 if (!kernel.run(3, global_size, config->local_work_size, false))
1104 {
1105 std::cout << "GEMM like kernel run failed." << std::endl;
1106 return false;
1107 }
1108 }
1109 } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
1110 ocl::Kernel kernel(config->kernelName.c_str(), program);
1111 if (kernel.empty())
1112 return false;
1113
1114 cl_uint argIdx = 0;
1115 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1116 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1117 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1118 if (bias_term_)
1119 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1120 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1121 kernel.set(argIdx++, (int)(top.offset / element_size));
1122 kernel.set(argIdx++, (uint16_t)width_);
1123 kernel.set(argIdx++, (uint16_t)height_);
1124 kernel.set(argIdx++, (uint16_t)output_w_);
1125 kernel.set(argIdx++, (uint16_t)output_h_);
1126
1127 size_t global_size[3];
1128 global_size[0] = output_w_;
1129 global_size[1] = output_h_;
1130 global_size[2] = num_output_ * num_;
1131
1132 if (!kernel.run(3, global_size, NULL, false))
1133 {
1134 std::cout << "DWCONV kernel run failed." << std::endl;
1135 return false;
1136 }
1137 } else {
1138 for (int32_t n = 0; n < numImages; ++n) {
1139 for (int32_t g = 0; g < group_; ++g) {
1140 bias_offset = M_ * g;
1141 int32_t image_offset = n * bottom_dim_
1142 + width_ * height_ * (channels_ / group_) * g;
1143 int32_t output_image_offset = n * top_dim_
1144 + output_w_ * output_h_ * M_ * g;
1145
1146 int32_t kernel_offset = kernel_h_ * kernel_w_ *
1147 (channels_ / group_) * M_
1148 * g;
1149
1150 ocl::Kernel kernel(config->kernelName.c_str(), program);
1151 if (kernel.empty())
1152 return false;
1153
1154 cl_uint argIdx = 0;
1155 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
1156 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1157 kernel.set(argIdx++, image_offset);
1158 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1159 kernel.set(argIdx++, kernel_offset);
1160 if (bias_term_)
1161 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
1162 else
1163 kernel.set(argIdx++, (void *)NULL);
1164 kernel.set(argIdx++, bias_offset);
1165 kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
1166 kernel.set(argIdx++, (int)(top.offset / element_size));
1167 kernel.set(argIdx++, output_image_offset);
1168 kernel.set(argIdx++, (uint16_t)width_);
1169 kernel.set(argIdx++, (uint16_t)height_);
1170 kernel.set(argIdx++, (uint16_t)output_w_);
1171 kernel.set(argIdx++, (uint16_t)output_h_);
1172 kernel.set(argIdx++, (uint16_t)pad_w_);
1173 kernel.set(argIdx++, (uint16_t)pad_h_);
1174 if (!kernel.run(3, config->global_work_size,
1175 (config->use_null_local) ? NULL : config->local_work_size,
1176 false))
1177 {
1178 std::cout << "Basic kernel run failed." << std::endl;
1179 return false;
1180 }
1181 }
1182 }
1183 }
1184
1185 return true;
1186 }
1187
1188 template<>
timedConvolve(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config)1189 float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
1190 const UMat &weight, const UMat &bias,
1191 int32_t numImages, kernelConfig* config)
1192 {
1193 cv::ocl::Queue queue;
1194 try
1195 {
1196 queue = cv::ocl::Queue::getDefault();
1197 }
1198 catch (const cv::Exception&)
1199 {
1200 static int warn_ = 0;
1201 if (!warn_)
1202 {
1203 std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
1204 warn_ = true;
1205 }
1206 return 1e6;
1207 }
1208
1209 // warm up.
1210 bool saved_tuned = tuned_;
1211 tuned_ = false;
1212 convolve(bottom, top, weight, bias, numImages, config);
1213
1214 cv::ocl::Timer timer(queue);
1215 timer.start();
1216 bool res = true;;
1217 CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
1218 tuned_ = true;
1219 int loop_cnt = 4;
1220 for (int i = 0; i < loop_cnt; i++) {
1221 res = convolve(bottom, top, weight, bias, numImages, config);
1222 if (!res)
1223 break;
1224 }
1225 tuned_ = saved_tuned;
1226 timer.stop();
1227 if (!res) {
1228 config->tested = true;
1229 config->verified = false;
1230 return 1e5;
1231 }
1232
1233 float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
1234 double out_w = output_w_;
1235 double out_h = output_h_;
1236 double out_z = M_;
1237 double k_w = kernel_w_;
1238 double k_h = kernel_h_;
1239 double k_z = channels_;
1240 double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
1241 CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
1242 CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
1243 return elapsedTime;
1244 }
1245
1246 template<>
verifyResult(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,kernelConfig * config,UMat & verifyTop)1247 bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
1248 UMat &top,
1249 const UMat &weight,
1250 const UMat &bias,
1251 int32_t numImages,
1252 kernelConfig* config,
1253 UMat &verifyTop)
1254 {
1255 if (config->verified)
1256 return true;
1257 else if (config->tested)
1258 return false;
1259
1260 int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
1261 top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
1262 bool saved_tuned = tuned_;
1263 tuned_ = false;
1264 convolve(bottom, top, weight, bias, numImages, config);
1265 tuned_ = saved_tuned;
1266
1267 config->tested = true;
1268
1269 UMat new_top, new_verify_top;
1270 Mat mat_top, mat_verify_top;
1271 if (use_half_)
1272 {
1273 convertFp16(top, new_top);
1274 convertFp16(verifyTop, new_verify_top);
1275
1276 mat_top = new_top.getMat(ACCESS_READ);
1277 mat_verify_top = new_verify_top.getMat(ACCESS_READ);
1278 }
1279 else
1280 {
1281 mat_top = top.getMat(ACCESS_READ);
1282 mat_verify_top = verifyTop.getMat(ACCESS_READ);
1283 }
1284 const float* data = mat_top.ptr<float>();
1285 const float* verify_data = mat_verify_top.ptr<float>();
1286
1287 int error_slice_offset = 0;
1288 int error_slice = 0;
1289 float relative_eps = use_half_ ? 0.1f : 0.01f;
1290
1291 size_t errors = 0;
1292
1293 double rel_err = norm(mat_top.reshape(1, 1), mat_verify_top.reshape(1, 1), NORM_L1 | NORM_RELATIVE);
1294 if (rel_err >= relative_eps)
1295 {
1296 for (int32_t n = 0; n < num_; ++n) {
1297 for (int32_t g = 0; g < group_; ++g) {
1298 int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g;
1299 for (int out_ch = 0; out_ch < M_; out_ch++)
1300 for (int h = 0; h < output_h_; h++)
1301 for (int w = 0; w < output_w_; w++) {
1302 size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
1303
1304 bool has_error = !(data[offset] == data[offset]); // is NaN
1305 if (!has_error)
1306 {
1307 float error_factor = std::fabs(data[offset] - verify_data[offset]);
1308 float base_value_abs = std::max(1e-3f, std::fabs(verify_data[offset]));
1309 has_error = error_factor > relative_eps * base_value_abs;
1310 }
1311 if (has_error)
1312 {
1313 if (errors == 0)
1314 {
1315 error_slice = (int)(offset / (output_w_ * output_h_));
1316 error_slice_offset = (int)(offset % (output_w_ * output_h_));
1317 CV_LOG_ERROR(NULL, "Kernel: " << config->kernelName);
1318 }
1319 if (errors < 10)
1320 CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
1321 << " out_ch " << out_ch << " h " << h << " w " << w
1322 << " (offset: " << offset << ")"
1323 << " got " << data[offset] << " expected " << verify_data[offset]);
1324 errors++;
1325 }
1326 }
1327 }
1328 }
1329 }
1330
1331 if (errors)
1332 {
1333 if (dumpFailedResult())
1334 {
1335 try
1336 {
1337 int n_outputs = (int)(mat_top.size[0]*mat_top.size[1]);
1338 int slice_size = (int)(mat_top.total() / n_outputs);
1339 Rect roi(0, 0, slice_size, n_outputs);
1340 roi.width = std::min(roi.width, 32);
1341 roi.height = std::min(roi.height, 16);
1342 roi.x = std::max(0, std::min(slice_size - roi.width, error_slice_offset - roi.width/2));
1343 roi.y = std::max(0, std::min(n_outputs - roi.height, error_slice - roi.height/2));
1344 std::cout << "roi = " << roi << " errors=" << errors << std::endl;
1345 std::cout << "mat_top = " << shape(mat_top) << std::endl
1346 << mat_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl;
1347 std::cout << "verify_top = " << shape(mat_verify_top) << std::endl
1348 << mat_verify_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl;
1349 }
1350 catch (const std::exception& e)
1351 {
1352 CV_LOG_ERROR(NULL, "Results dump failed: " << e.what());
1353 }
1354 catch (...)
1355 {
1356 CV_LOG_ERROR(NULL, "Results dump failed")
1357 }
1358 }
1359
1360 if (raiseOnCheckError())
1361 CV_Error_(Error::StsError, ("ocl4dnn tuning verification failed: %s (errors %lld)", config->kernelName.c_str(), (long long int)errors));
1362 return false;
1363 }
1364 else
1365 {
1366 config->verified = true;
1367 return true;
1368 }
1369 }
1370
1371 template<typename Dtype>
unloadProgram(const std::string & kernelName)1372 void OCL4DNNConvSpatial<Dtype>::unloadProgram(const std::string& kernelName)
1373 {
1374 ocl::Program program;
1375 phash_t::iterator it = phash.find(kernelName);
1376 if (it != phash.end())
1377 {
1378 program = it->second;
1379 it->second = ocl::Program();
1380 }
1381 else
1382 return;
1383
1384 ocl::Context ctx = ocl::Context::getDefault();
1385 ctx.unloadProg(program);
1386 }
1387
1388 template<typename Dtype>
compileKernel()1389 ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
1390 {
1391 phash_t::iterator it = phash.find(kernel_name_);
1392 if (it != phash.end())
1393 {
1394 return it->second;
1395 }
1396
1397 String errmsg;
1398 ocl::Context ctx = ocl::Context::getDefault();
1399 std::string options = options_.str();
1400 CV_Assert(options.size() != 0);
1401 ocl::Program program = ctx.getProg(src_, options, errmsg);
1402
1403 phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
1404 if (!program.ptr())
1405 {
1406 std::cout << "Failed to compile kernel: " << kernel_name_
1407 << ", buildflags: " << options
1408 << ", errmsg: " << errmsg << std::endl;
1409 }
1410 return program;
1411 }
1412
1413 template<>
createGEMMLikeConvKernel(int32_t blockM,int32_t blockK,int32_t blockN)1414 bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
1415 int32_t blockK,
1416 int32_t blockN)
1417 {
1418 int32_t simd_size = blockK;
1419
1420 int workItemOutput[3] = { blockM, blockK, blockN };
1421 size_t gx = (size_t)divUp(M_, blockN);
1422 size_t gy = (size_t)divUp(output_w_ * output_h_, blockM);
1423 gy = alignSize(gy, simd_size);
1424 size_t gz = num_;
1425 size_t global_size[3] = { gx, gy, gz };
1426 size_t local_size[3] = { 1, static_cast<size_t>(simd_size), 1 };
1427
1428 kernelType_ = KERNEL_TYPE_GEMM_LIKE;
1429 blockM_ = blockM;
1430 blockK_ = blockK;
1431 blockN_ = blockN;
1432 setupKernel();
1433
1434 ocl::Program program = compileKernel();
1435 if (program.ptr())
1436 {
1437 size_t workgroupSize_used;
1438 ocl::Kernel kernel(kernel_name_.c_str(), program);
1439 if (kernel.empty())
1440 return false;
1441
1442 workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
1443 if (workgroupSize_used != simd_size)
1444 {
1445 std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
1446 std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
1447 std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
1448 unloadProgram(kernel_name_);
1449 return false;
1450 }
1451 else
1452 {
1453 kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
1454 true, KERNEL_TYPE_GEMM_LIKE));
1455 return true;
1456 }
1457 }
1458 else
1459 return false;
1460 }
1461
1462 template<>
createIDLFKernel(int32_t blockWidth,int32_t blockHeight,int32_t simd_size)1463 bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
1464 int32_t blockHeight,
1465 int32_t simd_size)
1466 {
1467 int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
1468 const int32_t num_output_maps = M_;
1469 int32_t output_width = output_w_;
1470 int32_t output_height = output_h_;
1471 int32_t output_block_width = blockWidth;
1472 int32_t output_block_height = blockHeight;
1473 int32_t num_batches = num_;
1474
1475 size_t global_size[3] = {
1476 (size_t)divUp(output_width, output_block_width),
1477 (size_t)divUp(output_height, output_block_height),
1478 (size_t)num_batches * alignSize(num_output_maps, simd_size) };
1479 size_t local_size[3] = { 1, 1, static_cast<size_t>(simd_size) };
1480
1481 kernelType_ = KERNEL_TYPE_INTEL_IDLF;
1482 blockM_ = blockWidth;
1483 blockK_ = blockHeight;
1484 blockN_ = simd_size;
1485
1486 setupKernel();
1487
1488 if (enableWorkaroundIDLF() && ocl::Device::getDefault().intelSubgroupsSupport())
1489 {
1490 // Issues are observed with these kernels: 3x1 (covered by tests), 2x1, 4x1, 5x1, 3x2
1491 // kernels 1x3, 3x3, 2x3 are good
1492 if (pad_h_ != 0 && kernel_w_ <= simd_size && kernel_h_ <= 2)
1493 {
1494 CV_LOG_INFO(NULL, "DNN(workaround): skip IDLF kernel: " << kernel_name_);
1495 return false;
1496 }
1497 }
1498
1499 ocl::Program program = compileKernel();
1500 if (program.ptr())
1501 {
1502 size_t workgroupSize_used;
1503 ocl::Kernel kernel(kernel_name_.c_str(), program);
1504 if (kernel.empty())
1505 return false;
1506
1507 workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
1508 if (workgroupSize_used != simd_size)
1509 {
1510 std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
1511 std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
1512 std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
1513 unloadProgram(kernel_name_);
1514 return false;
1515 }
1516 else
1517 {
1518 kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
1519 true, KERNEL_TYPE_INTEL_IDLF));
1520 return true;
1521 }
1522 }
1523 else
1524 return false;
1525 }
1526
1527 template<>
createDWConvKernel(int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)1528 bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
1529 int32_t blockHeight,
1530 int32_t blockDepth)
1531 {
1532 if (!dwconv_)
1533 return false;
1534
1535 int workItemOutput[3] = { 1, 1, 1 };
1536 size_t local_size[3] = { 1, 1, 1 };
1537 size_t global_size[3];
1538 global_size[0] = divUp(output_w_, workItemOutput[0]);
1539 global_size[1] = divUp(output_h_, workItemOutput[1]);
1540 global_size[2] = divUp(M_ * num_, workItemOutput[2]);
1541
1542 kernelType_ = KERNEL_TYPE_DWCONV;
1543 blockM_ = blockWidth;
1544 blockK_ = blockHeight;
1545 blockN_ = blockDepth;
1546
1547 setupKernel();
1548
1549 ocl::Program program = compileKernel();
1550 if (program.ptr())
1551 {
1552 kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
1553 &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
1554 return true;
1555 }
1556 else
1557 return false;
1558 }
1559
1560 template<>
createConvolutionKernel(int32_t kernelType,int32_t blockWidth,int32_t blockHeight,int32_t blockDepth)1561 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1562 int32_t blockWidth,
1563 int32_t blockHeight,
1564 int32_t blockDepth)
1565 {
1566 kernelType_ = kernelType;
1567 options_.str(""); options_.clear(); // clear contents and state flags
1568 src_ = ocl::ProgramSource();
1569
1570 if (kernelType == KERNEL_TYPE_INTEL_IDLF)
1571 return createIDLFKernel(blockWidth, blockHeight, blockDepth);
1572 else if (kernelType == KERNEL_TYPE_BASIC)
1573 return createBasicKernel(blockWidth, blockHeight, blockDepth);
1574 else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
1575 return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
1576 else if (kernelType == KERNEL_TYPE_DWCONV)
1577 return createDWConvKernel(blockWidth, blockHeight, blockDepth);
1578 else
1579 CV_Assert(0 && "Internal error");
1580 return false;
1581 }
1582
1583 template<>
generate_gemmlike_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int blockN)1584 void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1585 int blockM, int blockK, int blockN)
1586 {
1587 if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
1588 return;
1589
1590 if (blockM != 1 && blockM != 2)
1591 return;
1592
1593 if (blockN != 32)
1594 return;
1595
1596 if (blockK != 8 && blockK != 16)
1597 return;
1598
1599 if (blockK == 16)
1600 {
1601 if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
1602 return;
1603 if ((blockM == 2) || M_ % 32 != 0)
1604 return;
1605 }
1606
1607 tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
1608 }
1609
1610 template<>
generate_idlf_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int simd_size)1611 void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1612 int blockM, int blockK, int simd_size)
1613 {
1614 int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
1615
1616 if (simd_size != 8 && simd_size != 16)
1617 return;
1618
1619 if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
1620 return;
1621
1622 if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
1623 return;
1624
1625 int width_max, height_max, block_size_max;
1626 width_max = 14;
1627 height_max = 14;
1628 block_size_max = 32;
1629
1630 if (blockM > width_max)
1631 return;
1632 if (blockK > height_max)
1633 return;
1634
1635 if (blockM > output_w_)
1636 return;
1637 if (blockK > output_h_)
1638 return;
1639
1640 // Only when the work items count is less than the device
1641 // max work items or the M_ is less than 16, we will tune
1642 // for simd 8.
1643 if (simd_size == 8 && M_ >= 16 &&
1644 ((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
1645 max_compute_units * 7 * 16))
1646 return;
1647
1648 int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
1649 int tile_x = alignSize(actual_tile_x, simd_size);
1650 if (tile_x > simd_size)
1651 return;
1652
1653 if (blockM * blockK > block_size_max)
1654 return;
1655
1656 tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
1657 }
1658
1659 template<>
generate_dwconv_tuneritems(std::vector<cv::Ptr<tunerParam>> & tunerItems,int blockM,int blockK,int blockN)1660 void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1661 int blockM, int blockK, int blockN)
1662 {
1663 if (!dwconv_)
1664 return;
1665
1666 tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
1667 }
1668
1669 template<>
generateTunerItems(std::vector<cv::Ptr<tunerParam>> & tunerItems)1670 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
1671 {
1672 if (ocl::Device::getDefault().intelSubgroupsSupport())
1673 {
1674 // depthwise kernel
1675 generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
1676 if (tunerItems.size() > 0 && group_ > 8)
1677 return;
1678
1679 // gemm like kernel
1680 generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
1681 generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
1682 generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
1683 generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
1684
1685 // idlf kernel
1686 for (int simd_size = 8; simd_size <= 16; simd_size += 8)
1687 {
1688 int width_max, height_max;
1689 width_max = 14;
1690 height_max = 14;
1691 for (uint32_t width = width_max; width > 0; width--)
1692 {
1693 for (uint32_t height = height_max; height > 0; height--)
1694 {
1695 generate_idlf_tuneritems(tunerItems, width, height, simd_size);
1696 }
1697 }
1698 }
1699 }
1700 }
1701
1702 template<>
useFirstAvailable(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,UMat & verifyTop)1703 void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom,
1704 UMat &top,
1705 const UMat &weight,
1706 const UMat &bias,
1707 int32_t numImages,
1708 UMat &verifyTop)
1709 {
1710 std::vector< cv::Ptr<tunerParam> > tunerItems;
1711 generateTunerItems(tunerItems);
1712 tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1));
1713
1714 for (int i = 0; i < tunerItems.size(); i++)
1715 {
1716 if (createConvolutionKernel(tunerItems[i]->kernelType,
1717 tunerItems[i]->blockWidth,
1718 tunerItems[i]->blockHeight,
1719 tunerItems[i]->blockDepth))
1720 {
1721 CV_Assert(!kernelQueue.empty()); // basic kernel must be available
1722 int kernelIdx = kernelQueue.size() - 1;
1723 kernelConfig* config = kernelQueue[kernelIdx].get();
1724 bool failed = false;
1725 const size_t testCount = testAllKernels();
1726 for(int t = 0; t < testCount; t++)
1727 {
1728 try
1729 {
1730 config->tested = false;
1731 config->verified = false;
1732 if (!verifyResult(bottom, top, weight, bias, numImages, config, verifyTop))
1733 {
1734 CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1735 failed = true;
1736 break;
1737 }
1738 }
1739 catch (...)
1740 {
1741 CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1742 throw;
1743 }
1744 }
1745 if (!failed && verifyResult(bottom, top, weight, bias, numImages, config, verifyTop))
1746 {
1747 bestKernelConfig = kernelQueue[kernelIdx];
1748 if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF &&
1749 bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
1750 if (!swizzled_weights_umat.empty())
1751 swizzled_weights_umat.release();
1752
1753 for (int32_t j = 0; j < kernelIdx; j++) {
1754 CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end());
1755 unloadProgram(kernelQueue[j]->kernelName);
1756 }
1757 kernelQueue.clear();
1758 tuned_ = true;
1759 break;
1760 }
1761 }
1762 }
1763 }
1764
1765 template<>
cacheTunedConfig()1766 void OCL4DNNConvSpatial<float>::cacheTunedConfig()
1767 {
1768 if (tuned_)
1769 {
1770 cv::AutoLock lock(kernelConfigMutex);
1771 std::stringstream outputKernel;
1772 outputKernel << bestKernelConfig->workItem_output[0] << " "
1773 << bestKernelConfig->workItem_output[1] << " "
1774 << bestKernelConfig->workItem_output[2] << " "
1775 << bestKernelConfig->kernelType << " "
1776 << bestKernelConfig->local_work_size[0] << " "
1777 << bestKernelConfig->local_work_size[1] << " "
1778 << bestKernelConfig->local_work_size[2] << " "
1779 << bestKernelConfig->swizzle_weights << " "
1780 << bestKernelConfig->use_null_local << " ";
1781 kernelConfigMap.insert(std::pair<std::string, std::string>(key_, outputKernel.str()));
1782 }
1783 }
1784
1785 template<>
setupConvolution(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages,UMat & verifyTop)1786 void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
1787 UMat &top,
1788 const UMat &weight,
1789 const UMat &bias,
1790 int32_t numImages,
1791 UMat &verifyTop)
1792 {
1793 std::vector< cv::Ptr<tunerParam> > tunerItems;
1794
1795 generateTunerItems(tunerItems);
1796 for (int i = 0; i < tunerItems.size(); i++)
1797 createConvolutionKernel(tunerItems[i]->kernelType,
1798 tunerItems[i]->blockWidth,
1799 tunerItems[i]->blockHeight,
1800 tunerItems[i]->blockDepth);
1801
1802 const size_t testCount = testAllKernels();
1803 for (int32_t x = 0; x < kernelQueue.size(); x++)
1804 {
1805 kernelConfig* config = kernelQueue[x];
1806 config->executionTime = timedConvolve(bottom, top, weight, bias, numImages, config);
1807 for(int t = 0; t < testCount; t++)
1808 {
1809 try
1810 {
1811 config->tested = false;
1812 config->verified = false;
1813 bool verified = verifyResult(bottom, top, weight, bias, numImages, config, verifyTop);
1814 if (verified == false)
1815 {
1816 CV_LOG_ERROR(NULL, "Kernel " << config->kernelName << " failed verification");
1817 CV_LOG_ERROR(NULL, "workItem="
1818 << config->workItem_output[0] << ","
1819 << config->workItem_output[1] << ","
1820 << config->workItem_output[2] << " "
1821 << "kernelType: " << config->kernelType << " "
1822 << "global_work_size="
1823 << config->global_work_size[0] << ","
1824 << config->global_work_size[1] << ","
1825 << config->global_work_size[2] << " "
1826 << "local_work_size="
1827 << config->local_work_size[0] << ","
1828 << config->local_work_size[1] << ","
1829 << config->local_work_size[2] << " "
1830 << config->swizzle_weights << " "
1831 << config->use_null_local);
1832 }
1833 else
1834 {
1835 CV_LOG_VERBOSE(NULL, 0, "Kernel " << config->kernelName << " pass verification");
1836 }
1837 }
1838 catch (...)
1839 {
1840 CV_LOG_ERROR(NULL, "Failed on test iteration: " << t);
1841 throw;
1842 }
1843 }
1844 }
1845
1846 int32_t failures = 0;
1847 bool verification = false;
1848 if (kernelQueue.size()) {
1849 while (failures < kernelQueue.size()) {
1850 int32_t fastestKernel = -1;
1851 float fastestTime = std::numeric_limits<float>::infinity();
1852
1853 for (int32_t x = 0; x < kernelQueue.size(); x++) {
1854 if (kernelQueue[x]->executionTime < fastestTime &&
1855 kernelQueue[x]->tested == false) {
1856 fastestKernel = x;
1857 fastestTime = kernelQueue[x]->executionTime;
1858 }
1859 }
1860 if (fastestKernel < 0) break;
1861 // Test fastest kernel
1862 bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
1863 if (verified == true) {
1864 kernel_index_ = fastestKernel;
1865 verification = true;
1866 break;
1867 } else {
1868 CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
1869 " failed verification");
1870 failures++;
1871 }
1872 }
1873 }
1874 if (verification) {
1875 CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
1876 "> passed verification");
1877 CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
1878 double out_w = output_w_;
1879 double out_h = output_h_;
1880 double out_z = M_;
1881 double k_w = kernel_w_;
1882 double k_h = kernel_h_;
1883 double k_z = channels_;
1884 float elapsedTime = kernelQueue[kernel_index_]->executionTime;
1885 double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
1886 CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
1887 CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
1888 } else {
1889 CV_LOG_INFO(NULL, "fallback to basic kernel");
1890 options_.str(""); options_.clear(); // clear contents and state flags
1891 createBasicKernel(1, 1, 1);
1892 CV_Assert(!kernelQueue.empty()); // basic kernel must be available
1893 kernel_index_ = kernelQueue.size() - 1;
1894 }
1895 this->bestKernelConfig = kernelQueue[kernel_index_];
1896
1897
1898 if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
1899 if (!swizzled_weights_umat.empty())
1900 swizzled_weights_umat.release();
1901
1902 for (int32_t x = 0; x < kernelQueue.size(); x++) {
1903 if (x != kernel_index_) {
1904 CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end());
1905 unloadProgram(kernelQueue[x]->kernelName);
1906 }
1907 }
1908 kernelQueue.clear();
1909 tuned_ = true;
1910 saveTunedConfig();
1911 }
1912
1913 template<typename Dtype>
saveTunedConfig()1914 void OCL4DNNConvSpatial<Dtype>::saveTunedConfig()
1915 {
1916 CV_Assert(tuned_);
1917 if (!use_cache_path_ || cache_path_.empty())
1918 return;
1919
1920 std::string outputFile;
1921 outputFile = cache_path_ + "/" + key_sanitized_;
1922 std::ofstream outputKernel;
1923 outputKernel.open(outputFile.c_str());
1924 outputKernel << bestKernelConfig->workItem_output[0] << " "
1925 << bestKernelConfig->workItem_output[1] << " "
1926 << bestKernelConfig->workItem_output[2] << " "
1927 << bestKernelConfig->kernelType << " "
1928 << bestKernelConfig->local_work_size[0] << " "
1929 << bestKernelConfig->local_work_size[1] << " "
1930 << bestKernelConfig->local_work_size[2] << " "
1931 << bestKernelConfig->swizzle_weights << " "
1932 << bestKernelConfig->use_null_local << " ";
1933 outputKernel.close();
1934 }
1935
1936 template<typename Dtype>
prepareKernel(const UMat & bottom,UMat & top,const UMat & weight,const UMat & bias,int32_t numImages)1937 void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
1938 const UMat &weight, const UMat &bias,
1939 int32_t numImages)
1940 {
1941 std::string previous_key = key_;
1942
1943 generateKey();
1944 if (key_.compare(previous_key) == 0 && bestKernelConfig)
1945 return;
1946
1947 if (bestKernelConfig)
1948 {
1949 prev_kernel_type_ = bestKernelConfig->kernelType;
1950 CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end());
1951 phash.erase(bestKernelConfig->kernelName);
1952 bestKernelConfig.release();
1953 }
1954
1955 if (loadCachedConfig()) // check in-memory cache
1956 return;
1957
1958 if (loadTunedConfig()) // check external storage
1959 return;
1960
1961 UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
1962
1963 calculateBenchmark(bottom, benchData, weight, bias, numImages);
1964
1965 if (run_auto_tuning_ || force_auto_tuning_)
1966 {
1967 setupConvolution(bottom, top, weight, bias, numImages, benchData);
1968 }
1969 else
1970 {
1971 useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
1972 }
1973 cacheTunedConfig();
1974 }
1975
1976 template<typename Dtype>
loadCachedConfig()1977 bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
1978 {
1979 cv::AutoLock lock(kernelConfigMutex);
1980 if (!defaultConfigLoaded && !force_auto_tuning_)
1981 initializeGlobalBuiltinConfigurations((use_cache_path_ && !cache_path_.empty()) ? (cache_path_ + '/') : std::string());
1982
1983 kernel_hash_t::iterator it = kernelConfigMap.find(key_);
1984 if (it != kernelConfigMap.end())
1985 {
1986 int32_t x, y, z, type, lx, ly, lz;
1987 bool swizzle, nullLocal;
1988 std::stringstream cachedKernel(it->second);
1989 if (cachedKernel)
1990 {
1991 cachedKernel >> x;
1992 cachedKernel >> y;
1993 cachedKernel >> z;
1994 cachedKernel >> type;
1995 cachedKernel >> lx;
1996 cachedKernel >> ly;
1997 cachedKernel >> lz;
1998 cachedKernel >> swizzle;
1999 cachedKernel >> nullLocal;
2000 if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
2001 tuned_ = true;
2002 return true;
2003 }
2004 }
2005 }
2006 return false;
2007 }
2008
2009
2010 template<typename Dtype>
setupKernelByConfig(int x,int y,int z,int type,int lx,int ly,int lz,bool swizzle,bool nullLocal)2011 bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int type,
2012 int lx, int ly, int lz,
2013 bool swizzle, bool nullLocal)
2014 {
2015 if (type == KERNEL_TYPE_INTEL_IDLF)
2016 {
2017 if (z == 1)
2018 z = 16;
2019 CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
2020 }
2021 kernelQueue.clear();
2022 createConvolutionKernel(type, x, y, z);
2023 if (kernelQueue.size() != 1) {
2024 std::cerr << "Failed setup kernel by config:"
2025 << " x = " << x
2026 << " y = " << y
2027 << " z = " << z
2028 << " type = " << type
2029 << std::endl;
2030 return false;
2031 }
2032 bestKernelConfig = kernelQueue[0];
2033 kernelQueue.clear();
2034 bestKernelConfig->local_work_size[0] = lx;
2035 bestKernelConfig->local_work_size[1] = ly;
2036 bestKernelConfig->local_work_size[2] = lz;
2037 bestKernelConfig->swizzle_weights = swizzle;
2038 bestKernelConfig->use_null_local = nullLocal;
2039 // If kernel type changed to type 2 or 4, we need to reset the swizzled
2040 // weights pointer to invalidate the previous swizzled weights data.
2041 if (prev_kernel_type_ != bestKernelConfig->kernelType &&
2042 (bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF ||
2043 bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE))
2044 {
2045 if (!swizzled_weights_umat.empty())
2046 swizzled_weights_umat.release();
2047 }
2048 return true;
2049 }
2050
2051 template<typename Dtype>
loadTunedConfig()2052 bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
2053 {
2054 if (force_auto_tuning_)
2055 return false; // don't load results from external storage
2056
2057 if (!use_cache_path_)
2058 {
2059 if (cache_path_.empty())
2060 {
2061 static int warn_ = 0;
2062 if (!warn_)
2063 {
2064 std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
2065 << " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
2066 warn_ = true;
2067 }
2068 }
2069 return false;
2070 }
2071
2072 int32_t x, y, z, type, lx, ly, lz;
2073 bool swizzle, nullLocal;
2074
2075 // Find cached kernel configuration from file
2076 std::string cacheFile = cache_path_ + "/" + key_sanitized_;
2077 std::ifstream cachedKernel(cacheFile.c_str());
2078 if (cachedKernel)
2079 {
2080 cachedKernel >> x;
2081 cachedKernel >> y;
2082 cachedKernel >> z;
2083 cachedKernel >> type;
2084 cachedKernel >> lx;
2085 cachedKernel >> ly;
2086 cachedKernel >> lz;
2087 cachedKernel >> swizzle;
2088 cachedKernel >> nullLocal;
2089 if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
2090 tuned_ = true;
2091 return true;
2092 }
2093 }
2094 return false;
2095 }
2096
2097 template class OCL4DNNConvSpatial<float>;
2098
2099 }}} // namespace cv::dnn::ocl4dnn
2100