1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_inf_engine.hpp"
46 #include "../ie_ngraph.hpp"
47 #include "../op_cuda.hpp"
48
49 #include <opencv2/dnn/shape_utils.hpp>
50
51 #ifdef HAVE_OPENCL
52 #include "../ocl4dnn/include/math_functions.hpp"
53 #include "opencl_kernels_dnn.hpp"
54 #endif
55
56 #ifdef HAVE_CUDA
57 #include "../cuda4dnn/primitives/mvn.hpp"
58 using namespace cv::dnn::cuda4dnn;
59 #endif
60
61 namespace cv
62 {
63 namespace dnn
64 {
65
66 class MVNLayerImpl CV_FINAL : public MVNLayer
67 {
68 public:
MVNLayerImpl(const LayerParams & params)69 MVNLayerImpl(const LayerParams& params)
70 {
71 setParamsFrom(params);
72 normVariance = params.get<bool>("normalize_variance", true);
73 acrossChannels = params.get<bool>("across_channels", false);
74 eps = params.get<double>("eps", 1e-9);
75 fuse_batch_norm = false;
76 fuse_relu = false;
77 relu_slope = 0.f;
78 zeroDev = false;
79 }
80
81 Mat scale, shift;
82 #ifdef HAVE_OPENCL
83 UMat umat_scale, umat_shift;
84 #endif
85 bool fuse_batch_norm;
86
87 Ptr<ReLULayer> activ_relu;
88 float relu_slope;
89 bool fuse_relu;
90 bool zeroDev; // TODO: Doesn't considered in Intel's Inference Engine backend.
setActivation(const Ptr<ActivationLayer> & layer)91 bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
92 {
93 if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
94 {
95 layer->getScaleShift(scale, shift);
96 fuse_batch_norm = !scale.empty() || !shift.empty();
97 return fuse_batch_norm;
98 }
99
100 if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
101 {
102 activ_relu = layer.dynamicCast<ReLULayer>();
103 if( !activ_relu.empty() )
104 relu_slope = activ_relu->negativeSlope;
105 }
106 fuse_relu = !activ_relu.empty();
107 return fuse_relu;
108 }
109
finalize(InputArrayOfArrays inputs_arr,OutputArrayOfArrays)110 void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
111 {
112 std::vector<Mat> inputs;
113 inputs_arr.getMatVector(inputs);
114 int splitDim = (acrossChannels) ? 1 : 2;
115 int i, newRows = 1;
116 for( i = 0; i < splitDim; i++ )
117 newRows *= inputs[0].size[i];
118 zeroDev = inputs[0].total() == newRows;
119 #ifdef HAVE_OPENCL
120 umat_scale.release();
121 umat_shift.release();
122 #endif
123 }
124
supportBackend(int backendId)125 virtual bool supportBackend(int backendId) CV_OVERRIDE
126 {
127 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
128 if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
129 {
130 bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
131 return !zeroDev && (!isMyriad || eps <= 1e-7f);
132 }
133 #endif
134 #ifdef HAVE_DNN_NGRAPH
135 if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
136 return true;
137 #endif
138 {
139 return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
140 }
141 }
142
143 #ifdef HAVE_OPENCL
fast_forward_ocl(std::vector<UMat> & inputs,std::vector<UMat> & outputs)144 bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
145 {
146 if (umat_scale.empty() && !scale.empty())
147 scale.copyTo(umat_scale);
148 if (umat_shift.empty() && !shift.empty())
149 shift.copyTo(umat_shift);
150 UMat& bnorm_weight = umat_scale;
151 UMat& bnorm_bias = umat_shift;
152
153 const unsigned LOCAL_SIZE = 128;
154 bool use_half = (inputs[0].depth() == CV_16S);
155 String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
156 use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
157 LOCAL_SIZE
158 );
159
160 int splitDim = (acrossChannels) ? 1 : 2;
161 for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
162 {
163 UMat &inpMat = inputs[inpIdx];
164 UMat &outMat = outputs[inpIdx];
165 int newRows = total(shape(inpMat), 0, splitDim);
166 CV_Assert(newRows != 0);
167
168 MatShape s = shape(newRows, inpMat.total() / newRows);
169 UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
170 UMat tmpMat = UMat(s[0], s[1], CV_32F);
171 float alpha = 1.0f / s[1];
172
173 String buildopt = "-DNUM=4" + opts;
174 ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE");
175 size_t localsize[] = { LOCAL_SIZE };
176 size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
177
178 int argId = 0;
179 k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
180 k.set(argId++, (int)s[1]);
181 k.set(argId++, alpha);
182 k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
183 k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
184 bool ret = k.run(1, globalsize, localsize, false);
185 if (!ret)
186 return false;
187
188 buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
189 (fuse_relu) ? "-DFUSE_RELU" : "");
190
191 ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE");
192 argId = 0;
193 k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
194 k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
195 k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat));
196 k1.set(argId++, (int)s[1]);
197 k1.set(argId++, (float)alpha);
198 k1.set(argId++, (float)eps);
199 k1.set(argId++, (float)relu_slope);
200 k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
201 k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
202 k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
203 ret = k1.run(1, globalsize, localsize, false);
204 if (!ret)
205 return false;
206 }
207 return true;
208 }
209
forward_ocl(InputArrayOfArrays inputs_,OutputArrayOfArrays outputs_,OutputArrayOfArrays internals_)210 bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
211 {
212 if (umat_scale.empty() && !scale.empty())
213 scale.copyTo(umat_scale);
214 if (umat_shift.empty() && !shift.empty())
215 shift.copyTo(umat_shift);
216 UMat& bnorm_weight = umat_scale;
217 UMat& bnorm_bias = umat_shift;
218
219 std::vector<UMat> inputs;
220 std::vector<UMat> outputs;
221
222 inputs_.getUMatVector(inputs);
223 outputs_.getUMatVector(outputs);
224
225 int splitDim = (acrossChannels) ? 1 : 2;
226 int row_size = total(shape(inputs[0]), 0, splitDim);
227 int plane_size = total(shape(inputs[0]), splitDim);
228 if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
229 return fast_forward_ocl(inputs, outputs);
230
231 if (inputs[0].depth() == CV_16S)
232 return false;
233
234 String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
235
236 for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
237 {
238 UMat &inpMat = inputs[inpIdx];
239 UMat &outMat = outputs[inpIdx];
240 int newRows = total(shape(inpMat), 0, splitDim);
241 CV_Assert(newRows != 0);
242
243 MatShape s = shape(newRows, inpMat.total() / newRows);
244 UMat oneMat = UMat::ones(s[1], 1, CV_32F);
245 UMat meanMat = UMat(s[0], 1, CV_32F);
246 UMat devMat = UMat(s[0], 1, CV_32F);
247 UMat tmpMat = UMat(s[0], s[1], CV_32F);
248 float alpha = 1.0f / s[1];
249
250 bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
251 inpMat, 0, oneMat, 0, 0.0f, meanMat, 0);
252 if (!ret)
253 return false;
254
255 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
256 size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
257 String buildopt = format("-DNUM=%d", number) + opts;
258 if (normVariance)
259 {
260 String kname = format("calc_mean%d", number);
261 ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN");
262 if (kernel.empty())
263 return false;
264
265 kernel.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
266 kernel.set(1, (int)s[0]);
267 kernel.set(2, (int)s[1]);
268 kernel.set(3, ocl::KernelArg::PtrReadOnly(meanMat));
269 kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmpMat));
270 ret = kernel.run(2, global, NULL, false);
271 if (!ret)
272 return false;
273
274 ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
275 tmpMat, 0, oneMat, 0, 0.0f, devMat, 0);
276 if (!ret)
277 return false;
278 }
279
280 String kname = format("mvn%d", number);
281 buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "",
282 (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
283 (fuse_relu) ? " -DFUSE_RELU" : "");
284 ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
285 if (kernel1.empty())
286 return false;
287 kernel1.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
288 kernel1.set(1, (int)s[0]);
289 kernel1.set(2, (int)s[1]);
290 kernel1.set(3, (float)eps);
291 kernel1.set(4, ocl::KernelArg::PtrReadOnly(meanMat));
292 kernel1.set(5, ocl::KernelArg::PtrReadOnly(devMat));
293 kernel1.set(6, ocl::KernelArg::PtrReadOnly(bnorm_weight));
294 kernel1.set(7, ocl::KernelArg::PtrReadOnly(bnorm_bias));
295 kernel1.set(8, (int)inpMat.size[1]);
296 kernel1.set(9, (float)relu_slope);
297 kernel1.set(10, ocl::KernelArg::PtrWriteOnly(outMat));
298 ret = kernel1.run(2, global, NULL, false);
299 if (!ret)
300 return false;
301 }
302 return true;
303 }
304 #endif
305
forward(InputArrayOfArrays inputs_arr,OutputArrayOfArrays outputs_arr,OutputArrayOfArrays internals_arr)306 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
307 {
308 CV_TRACE_FUNCTION();
309 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
310
311 CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
312 forward_ocl(inputs_arr, outputs_arr, internals_arr))
313
314 if (inputs_arr.depth() == CV_16S)
315 {
316 forward_fallback(inputs_arr, outputs_arr, internals_arr);
317 return;
318 }
319
320 std::vector<Mat> inputs, outputs, internals;
321 inputs_arr.getMatVector(inputs);
322 outputs_arr.getMatVector(outputs);
323 internals_arr.getMatVector(internals);
324
325 for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
326 {
327 Mat &inpBlob = inputs[inpIdx];
328 Mat &outBlob = outputs[inpIdx];
329
330 int splitDim = (acrossChannels) ? 1 : 2;
331 int i, newRows = 1;
332 for( i = 0; i < splitDim; i++ )
333 newRows *= inpBlob.size[i];
334
335 Mat inpMat = inpBlob.reshape(1, newRows);
336 Mat outMat = outBlob.reshape(1, newRows);
337
338 if ( inpBlob.total() == newRows )
339 {
340 // MVN is applied to single values at an every row.
341 if (shift.empty())
342 {
343 outBlob.setTo(0);
344 }
345 else
346 {
347 for ( i = 0; i < newRows; i++ )
348 {
349 outMat.row(i).setTo(((float*)shift.data)[i]);
350 }
351 }
352 return;
353 }
354
355 Scalar mean, dev;
356 for ( i = 0; i < newRows; i++)
357 {
358 Mat inpRow = inpMat.row(i);
359 Mat outRow = outMat.row(i);
360 float weight = 1.f;
361 float bias = 0.f;
362 if (fuse_batch_norm)
363 {
364 weight = i < scale.cols ? ((float*)scale.data)[i] : weight;
365 bias = i < shift.cols ? ((float*)shift.data)[i] : bias;
366 }
367 cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
368 double alpha = 1;
369 if (normVariance)
370 {
371 alpha = 1 / std::sqrt(eps + dev[0]*dev[0]);
372 }
373 double normalizationScale = 1.0;
374 double normalizationShift = 0.0;
375 if (fuse_batch_norm)
376 {
377 normalizationScale = alpha * weight;
378 normalizationShift = -mean[0] * normalizationScale + bias;
379 }
380 else
381 {
382 normalizationScale = alpha;
383 normalizationShift = -mean[0] * alpha;
384 }
385 inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift);
386 }
387 }
388 }
389
390 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
initInfEngine(const std::vector<Ptr<BackendWrapper>> &)391 virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
392 {
393 InferenceEngine::Builder::MVNLayer ieLayer(name);
394 ieLayer.setAcrossChannels(acrossChannels);
395 ieLayer.setNormalize(normVariance);
396 ieLayer.setEpsilon(eps);
397 return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
398 }
399 #endif // HAVE_DNN_IE_NN_BUILDER_2019
400
401 #ifdef HAVE_DNN_NGRAPH
initNgraph(const std::vector<Ptr<BackendWrapper>> & inputs,const std::vector<Ptr<BackendNode>> & nodes)402 virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
403 const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
404 {
405 auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
406 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
407 auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
408 #else
409 int64_t start_axis = acrossChannels ? 1 : 2;
410 std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
411 std::iota(axes_v.begin(), axes_v.end(), start_axis);
412 auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
413 auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
414 #endif
415 return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
416 }
417 #endif // HAVE_DNN_NGRAPH
418
419 #ifdef HAVE_CUDA
initCUDA(void * context_,const std::vector<Ptr<BackendWrapper>> & inputs,const std::vector<Ptr<BackendWrapper>> & outputs)420 Ptr<BackendNode> initCUDA(
421 void *context_,
422 const std::vector<Ptr<BackendWrapper>>& inputs,
423 const std::vector<Ptr<BackendWrapper>>& outputs
424 ) override
425 {
426 auto context = reinterpret_cast<csl::CSLContext*>(context_);
427
428 cuda4dnn::MVNConfiguration config;
429 config.split_axis = acrossChannels ? 1 : 2;
430 config.normalize_variance = normVariance;
431 config.epsilon = eps;
432 config.input_shapes.resize(inputs.size());
433 for (int i = 0; i < inputs.size(); i++)
434 {
435 auto wrapper = inputs[i].dynamicCast<CUDABackendWrapper>();
436 auto shape = wrapper->getShape();
437 config.input_shapes[i].assign(std::begin(shape), std::end(shape));
438 }
439
440 return make_cuda_node<cuda4dnn::MVNOp>(preferableTarget, std::move(context->stream), config);
441 }
442 #endif
443
getFLOPS(const std::vector<MatShape> & inputs,const std::vector<MatShape> & outputs) const444 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
445 const std::vector<MatShape> &outputs) const CV_OVERRIDE
446 {
447 CV_UNUSED(outputs); // suppress unused variable warning
448 long flops = 0;
449 for(int i = 0; i < inputs.size(); i++)
450 {
451 flops += 6*total(inputs[i]) + 3*total(inputs[i], 0, normVariance ? 2 : 1);
452 }
453 return flops;
454 }
455 };
456
create(const LayerParams & params)457 Ptr<MVNLayer> MVNLayer::create(const LayerParams& params)
458 {
459 return Ptr<MVNLayer>(new MVNLayerImpl(params));
460 }
461
462 }
463 }
464