1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_inf_engine.hpp"
46 #include "../ie_ngraph.hpp"
47 #include "../op_cuda.hpp"
48 
49 #include <opencv2/dnn/shape_utils.hpp>
50 
51 #ifdef HAVE_OPENCL
52 #include "../ocl4dnn/include/math_functions.hpp"
53 #include "opencl_kernels_dnn.hpp"
54 #endif
55 
56 #ifdef HAVE_CUDA
57 #include "../cuda4dnn/primitives/mvn.hpp"
58 using namespace cv::dnn::cuda4dnn;
59 #endif
60 
61 namespace cv
62 {
63 namespace dnn
64 {
65 
66 class MVNLayerImpl CV_FINAL : public MVNLayer
67 {
68 public:
MVNLayerImpl(const LayerParams & params)69     MVNLayerImpl(const LayerParams& params)
70     {
71         setParamsFrom(params);
72         normVariance = params.get<bool>("normalize_variance", true);
73         acrossChannels = params.get<bool>("across_channels", false);
74         eps = params.get<double>("eps", 1e-9);
75         fuse_batch_norm = false;
76         fuse_relu = false;
77         relu_slope = 0.f;
78         zeroDev = false;
79     }
80 
81     Mat scale, shift;
82 #ifdef HAVE_OPENCL
83     UMat umat_scale, umat_shift;
84 #endif
85     bool fuse_batch_norm;
86 
87     Ptr<ReLULayer> activ_relu;
88     float relu_slope;
89     bool fuse_relu;
90     bool zeroDev;  // TODO: Doesn't considered in Intel's Inference Engine backend.
setActivation(const Ptr<ActivationLayer> & layer)91     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
92     {
93         if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
94         {
95             layer->getScaleShift(scale, shift);
96             fuse_batch_norm = !scale.empty() || !shift.empty();
97             return fuse_batch_norm;
98         }
99 
100         if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
101         {
102             activ_relu = layer.dynamicCast<ReLULayer>();
103             if( !activ_relu.empty() )
104                 relu_slope = activ_relu->negativeSlope;
105         }
106         fuse_relu = !activ_relu.empty();
107         return fuse_relu;
108     }
109 
finalize(InputArrayOfArrays inputs_arr,OutputArrayOfArrays)110     void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
111     {
112         std::vector<Mat> inputs;
113         inputs_arr.getMatVector(inputs);
114         int splitDim = (acrossChannels) ? 1 : 2;
115         int i, newRows = 1;
116         for( i = 0; i < splitDim; i++ )
117             newRows *= inputs[0].size[i];
118         zeroDev = inputs[0].total() == newRows;
119 #ifdef HAVE_OPENCL
120         umat_scale.release();
121         umat_shift.release();
122 #endif
123     }
124 
supportBackend(int backendId)125     virtual bool supportBackend(int backendId) CV_OVERRIDE
126     {
127 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
128         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
129         {
130             bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
131             return !zeroDev && (!isMyriad || eps <= 1e-7f);
132         }
133 #endif
134 #ifdef HAVE_DNN_NGRAPH
135         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
136             return true;
137 #endif
138         {
139             return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
140         }
141     }
142 
143 #ifdef HAVE_OPENCL
fast_forward_ocl(std::vector<UMat> & inputs,std::vector<UMat> & outputs)144     bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
145     {
146         if (umat_scale.empty() && !scale.empty())
147             scale.copyTo(umat_scale);
148         if (umat_shift.empty() && !shift.empty())
149             shift.copyTo(umat_shift);
150         UMat& bnorm_weight = umat_scale;
151         UMat& bnorm_bias = umat_shift;
152 
153         const unsigned LOCAL_SIZE = 128;
154         bool use_half = (inputs[0].depth() == CV_16S);
155         String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
156                              use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
157                              LOCAL_SIZE
158         );
159 
160         int splitDim = (acrossChannels) ? 1 : 2;
161         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
162         {
163             UMat &inpMat = inputs[inpIdx];
164             UMat &outMat = outputs[inpIdx];
165             int newRows = total(shape(inpMat), 0, splitDim);
166             CV_Assert(newRows != 0);
167 
168             MatShape s = shape(newRows, inpMat.total() / newRows);
169             UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
170             UMat tmpMat  = UMat(s[0], s[1], CV_32F);
171             float alpha = 1.0f / s[1];
172 
173             String buildopt = "-DNUM=4" + opts;
174             ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE");
175             size_t localsize[] = { LOCAL_SIZE };
176             size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
177 
178             int argId = 0;
179             k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
180             k.set(argId++, (int)s[1]);
181             k.set(argId++, alpha);
182             k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
183             k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
184             bool ret = k.run(1, globalsize, localsize, false);
185             if (!ret)
186                 return false;
187 
188             buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
189                                (fuse_relu) ? "-DFUSE_RELU" : "");
190 
191             ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE");
192             argId = 0;
193             k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
194             k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
195             k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat));
196             k1.set(argId++, (int)s[1]);
197             k1.set(argId++, (float)alpha);
198             k1.set(argId++, (float)eps);
199             k1.set(argId++, (float)relu_slope);
200             k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
201             k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
202             k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
203             ret = k1.run(1, globalsize, localsize, false);
204             if (!ret)
205                 return false;
206         }
207         return true;
208     }
209 
forward_ocl(InputArrayOfArrays inputs_,OutputArrayOfArrays outputs_,OutputArrayOfArrays internals_)210     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
211     {
212         if (umat_scale.empty() && !scale.empty())
213             scale.copyTo(umat_scale);
214         if (umat_shift.empty() && !shift.empty())
215             shift.copyTo(umat_shift);
216         UMat& bnorm_weight = umat_scale;
217         UMat& bnorm_bias = umat_shift;
218 
219         std::vector<UMat> inputs;
220         std::vector<UMat> outputs;
221 
222         inputs_.getUMatVector(inputs);
223         outputs_.getUMatVector(outputs);
224 
225         int splitDim = (acrossChannels) ? 1 : 2;
226         int row_size = total(shape(inputs[0]), 0, splitDim);
227         int plane_size = total(shape(inputs[0]), splitDim);
228         if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
229             return fast_forward_ocl(inputs, outputs);
230 
231         if (inputs[0].depth() == CV_16S)
232             return false;
233 
234         String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
235 
236         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
237         {
238             UMat &inpMat = inputs[inpIdx];
239             UMat &outMat = outputs[inpIdx];
240             int newRows = total(shape(inpMat), 0, splitDim);
241             CV_Assert(newRows != 0);
242 
243             MatShape s = shape(newRows, inpMat.total() / newRows);
244             UMat oneMat = UMat::ones(s[1], 1, CV_32F);
245             UMat meanMat = UMat(s[0], 1, CV_32F);
246             UMat devMat  = UMat(s[0], 1, CV_32F);
247             UMat tmpMat  = UMat(s[0], s[1], CV_32F);
248             float alpha = 1.0f / s[1];
249 
250             bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
251                                                    inpMat, 0, oneMat, 0, 0.0f, meanMat, 0);
252             if (!ret)
253                 return false;
254 
255             int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
256             size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
257             String buildopt = format("-DNUM=%d", number) + opts;
258             if (normVariance)
259             {
260                 String kname = format("calc_mean%d", number);
261                 ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN");
262                 if (kernel.empty())
263                     return false;
264 
265                 kernel.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
266                 kernel.set(1, (int)s[0]);
267                 kernel.set(2, (int)s[1]);
268                 kernel.set(3, ocl::KernelArg::PtrReadOnly(meanMat));
269                 kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmpMat));
270                 ret = kernel.run(2, global, NULL, false);
271                 if (!ret)
272                     return false;
273 
274                 ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
275                                                   tmpMat, 0, oneMat, 0, 0.0f, devMat, 0);
276                 if (!ret)
277                     return false;
278             }
279 
280             String kname = format("mvn%d", number);
281             buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "",
282                                (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
283                                (fuse_relu) ? " -DFUSE_RELU" : "");
284             ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
285             if (kernel1.empty())
286                 return false;
287             kernel1.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
288             kernel1.set(1, (int)s[0]);
289             kernel1.set(2, (int)s[1]);
290             kernel1.set(3, (float)eps);
291             kernel1.set(4, ocl::KernelArg::PtrReadOnly(meanMat));
292             kernel1.set(5, ocl::KernelArg::PtrReadOnly(devMat));
293             kernel1.set(6, ocl::KernelArg::PtrReadOnly(bnorm_weight));
294             kernel1.set(7, ocl::KernelArg::PtrReadOnly(bnorm_bias));
295             kernel1.set(8, (int)inpMat.size[1]);
296             kernel1.set(9, (float)relu_slope);
297             kernel1.set(10, ocl::KernelArg::PtrWriteOnly(outMat));
298             ret = kernel1.run(2, global, NULL, false);
299             if (!ret)
300                 return false;
301         }
302         return true;
303     }
304 #endif
305 
forward(InputArrayOfArrays inputs_arr,OutputArrayOfArrays outputs_arr,OutputArrayOfArrays internals_arr)306     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
307     {
308         CV_TRACE_FUNCTION();
309         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
310 
311         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
312                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
313 
314         if (inputs_arr.depth() == CV_16S)
315         {
316             forward_fallback(inputs_arr, outputs_arr, internals_arr);
317             return;
318         }
319 
320         std::vector<Mat> inputs, outputs, internals;
321         inputs_arr.getMatVector(inputs);
322         outputs_arr.getMatVector(outputs);
323         internals_arr.getMatVector(internals);
324 
325         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
326         {
327             Mat &inpBlob = inputs[inpIdx];
328             Mat &outBlob = outputs[inpIdx];
329 
330             int splitDim = (acrossChannels) ? 1 : 2;
331             int i, newRows = 1;
332             for( i = 0; i < splitDim; i++ )
333                 newRows *= inpBlob.size[i];
334 
335             Mat inpMat = inpBlob.reshape(1, newRows);
336             Mat outMat = outBlob.reshape(1, newRows);
337 
338             if ( inpBlob.total() == newRows )
339             {
340                 // MVN is applied to single values at an every row.
341                 if (shift.empty())
342                 {
343                     outBlob.setTo(0);
344                 }
345                 else
346                 {
347                     for ( i = 0; i < newRows; i++ )
348                     {
349                         outMat.row(i).setTo(((float*)shift.data)[i]);
350                     }
351                 }
352                 return;
353             }
354 
355             Scalar mean, dev;
356             for ( i = 0; i < newRows; i++)
357             {
358                 Mat inpRow = inpMat.row(i);
359                 Mat outRow = outMat.row(i);
360                 float weight = 1.f;
361                 float bias = 0.f;
362                 if (fuse_batch_norm)
363                 {
364                     weight = i < scale.cols ? ((float*)scale.data)[i] : weight;
365                     bias = i < shift.cols ? ((float*)shift.data)[i] : bias;
366                 }
367                 cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
368                 double alpha = 1;
369                 if (normVariance)
370                 {
371                     alpha = 1 / std::sqrt(eps + dev[0]*dev[0]);
372                 }
373                 double normalizationScale = 1.0;
374                 double normalizationShift = 0.0;
375                 if (fuse_batch_norm)
376                 {
377                     normalizationScale = alpha * weight;
378                     normalizationShift = -mean[0] * normalizationScale + bias;
379                 }
380                 else
381                 {
382                     normalizationScale = alpha;
383                     normalizationShift = -mean[0] * alpha;
384                 }
385                 inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift);
386             }
387         }
388     }
389 
390 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
initInfEngine(const std::vector<Ptr<BackendWrapper>> &)391     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
392     {
393         InferenceEngine::Builder::MVNLayer ieLayer(name);
394         ieLayer.setAcrossChannels(acrossChannels);
395         ieLayer.setNormalize(normVariance);
396         ieLayer.setEpsilon(eps);
397         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
398     }
399 #endif  // HAVE_DNN_IE_NN_BUILDER_2019
400 
401 #ifdef HAVE_DNN_NGRAPH
initNgraph(const std::vector<Ptr<BackendWrapper>> & inputs,const std::vector<Ptr<BackendNode>> & nodes)402     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
403                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
404     {
405         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
406 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
407         auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
408 #else
409         int64_t start_axis = acrossChannels ? 1 : 2;
410         std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
411         std::iota(axes_v.begin(), axes_v.end(), start_axis);
412         auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
413         auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
414 #endif
415         return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
416     }
417 #endif  // HAVE_DNN_NGRAPH
418 
419 #ifdef HAVE_CUDA
initCUDA(void * context_,const std::vector<Ptr<BackendWrapper>> & inputs,const std::vector<Ptr<BackendWrapper>> & outputs)420     Ptr<BackendNode> initCUDA(
421         void *context_,
422         const std::vector<Ptr<BackendWrapper>>& inputs,
423         const std::vector<Ptr<BackendWrapper>>& outputs
424     ) override
425     {
426         auto context = reinterpret_cast<csl::CSLContext*>(context_);
427 
428         cuda4dnn::MVNConfiguration config;
429         config.split_axis = acrossChannels ? 1 : 2;
430         config.normalize_variance = normVariance;
431         config.epsilon = eps;
432         config.input_shapes.resize(inputs.size());
433         for (int i = 0; i < inputs.size(); i++)
434         {
435             auto wrapper = inputs[i].dynamicCast<CUDABackendWrapper>();
436             auto shape = wrapper->getShape();
437             config.input_shapes[i].assign(std::begin(shape), std::end(shape));
438         }
439 
440         return make_cuda_node<cuda4dnn::MVNOp>(preferableTarget, std::move(context->stream), config);
441     }
442 #endif
443 
getFLOPS(const std::vector<MatShape> & inputs,const std::vector<MatShape> & outputs) const444     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
445                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
446     {
447         CV_UNUSED(outputs); // suppress unused variable warning
448         long flops = 0;
449         for(int i = 0; i < inputs.size(); i++)
450         {
451             flops += 6*total(inputs[i]) + 3*total(inputs[i], 0, normVariance ? 2 : 1);
452         }
453         return flops;
454     }
455 };
456 
create(const LayerParams & params)457 Ptr<MVNLayer> MVNLayer::create(const LayerParams& params)
458 {
459     return Ptr<MVNLayer>(new MVNLayerImpl(params));
460 }
461 
462 }
463 }
464