1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "precomp.hpp"
44 #include "opencv2/core/hal/intrin.hpp"
45 #include "opencl_kernels_video.hpp"
46 
47 using namespace std;
48 #define EPS 0.001F
49 #define INF 1E+10F
50 
51 namespace cv {
52 
53 class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
54 {
55   public:
56     DISOpticalFlowImpl();
57 
58     void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
59     void collectGarbage() CV_OVERRIDE;
60 
61   protected: //!< algorithm parameters
62     int finest_scale, coarsest_scale;
63     int patch_size;
64     int patch_stride;
65     int grad_descent_iter;
66     int variational_refinement_iter;
67     float variational_refinement_alpha;
68     float variational_refinement_gamma;
69     float variational_refinement_delta;
70     bool use_mean_normalization;
71     bool use_spatial_propagation;
72 
73   protected: //!< some auxiliary variables
74     int border_size;
75     int w, h;   //!< flow buffer width and height on the current scale
76     int ws, hs; //!< sparse flow buffer width and height on the current scale
77 
78   public:
getFinestScale() const79     int getFinestScale() const CV_OVERRIDE { return finest_scale; }
setFinestScale(int val)80     void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
getPatchSize() const81     int getPatchSize() const CV_OVERRIDE { return patch_size; }
setPatchSize(int val)82     void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
getPatchStride() const83     int getPatchStride() const CV_OVERRIDE { return patch_stride; }
setPatchStride(int val)84     void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
getGradientDescentIterations() const85     int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
setGradientDescentIterations(int val)86     void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
getVariationalRefinementIterations() const87     int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
setVariationalRefinementIterations(int val)88     void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
getVariationalRefinementAlpha() const89     float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
setVariationalRefinementAlpha(float val)90     void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
getVariationalRefinementDelta() const91     float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
setVariationalRefinementDelta(float val)92     void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
getVariationalRefinementGamma() const93     float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
setVariationalRefinementGamma(float val)94     void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
95 
getUseMeanNormalization() const96     bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
setUseMeanNormalization(bool val)97     void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
getUseSpatialPropagation() const98     bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
setUseSpatialPropagation(bool val)99     void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
100 
101   protected:                      //!< internal buffers
102     vector<Mat_<uchar> > I0s;     //!< Gaussian pyramid for the current frame
103     vector<Mat_<uchar> > I1s;     //!< Gaussian pyramid for the next frame
104     vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
105 
106     vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
107     vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
108 
109     vector<Mat_<float> > Ux; //!< x component of the flow vectors
110     vector<Mat_<float> > Uy; //!< y component of the flow vectors
111 
112     vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
113     vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
114 
115     Mat_<Vec2f> U; //!< a buffer for the merged flow
116 
117     Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
118     Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
119 
120     /* Structure tensor components: */
121     Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
122     Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
123     Mat_<float> I0xy_buf; //!< sum of x and y gradient products
124 
125     /* Extra buffers that are useful if patch mean-normalization is used: */
126     Mat_<float> I0x_buf; //!< sum of x gradient values
127     Mat_<float> I0y_buf; //!< sum of y gradient values
128 
129     /* Auxiliary buffers used in structure tensor computation: */
130     Mat_<float> I0xx_buf_aux;
131     Mat_<float> I0yy_buf_aux;
132     Mat_<float> I0xy_buf_aux;
133     Mat_<float> I0x_buf_aux;
134     Mat_<float> I0y_buf_aux;
135 
136     vector<Ptr<VariationalRefinement> > variational_refinement_processors;
137 
138   private: //!< private methods and parallel sections
139     void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
140     void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
141                                    Mat &I0y);
142     int autoSelectCoarsestScale(int img_width);
143     void autoSelectPatchSizeAndScales(int img_width);
144 
145     struct PatchInverseSearch_ParBody : public ParallelLoopBody
146     {
147         DISOpticalFlowImpl *dis;
148         int nstripes, stripe_sz;
149         int hs;
150         Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
151         int num_iter, pyr_level;
152 
153         PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
154                                    Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
155                                    int _pyr_level);
156         void operator()(const Range &range) const CV_OVERRIDE;
157     };
158 
159     struct Densification_ParBody : public ParallelLoopBody
160     {
161         DISOpticalFlowImpl *dis;
162         int nstripes, stripe_sz;
163         int h;
164         Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
165 
166         Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
167                               Mat &src_Sy, Mat &_I0, Mat &_I1);
168         void operator()(const Range &range) const CV_OVERRIDE;
169     };
170 
171 #ifdef HAVE_OPENCL
172     vector<UMat> u_I0s;     //!< Gaussian pyramid for the current frame
173     vector<UMat> u_I1s;     //!< Gaussian pyramid for the next frame
174     vector<UMat> u_I1s_ext; //!< I1s with borders
175 
176     vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
177     vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
178 
179     vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2)
180     vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2)
181 
182     UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2)
183 
184     /* Structure tensor components: */
185     UMat u_I0xx_buf; //!< sum of squares of x gradient values
186     UMat u_I0yy_buf; //!< sum of squares of y gradient values
187     UMat u_I0xy_buf; //!< sum of x and y gradient products
188 
189     /* Extra buffers that are useful if patch mean-normalization is used: */
190     UMat u_I0x_buf; //!< sum of x gradient values
191     UMat u_I0y_buf; //!< sum of y gradient values
192 
193     /* Auxiliary buffers used in structure tensor computation: */
194     UMat u_I0xx_buf_aux;
195     UMat u_I0yy_buf_aux;
196     UMat u_I0xy_buf_aux;
197     UMat u_I0x_buf_aux;
198     UMat u_I0y_buf_aux;
199 
200     bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
201                                        UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
202     void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow);
203     bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
204     bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1);
205     bool ocl_PatchInverseSearch(UMat &src_U,
206                                 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
207 #endif
208 };
209 
DISOpticalFlowImpl()210 DISOpticalFlowImpl::DISOpticalFlowImpl()
211 {
212     CV_INSTRUMENT_REGION();
213 
214     finest_scale = 2;
215     patch_size = 8;
216     patch_stride = 4;
217     grad_descent_iter = 16;
218     variational_refinement_iter = 5;
219     variational_refinement_alpha = 20.f;
220     variational_refinement_gamma = 10.f;
221     variational_refinement_delta = 5.f;
222 
223     border_size = 16;
224     use_mean_normalization = true;
225     use_spatial_propagation = true;
226     coarsest_scale = 10;
227 
228     /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
229     int max_possible_scales = 10;
230     ws = hs = w = h = 0;
231     for (int i = 0; i < max_possible_scales; i++)
232         variational_refinement_processors.push_back(VariationalRefinement::create());
233 }
234 
prepareBuffers(Mat & I0,Mat & I1,Mat & flow,bool use_flow)235 void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
236 {
237     CV_INSTRUMENT_REGION();
238 
239     I0s.resize(coarsest_scale + 1);
240     I1s.resize(coarsest_scale + 1);
241     I1s_ext.resize(coarsest_scale + 1);
242     I0xs.resize(coarsest_scale + 1);
243     I0ys.resize(coarsest_scale + 1);
244     Ux.resize(coarsest_scale + 1);
245     Uy.resize(coarsest_scale + 1);
246 
247     Mat flow_uv[2];
248     if (use_flow)
249     {
250         split(flow, flow_uv);
251         initial_Ux.resize(coarsest_scale + 1);
252         initial_Uy.resize(coarsest_scale + 1);
253     }
254 
255     int fraction = 1;
256     int cur_rows = 0, cur_cols = 0;
257 
258     for (int i = 0; i <= coarsest_scale; i++)
259     {
260         /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
261         if (i == finest_scale)
262         {
263             cur_rows = I0.rows / fraction;
264             cur_cols = I0.cols / fraction;
265             I0s[i].create(cur_rows, cur_cols);
266             resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
267             I1s[i].create(cur_rows, cur_cols);
268             resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
269 
270             /* These buffers are reused in each scale so we initialize them once on the finest scale: */
271             Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
272             Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
273             I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
274             I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
275             I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
276             I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
277             I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
278 
279             I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
280             I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
281             I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
282             I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
283             I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
284 
285             U.create(cur_rows, cur_cols);
286         }
287         else if (i > finest_scale)
288         {
289             cur_rows = I0s[i - 1].rows / 2;
290             cur_cols = I0s[i - 1].cols / 2;
291             I0s[i].create(cur_rows, cur_cols);
292             resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
293             I1s[i].create(cur_rows, cur_cols);
294             resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
295         }
296 
297         if (i >= finest_scale)
298         {
299             I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
300             copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
301             I0xs[i].create(cur_rows, cur_cols);
302             I0ys[i].create(cur_rows, cur_cols);
303             spatialGradient(I0s[i], I0xs[i], I0ys[i]);
304             Ux[i].create(cur_rows, cur_cols);
305             Uy[i].create(cur_rows, cur_cols);
306             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
307             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
308             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
309             variational_refinement_processors[i]->setSorIterations(5);
310             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
311 
312             if (use_flow)
313             {
314                 resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows));
315                 initial_Ux[i] /= fraction;
316                 resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows));
317                 initial_Uy[i] /= fraction;
318             }
319         }
320 
321         fraction *= 2;
322     }
323 }
324 
325 /* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
326  * A simple box filter is not used instead because we need to compute these sums on a sparse grid
327  * and store them densely in the output buffers.
328  */
precomputeStructureTensor(Mat & dst_I0xx,Mat & dst_I0yy,Mat & dst_I0xy,Mat & dst_I0x,Mat & dst_I0y,Mat & I0x,Mat & I0y)329 void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
330                                                    Mat &dst_I0y, Mat &I0x, Mat &I0y)
331 {
332     CV_INSTRUMENT_REGION();
333 
334     float *I0xx_ptr = dst_I0xx.ptr<float>();
335     float *I0yy_ptr = dst_I0yy.ptr<float>();
336     float *I0xy_ptr = dst_I0xy.ptr<float>();
337     float *I0x_ptr = dst_I0x.ptr<float>();
338     float *I0y_ptr = dst_I0y.ptr<float>();
339 
340     float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
341     float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
342     float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
343     float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
344     float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();
345 
346     /* Separable box filter: horizontal pass */
347     for (int i = 0; i < h; i++)
348     {
349         float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
350         short *x_row = I0x.ptr<short>(i);
351         short *y_row = I0y.ptr<short>(i);
352         for (int j = 0; j < patch_size; j++)
353         {
354             sum_xx += x_row[j] * x_row[j];
355             sum_yy += y_row[j] * y_row[j];
356             sum_xy += x_row[j] * y_row[j];
357             sum_x += x_row[j];
358             sum_y += y_row[j];
359         }
360         I0xx_aux_ptr[i * ws] = sum_xx;
361         I0yy_aux_ptr[i * ws] = sum_yy;
362         I0xy_aux_ptr[i * ws] = sum_xy;
363         I0x_aux_ptr[i * ws] = sum_x;
364         I0y_aux_ptr[i * ws] = sum_y;
365         int js = 1;
366         for (int j = patch_size; j < w; j++)
367         {
368             sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
369             sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
370             sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
371             sum_x += (x_row[j] - x_row[j - patch_size]);
372             sum_y += (y_row[j] - y_row[j - patch_size]);
373             if ((j - patch_size + 1) % patch_stride == 0)
374             {
375                 I0xx_aux_ptr[i * ws + js] = sum_xx;
376                 I0yy_aux_ptr[i * ws + js] = sum_yy;
377                 I0xy_aux_ptr[i * ws + js] = sum_xy;
378                 I0x_aux_ptr[i * ws + js] = sum_x;
379                 I0y_aux_ptr[i * ws + js] = sum_y;
380                 js++;
381             }
382         }
383     }
384 
385     AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
386     for (int j = 0; j < ws; j++)
387     {
388         sum_xx[j] = 0.0f;
389         sum_yy[j] = 0.0f;
390         sum_xy[j] = 0.0f;
391         sum_x[j] = 0.0f;
392         sum_y[j] = 0.0f;
393     }
394 
395     /* Separable box filter: vertical pass */
396     for (int i = 0; i < patch_size; i++)
397         for (int j = 0; j < ws; j++)
398         {
399             sum_xx[j] += I0xx_aux_ptr[i * ws + j];
400             sum_yy[j] += I0yy_aux_ptr[i * ws + j];
401             sum_xy[j] += I0xy_aux_ptr[i * ws + j];
402             sum_x[j] += I0x_aux_ptr[i * ws + j];
403             sum_y[j] += I0y_aux_ptr[i * ws + j];
404         }
405     for (int j = 0; j < ws; j++)
406     {
407         I0xx_ptr[j] = sum_xx[j];
408         I0yy_ptr[j] = sum_yy[j];
409         I0xy_ptr[j] = sum_xy[j];
410         I0x_ptr[j] = sum_x[j];
411         I0y_ptr[j] = sum_y[j];
412     }
413     int is = 1;
414     for (int i = patch_size; i < h; i++)
415     {
416         for (int j = 0; j < ws; j++)
417         {
418             sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
419             sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
420             sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
421             sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
422             sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
423         }
424         if ((i - patch_size + 1) % patch_stride == 0)
425         {
426             for (int j = 0; j < ws; j++)
427             {
428                 I0xx_ptr[is * ws + j] = sum_xx[j];
429                 I0yy_ptr[is * ws + j] = sum_yy[j];
430                 I0xy_ptr[is * ws + j] = sum_xy[j];
431                 I0x_ptr[is * ws + j] = sum_x[j];
432                 I0y_ptr[is * ws + j] = sum_y[j];
433             }
434             is++;
435         }
436     }
437 }
438 
autoSelectCoarsestScale(int img_width)439 int DISOpticalFlowImpl::autoSelectCoarsestScale(int img_width)
440 {
441     const int fratio = 5;
442     return std::max(0, (int)std::floor(log2((2.0f*(float)img_width) / ((float)fratio * (float)patch_size))));
443 }
444 
autoSelectPatchSizeAndScales(int img_width)445 void DISOpticalFlowImpl::autoSelectPatchSizeAndScales(int img_width)
446 {
447     switch (finest_scale)
448     {
449     case 1:
450         patch_size = 8;
451         coarsest_scale = autoSelectCoarsestScale(img_width);
452         finest_scale = std::max(coarsest_scale-2, 0);
453         break;
454 
455     case 3:
456         patch_size = 12;
457         coarsest_scale = autoSelectCoarsestScale(img_width);
458         finest_scale = std::max(coarsest_scale-4, 0);
459         break;
460 
461     case 4:
462         patch_size = 12;
463         coarsest_scale = autoSelectCoarsestScale(img_width);
464         finest_scale = std::max(coarsest_scale-5, 0);
465         break;
466 
467     // default case, fall-through.
468     case 2:
469     default:
470         patch_size = 8;
471         coarsest_scale = autoSelectCoarsestScale(img_width);
472         finest_scale = std::max(coarsest_scale-2, 0);
473         break;
474     }
475 }
476 
PatchInverseSearch_ParBody(DISOpticalFlowImpl & _dis,int _nstripes,int _hs,Mat & dst_Sx,Mat & dst_Sy,Mat & src_Ux,Mat & src_Uy,Mat & _I0,Mat & _I1,Mat & _I0x,Mat & _I0y,int _num_iter,int _pyr_level)477 DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
478                                                                            int _hs, Mat &dst_Sx, Mat &dst_Sy,
479                                                                            Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
480                                                                            Mat &_I0x, Mat &_I0y, int _num_iter,
481                                                                            int _pyr_level)
482     : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
483       I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
484 {
485     stripe_sz = (int)ceil(hs / (double)nstripes);
486 }
487 
488 /////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
489 
490 /* Some auxiliary macros */
491 #define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION                                                                         \
492     v_float32x4 w00v = v_setall_f32(w00);                                                                              \
493     v_float32x4 w01v = v_setall_f32(w01);                                                                              \
494     v_float32x4 w10v = v_setall_f32(w10);                                                                              \
495     v_float32x4 w11v = v_setall_f32(w11);                                                                              \
496                                                                                                                        \
497     v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp;                        \
498     v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left;    \
499     v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right,                            \
500       I1_row_next_shifted_4_right;                                                                                     \
501     v_float32x4 I_diff_left, I_diff_right;                                                                             \
502                                                                                                                        \
503     /* Preload and expand the first row of I1: */                                                                      \
504     I1_row_8 = v_load_expand(I1_ptr);                                                                                  \
505     I1_row_shifted_8 = v_load_expand(I1_ptr + 1);                                                                      \
506     v_expand(I1_row_8, I1_row_4_left, I1_row_4_right);                                                                 \
507     v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right);                                         \
508     I1_ptr += I1_stride;
509 
510 #define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION                                                                      \
511     /* Load the next row of I1: */                                                                                     \
512     I1_row_next_8 = v_load_expand(I1_ptr);                                                                             \
513     I1_row_next_shifted_8 = v_load_expand(I1_ptr + 1);                                                                 \
514     /* Separate the left and right halves: */                                                                          \
515     v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right);                                                  \
516     v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right);                          \
517                                                                                                                        \
518     /* Load current row of I0: */                                                                                      \
519     I0_row_8 = v_load_expand(I0_ptr);                                                                                  \
520     v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
521                                                                                                                        \
522     /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
523     I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
524                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
525                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
526                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
527                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
528     I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
529                    w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
530                    w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
531                    w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
532                    v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
533 
534 #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
535     I0_ptr += I0_stride;                                                                                               \
536     I1_ptr += I1_stride;                                                                                               \
537                                                                                                                        \
538     I1_row_4_left = I1_row_next_4_left;                                                                                \
539     I1_row_4_right = I1_row_next_4_right;                                                                              \
540     I1_row_shifted_4_left = I1_row_next_shifted_4_left;                                                                \
541     I1_row_shifted_4_right = I1_row_next_shifted_4_right;
542 
543 /* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
544  * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
545  * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
546  * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
547  * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
548  * the quality significantly.
549  */
processPatch(float & dst_dUx,float & dst_dUy,uchar * I0_ptr,uchar * I1_ptr,short * I0x_ptr,short * I0y_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)550 inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
551                           int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
552 {
553     float SSD = 0.0f;
554 #if CV_SIMD128
555     if (patch_sz == 8)
556     {
557         /* Variables to accumulate the sums */
558         v_float32x4 Ux_vec = v_setall_f32(0);
559         v_float32x4 Uy_vec = v_setall_f32(0);
560         v_float32x4 SSD_vec = v_setall_f32(0);
561 
562         v_int16x8 I0x_row, I0y_row;
563         v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
564 
565         HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
566         for (int row = 0; row < 8; row++)
567         {
568             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
569             I0x_row = v_load(I0x_ptr);
570             v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
571             I0y_row = v_load(I0y_ptr);
572             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
573 
574             /* Update the sums: */
575             Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
576             Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
577             SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
578 
579             I0x_ptr += I0_stride;
580             I0y_ptr += I0_stride;
581             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
582         }
583 
584         /* Final reduce operations: */
585         dst_dUx = v_reduce_sum(Ux_vec);
586         dst_dUy = v_reduce_sum(Uy_vec);
587         SSD = v_reduce_sum(SSD_vec);
588     }
589     else
590 #endif
591     {
592         dst_dUx = 0.0f;
593         dst_dUy = 0.0f;
594         float diff;
595         for (int i = 0; i < patch_sz; i++)
596             for (int j = 0; j < patch_sz; j++)
597             {
598                 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
599                        w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
600                        I0_ptr[i * I0_stride + j];
601 
602                 SSD += diff * diff;
603                 dst_dUx += diff * I0x_ptr[i * I0_stride + j];
604                 dst_dUy += diff * I0y_ptr[i * I0_stride + j];
605             }
606     }
607     return SSD;
608 }
609 
610 /* Same as processPatch, but with patch mean normalization, which improves robustness under changing
611  * lighting conditions
612  */
processPatchMeanNorm(float & dst_dUx,float & dst_dUy,uchar * I0_ptr,uchar * I1_ptr,short * I0x_ptr,short * I0y_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz,float x_grad_sum,float y_grad_sum)613 inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
614                                   short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
615                                   float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
616 {
617     float sum_diff = 0.0, sum_diff_sq = 0.0;
618     float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
619     float n = (float)patch_sz * patch_sz;
620 
621 #if CV_SIMD128
622     if (patch_sz == 8)
623     {
624         /* Variables to accumulate the sums */
625         v_float32x4 sum_I0x_mul_vec = v_setall_f32(0);
626         v_float32x4 sum_I0y_mul_vec = v_setall_f32(0);
627         v_float32x4 sum_diff_vec = v_setall_f32(0);
628         v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
629 
630         v_int16x8 I0x_row, I0y_row;
631         v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
632 
633         HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
634         for (int row = 0; row < 8; row++)
635         {
636             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
637             I0x_row = v_load(I0x_ptr);
638             v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
639             I0y_row = v_load(I0y_ptr);
640             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
641 
642             /* Update the sums: */
643             sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
644             sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
645             sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
646             sum_diff_vec += I_diff_left + I_diff_right;
647 
648             I0x_ptr += I0_stride;
649             I0y_ptr += I0_stride;
650             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
651         }
652 
653         /* Final reduce operations: */
654         sum_I0x_mul = v_reduce_sum(sum_I0x_mul_vec);
655         sum_I0y_mul = v_reduce_sum(sum_I0y_mul_vec);
656         sum_diff = v_reduce_sum(sum_diff_vec);
657         sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
658     }
659     else
660 #endif
661     {
662         float diff;
663         for (int i = 0; i < patch_sz; i++)
664             for (int j = 0; j < patch_sz; j++)
665             {
666                 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
667                        w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
668                        I0_ptr[i * I0_stride + j];
669 
670                 sum_diff += diff;
671                 sum_diff_sq += diff * diff;
672 
673                 sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
674                 sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
675             }
676     }
677     dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
678     dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
679     return sum_diff_sq - sum_diff * sum_diff / n;
680 }
681 
682 /* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
computeSSD(uchar * I0_ptr,uchar * I1_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)683 inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
684                         float w11, int patch_sz)
685 {
686     float SSD = 0.0f;
687 #if CV_SIMD128
688     if (patch_sz == 8)
689     {
690         v_float32x4 SSD_vec = v_setall_f32(0);
691         HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
692         for (int row = 0; row < 8; row++)
693         {
694             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
695             SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
696             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
697         }
698         SSD = v_reduce_sum(SSD_vec);
699     }
700     else
701 #endif
702     {
703         float diff;
704         for (int i = 0; i < patch_sz; i++)
705             for (int j = 0; j < patch_sz; j++)
706             {
707                 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
708                        w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
709                        I0_ptr[i * I0_stride + j];
710                 SSD += diff * diff;
711             }
712     }
713     return SSD;
714 }
715 
716 /* Same as computeSSD, but with patch mean normalization */
computeSSDMeanNorm(uchar * I0_ptr,uchar * I1_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)717 inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
718                                 float w10, float w11, int patch_sz)
719 {
720     float sum_diff = 0.0f, sum_diff_sq = 0.0f;
721     float n = (float)patch_sz * patch_sz;
722 #if CV_SIMD128
723     if (patch_sz == 8)
724     {
725         v_float32x4 sum_diff_vec = v_setall_f32(0);
726         v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
727         HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
728         for (int row = 0; row < 8; row++)
729         {
730             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
731             sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
732             sum_diff_vec += I_diff_left + I_diff_right;
733             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
734         }
735         sum_diff = v_reduce_sum(sum_diff_vec);
736         sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
737     }
738     else
739     {
740 #endif
741         float diff;
742         for (int i = 0; i < patch_sz; i++)
743             for (int j = 0; j < patch_sz; j++)
744             {
745                 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
746                        w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
747                        I0_ptr[i * I0_stride + j];
748 
749                 sum_diff += diff;
750                 sum_diff_sq += diff * diff;
751             }
752 #if CV_SIMD128
753     }
754 #endif
755     return sum_diff_sq - sum_diff * sum_diff / n;
756 }
757 
758 #undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
759 #undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
760 #undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
761 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
762 
operator ()(const Range & range) const763 void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
764 {
765     CV_INSTRUMENT_REGION();
766 
767     // force separate processing of stripes if we are using spatial propagation:
768     if (dis->use_spatial_propagation && range.end > range.start + 1)
769     {
770         for (int n = range.start; n < range.end; n++)
771             (*this)(Range(n, n + 1));
772         return;
773     }
774     int psz = dis->patch_size;
775     int psz2 = psz / 2;
776     int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
777     int bsz = dis->border_size;
778 
779     /* Input dense flow */
780     float *Ux_ptr = Ux->ptr<float>();
781     float *Uy_ptr = Uy->ptr<float>();
782 
783     /* Output sparse flow */
784     float *Sx_ptr = Sx->ptr<float>();
785     float *Sy_ptr = Sy->ptr<float>();
786 
787     uchar *I0_ptr = I0->ptr<uchar>();
788     uchar *I1_ptr = I1->ptr<uchar>();
789     short *I0x_ptr = I0x->ptr<short>();
790     short *I0y_ptr = I0y->ptr<short>();
791 
792     /* Precomputed structure tensor */
793     float *xx_ptr = dis->I0xx_buf.ptr<float>();
794     float *yy_ptr = dis->I0yy_buf.ptr<float>();
795     float *xy_ptr = dis->I0xy_buf.ptr<float>();
796     /* And extra buffers for mean-normalization: */
797     float *x_ptr = dis->I0x_buf.ptr<float>();
798     float *y_ptr = dis->I0y_buf.ptr<float>();
799 
800     bool use_temporal_candidates = false;
801     float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL;
802     if (!dis->initial_Ux.empty())
803     {
804         initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
805         initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
806         use_temporal_candidates = true;
807     }
808 
809     int i, j, dir;
810     int start_is, end_is, start_js, end_js;
811     int start_i, start_j;
812     float i_lower_limit = bsz - psz + 1.0f;
813     float i_upper_limit = bsz + dis->h - 1.0f;
814     float j_lower_limit = bsz - psz + 1.0f;
815     float j_upper_limit = bsz + dis->w - 1.0f;
816     float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
817 
818 #define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
819     i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \
820     j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \
821     { \
822         float di = i_I1 - floor(i_I1); \
823         float dj = j_I1 - floor(j_I1); \
824         w11 = di       * dj; \
825         w10 = di       * (1 - dj); \
826         w01 = (1 - di) * dj; \
827         w00 = (1 - di) * (1 - dj); \
828     }
829 
830 #define COMPUTE_SSD(dst, Ux, Uy)                                                                                       \
831     INIT_BILINEAR_WEIGHTS(Ux, Uy);                                                                                     \
832     if (dis->use_mean_normalization)                                                                                   \
833         dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00,  \
834                                  w01, w10, w11, psz);                                                                  \
835     else                                                                                                               \
836         dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01,     \
837                          w10, w11, psz);
838 
839     int num_inner_iter = (int)floor(dis->grad_descent_iter / (float)num_iter);
840     for (int iter = 0; iter < num_iter; iter++)
841     {
842         if (iter % 2 == 0)
843         {
844             dir = 1;
845             start_is = min(range.start * stripe_sz, hs);
846             end_is = min(range.end * stripe_sz, hs);
847             start_js = 0;
848             end_js = dis->ws;
849             start_i = start_is * dis->patch_stride;
850             start_j = 0;
851         }
852         else
853         {
854             dir = -1;
855             start_is = min(range.end * stripe_sz, hs) - 1;
856             end_is = min(range.start * stripe_sz, hs) - 1;
857             start_js = dis->ws - 1;
858             end_js = -1;
859             start_i = start_is * dis->patch_stride;
860             start_j = (dis->ws - 1) * dis->patch_stride;
861         }
862 
863         i = start_i;
864         for (int is = start_is; dir * is < dir * end_is; is += dir)
865         {
866             j = start_j;
867             for (int js = start_js; dir * js < dir * end_js; js += dir)
868             {
869                 if (iter == 0)
870                 {
871                     /* Using result form the previous pyramid level as the very first approximation: */
872                     Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
873                     Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
874                 }
875 
876                 float min_SSD = INF, cur_SSD;
877                 if (use_temporal_candidates || dis->use_spatial_propagation)
878                 {
879                     COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
880                 }
881 
882                 if (use_temporal_candidates)
883                 {
884                     /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */
885                     COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
886                                 initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
887                     if (cur_SSD < min_SSD)
888                     {
889                         min_SSD = cur_SSD;
890                         Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
891                         Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
892                     }
893                 }
894 
895                 if (dis->use_spatial_propagation)
896                 {
897                     /* Try spatial candidates: */
898                     if (dir * js > dir * start_js)
899                     {
900                         COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
901                         if (cur_SSD < min_SSD)
902                         {
903                             min_SSD = cur_SSD;
904                             Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
905                             Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
906                         }
907                     }
908                     /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
909                      * the number of stripes constant. It works well enough in practice and doesn't introduce any
910                      * visible seams.
911                      */
912                     if (dir * is > dir * start_is)
913                     {
914                         COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
915                         if (cur_SSD < min_SSD)
916                         {
917                             min_SSD = cur_SSD;
918                             Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
919                             Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
920                         }
921                     }
922                 }
923 
924                 /* Use the best candidate as a starting point for the gradient descent: */
925                 float cur_Ux = Sx_ptr[is * dis->ws + js];
926                 float cur_Uy = Sy_ptr[is * dis->ws + js];
927 
928                 /* Computing the inverse of the structure tensor: */
929                 float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
930                              xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
931                 if (abs(detH) < EPS)
932                     detH = EPS;
933                 float invH11 = yy_ptr[is * dis->ws + js] / detH;
934                 float invH12 = -xy_ptr[is * dis->ws + js] / detH;
935                 float invH22 = xx_ptr[is * dis->ws + js] / detH;
936                 float prev_SSD = INF, SSD;
937                 float x_grad_sum = x_ptr[is * dis->ws + js];
938                 float y_grad_sum = y_ptr[is * dis->ws + js];
939 
940                 for (int t = 0; t < num_inner_iter; t++)
941                 {
942                     INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
943                     if (dis->use_mean_normalization)
944                         SSD = processPatchMeanNorm(dUx, dUy,
945                                 I0_ptr  + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
946                                 I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
947                                 dis->w, w_ext, w00, w01, w10, w11, psz,
948                                 x_grad_sum, y_grad_sum);
949                     else
950                         SSD = processPatch(dUx, dUy,
951                                 I0_ptr  + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
952                                 I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
953                                 dis->w, w_ext, w00, w01, w10, w11, psz);
954 
955                     dx = invH11 * dUx + invH12 * dUy;
956                     dy = invH12 * dUx + invH22 * dUy;
957                     cur_Ux -= dx;
958                     cur_Uy -= dy;
959 
960                     /* Break when patch distance stops decreasing */
961                     if (SSD >= prev_SSD)
962                         break;
963                     prev_SSD = SSD;
964                 }
965 
966                 /* If gradient descent converged to a flow vector that is very far from the initial approximation
967                  * (more than patch size) then we don't use it. Noticeably improves the robustness.
968                  */
969                 if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
970                 {
971                     Sx_ptr[is * dis->ws + js] = cur_Ux;
972                     Sy_ptr[is * dis->ws + js] = cur_Uy;
973                 }
974                 j += dir * dis->patch_stride;
975             }
976             i += dir * dis->patch_stride;
977         }
978     }
979 #undef INIT_BILINEAR_WEIGHTS
980 #undef COMPUTE_SSD
981 }
982 
Densification_ParBody(DISOpticalFlowImpl & _dis,int _nstripes,int _h,Mat & dst_Ux,Mat & dst_Uy,Mat & src_Sx,Mat & src_Sy,Mat & _I0,Mat & _I1)983 DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
984                                                                  Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
985                                                                  Mat &_I0, Mat &_I1)
986     : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
987 {
988     stripe_sz = (int)ceil(h / (double)nstripes);
989 }
990 
991 /* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
992  * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
993  * overlapping patches.
994  */
operator ()(const Range & range) const995 void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
996 {
997     CV_INSTRUMENT_REGION();
998 
999     int start_i = min(range.start * stripe_sz, h);
1000     int end_i = min(range.end * stripe_sz, h);
1001 
1002     /* Input sparse flow */
1003     float *Sx_ptr = Sx->ptr<float>();
1004     float *Sy_ptr = Sy->ptr<float>();
1005 
1006     /* Output dense flow */
1007     float *Ux_ptr = Ux->ptr<float>();
1008     float *Uy_ptr = Uy->ptr<float>();
1009 
1010     uchar *I0_ptr = I0->ptr<uchar>();
1011     uchar *I1_ptr = I1->ptr<uchar>();
1012 
1013     int psz = dis->patch_size;
1014     int pstr = dis->patch_stride;
1015     int i_l, i_u;
1016     int j_l, j_u;
1017     float i_m, j_m, diff;
1018 
1019     /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
1020      * location */
1021     int start_is, end_is;
1022     int start_js, end_js;
1023 
1024 /* Some helper macros for updating this set of sparse grid locations */
1025 #define UPDATE_SPARSE_I_COORDINATES                                                                                    \
1026     if (i % pstr == 0 && i + psz <= h)                                                                                 \
1027         end_is++;                                                                                                      \
1028     if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is)                                                    \
1029         start_is++;
1030 
1031 #define UPDATE_SPARSE_J_COORDINATES                                                                                    \
1032     if (j % pstr == 0 && j + psz <= dis->w)                                                                            \
1033         end_js++;                                                                                                      \
1034     if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js)                                                    \
1035         start_js++;
1036 
1037     start_is = 0;
1038     end_is = -1;
1039     for (int i = 0; i < start_i; i++)
1040     {
1041         UPDATE_SPARSE_I_COORDINATES;
1042     }
1043     for (int i = start_i; i < end_i; i++)
1044     {
1045         UPDATE_SPARSE_I_COORDINATES;
1046         start_js = 0;
1047         end_js = -1;
1048         for (int j = 0; j < dis->w; j++)
1049         {
1050             UPDATE_SPARSE_J_COORDINATES;
1051             float coef, sum_coef = 0.0f;
1052             float sum_Ux = 0.0f;
1053             float sum_Uy = 0.0f;
1054 
1055             /* Iterate through all the patches that overlap the current location (i,j) */
1056             for (int is = start_is; is <= end_is; is++)
1057                 for (int js = start_js; js <= end_js; js++)
1058                 {
1059                     j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
1060                     i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
1061                     j_l = (int)j_m;
1062                     j_u = j_l + 1;
1063                     i_l = (int)i_m;
1064                     i_u = i_l + 1;
1065                     diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
1066                            (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
1067                            (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
1068                            (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
1069                     coef = 1 / max(1.0f, abs(diff));
1070                     sum_Ux += coef * Sx_ptr[is * dis->ws + js];
1071                     sum_Uy += coef * Sy_ptr[is * dis->ws + js];
1072                     sum_coef += coef;
1073                 }
1074             CV_DbgAssert(sum_coef != 0);
1075             Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
1076             Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
1077         }
1078     }
1079 #undef UPDATE_SPARSE_I_COORDINATES
1080 #undef UPDATE_SPARSE_J_COORDINATES
1081 }
1082 
1083 #ifdef HAVE_OPENCL
ocl_PatchInverseSearch(UMat & src_U,UMat & I0,UMat & I1,UMat & I0x,UMat & I0y,int num_iter,int)1084 bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U,
1085                                                 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /*pyr_level*/)
1086 {
1087     CV_INSTRUMENT_REGION();
1088     CV_INSTRUMENT_REGION_OPENCL();
1089 
1090     size_t globalSize[] = {(size_t)ws, (size_t)hs};
1091     size_t localSize[]  = {16, 16};
1092     int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
1093 
1094     String subgroups_build_options;
1095     if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
1096         subgroups_build_options = " -DCV_USE_SUBGROUPS=1";
1097 
1098     String build_options = cv::format(
1099                 "-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1100                 border_size, patch_size, patch_stride
1101             ) + subgroups_build_options;
1102 
1103 #if 0 // OpenCL debug
1104 u_Sx = Scalar::all(0);
1105 u_Sy = Scalar::all(0);
1106 #endif
1107 
1108     CV_Assert(num_iter == 2);
1109     for (int iter = 0; iter < num_iter; iter++)
1110     {
1111         if (iter == 0)
1112         {
1113             ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options);
1114             size_t global_sz[] = {(size_t)hs * 8};
1115             size_t local_sz[]  = {8};
1116 
1117             k1.args(
1118                 ocl::KernelArg::PtrReadOnly(src_U),
1119                 ocl::KernelArg::PtrReadOnly(I0),
1120                 ocl::KernelArg::PtrReadOnly(I1),
1121                 (int)w, (int)h, (int)ws, (int)hs,
1122                 ocl::KernelArg::PtrWriteOnly(u_S)
1123             );
1124             if (!k1.run(1, global_sz, local_sz, false))
1125                 return false;
1126 
1127             ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options);
1128 
1129             k2.args(
1130                 ocl::KernelArg::PtrReadOnly(src_U),
1131                 ocl::KernelArg::PtrReadOnly(I0),
1132                 ocl::KernelArg::PtrReadOnly(I1),
1133                 ocl::KernelArg::PtrReadOnly(I0x),
1134                 ocl::KernelArg::PtrReadOnly(I0y),
1135                 ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
1136                 ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
1137                 ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
1138                 ocl::KernelArg::PtrReadOnly(u_I0x_buf),
1139                 ocl::KernelArg::PtrReadOnly(u_I0y_buf),
1140                 (int)w, (int)h, (int)ws, (int)hs,
1141                 (int)num_inner_iter,
1142                 ocl::KernelArg::PtrReadWrite(u_S)
1143             );
1144             if (!k2.run(2, globalSize, localSize, false))
1145                 return false;
1146         }
1147         else
1148         {
1149             ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options);
1150             size_t global_sz[] = {(size_t)hs * 8};
1151             size_t local_sz[]  = {8};
1152 
1153             k3.args(
1154                 ocl::KernelArg::PtrReadOnly(I0),
1155                 ocl::KernelArg::PtrReadOnly(I1),
1156                 (int)w, (int)h, (int)ws, (int)hs,
1157                 ocl::KernelArg::PtrReadWrite(u_S)
1158             );
1159             if (!k3.run(1, global_sz, local_sz, false))
1160                 return false;
1161 
1162             ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options);
1163 
1164             k4.args(
1165                 ocl::KernelArg::PtrReadOnly(I0),
1166                 ocl::KernelArg::PtrReadOnly(I1),
1167                 ocl::KernelArg::PtrReadOnly(I0x),
1168                 ocl::KernelArg::PtrReadOnly(I0y),
1169                 ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
1170                 ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
1171                 ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
1172                 ocl::KernelArg::PtrReadOnly(u_I0x_buf),
1173                 ocl::KernelArg::PtrReadOnly(u_I0y_buf),
1174                 (int)w, (int)h,(int)ws, (int)hs,
1175                 (int)num_inner_iter,
1176                 ocl::KernelArg::PtrReadWrite(u_S)
1177             );
1178             if (!k4.run(2, globalSize, localSize, false))
1179                 return false;
1180         }
1181     }
1182     return true;
1183 }
1184 
ocl_Densification(UMat & dst_U,UMat & src_S,UMat & _I0,UMat & _I1)1185 bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1)
1186 {
1187     CV_INSTRUMENT_REGION();
1188     CV_INSTRUMENT_REGION_OPENCL();
1189 
1190     size_t globalSize[] = {(size_t)w, (size_t)h};
1191     size_t localSize[]  = {16, 16};
1192 
1193     String build_options = cv::format(
1194                 "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1195                 patch_size, patch_stride
1196             );
1197 
1198     ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options);
1199     kernel.args(
1200         ocl::KernelArg::PtrReadOnly(src_S),
1201         ocl::KernelArg::PtrReadOnly(_I0),
1202         ocl::KernelArg::PtrReadOnly(_I1),
1203         (int)w, (int)h, (int)ws,
1204         ocl::KernelArg::PtrWriteOnly(dst_U)
1205     );
1206     return kernel.run(2, globalSize, localSize, false);
1207 }
1208 
ocl_prepareBuffers(UMat & I0,UMat & I1,InputArray flow,bool use_flow)1209 void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow)
1210 {
1211     CV_INSTRUMENT_REGION();
1212     // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1213 
1214     u_I0s.resize(coarsest_scale + 1);
1215     u_I1s.resize(coarsest_scale + 1);
1216     u_I1s_ext.resize(coarsest_scale + 1);
1217     u_I0xs.resize(coarsest_scale + 1);
1218     u_I0ys.resize(coarsest_scale + 1);
1219     u_U.resize(coarsest_scale + 1);
1220 
1221     if (use_flow)
1222     {
1223         u_initial_U.resize(coarsest_scale + 1);
1224     }
1225 
1226     int fraction = 1;
1227     int cur_rows = 0, cur_cols = 0;
1228 
1229     for (int i = 0; i <= coarsest_scale; i++)
1230     {
1231         CV_TRACE_REGION("coarsest_scale_iteration");
1232         /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
1233         if (i == finest_scale)
1234         {
1235             cur_rows = I0.rows / fraction;
1236             cur_cols = I0.cols / fraction;
1237             u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1238             resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1239             u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1240             resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1241 
1242             /* These buffers are reused in each scale so we initialize them once on the finest scale: */
1243             u_S.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC2);
1244             u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1245             u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1246             u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1247             u_I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1248             u_I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1249 
1250             u_I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1251             u_I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1252             u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1253             u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1254             u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1255         }
1256         else if (i > finest_scale)
1257         {
1258             cur_rows = u_I0s[i - 1].rows / 2;
1259             cur_cols = u_I0s[i - 1].cols / 2;
1260             u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1261             resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1262             u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1263             resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1264         }
1265 
1266         if (i >= finest_scale)
1267         {
1268             u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
1269             copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
1270             u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
1271             u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
1272             spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
1273             u_U[i].create(cur_rows, cur_cols, CV_32FC2);
1274             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
1275             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
1276             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
1277             variational_refinement_processors[i]->setSorIterations(5);
1278             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
1279 
1280             if (use_flow)
1281             {
1282                 UMat resized_flow;
1283                 resize(flow, resized_flow, Size(cur_cols, cur_rows));
1284                 float scale = 1.0f / fraction;
1285                 resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, 0.0f);
1286             }
1287         }
1288 
1289         fraction *= 2;
1290     }
1291 }
1292 
ocl_precomputeStructureTensor(UMat & dst_I0xx,UMat & dst_I0yy,UMat & dst_I0xy,UMat & dst_I0x,UMat & dst_I0y,UMat & I0x,UMat & I0y)1293 bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
1294                                                        UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
1295 {
1296     CV_INSTRUMENT_REGION();
1297     CV_INSTRUMENT_REGION_OPENCL();
1298 
1299     size_t globalSizeX[] = {(size_t)h};
1300     size_t localSizeX[]  = {16};
1301 
1302 #if 0 // OpenCL debug
1303     u_I0xx_buf_aux = Scalar::all(0);
1304     u_I0yy_buf_aux = Scalar::all(0);
1305     u_I0xy_buf_aux = Scalar::all(0);
1306     u_I0x_buf_aux = Scalar::all(0);
1307     u_I0y_buf_aux = Scalar::all(0);
1308     dst_I0xx = Scalar::all(0);
1309     dst_I0yy = Scalar::all(0);
1310     dst_I0xy = Scalar::all(0);
1311     dst_I0x = Scalar::all(0);
1312     dst_I0y = Scalar::all(0);
1313 #endif
1314 
1315     String build_options = cv::format(
1316                 "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1317                 patch_size, patch_stride
1318             );
1319 
1320     ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options);
1321     kernelX.args(
1322         ocl::KernelArg::PtrReadOnly(I0x),
1323         ocl::KernelArg::PtrReadOnly(I0y),
1324         (int)w, (int)h, (int)ws,
1325         ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
1326         ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
1327         ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
1328         ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
1329         ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux)
1330     );
1331     if (!kernelX.run(1, globalSizeX, localSizeX, false))
1332         return false;
1333 
1334     size_t globalSizeY[] = {(size_t)ws};
1335     size_t localSizeY[]  = {16};
1336 
1337     ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options);
1338     kernelY.args(
1339         ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
1340         ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
1341         ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
1342         ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
1343         ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
1344         (int)w, (int)h, (int)ws,
1345         ocl::KernelArg::PtrWriteOnly(dst_I0xx),
1346         ocl::KernelArg::PtrWriteOnly(dst_I0yy),
1347         ocl::KernelArg::PtrWriteOnly(dst_I0xy),
1348         ocl::KernelArg::PtrWriteOnly(dst_I0x),
1349         ocl::KernelArg::PtrWriteOnly(dst_I0y)
1350     );
1351     return kernelY.run(1, globalSizeY, localSizeY, false);
1352 }
1353 
ocl_calc(InputArray I0,InputArray I1,InputOutputArray flow)1354 bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
1355 {
1356     CV_INSTRUMENT_REGION();
1357     // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1358 
1359     UMat I0Mat = I0.getUMat();
1360     UMat I1Mat = I1.getUMat();
1361     bool use_input_flow = false;
1362     if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
1363         use_input_flow = true;
1364     coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1365                          (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
1366 
1367     if (coarsest_scale<0)
1368         CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1369 
1370     if (coarsest_scale<finest_scale)
1371     {
1372         // choose the finest level based on coarsest level.
1373         // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1374         int original_img_width = I0.size().width;
1375         autoSelectPatchSizeAndScales(original_img_width);
1376     }
1377 
1378     ocl_prepareBuffers(I0Mat, I1Mat, flow, use_input_flow);
1379     u_U[coarsest_scale].setTo(0.0f);
1380 
1381     for (int i = coarsest_scale; i >= finest_scale; i--)
1382     {
1383         CV_TRACE_REGION("coarsest_scale_iteration");
1384         w = u_I0s[i].cols;
1385         h = u_I0s[i].rows;
1386         ws = 1 + (w - patch_size) / patch_stride;
1387         hs = 1 + (h - patch_size) / patch_stride;
1388 
1389         if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
1390                                            u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
1391             return false;
1392 
1393         if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
1394             return false;
1395 
1396         if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i]))
1397             return false;
1398 
1399         if (variational_refinement_iter > 0)
1400         {
1401             std::vector<Mat> U_channels;
1402             split(u_U[i], U_channels); CV_Assert(U_channels.size() == 2);
1403             variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
1404                     U_channels[0], U_channels[1]);
1405             merge(U_channels, u_U[i]);
1406         }
1407 
1408         if (i > finest_scale)
1409         {
1410             UMat resized;
1411             resize(u_U[i], resized, u_U[i - 1].size());
1412             multiply(resized, 2, u_U[i - 1]);
1413         }
1414     }
1415 
1416     UMat resized_flow;
1417     resize(u_U[finest_scale], resized_flow, I1Mat.size());
1418     multiply(resized_flow, 1 << finest_scale, flow);
1419 
1420     return true;
1421 }
1422 #endif
1423 
calc(InputArray I0,InputArray I1,InputOutputArray flow)1424 void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
1425 {
1426     CV_INSTRUMENT_REGION();
1427 
1428     CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
1429     CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
1430     CV_Assert(I0.sameSize(I1));
1431     CV_Assert(I0.isContinuous());
1432     CV_Assert(I1.isContinuous());
1433 
1434     CV_OCL_RUN(flow.isUMat() &&
1435                (patch_size == 8) && (use_spatial_propagation == true),
1436                ocl_calc(I0, I1, flow));
1437 
1438     Mat I0Mat = I0.getMat();
1439     Mat I1Mat = I1.getMat();
1440     bool use_input_flow = false;
1441     if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
1442         use_input_flow = true;
1443     else
1444         flow.create(I1Mat.size(), CV_32FC2);
1445     Mat flowMat = flow.getMat();
1446     coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1447                          (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
1448 
1449     if (coarsest_scale<0)
1450         CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1451 
1452     if (coarsest_scale<finest_scale)
1453     {
1454         // choose the finest level based on coarsest level.
1455         // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1456         int original_img_width = I0.size().width;
1457         autoSelectPatchSizeAndScales(original_img_width);
1458     }
1459 
1460     int num_stripes = getNumThreads();
1461 
1462     prepareBuffers(I0Mat, I1Mat, flowMat, use_input_flow);
1463     Ux[coarsest_scale].setTo(0.0f);
1464     Uy[coarsest_scale].setTo(0.0f);
1465 
1466     for (int i = coarsest_scale; i >= finest_scale; i--)
1467     {
1468         CV_TRACE_REGION("coarsest_scale_iteration");
1469         w = I0s[i].cols;
1470         h = I0s[i].rows;
1471         ws = 1 + (w - patch_size) / patch_stride;
1472         hs = 1 + (h - patch_size) / patch_stride;
1473 
1474         precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
1475         if (use_spatial_propagation)
1476         {
1477             /* Use a fixed number of stripes regardless the number of threads to make inverse search
1478              * with spatial propagation reproducible
1479              */
1480             parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
1481                                                                   I1s_ext[i], I0xs[i], I0ys[i], 2, i));
1482         }
1483         else
1484         {
1485             parallel_for_(Range(0, num_stripes),
1486                           PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
1487                                                      I0xs[i], I0ys[i], 1, i));
1488         }
1489 
1490         parallel_for_(Range(0, num_stripes),
1491                       Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
1492         if (variational_refinement_iter > 0)
1493             variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
1494 
1495         if (i > finest_scale)
1496         {
1497             resize(Ux[i], Ux[i - 1], Ux[i - 1].size());
1498             resize(Uy[i], Uy[i - 1], Uy[i - 1].size());
1499             Ux[i - 1] *= 2;
1500             Uy[i - 1] *= 2;
1501         }
1502     }
1503     Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
1504     merge(uxy, 2, U);
1505     resize(U, flowMat, flowMat.size());
1506     flowMat *= 1 << finest_scale;
1507 }
1508 
collectGarbage()1509 void DISOpticalFlowImpl::collectGarbage()
1510 {
1511     CV_INSTRUMENT_REGION();
1512 
1513     I0s.clear();
1514     I1s.clear();
1515     I1s_ext.clear();
1516     I0xs.clear();
1517     I0ys.clear();
1518     Ux.clear();
1519     Uy.clear();
1520     U.release();
1521     Sx.release();
1522     Sy.release();
1523     I0xx_buf.release();
1524     I0yy_buf.release();
1525     I0xy_buf.release();
1526     I0xx_buf_aux.release();
1527     I0yy_buf_aux.release();
1528     I0xy_buf_aux.release();
1529 
1530 #ifdef HAVE_OPENCL
1531     u_I0s.clear();
1532     u_I1s.clear();
1533     u_I1s_ext.clear();
1534     u_I0xs.clear();
1535     u_I0ys.clear();
1536     u_U.clear();
1537     u_S.release();
1538     u_I0xx_buf.release();
1539     u_I0yy_buf.release();
1540     u_I0xy_buf.release();
1541     u_I0xx_buf_aux.release();
1542     u_I0yy_buf_aux.release();
1543     u_I0xy_buf_aux.release();
1544 #endif
1545 
1546     for (int i = finest_scale; i <= coarsest_scale; i++)
1547         variational_refinement_processors[i]->collectGarbage();
1548     variational_refinement_processors.clear();
1549 }
1550 
create(int preset)1551 Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
1552 {
1553     CV_INSTRUMENT_REGION();
1554 
1555     Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
1556     dis->setPatchSize(8);
1557     if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
1558     {
1559         dis->setFinestScale(2);
1560         dis->setPatchStride(4);
1561         dis->setGradientDescentIterations(12);
1562         dis->setVariationalRefinementIterations(0);
1563     }
1564     else if (preset == DISOpticalFlow::PRESET_FAST)
1565     {
1566         dis->setFinestScale(2);
1567         dis->setPatchStride(4);
1568         dis->setGradientDescentIterations(16);
1569         dis->setVariationalRefinementIterations(5);
1570     }
1571     else if (preset == DISOpticalFlow::PRESET_MEDIUM)
1572     {
1573         dis->setFinestScale(1);
1574         dis->setPatchStride(3);
1575         dis->setGradientDescentIterations(25);
1576         dis->setVariationalRefinementIterations(5);
1577     }
1578 
1579     return dis;
1580 }
1581 
1582 
1583 } // namespace
1584