1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 //
5 // Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
6 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
7 // Copyright (C) 2020, Intel Corporation, all rights reserved.
8 
9 /**********************************************************************************************\
10  Implementation of SIFT is based on the code from http://blogs.oregonstate.edu/hess/code/sift/
11  Below is the original copyright.
12  Patent US6711293 expired in March 2020.
13 
14 //    Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
15 //    All rights reserved.
16 
17 //    The following patent has been issued for methods embodied in this
18 //    software: "Method and apparatus for identifying scale invariant features
19 //    in an image and use of same for locating an object in an image," David
20 //    G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
21 //    filed March 8, 1999. Asignee: The University of British Columbia. For
22 //    further details, contact David Lowe (lowe@cs.ubc.ca) or the
23 //    University-Industry Liaison Office of the University of British
24 //    Columbia.
25 
26 //    Note that restrictions imposed by this patent (and possibly others)
27 //    exist independently of and may be in conflict with the freedoms granted
28 //    in this license, which refers to copyright of the program, not patents
29 //    for any methods that it implements.  Both copyright and patent law must
30 //    be obeyed to legally use and redistribute this program and it is not the
31 //    purpose of this license to induce you to infringe any patents or other
32 //    property right claims or to contest validity of any such claims.  If you
33 //    redistribute or use the program, then this license merely protects you
34 //    from committing copyright infringement.  It does not protect you from
35 //    committing patent infringement.  So, before you do anything with this
36 //    program, make sure that you have permission to do so not merely in terms
37 //    of copyright, but also in terms of patent law.
38 
39 //    Please note that this license is not to be understood as a guarantee
40 //    either.  If you use the program according to this license, but in
41 //    conflict with patent law, it does not mean that the licensor will refund
42 //    you for any losses that you incur if you are sued for your patent
43 //    infringement.
44 
45 //    Redistribution and use in source and binary forms, with or without
46 //    modification, are permitted provided that the following conditions are
47 //    met:
48 //        * Redistributions of source code must retain the above copyright and
49 //          patent notices, this list of conditions and the following
50 //          disclaimer.
51 //        * Redistributions in binary form must reproduce the above copyright
52 //          notice, this list of conditions and the following disclaimer in
53 //          the documentation and/or other materials provided with the
54 //          distribution.
55 //        * Neither the name of Oregon State University nor the names of its
56 //          contributors may be used to endorse or promote products derived
57 //          from this software without specific prior written permission.
58 
59 //    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
60 //    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
61 //    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
62 //    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
63 //    HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
64 //    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
65 //    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
66 //    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
67 //    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
68 //    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
69 //    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
70 \**********************************************************************************************/
71 
72 #include "precomp.hpp"
73 
74 #include <opencv2/core/hal/hal.hpp>
75 #include "opencv2/core/hal/intrin.hpp"
76 #include <opencv2/core/utils/buffer_area.private.hpp>
77 
78 namespace cv {
79 
80 #if !defined(CV_CPU_DISPATCH_MODE) || !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY)
81 /******************************* Defs and macros *****************************/
82 
83 // default width of descriptor histogram array
84 static const int SIFT_DESCR_WIDTH = 4;
85 
86 // default number of bins per histogram in descriptor array
87 static const int SIFT_DESCR_HIST_BINS = 8;
88 
89 // assumed gaussian blur for input image
90 static const float SIFT_INIT_SIGMA = 0.5f;
91 
92 // width of border in which to ignore keypoints
93 static const int SIFT_IMG_BORDER = 5;
94 
95 // maximum steps of keypoint interpolation before failure
96 static const int SIFT_MAX_INTERP_STEPS = 5;
97 
98 // default number of bins in histogram for orientation assignment
99 static const int SIFT_ORI_HIST_BINS = 36;
100 
101 // determines gaussian sigma for orientation assignment
102 static const float SIFT_ORI_SIG_FCTR = 1.5f;
103 
104 // determines the radius of the region used in orientation assignment
105 static const float SIFT_ORI_RADIUS = 4.5f; // 3 * SIFT_ORI_SIG_FCTR;
106 
107 // orientation magnitude relative to max that results in new feature
108 static const float SIFT_ORI_PEAK_RATIO = 0.8f;
109 
110 // determines the size of a single descriptor orientation histogram
111 static const float SIFT_DESCR_SCL_FCTR = 3.f;
112 
113 // threshold on magnitude of elements of descriptor vector
114 static const float SIFT_DESCR_MAG_THR = 0.2f;
115 
116 // factor used to convert floating-point descriptor to unsigned char
117 static const float SIFT_INT_DESCR_FCTR = 512.f;
118 
119 #define DoG_TYPE_SHORT 0
120 #if DoG_TYPE_SHORT
121 // intermediate type used for DoG pyramids
122 typedef short sift_wt;
123 static const int SIFT_FIXPT_SCALE = 48;
124 #else
125 // intermediate type used for DoG pyramids
126 typedef float sift_wt;
127 static const int SIFT_FIXPT_SCALE = 1;
128 #endif
129 
130 #endif  // definitions and macros
131 
132 
133 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
134 
135 void findScaleSpaceExtrema(
136     int octave,
137     int layer,
138     int threshold,
139     int idx,
140     int step,
141     int cols,
142     int nOctaveLayers,
143     double contrastThreshold,
144     double edgeThreshold,
145     double sigma,
146     const std::vector<Mat>& gauss_pyr,
147     const std::vector<Mat>& dog_pyr,
148     std::vector<KeyPoint>& kpts,
149     const cv::Range& range);
150 
151 void calcSIFTDescriptor(
152         const Mat& img, Point2f ptf, float ori, float scl,
153         int d, int n, Mat& dst, int row
154 );
155 
156 
157 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
158 
159 // Computes a gradient orientation histogram at a specified pixel
160 static
calcOrientationHist(const Mat & img,Point pt,int radius,float sigma,float * hist,int n)161 float calcOrientationHist(
162         const Mat& img, Point pt, int radius,
163         float sigma, float* hist, int n
164 )
165 {
166     CV_TRACE_FUNCTION();
167 
168     int i, j, k, len = (radius*2+1)*(radius*2+1);
169 
170     float expf_scale = -1.f/(2.f * sigma * sigma);
171 
172     cv::utils::BufferArea area;
173     float *X = 0, *Y = 0, *Mag, *Ori = 0, *W = 0, *temphist = 0;
174     area.allocate(X, len, CV_SIMD_WIDTH);
175     area.allocate(Y, len, CV_SIMD_WIDTH);
176     area.allocate(Ori, len, CV_SIMD_WIDTH);
177     area.allocate(W, len, CV_SIMD_WIDTH);
178     area.allocate(temphist, n+4, CV_SIMD_WIDTH);
179     area.commit();
180     temphist += 2;
181     Mag = X;
182 
183     for( i = 0; i < n; i++ )
184         temphist[i] = 0.f;
185 
186     for( i = -radius, k = 0; i <= radius; i++ )
187     {
188         int y = pt.y + i;
189         if( y <= 0 || y >= img.rows - 1 )
190             continue;
191         for( j = -radius; j <= radius; j++ )
192         {
193             int x = pt.x + j;
194             if( x <= 0 || x >= img.cols - 1 )
195                 continue;
196 
197             float dx = (float)(img.at<sift_wt>(y, x+1) - img.at<sift_wt>(y, x-1));
198             float dy = (float)(img.at<sift_wt>(y-1, x) - img.at<sift_wt>(y+1, x));
199 
200             X[k] = dx; Y[k] = dy; W[k] = (i*i + j*j)*expf_scale;
201             k++;
202         }
203     }
204 
205     len = k;
206 
207     // compute gradient values, orientations and the weights over the pixel neighborhood
208     cv::hal::exp32f(W, W, len);
209     cv::hal::fastAtan2(Y, X, Ori, len, true);
210     cv::hal::magnitude32f(X, Y, Mag, len);
211 
212     k = 0;
213 #if CV_SIMD
214     const int vecsize = v_float32::nlanes;
215     v_float32 nd360 = vx_setall_f32(n/360.f);
216     v_int32 __n = vx_setall_s32(n);
217     int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
218     float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
219 
220     for( ; k <= len - vecsize; k += vecsize )
221     {
222         v_float32 w = vx_load_aligned( W + k );
223         v_float32 mag = vx_load_aligned( Mag + k );
224         v_float32 ori = vx_load_aligned( Ori + k );
225         v_int32 bin = v_round( nd360 * ori );
226 
227         bin = v_select(bin >= __n, bin - __n, bin);
228         bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
229 
230         w = w * mag;
231         v_store_aligned(bin_buf, bin);
232         v_store_aligned(w_mul_mag_buf, w);
233         for(int vi = 0; vi < vecsize; vi++)
234         {
235             temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
236         }
237     }
238 #endif
239     for( ; k < len; k++ )
240     {
241         int bin = cvRound((n/360.f)*Ori[k]);
242         if( bin >= n )
243             bin -= n;
244         if( bin < 0 )
245             bin += n;
246         temphist[bin] += W[k]*Mag[k];
247     }
248 
249     // smooth the histogram
250     temphist[-1] = temphist[n-1];
251     temphist[-2] = temphist[n-2];
252     temphist[n] = temphist[0];
253     temphist[n+1] = temphist[1];
254 
255     i = 0;
256 #if CV_SIMD
257     v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
258     v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
259     v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
260     for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
261     {
262         v_float32 tn2 = vx_load_aligned(temphist + i-2);
263         v_float32 tn1 = vx_load(temphist + i-1);
264         v_float32 t0 = vx_load(temphist + i);
265         v_float32 t1 = vx_load(temphist + i+1);
266         v_float32 t2 = vx_load(temphist + i+2);
267         v_float32 _hist = v_fma(tn2 + t2, d_1_16,
268             v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
269         v_store(hist + i, _hist);
270     }
271 #endif
272     for( ; i < n; i++ )
273     {
274         hist[i] = (temphist[i-2] + temphist[i+2])*(1.f/16.f) +
275             (temphist[i-1] + temphist[i+1])*(4.f/16.f) +
276             temphist[i]*(6.f/16.f);
277     }
278 
279     float maxval = hist[0];
280     for( i = 1; i < n; i++ )
281         maxval = std::max(maxval, hist[i]);
282 
283     return maxval;
284 }
285 
286 
287 //
288 // Interpolates a scale-space extremum's location and scale to subpixel
289 // accuracy to form an image feature. Rejects features with low contrast.
290 // Based on Section 4 of Lowe's paper.
291 static
adjustLocalExtrema(const std::vector<Mat> & dog_pyr,KeyPoint & kpt,int octv,int & layer,int & r,int & c,int nOctaveLayers,float contrastThreshold,float edgeThreshold,float sigma)292 bool adjustLocalExtrema(
293         const std::vector<Mat>& dog_pyr, KeyPoint& kpt, int octv,
294         int& layer, int& r, int& c, int nOctaveLayers,
295         float contrastThreshold, float edgeThreshold, float sigma
296 )
297 {
298     CV_TRACE_FUNCTION();
299 
300     const float img_scale = 1.f/(255*SIFT_FIXPT_SCALE);
301     const float deriv_scale = img_scale*0.5f;
302     const float second_deriv_scale = img_scale;
303     const float cross_deriv_scale = img_scale*0.25f;
304 
305     float xi=0, xr=0, xc=0, contr=0;
306     int i = 0;
307 
308     for( ; i < SIFT_MAX_INTERP_STEPS; i++ )
309     {
310         int idx = octv*(nOctaveLayers+2) + layer;
311         const Mat& img = dog_pyr[idx];
312         const Mat& prev = dog_pyr[idx-1];
313         const Mat& next = dog_pyr[idx+1];
314 
315         Vec3f dD((img.at<sift_wt>(r, c+1) - img.at<sift_wt>(r, c-1))*deriv_scale,
316                  (img.at<sift_wt>(r+1, c) - img.at<sift_wt>(r-1, c))*deriv_scale,
317                  (next.at<sift_wt>(r, c) - prev.at<sift_wt>(r, c))*deriv_scale);
318 
319         float v2 = (float)img.at<sift_wt>(r, c)*2;
320         float dxx = (img.at<sift_wt>(r, c+1) + img.at<sift_wt>(r, c-1) - v2)*second_deriv_scale;
321         float dyy = (img.at<sift_wt>(r+1, c) + img.at<sift_wt>(r-1, c) - v2)*second_deriv_scale;
322         float dss = (next.at<sift_wt>(r, c) + prev.at<sift_wt>(r, c) - v2)*second_deriv_scale;
323         float dxy = (img.at<sift_wt>(r+1, c+1) - img.at<sift_wt>(r+1, c-1) -
324                      img.at<sift_wt>(r-1, c+1) + img.at<sift_wt>(r-1, c-1))*cross_deriv_scale;
325         float dxs = (next.at<sift_wt>(r, c+1) - next.at<sift_wt>(r, c-1) -
326                      prev.at<sift_wt>(r, c+1) + prev.at<sift_wt>(r, c-1))*cross_deriv_scale;
327         float dys = (next.at<sift_wt>(r+1, c) - next.at<sift_wt>(r-1, c) -
328                      prev.at<sift_wt>(r+1, c) + prev.at<sift_wt>(r-1, c))*cross_deriv_scale;
329 
330         Matx33f H(dxx, dxy, dxs,
331                   dxy, dyy, dys,
332                   dxs, dys, dss);
333 
334         Vec3f X = H.solve(dD, DECOMP_LU);
335 
336         xi = -X[2];
337         xr = -X[1];
338         xc = -X[0];
339 
340         if( std::abs(xi) < 0.5f && std::abs(xr) < 0.5f && std::abs(xc) < 0.5f )
341             break;
342 
343         if( std::abs(xi) > (float)(INT_MAX/3) ||
344             std::abs(xr) > (float)(INT_MAX/3) ||
345             std::abs(xc) > (float)(INT_MAX/3) )
346             return false;
347 
348         c += cvRound(xc);
349         r += cvRound(xr);
350         layer += cvRound(xi);
351 
352         if( layer < 1 || layer > nOctaveLayers ||
353             c < SIFT_IMG_BORDER || c >= img.cols - SIFT_IMG_BORDER  ||
354             r < SIFT_IMG_BORDER || r >= img.rows - SIFT_IMG_BORDER )
355             return false;
356     }
357 
358     // ensure convergence of interpolation
359     if( i >= SIFT_MAX_INTERP_STEPS )
360         return false;
361 
362     {
363         int idx = octv*(nOctaveLayers+2) + layer;
364         const Mat& img = dog_pyr[idx];
365         const Mat& prev = dog_pyr[idx-1];
366         const Mat& next = dog_pyr[idx+1];
367         Matx31f dD((img.at<sift_wt>(r, c+1) - img.at<sift_wt>(r, c-1))*deriv_scale,
368                    (img.at<sift_wt>(r+1, c) - img.at<sift_wt>(r-1, c))*deriv_scale,
369                    (next.at<sift_wt>(r, c) - prev.at<sift_wt>(r, c))*deriv_scale);
370         float t = dD.dot(Matx31f(xc, xr, xi));
371 
372         contr = img.at<sift_wt>(r, c)*img_scale + t * 0.5f;
373         if( std::abs( contr ) * nOctaveLayers < contrastThreshold )
374             return false;
375 
376         // principal curvatures are computed using the trace and det of Hessian
377         float v2 = img.at<sift_wt>(r, c)*2.f;
378         float dxx = (img.at<sift_wt>(r, c+1) + img.at<sift_wt>(r, c-1) - v2)*second_deriv_scale;
379         float dyy = (img.at<sift_wt>(r+1, c) + img.at<sift_wt>(r-1, c) - v2)*second_deriv_scale;
380         float dxy = (img.at<sift_wt>(r+1, c+1) - img.at<sift_wt>(r+1, c-1) -
381                      img.at<sift_wt>(r-1, c+1) + img.at<sift_wt>(r-1, c-1)) * cross_deriv_scale;
382         float tr = dxx + dyy;
383         float det = dxx * dyy - dxy * dxy;
384 
385         if( det <= 0 || tr*tr*edgeThreshold >= (edgeThreshold + 1)*(edgeThreshold + 1)*det )
386             return false;
387     }
388 
389     kpt.pt.x = (c + xc) * (1 << octv);
390     kpt.pt.y = (r + xr) * (1 << octv);
391     kpt.octave = octv + (layer << 8) + (cvRound((xi + 0.5)*255) << 16);
392     kpt.size = sigma*powf(2.f, (layer + xi) / nOctaveLayers)*(1 << octv)*2;
393     kpt.response = std::abs(contr);
394 
395     return true;
396 }
397 
398 namespace {
399 
400 class findScaleSpaceExtremaT
401 {
402 public:
findScaleSpaceExtremaT(int _o,int _i,int _threshold,int _idx,int _step,int _cols,int _nOctaveLayers,double _contrastThreshold,double _edgeThreshold,double _sigma,const std::vector<Mat> & _gauss_pyr,const std::vector<Mat> & _dog_pyr,std::vector<KeyPoint> & kpts)403     findScaleSpaceExtremaT(
404         int _o,
405         int _i,
406         int _threshold,
407         int _idx,
408         int _step,
409         int _cols,
410         int _nOctaveLayers,
411         double _contrastThreshold,
412         double _edgeThreshold,
413         double _sigma,
414         const std::vector<Mat>& _gauss_pyr,
415         const std::vector<Mat>& _dog_pyr,
416         std::vector<KeyPoint>& kpts)
417 
418         : o(_o),
419           i(_i),
420           threshold(_threshold),
421           idx(_idx),
422           step(_step),
423           cols(_cols),
424           nOctaveLayers(_nOctaveLayers),
425           contrastThreshold(_contrastThreshold),
426           edgeThreshold(_edgeThreshold),
427           sigma(_sigma),
428           gauss_pyr(_gauss_pyr),
429           dog_pyr(_dog_pyr),
430           kpts_(kpts)
431     {
432         // nothing
433     }
process(const cv::Range & range)434     void process(const cv::Range& range)
435     {
436         CV_TRACE_FUNCTION();
437 
438         const int begin = range.start;
439         const int end = range.end;
440 
441         static const int n = SIFT_ORI_HIST_BINS;
442         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) hist[n];
443 
444         const Mat& img = dog_pyr[idx];
445         const Mat& prev = dog_pyr[idx-1];
446         const Mat& next = dog_pyr[idx+1];
447 
448         for( int r = begin; r < end; r++)
449         {
450             const sift_wt* currptr = img.ptr<sift_wt>(r);
451             const sift_wt* prevptr = prev.ptr<sift_wt>(r);
452             const sift_wt* nextptr = next.ptr<sift_wt>(r);
453             int c = SIFT_IMG_BORDER;
454 
455 #if CV_SIMD && !(DoG_TYPE_SHORT)
456             const int vecsize = v_float32::nlanes;
457             for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
458             {
459                 v_float32 val = vx_load(&currptr[c]);
460                 v_float32 _00,_01,_02;
461                 v_float32 _10,    _12;
462                 v_float32 _20,_21,_22;
463 
464                 v_float32 vmin,vmax;
465 
466 
467                 v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
468                 if (!v_check_any(cond))
469                 {
470                     continue;
471                 }
472 
473                 _00 = vx_load(&currptr[c-step-1]); _01 = vx_load(&currptr[c-step]); _02 = vx_load(&currptr[c-step+1]);
474                 _10 = vx_load(&currptr[c     -1]);                                  _12 = vx_load(&currptr[c     +1]);
475                 _20 = vx_load(&currptr[c+step-1]); _21 = vx_load(&currptr[c+step]); _22 = vx_load(&currptr[c+step+1]);
476 
477                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
478                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
479 
480                 v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
481                 v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
482 
483                 cond = condp | condm;
484                 if (!v_check_any(cond))
485                 {
486                     continue;
487                 }
488 
489                 _00 = vx_load(&prevptr[c-step-1]); _01 = vx_load(&prevptr[c-step]); _02 = vx_load(&prevptr[c-step+1]);
490                 _10 = vx_load(&prevptr[c     -1]);                                  _12 = vx_load(&prevptr[c     +1]);
491                 _20 = vx_load(&prevptr[c+step-1]); _21 = vx_load(&prevptr[c+step]); _22 = vx_load(&prevptr[c+step+1]);
492 
493                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
494                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
495 
496                 condp &= (val >= vmax);
497                 condm &= (val <= vmin);
498 
499                 cond = condp | condm;
500                 if (!v_check_any(cond))
501                 {
502                     continue;
503                 }
504 
505                 v_float32 _11p = vx_load(&prevptr[c]);
506                 v_float32 _11n = vx_load(&nextptr[c]);
507 
508                 v_float32 max_middle = v_max(_11n,_11p);
509                 v_float32 min_middle = v_min(_11n,_11p);
510 
511                 _00 = vx_load(&nextptr[c-step-1]); _01 = vx_load(&nextptr[c-step]); _02 = vx_load(&nextptr[c-step+1]);
512                 _10 = vx_load(&nextptr[c     -1]);                                  _12 = vx_load(&nextptr[c     +1]);
513                 _20 = vx_load(&nextptr[c+step-1]); _21 = vx_load(&nextptr[c+step]); _22 = vx_load(&nextptr[c+step+1]);
514 
515                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
516                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
517 
518                 condp &= (val >= v_max(vmax,max_middle));
519                 condm &= (val <= v_min(vmin,min_middle));
520 
521                 cond = condp | condm;
522                 if (!v_check_any(cond))
523                 {
524                     continue;
525                 }
526 
527                 int mask = v_signmask(cond);
528                 for (int k = 0; k<vecsize;k++)
529                 {
530                     if ((mask & (1<<k)) == 0)
531                         continue;
532 
533                     CV_TRACE_REGION("pixel_candidate_simd");
534 
535                     KeyPoint kpt;
536                     int r1 = r, c1 = c+k, layer = i;
537                     if( !adjustLocalExtrema(dog_pyr, kpt, o, layer, r1, c1,
538                                             nOctaveLayers, (float)contrastThreshold,
539                                             (float)edgeThreshold, (float)sigma) )
540                         continue;
541                     float scl_octv = kpt.size*0.5f/(1 << o);
542                     float omax = calcOrientationHist(gauss_pyr[o*(nOctaveLayers+3) + layer],
543                                                      Point(c1, r1),
544                                                      cvRound(SIFT_ORI_RADIUS * scl_octv),
545                                                      SIFT_ORI_SIG_FCTR * scl_octv,
546                                                      hist, n);
547                     float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
548                     for( int j = 0; j < n; j++ )
549                     {
550                         int l = j > 0 ? j - 1 : n - 1;
551                         int r2 = j < n-1 ? j + 1 : 0;
552 
553                         if( hist[j] > hist[l]  &&  hist[j] > hist[r2]  &&  hist[j] >= mag_thr )
554                         {
555                             float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]);
556                             bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
557                             kpt.angle = 360.f - (float)((360.f/n) * bin);
558                             if(std::abs(kpt.angle - 360.f) < FLT_EPSILON)
559                                 kpt.angle = 0.f;
560 
561                             kpts_.push_back(kpt);
562                         }
563                     }
564                 }
565             }
566 
567 #endif //CV_SIMD && !(DoG_TYPE_SHORT)
568 
569             // vector loop reminder, better predictibility and less branch density
570             for( ; c < cols-SIFT_IMG_BORDER; c++)
571             {
572                 sift_wt val = currptr[c];
573                 if (std::abs(val) <= threshold)
574                     continue;
575 
576                 sift_wt _00,_01,_02;
577                 sift_wt _10,    _12;
578                 sift_wt _20,_21,_22;
579                 _00 = currptr[c-step-1]; _01 = currptr[c-step]; _02 = currptr[c-step+1];
580                 _10 = currptr[c     -1];                        _12 = currptr[c     +1];
581                 _20 = currptr[c+step-1]; _21 = currptr[c+step]; _22 = currptr[c+step+1];
582 
583                 bool calculate = false;
584                 if (val > 0)
585                 {
586                     sift_wt vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
587                     if (val >= vmax)
588                     {
589                         _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
590                         _10 = prevptr[c     -1];                        _12 = prevptr[c     +1];
591                         _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
592                         vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
593                         if (val >= vmax)
594                         {
595                             _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
596                             _10 = nextptr[c     -1];                        _12 = nextptr[c     +1];
597                             _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
598                             vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
599                             if (val >= vmax)
600                             {
601                                 sift_wt _11p = prevptr[c], _11n = nextptr[c];
602                                 calculate = (val >= std::max(_11p,_11n));
603                             }
604                         }
605                     }
606 
607                 } else  { // val cant be zero here (first abs took care of zero), must be negative
608                     sift_wt vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
609                     if (val <= vmin)
610                     {
611                         _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
612                         _10 = prevptr[c     -1];                        _12 = prevptr[c     +1];
613                         _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
614                         vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
615                         if (val <= vmin)
616                         {
617                             _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
618                             _10 = nextptr[c     -1];                        _12 = nextptr[c     +1];
619                             _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
620                             vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
621                             if (val <= vmin)
622                             {
623                                 sift_wt _11p = prevptr[c], _11n = nextptr[c];
624                                 calculate = (val <= std::min(_11p,_11n));
625                             }
626                         }
627                     }
628                 }
629 
630                 if (calculate)
631                 {
632                     CV_TRACE_REGION("pixel_candidate");
633 
634                     KeyPoint kpt;
635                     int r1 = r, c1 = c, layer = i;
636                     if( !adjustLocalExtrema(dog_pyr, kpt, o, layer, r1, c1,
637                                             nOctaveLayers, (float)contrastThreshold,
638                                             (float)edgeThreshold, (float)sigma) )
639                         continue;
640                     float scl_octv = kpt.size*0.5f/(1 << o);
641                     float omax = calcOrientationHist(gauss_pyr[o*(nOctaveLayers+3) + layer],
642                                                      Point(c1, r1),
643                                                      cvRound(SIFT_ORI_RADIUS * scl_octv),
644                                                      SIFT_ORI_SIG_FCTR * scl_octv,
645                                                      hist, n);
646                     float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
647                     for( int j = 0; j < n; j++ )
648                     {
649                         int l = j > 0 ? j - 1 : n - 1;
650                         int r2 = j < n-1 ? j + 1 : 0;
651 
652                         if( hist[j] > hist[l]  &&  hist[j] > hist[r2]  &&  hist[j] >= mag_thr )
653                         {
654                             float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]);
655                             bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
656                             kpt.angle = 360.f - (float)((360.f/n) * bin);
657                             if(std::abs(kpt.angle - 360.f) < FLT_EPSILON)
658                                 kpt.angle = 0.f;
659 
660                             kpts_.push_back(kpt);
661                         }
662                     }
663                 }
664             }
665         }
666     }
667 private:
668     int o, i;
669     int threshold;
670     int idx, step, cols;
671     int nOctaveLayers;
672     double contrastThreshold;
673     double edgeThreshold;
674     double sigma;
675     const std::vector<Mat>& gauss_pyr;
676     const std::vector<Mat>& dog_pyr;
677     std::vector<KeyPoint>& kpts_;
678 };
679 
680 }  // namespace
681 
682 
findScaleSpaceExtrema(int octave,int layer,int threshold,int idx,int step,int cols,int nOctaveLayers,double contrastThreshold,double edgeThreshold,double sigma,const std::vector<Mat> & gauss_pyr,const std::vector<Mat> & dog_pyr,std::vector<KeyPoint> & kpts,const cv::Range & range)683 void findScaleSpaceExtrema(
684     int octave,
685     int layer,
686     int threshold,
687     int idx,
688     int step,
689     int cols,
690     int nOctaveLayers,
691     double contrastThreshold,
692     double edgeThreshold,
693     double sigma,
694     const std::vector<Mat>& gauss_pyr,
695     const std::vector<Mat>& dog_pyr,
696     std::vector<KeyPoint>& kpts,
697     const cv::Range& range)
698 {
699     CV_TRACE_FUNCTION();
700 
701     findScaleSpaceExtremaT(octave, layer, threshold, idx,
702             step, cols,
703             nOctaveLayers, contrastThreshold, edgeThreshold, sigma,
704             gauss_pyr, dog_pyr,
705             kpts)
706         .process(range);
707 }
708 
calcSIFTDescriptor(const Mat & img,Point2f ptf,float ori,float scl,int d,int n,Mat & dstMat,int row)709 void calcSIFTDescriptor(
710         const Mat& img, Point2f ptf, float ori, float scl,
711         int d, int n, Mat& dstMat, int row
712 )
713 {
714     CV_TRACE_FUNCTION();
715 
716     Point pt(cvRound(ptf.x), cvRound(ptf.y));
717     float cos_t = cosf(ori*(float)(CV_PI/180));
718     float sin_t = sinf(ori*(float)(CV_PI/180));
719     float bins_per_rad = n / 360.f;
720     float exp_scale = -1.f/(d * d * 0.5f);
721     float hist_width = SIFT_DESCR_SCL_FCTR * scl;
722     int radius = cvRound(hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
723     // Clip the radius to the diagonal of the image to avoid autobuffer too large exception
724     radius = std::min(radius, (int)std::sqrt(((double) img.cols)*img.cols + ((double) img.rows)*img.rows));
725     cos_t /= hist_width;
726     sin_t /= hist_width;
727 
728     int i, j, k, len = (radius*2+1)*(radius*2+1), histlen = (d+2)*(d+2)*(n+2);
729     int rows = img.rows, cols = img.cols;
730 
731     cv::utils::BufferArea area;
732     float *X = 0, *Y = 0, *Mag, *Ori = 0, *W = 0, *RBin = 0, *CBin = 0, *hist = 0, *rawDst = 0;
733     area.allocate(X, len, CV_SIMD_WIDTH);
734     area.allocate(Y, len, CV_SIMD_WIDTH);
735     area.allocate(Ori, len, CV_SIMD_WIDTH);
736     area.allocate(W, len, CV_SIMD_WIDTH);
737     area.allocate(RBin, len, CV_SIMD_WIDTH);
738     area.allocate(CBin, len, CV_SIMD_WIDTH);
739     area.allocate(hist, histlen, CV_SIMD_WIDTH);
740     area.allocate(rawDst, len, CV_SIMD_WIDTH);
741     area.commit();
742     Mag = Y;
743 
744     for( i = 0; i < d+2; i++ )
745     {
746         for( j = 0; j < d+2; j++ )
747             for( k = 0; k < n+2; k++ )
748                 hist[(i*(d+2) + j)*(n+2) + k] = 0.;
749     }
750 
751     for( i = -radius, k = 0; i <= radius; i++ )
752         for( j = -radius; j <= radius; j++ )
753         {
754             // Calculate sample's histogram array coords rotated relative to ori.
755             // Subtract 0.5 so samples that fall e.g. in the center of row 1 (i.e.
756             // r_rot = 1.5) have full weight placed in row 1 after interpolation.
757             float c_rot = j * cos_t - i * sin_t;
758             float r_rot = j * sin_t + i * cos_t;
759             float rbin = r_rot + d/2 - 0.5f;
760             float cbin = c_rot + d/2 - 0.5f;
761             int r = pt.y + i, c = pt.x + j;
762 
763             if( rbin > -1 && rbin < d && cbin > -1 && cbin < d &&
764                 r > 0 && r < rows - 1 && c > 0 && c < cols - 1 )
765             {
766                 float dx = (float)(img.at<sift_wt>(r, c+1) - img.at<sift_wt>(r, c-1));
767                 float dy = (float)(img.at<sift_wt>(r-1, c) - img.at<sift_wt>(r+1, c));
768                 X[k] = dx; Y[k] = dy; RBin[k] = rbin; CBin[k] = cbin;
769                 W[k] = (c_rot * c_rot + r_rot * r_rot)*exp_scale;
770                 k++;
771             }
772         }
773 
774     len = k;
775     cv::hal::fastAtan2(Y, X, Ori, len, true);
776     cv::hal::magnitude32f(X, Y, Mag, len);
777     cv::hal::exp32f(W, W, len);
778 
779     k = 0;
780 #if CV_SIMD
781     {
782         const int vecsize = v_float32::nlanes;
783         int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
784         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
785         const v_float32 __ori  = vx_setall_f32(ori);
786         const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
787         const v_int32 __n = vx_setall_s32(n);
788         const v_int32 __1 = vx_setall_s32(1);
789         const v_int32 __d_plus_2 = vx_setall_s32(d+2);
790         const v_int32 __n_plus_2 = vx_setall_s32(n+2);
791         for( ; k <= len - vecsize; k += vecsize )
792         {
793             v_float32 rbin = vx_load_aligned(RBin + k);
794             v_float32 cbin = vx_load_aligned(CBin + k);
795             v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad;
796             v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k);
797 
798             v_int32 r0 = v_floor(rbin);
799             v_int32 c0 = v_floor(cbin);
800             v_int32 o0 = v_floor(obin);
801             rbin -= v_cvt_f32(r0);
802             cbin -= v_cvt_f32(c0);
803             obin -= v_cvt_f32(o0);
804 
805             o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
806             o0 = v_select(o0 >= __n, o0 - __n, o0);
807 
808             v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
809             v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
810             v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
811             v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
812             v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
813             v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
814             v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
815 
816             v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
817             v_store_aligned(idx_buf, idx);
818 
819             v_store_aligned(rco_buf,           v_rco000);
820             v_store_aligned(rco_buf+vecsize,   v_rco001);
821             v_store_aligned(rco_buf+vecsize*2, v_rco010);
822             v_store_aligned(rco_buf+vecsize*3, v_rco011);
823             v_store_aligned(rco_buf+vecsize*4, v_rco100);
824             v_store_aligned(rco_buf+vecsize*5, v_rco101);
825             v_store_aligned(rco_buf+vecsize*6, v_rco110);
826             v_store_aligned(rco_buf+vecsize*7, v_rco111);
827 
828             for(int id = 0; id < vecsize; id++)
829             {
830                 hist[idx_buf[id]] += rco_buf[id];
831                 hist[idx_buf[id]+1] += rco_buf[vecsize + id];
832                 hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id];
833                 hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id];
834                 hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id];
835                 hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id];
836                 hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id];
837                 hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id];
838             }
839         }
840     }
841 #endif
842     for( ; k < len; k++ )
843     {
844         float rbin = RBin[k], cbin = CBin[k];
845         float obin = (Ori[k] - ori)*bins_per_rad;
846         float mag = Mag[k]*W[k];
847 
848         int r0 = cvFloor( rbin );
849         int c0 = cvFloor( cbin );
850         int o0 = cvFloor( obin );
851         rbin -= r0;
852         cbin -= c0;
853         obin -= o0;
854 
855         if( o0 < 0 )
856             o0 += n;
857         if( o0 >= n )
858             o0 -= n;
859 
860         // histogram update using tri-linear interpolation
861         float v_r1 = mag*rbin, v_r0 = mag - v_r1;
862         float v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
863         float v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
864         float v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
865         float v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
866         float v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
867         float v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
868 
869         int idx = ((r0+1)*(d+2) + c0+1)*(n+2) + o0;
870         hist[idx] += v_rco000;
871         hist[idx+1] += v_rco001;
872         hist[idx+(n+2)] += v_rco010;
873         hist[idx+(n+3)] += v_rco011;
874         hist[idx+(d+2)*(n+2)] += v_rco100;
875         hist[idx+(d+2)*(n+2)+1] += v_rco101;
876         hist[idx+(d+3)*(n+2)] += v_rco110;
877         hist[idx+(d+3)*(n+2)+1] += v_rco111;
878     }
879 
880     // finalize histogram, since the orientation histograms are circular
881     for( i = 0; i < d; i++ )
882         for( j = 0; j < d; j++ )
883         {
884             int idx = ((i+1)*(d+2) + (j+1))*(n+2);
885             hist[idx] += hist[idx+n];
886             hist[idx+1] += hist[idx+n+1];
887             for( k = 0; k < n; k++ )
888                 rawDst[(i*d + j)*n + k] = hist[idx+k];
889         }
890     // copy histogram to the descriptor,
891     // apply hysteresis thresholding
892     // and scale the result, so that it can be easily converted
893     // to byte array
894     float nrm2 = 0;
895     len = d*d*n;
896     k = 0;
897 #if CV_SIMD
898     {
899         v_float32 __nrm2 = vx_setzero_f32();
900         v_float32 __rawDst;
901         for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
902         {
903             __rawDst = vx_load_aligned(rawDst + k);
904             __nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
905         }
906         nrm2 = (float)v_reduce_sum(__nrm2);
907     }
908 #endif
909     for( ; k < len; k++ )
910         nrm2 += rawDst[k]*rawDst[k];
911 
912     float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR;
913 
914     i = 0, nrm2 = 0;
915 #if 0 //CV_AVX2
916     // This code cannot be enabled because it sums nrm2 in a different order,
917     // thus producing slightly different results
918     {
919         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8];
920         __m256 __dst;
921         __m256 __nrm2 = _mm256_setzero_ps();
922         __m256 __thr = _mm256_set1_ps(thr);
923         for( ; i <= len - 8; i += 8 )
924         {
925             __dst = _mm256_loadu_ps(&rawDst[i]);
926             __dst = _mm256_min_ps(__dst, __thr);
927             _mm256_storeu_ps(&rawDst[i], __dst);
928 #if CV_FMA3
929             __nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
930 #else
931             __nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
932 #endif
933         }
934         _mm256_store_ps(nrm2_buf, __nrm2);
935         nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
936                nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
937     }
938 #endif
939     for( ; i < len; i++ )
940     {
941         float val = std::min(rawDst[i], thr);
942         rawDst[i] = val;
943         nrm2 += val*val;
944     }
945     nrm2 = SIFT_INT_DESCR_FCTR/std::max(std::sqrt(nrm2), FLT_EPSILON);
946 
947 #if 1
948     k = 0;
949 if( dstMat.type() == CV_32F )
950 {
951     float* dst = dstMat.ptr<float>(row);
952 #if CV_SIMD
953     v_float32 __dst;
954     v_float32 __min = vx_setzero_f32();
955     v_float32 __max = vx_setall_f32(255.0f); // max of uchar
956     v_float32 __nrm2 = vx_setall_f32(nrm2);
957     for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
958     {
959         __dst = vx_load_aligned(rawDst + k);
960         __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
961         v_store(dst + k, __dst);
962     }
963 #endif
964     for( ; k < len; k++ )
965     {
966         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
967     }
968 }
969 else // CV_8U
970 {
971     uint8_t* dst = dstMat.ptr<uint8_t>(row);
972 #if CV_SIMD
973     v_float32 __dst0, __dst1;
974     v_uint16 __pack01;
975     v_float32 __nrm2 = vx_setall_f32(nrm2);
976     for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 )
977     {
978         __dst0 = vx_load_aligned(rawDst + k);
979         __dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes);
980 
981         __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
982         v_pack_store(dst + k, __pack01);
983     }
984 #endif
985     for( ; k < len; k++ )
986     {
987         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
988     }
989 }
990 #else
991     float* dst = dstMat.ptr<float>(row);
992     float nrm1 = 0;
993     for( k = 0; k < len; k++ )
994     {
995         rawDst[k] *= nrm2;
996         nrm1 += rawDst[k];
997     }
998     nrm1 = 1.f/std::max(nrm1, FLT_EPSILON);
999 if( dstMat.type() == CV_32F )
1000 {
1001     for( k = 0; k < len; k++ )
1002     {
1003         dst[k] = std::sqrt(rawDst[k] * nrm1);
1004     }
1005 }
1006 else // CV_8U
1007 {
1008     for( k = 0; k < len; k++ )
1009     {
1010         dst[k] = saturate_cast<uchar>(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
1011     }
1012 }
1013 #endif
1014 }
1015 
1016 #endif
1017 CV_CPU_OPTIMIZATION_NAMESPACE_END
1018 } // namespace
1019