1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49 
50 #include "precomp.hpp"
51 #include "opencl_kernels_imgproc.hpp"
52 #include "hal_replacement.hpp"
53 #include <opencv2/core/utils/configuration.private.hpp>
54 #include "opencv2/core/hal/intrin.hpp"
55 #include "opencv2/core/openvx/ovx_defs.hpp"
56 #include "opencv2/core/softfloat.hpp"
57 #include "imgwarp.hpp"
58 
59 using namespace cv;
60 
61 namespace cv
62 {
63 
64 #if defined (HAVE_IPP) && (!IPP_DISABLE_WARPAFFINE || !IPP_DISABLE_WARPPERSPECTIVE || !IPP_DISABLE_REMAP)
65 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66 
67 template <int channels, typename Type>
IPPSetSimple(cv::Scalar value,void * dataPointer,int step,IppiSize & size,ippiSetFunc func)68 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
69 {
70     CV_INSTRUMENT_REGION_IPP();
71 
72     Type values[channels];
73     for( int i = 0; i < channels; i++ )
74         values[i] = saturate_cast<Type>(value[i]);
75     return func(values, dataPointer, step, size) >= 0;
76 }
77 
IPPSet(const cv::Scalar & value,void * dataPointer,int step,IppiSize & size,int channels,int depth)78 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
79 {
80     CV_INSTRUMENT_REGION_IPP();
81 
82     if( channels == 1 )
83     {
84         switch( depth )
85         {
86         case CV_8U:
87             return CV_INSTRUMENT_FUN_IPP(ippiSet_8u_C1R, saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
88         case CV_16U:
89             return CV_INSTRUMENT_FUN_IPP(ippiSet_16u_C1R, saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
90         case CV_32F:
91             return CV_INSTRUMENT_FUN_IPP(ippiSet_32f_C1R, saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
92         }
93     }
94     else
95     {
96         if( channels == 3 )
97         {
98             switch( depth )
99             {
100             case CV_8U:
101                 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
102             case CV_16U:
103                 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
104             case CV_32F:
105                 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
106             }
107         }
108         else if( channels == 4 )
109         {
110             switch( depth )
111             {
112             case CV_8U:
113                 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
114             case CV_16U:
115                 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
116             case CV_32F:
117                 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
118             }
119         }
120     }
121     return false;
122 }
123 #endif
124 
125 /************** interpolation formulas and tables ***************/
126 
127 const int INTER_REMAP_COEF_BITS=15;
128 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
129 
130 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
131 
132 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
133 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
134 
135 #if CV_SIMD128
136 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
137 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
138 #endif
139 
140 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
141 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
142 
143 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
144 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
145 
interpolateLinear(float x,float * coeffs)146 static inline void interpolateLinear( float x, float* coeffs )
147 {
148     coeffs[0] = 1.f - x;
149     coeffs[1] = x;
150 }
151 
interpolateCubic(float x,float * coeffs)152 static inline void interpolateCubic( float x, float* coeffs )
153 {
154     const float A = -0.75f;
155 
156     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
157     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
158     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
159     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
160 }
161 
interpolateLanczos4(float x,float * coeffs)162 static inline void interpolateLanczos4( float x, float* coeffs )
163 {
164     static const double s45 = 0.70710678118654752440084436210485;
165     static const double cs[][2]=
166     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
167 
168     if( x < FLT_EPSILON )
169     {
170         for( int i = 0; i < 8; i++ )
171             coeffs[i] = 0;
172         coeffs[3] = 1;
173         return;
174     }
175 
176     float sum = 0;
177     double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
178     for(int i = 0; i < 8; i++ )
179     {
180         double y = -(x+3-i)*CV_PI*0.25;
181         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
182         sum += coeffs[i];
183     }
184 
185     sum = 1.f/sum;
186     for(int i = 0; i < 8; i++ )
187         coeffs[i] *= sum;
188 }
189 
initInterTab1D(int method,float * tab,int tabsz)190 static void initInterTab1D(int method, float* tab, int tabsz)
191 {
192     float scale = 1.f/tabsz;
193     if( method == INTER_LINEAR )
194     {
195         for( int i = 0; i < tabsz; i++, tab += 2 )
196             interpolateLinear( i*scale, tab );
197     }
198     else if( method == INTER_CUBIC )
199     {
200         for( int i = 0; i < tabsz; i++, tab += 4 )
201             interpolateCubic( i*scale, tab );
202     }
203     else if( method == INTER_LANCZOS4 )
204     {
205         for( int i = 0; i < tabsz; i++, tab += 8 )
206             interpolateLanczos4( i*scale, tab );
207     }
208     else
209         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
210 }
211 
212 
initInterTab2D(int method,bool fixpt)213 static const void* initInterTab2D( int method, bool fixpt )
214 {
215     static bool inittab[INTER_MAX+1] = {false};
216     float* tab = 0;
217     short* itab = 0;
218     int ksize = 0;
219     if( method == INTER_LINEAR )
220         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
221     else if( method == INTER_CUBIC )
222         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
223     else if( method == INTER_LANCZOS4 )
224         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
225     else
226         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
227 
228     if( !inittab[method] )
229     {
230         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
231         int i, j, k1, k2;
232         initInterTab1D(method, _tab.data(), INTER_TAB_SIZE);
233         for( i = 0; i < INTER_TAB_SIZE; i++ )
234             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
235             {
236                 int isum = 0;
237                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
238                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
239 
240                 for( k1 = 0; k1 < ksize; k1++ )
241                 {
242                     float vy = _tab[i*ksize + k1];
243                     for( k2 = 0; k2 < ksize; k2++ )
244                     {
245                         float v = vy*_tab[j*ksize + k2];
246                         tab[k1*ksize + k2] = v;
247                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
248                     }
249                 }
250 
251                 if( isum != INTER_REMAP_COEF_SCALE )
252                 {
253                     int diff = isum - INTER_REMAP_COEF_SCALE;
254                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
255                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
256                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
257                         {
258                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
259                                 mk1 = k1, mk2 = k2;
260                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
261                                 Mk1 = k1, Mk2 = k2;
262                         }
263                     if( diff < 0 )
264                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
265                     else
266                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
267                 }
268             }
269         tab -= INTER_TAB_SIZE2*ksize*ksize;
270         itab -= INTER_TAB_SIZE2*ksize*ksize;
271 #if CV_SIMD128
272         if( method == INTER_LINEAR )
273         {
274             for( i = 0; i < INTER_TAB_SIZE2; i++ )
275                 for( j = 0; j < 4; j++ )
276                 {
277                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
278                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
279                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
280                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
281                 }
282         }
283 #endif
284         inittab[method] = true;
285     }
286     return fixpt ? (const void*)itab : (const void*)tab;
287 }
288 
289 #ifndef __MINGW32__
initAllInterTab2D()290 static bool initAllInterTab2D()
291 {
292     return  initInterTab2D( INTER_LINEAR, false ) &&
293             initInterTab2D( INTER_LINEAR, true ) &&
294             initInterTab2D( INTER_CUBIC, false ) &&
295             initInterTab2D( INTER_CUBIC, true ) &&
296             initInterTab2D( INTER_LANCZOS4, false ) &&
297             initInterTab2D( INTER_LANCZOS4, true );
298 }
299 
300 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
301 #endif
302 
303 template<typename ST, typename DT> struct Cast
304 {
305     typedef ST type1;
306     typedef DT rtype;
307 
operator ()cv::Cast308     DT operator()(ST val) const { return saturate_cast<DT>(val); }
309 };
310 
311 template<typename ST, typename DT, int bits> struct FixedPtCast
312 {
313     typedef ST type1;
314     typedef DT rtype;
315     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
316 
operator ()cv::FixedPtCast317     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
318 };
319 
clip(int x,int a,int b)320 static inline int clip(int x, int a, int b)
321 {
322     return x >= a ? (x < b ? x : b-1) : a;
323 }
324 
325 /****************************************************************************************\
326 *                       General warping (affine, perspective, remap)                     *
327 \****************************************************************************************/
328 
329 template<typename T>
remapNearest(const Mat & _src,Mat & _dst,const Mat & _xy,int borderType,const Scalar & _borderValue)330 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
331                           int borderType, const Scalar& _borderValue )
332 {
333     Size ssize = _src.size(), dsize = _dst.size();
334     const int cn = _src.channels();
335     const T* S0 = _src.ptr<T>();
336     T cval[CV_CN_MAX];
337     size_t sstep = _src.step/sizeof(S0[0]);
338 
339     for(int k = 0; k < cn; k++ )
340         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
341 
342     unsigned width1 = ssize.width, height1 = ssize.height;
343 
344     if( _dst.isContinuous() && _xy.isContinuous() )
345     {
346         dsize.width *= dsize.height;
347         dsize.height = 1;
348     }
349 
350     for(int dy = 0; dy < dsize.height; dy++ )
351     {
352         T* D = _dst.ptr<T>(dy);
353         const short* XY = _xy.ptr<short>(dy);
354 
355         if( cn == 1 )
356         {
357             for(int dx = 0; dx < dsize.width; dx++ )
358             {
359                 int sx = XY[dx*2], sy = XY[dx*2+1];
360                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
361                     D[dx] = S0[sy*sstep + sx];
362                 else
363                 {
364                     if( borderType == BORDER_REPLICATE )
365                     {
366                         sx = clip(sx, 0, ssize.width);
367                         sy = clip(sy, 0, ssize.height);
368                         D[dx] = S0[sy*sstep + sx];
369                     }
370                     else if( borderType == BORDER_CONSTANT )
371                         D[dx] = cval[0];
372                     else if( borderType != BORDER_TRANSPARENT )
373                     {
374                         sx = borderInterpolate(sx, ssize.width, borderType);
375                         sy = borderInterpolate(sy, ssize.height, borderType);
376                         D[dx] = S0[sy*sstep + sx];
377                     }
378                 }
379             }
380         }
381         else
382         {
383             for(int dx = 0; dx < dsize.width; dx++, D += cn )
384             {
385                 int sx = XY[dx*2], sy = XY[dx*2+1];
386                 const T *S;
387                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
388                 {
389                     if( cn == 3 )
390                     {
391                         S = S0 + sy*sstep + sx*3;
392                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
393                     }
394                     else if( cn == 4 )
395                     {
396                         S = S0 + sy*sstep + sx*4;
397                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
398                     }
399                     else
400                     {
401                         S = S0 + sy*sstep + sx*cn;
402                         for(int k = 0; k < cn; k++ )
403                             D[k] = S[k];
404                     }
405                 }
406                 else if( borderType != BORDER_TRANSPARENT )
407                 {
408                     if( borderType == BORDER_REPLICATE )
409                     {
410                         sx = clip(sx, 0, ssize.width);
411                         sy = clip(sy, 0, ssize.height);
412                         S = S0 + sy*sstep + sx*cn;
413                     }
414                     else if( borderType == BORDER_CONSTANT )
415                         S = &cval[0];
416                     else
417                     {
418                         sx = borderInterpolate(sx, ssize.width, borderType);
419                         sy = borderInterpolate(sy, ssize.height, borderType);
420                         S = S0 + sy*sstep + sx*cn;
421                     }
422                     for(int k = 0; k < cn; k++ )
423                         D[k] = S[k];
424                 }
425             }
426         }
427     }
428 }
429 
430 
431 struct RemapNoVec
432 {
operator ()cv::RemapNoVec433     int operator()( const Mat&, void*, const short*, const ushort*,
434                     const void*, int ) const { return 0; }
435 };
436 
437 #if CV_SIMD128
438 
439 typedef unsigned short CV_DECL_ALIGNED(1) unaligned_ushort;
440 typedef int CV_DECL_ALIGNED(1) unaligned_int;
441 
442 struct RemapVec_8u
443 {
operator ()cv::RemapVec_8u444     int operator()( const Mat& _src, void* _dst, const short* XY,
445                     const ushort* FXY, const void* _wtab, int width ) const
446     {
447         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
448 
449         if( (cn != 1 && cn != 3 && cn != 4) || sstep >= 0x8000 )
450             return 0;
451 
452         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
453         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
454         uchar* D = (uchar*)_dst;
455         v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
456         v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
457         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
458         const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
459 #define CV_PICK_AND_PACK_RGB(ptr, offset, result)  \
460         {                                          \
461             const uchar* const p = ((const uchar*)ptr) + (offset); \
462             if (p <= src_limit_8bytes)             \
463             {                                      \
464                 v_uint8x16 rrggbb, dummy;          \
465                 v_uint16x8 rrggbb8, dummy8;        \
466                 v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
467                 v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + 3), 0, 0, 0)); \
468                 v_zip(rgb0, rgb1, rrggbb, dummy);  \
469                 v_expand(rrggbb, rrggbb8, dummy8); \
470                 result = v_reinterpret_as_s16(rrggbb8); \
471             }                                      \
472             else                                   \
473             {                                      \
474                 result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \
475                                    (short)p[1], (short)p[4], /* g0g1 */ \
476                                    (short)p[2], (short)p[5], /* b0b1 */ 0, 0); \
477             }                                      \
478         }
479 #define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \
480         {                                          \
481             const uchar* const p = ((const uchar*)ptr) + (offset); \
482             CV_DbgAssert(p <= src_limit_8bytes);   \
483             v_uint8x16 rrggbbaa, dummy;            \
484             v_uint16x8 rrggbbaa8, dummy8;          \
485             v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
486             v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
487             v_zip(rgba0, rgba1, rrggbbaa, dummy);  \
488             v_expand(rrggbbaa, rrggbbaa8, dummy8); \
489             result = v_reinterpret_as_s16(rrggbbaa8); \
490         }
491 #define CV_PICK_AND_PACK4(base,offset)             \
492             v_uint16x8(*(unaligned_ushort*)(base + offset[0]), *(unaligned_ushort*)(base + offset[1]), \
493                        *(unaligned_ushort*)(base + offset[2]), *(unaligned_ushort*)(base + offset[3]), \
494                        0, 0, 0, 0)
495 
496         if( cn == 1 )
497         {
498             for( ; x <= width - 8; x += 8 )
499             {
500                 v_int16x8 _xy0 = v_load(XY + x*2);
501                 v_int16x8 _xy1 = v_load(XY + x*2 + 8);
502                 v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
503 
504                 v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
505                 v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs );
506                 v_store( iofs0, xy0 );
507                 v_store( iofs1, xy1 );
508 
509                 v_uint16x8 stub, dummy;
510                 v_uint16x8 vec16;
511                 vec16 = CV_PICK_AND_PACK4(S0, iofs0);
512                 v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
513                 v0 = v_reinterpret_as_s32(stub);
514                 vec16 = CV_PICK_AND_PACK4(S1, iofs0);
515                 v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
516                 v1 = v_reinterpret_as_s32(stub);
517 
518                 v_zip(v_load_low((int*)(wtab + FXY[x]     * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1);
519                 v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1);
520                 v_recombine(a0, b0, a2, b2);
521                 v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta);
522                 v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1);
523 
524                 vec16 = CV_PICK_AND_PACK4(S0, iofs1);
525                 v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
526                 v2 = v_reinterpret_as_s32(stub);
527                 vec16 = CV_PICK_AND_PACK4(S1, iofs1);
528                 v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
529                 v3 = v_reinterpret_as_s32(stub);
530 
531                 v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1);
532                 v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1);
533                 v_recombine(c0, d0, c2, d2);
534                 v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
535                 v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
536 
537                 v0 = v0 >> INTER_REMAP_COEF_BITS;
538                 v2 = v2 >> INTER_REMAP_COEF_BITS;
539                 v_pack_u_store(D + x, v_pack(v0, v2));
540             }
541         }
542         else if( cn == 3 )
543         {
544             for( ; x <= width - 5; x += 4, D += 12 )
545             {
546                 v_int16x8 u0, v0, u1, v1;
547                 v_int16x8 _xy0 = v_load(XY + x * 2);
548 
549                 v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs);
550                 v_store(iofs0, xy0);
551 
552                 int offset0 = FXY[x] * 16;
553                 int offset1 = FXY[x + 1] * 16;
554                 int offset2 = FXY[x + 2] * 16;
555                 int offset3 = FXY[x + 3] * 16;
556                 v_int16x8 w00 = v_load(wtab + offset0);
557                 v_int16x8 w01 = v_load(wtab + offset0 + 8);
558                 v_int16x8 w10 = v_load(wtab + offset1);
559                 v_int16x8 w11 = v_load(wtab + offset1 + 8);
560 
561                 CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0);
562                 CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0);
563                 CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
564                 CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
565 
566                 v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
567                 v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
568 
569                 result0 = v_rotate_left<1>(result0);
570                 v_int16x8 result8 = v_pack(result0, result1);
571                 v_uint8x16 result16 = v_pack_u(result8, result8);
572                 v_store_low(D, v_rotate_right<1>(result16));
573 
574 
575                 w00 = v_load(wtab + offset2);
576                 w01 = v_load(wtab + offset2 + 8);
577                 w10 = v_load(wtab + offset3);
578                 w11 = v_load(wtab + offset3 + 8);
579                 CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0);
580                 CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0);
581                 CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
582                 CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
583 
584                 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
585                 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
586 
587                 result0 = v_rotate_left<1>(result0);
588                 result8 = v_pack(result0, result1);
589                 result16 = v_pack_u(result8, result8);
590                 v_store_low(D + 6, v_rotate_right<1>(result16));
591             }
592         }
593         else if( cn == 4 )
594         {
595             for( ; x <= width - 4; x += 4, D += 16 )
596             {
597                 v_int16x8 _xy0 = v_load(XY + x * 2);
598                 v_int16x8 u0, v0, u1, v1;
599 
600                 v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
601                 v_store(iofs0, xy0);
602                 int offset0 = FXY[x] * 16;
603                 int offset1 = FXY[x + 1] * 16;
604                 int offset2 = FXY[x + 2] * 16;
605                 int offset3 = FXY[x + 3] * 16;
606 
607                 v_int16x8 w00 = v_load(wtab + offset0);
608                 v_int16x8 w01 = v_load(wtab + offset0 + 8);
609                 v_int16x8 w10 = v_load(wtab + offset1);
610                 v_int16x8 w11 = v_load(wtab + offset1 + 8);
611                 CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0);
612                 CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0);
613                 CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
614                 CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
615 
616                 v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
617                 v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
618                 v_int16x8 result8 = v_pack(result0, result1);
619                 v_pack_u_store(D, result8);
620 
621                 w00 = v_load(wtab + offset2);
622                 w01 = v_load(wtab + offset2 + 8);
623                 w10 = v_load(wtab + offset3);
624                 w11 = v_load(wtab + offset3 + 8);
625                 CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0);
626                 CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0);
627                 CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
628                 CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
629 
630                 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
631                 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
632                 result8 = v_pack(result0, result1);
633                 v_pack_u_store(D + 8, result8);
634             }
635         }
636 
637         return x;
638     }
639 };
640 
641 #else
642 
643 typedef RemapNoVec RemapVec_8u;
644 
645 #endif
646 
647 
648 template<class CastOp, class VecOp, typename AT>
remapBilinear(const Mat & _src,Mat & _dst,const Mat & _xy,const Mat & _fxy,const void * _wtab,int borderType,const Scalar & _borderValue)649 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
650                            const Mat& _fxy, const void* _wtab,
651                            int borderType, const Scalar& _borderValue )
652 {
653     typedef typename CastOp::rtype T;
654     typedef typename CastOp::type1 WT;
655     Size ssize = _src.size(), dsize = _dst.size();
656     const int cn = _src.channels();
657     const AT* wtab = (const AT*)_wtab;
658     const T* S0 = _src.ptr<T>();
659     size_t sstep = _src.step/sizeof(S0[0]);
660     T cval[CV_CN_MAX];
661     CastOp castOp;
662     VecOp vecOp;
663 
664     for(int k = 0; k < cn; k++ )
665         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
666 
667     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
668     CV_Assert( !ssize.empty() );
669 #if CV_SIMD128
670     if( _src.type() == CV_8UC3 )
671         width1 = std::max(ssize.width-2, 0);
672 #endif
673 
674     for(int dy = 0; dy < dsize.height; dy++ )
675     {
676         T* D = _dst.ptr<T>(dy);
677         const short* XY = _xy.ptr<short>(dy);
678         const ushort* FXY = _fxy.ptr<ushort>(dy);
679         int X0 = 0;
680         bool prevInlier = false;
681 
682         for(int dx = 0; dx <= dsize.width; dx++ )
683         {
684             bool curInlier = dx < dsize.width ?
685                 (unsigned)XY[dx*2] < width1 &&
686                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
687             if( curInlier == prevInlier )
688                 continue;
689 
690             int X1 = dx;
691             dx = X0;
692             X0 = X1;
693             prevInlier = curInlier;
694 
695             if( !curInlier )
696             {
697                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
698                 D += len*cn;
699                 dx += len;
700 
701                 if( cn == 1 )
702                 {
703                     for( ; dx < X1; dx++, D++ )
704                     {
705                         int sx = XY[dx*2], sy = XY[dx*2+1];
706                         const AT* w = wtab + FXY[dx]*4;
707                         const T* S = S0 + sy*sstep + sx;
708                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
709                     }
710                 }
711                 else if( cn == 2 )
712                     for( ; dx < X1; dx++, D += 2 )
713                     {
714                         int sx = XY[dx*2], sy = XY[dx*2+1];
715                         const AT* w = wtab + FXY[dx]*4;
716                         const T* S = S0 + sy*sstep + sx*2;
717                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
718                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
719                         D[0] = castOp(t0); D[1] = castOp(t1);
720                     }
721                 else if( cn == 3 )
722                     for( ; dx < X1; dx++, D += 3 )
723                     {
724                         int sx = XY[dx*2], sy = XY[dx*2+1];
725                         const AT* w = wtab + FXY[dx]*4;
726                         const T* S = S0 + sy*sstep + sx*3;
727                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
728                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
729                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
730                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
731                     }
732                 else if( cn == 4 )
733                     for( ; dx < X1; dx++, D += 4 )
734                     {
735                         int sx = XY[dx*2], sy = XY[dx*2+1];
736                         const AT* w = wtab + FXY[dx]*4;
737                         const T* S = S0 + sy*sstep + sx*4;
738                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
739                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
740                         D[0] = castOp(t0); D[1] = castOp(t1);
741                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
742                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
743                         D[2] = castOp(t0); D[3] = castOp(t1);
744                     }
745                 else
746                     for( ; dx < X1; dx++, D += cn )
747                     {
748                         int sx = XY[dx*2], sy = XY[dx*2+1];
749                         const AT* w = wtab + FXY[dx]*4;
750                         const T* S = S0 + sy*sstep + sx*cn;
751                         for(int k = 0; k < cn; k++ )
752                         {
753                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
754                             D[k] = castOp(t0);
755                         }
756                     }
757             }
758             else
759             {
760                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
761                 {
762                     D += (X1 - dx)*cn;
763                     dx = X1;
764                     continue;
765                 }
766 
767                 if( cn == 1 )
768                     for( ; dx < X1; dx++, D++ )
769                     {
770                         int sx = XY[dx*2], sy = XY[dx*2+1];
771                         if( borderType == BORDER_CONSTANT &&
772                             (sx >= ssize.width || sx+1 < 0 ||
773                              sy >= ssize.height || sy+1 < 0) )
774                         {
775                             D[0] = cval[0];
776                         }
777                         else
778                         {
779                             int sx0, sx1, sy0, sy1;
780                             T v0, v1, v2, v3;
781                             const AT* w = wtab + FXY[dx]*4;
782                             if( borderType == BORDER_REPLICATE )
783                             {
784                                 sx0 = clip(sx, 0, ssize.width);
785                                 sx1 = clip(sx+1, 0, ssize.width);
786                                 sy0 = clip(sy, 0, ssize.height);
787                                 sy1 = clip(sy+1, 0, ssize.height);
788                                 v0 = S0[sy0*sstep + sx0];
789                                 v1 = S0[sy0*sstep + sx1];
790                                 v2 = S0[sy1*sstep + sx0];
791                                 v3 = S0[sy1*sstep + sx1];
792                             }
793                             else
794                             {
795                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
796                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
797                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
798                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
799                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
800                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
801                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
802                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
803                             }
804                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
805                         }
806                     }
807                 else
808                     for( ; dx < X1; dx++, D += cn )
809                     {
810                         int sx = XY[dx*2], sy = XY[dx*2+1];
811                         if( borderType == BORDER_CONSTANT &&
812                             (sx >= ssize.width || sx+1 < 0 ||
813                              sy >= ssize.height || sy+1 < 0) )
814                         {
815                             for(int k = 0; k < cn; k++ )
816                                 D[k] = cval[k];
817                         }
818                         else
819                         {
820                             int sx0, sx1, sy0, sy1;
821                             const T *v0, *v1, *v2, *v3;
822                             const AT* w = wtab + FXY[dx]*4;
823                             if( borderType == BORDER_REPLICATE )
824                             {
825                                 sx0 = clip(sx, 0, ssize.width);
826                                 sx1 = clip(sx+1, 0, ssize.width);
827                                 sy0 = clip(sy, 0, ssize.height);
828                                 sy1 = clip(sy+1, 0, ssize.height);
829                                 v0 = S0 + sy0*sstep + sx0*cn;
830                                 v1 = S0 + sy0*sstep + sx1*cn;
831                                 v2 = S0 + sy1*sstep + sx0*cn;
832                                 v3 = S0 + sy1*sstep + sx1*cn;
833                             }
834                             else if( borderType == BORDER_TRANSPARENT &&
835                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
836                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
837                                 continue;
838                             else
839                             {
840                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
841                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
842                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
843                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
844                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
845                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
846                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
847                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
848                             }
849                             for(int k = 0; k < cn; k++ )
850                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
851                         }
852                     }
853             }
854         }
855     }
856 }
857 
858 
859 template<class CastOp, typename AT, int ONE>
remapBicubic(const Mat & _src,Mat & _dst,const Mat & _xy,const Mat & _fxy,const void * _wtab,int borderType,const Scalar & _borderValue)860 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
861                           const Mat& _fxy, const void* _wtab,
862                           int borderType, const Scalar& _borderValue )
863 {
864     typedef typename CastOp::rtype T;
865     typedef typename CastOp::type1 WT;
866     Size ssize = _src.size(), dsize = _dst.size();
867     const int cn = _src.channels();
868     const AT* wtab = (const AT*)_wtab;
869     const T* S0 = _src.ptr<T>();
870     size_t sstep = _src.step/sizeof(S0[0]);
871     T cval[CV_CN_MAX];
872     CastOp castOp;
873 
874     for(int k = 0; k < cn; k++ )
875         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
876 
877     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
878 
879     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
880 
881     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
882     {
883         dsize.width *= dsize.height;
884         dsize.height = 1;
885     }
886 
887     for(int dy = 0; dy < dsize.height; dy++ )
888     {
889         T* D = _dst.ptr<T>(dy);
890         const short* XY = _xy.ptr<short>(dy);
891         const ushort* FXY = _fxy.ptr<ushort>(dy);
892 
893         for(int dx = 0; dx < dsize.width; dx++, D += cn )
894         {
895             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
896             const AT* w = wtab + FXY[dx]*16;
897             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
898             {
899                 const T* S = S0 + sy*sstep + sx*cn;
900                 for(int k = 0; k < cn; k++ )
901                 {
902                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
903                     S += sstep;
904                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
905                     S += sstep;
906                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
907                     S += sstep;
908                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
909                     S += 1 - sstep*3;
910                     D[k] = castOp(sum);
911                 }
912             }
913             else
914             {
915                 int x[4], y[4];
916                 if( borderType == BORDER_TRANSPARENT &&
917                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
918                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
919                     continue;
920 
921                 if( borderType1 == BORDER_CONSTANT &&
922                     (sx >= ssize.width || sx+4 <= 0 ||
923                     sy >= ssize.height || sy+4 <= 0))
924                 {
925                     for(int k = 0; k < cn; k++ )
926                         D[k] = cval[k];
927                     continue;
928                 }
929 
930                 for(int i = 0; i < 4; i++ )
931                 {
932                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
933                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
934                 }
935 
936                 for(int k = 0; k < cn; k++, S0++, w -= 16 )
937                 {
938                     WT cv = cval[k], sum = cv*ONE;
939                     for(int i = 0; i < 4; i++, w += 4 )
940                     {
941                         int yi = y[i];
942                         const T* S = S0 + yi*sstep;
943                         if( yi < 0 )
944                             continue;
945                         if( x[0] >= 0 )
946                             sum += (S[x[0]] - cv)*w[0];
947                         if( x[1] >= 0 )
948                             sum += (S[x[1]] - cv)*w[1];
949                         if( x[2] >= 0 )
950                             sum += (S[x[2]] - cv)*w[2];
951                         if( x[3] >= 0 )
952                             sum += (S[x[3]] - cv)*w[3];
953                     }
954                     D[k] = castOp(sum);
955                 }
956                 S0 -= cn;
957             }
958         }
959     }
960 }
961 
962 
963 template<class CastOp, typename AT, int ONE>
remapLanczos4(const Mat & _src,Mat & _dst,const Mat & _xy,const Mat & _fxy,const void * _wtab,int borderType,const Scalar & _borderValue)964 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
965                            const Mat& _fxy, const void* _wtab,
966                            int borderType, const Scalar& _borderValue )
967 {
968     typedef typename CastOp::rtype T;
969     typedef typename CastOp::type1 WT;
970     Size ssize = _src.size(), dsize = _dst.size();
971     const int cn = _src.channels();
972     const AT* wtab = (const AT*)_wtab;
973     const T* S0 = _src.ptr<T>();
974     size_t sstep = _src.step/sizeof(S0[0]);
975     T cval[CV_CN_MAX];
976     CastOp castOp;
977 
978     for(int k = 0; k < cn; k++ )
979         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
980 
981     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
982 
983     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
984 
985     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
986     {
987         dsize.width *= dsize.height;
988         dsize.height = 1;
989     }
990 
991     for(int dy = 0; dy < dsize.height; dy++ )
992     {
993         T* D = _dst.ptr<T>(dy);
994         const short* XY = _xy.ptr<short>(dy);
995         const ushort* FXY = _fxy.ptr<ushort>(dy);
996 
997         for(int dx = 0; dx < dsize.width; dx++, D += cn )
998         {
999             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
1000             const AT* w = wtab + FXY[dx]*64;
1001             const T* S = S0 + sy*sstep + sx*cn;
1002             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
1003             {
1004                 for(int k = 0; k < cn; k++ )
1005                 {
1006                     WT sum = 0;
1007                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
1008                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
1009                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
1010                     w -= 64;
1011                     S -= sstep*8 - 1;
1012                     D[k] = castOp(sum);
1013                 }
1014             }
1015             else
1016             {
1017                 int x[8], y[8];
1018                 if( borderType == BORDER_TRANSPARENT &&
1019                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
1020                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
1021                     continue;
1022 
1023                 if( borderType1 == BORDER_CONSTANT &&
1024                     (sx >= ssize.width || sx+8 <= 0 ||
1025                     sy >= ssize.height || sy+8 <= 0))
1026                 {
1027                     for(int k = 0; k < cn; k++ )
1028                         D[k] = cval[k];
1029                     continue;
1030                 }
1031 
1032                 for(int i = 0; i < 8; i++ )
1033                 {
1034                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
1035                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
1036                 }
1037 
1038                 for(int k = 0; k < cn; k++, S0++, w -= 64 )
1039                 {
1040                     WT cv = cval[k], sum = cv*ONE;
1041                     for(int i = 0; i < 8; i++, w += 8 )
1042                     {
1043                         int yi = y[i];
1044                         const T* S1 = S0 + yi*sstep;
1045                         if( yi < 0 )
1046                             continue;
1047                         if( x[0] >= 0 )
1048                             sum += (S1[x[0]] - cv)*w[0];
1049                         if( x[1] >= 0 )
1050                             sum += (S1[x[1]] - cv)*w[1];
1051                         if( x[2] >= 0 )
1052                             sum += (S1[x[2]] - cv)*w[2];
1053                         if( x[3] >= 0 )
1054                             sum += (S1[x[3]] - cv)*w[3];
1055                         if( x[4] >= 0 )
1056                             sum += (S1[x[4]] - cv)*w[4];
1057                         if( x[5] >= 0 )
1058                             sum += (S1[x[5]] - cv)*w[5];
1059                         if( x[6] >= 0 )
1060                             sum += (S1[x[6]] - cv)*w[6];
1061                         if( x[7] >= 0 )
1062                             sum += (S1[x[7]] - cv)*w[7];
1063                     }
1064                     D[k] = castOp(sum);
1065                 }
1066                 S0 -= cn;
1067             }
1068         }
1069     }
1070 }
1071 
1072 
1073 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
1074                             int borderType, const Scalar& _borderValue );
1075 
1076 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
1077                           const Mat& _fxy, const void* _wtab,
1078                           int borderType, const Scalar& _borderValue);
1079 
1080 class RemapInvoker :
1081     public ParallelLoopBody
1082 {
1083 public:
RemapInvoker(const Mat & _src,Mat & _dst,const Mat * _m1,const Mat * _m2,int _borderType,const Scalar & _borderValue,int _planar_input,RemapNNFunc _nnfunc,RemapFunc _ifunc,const void * _ctab)1084     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
1085                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
1086                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
1087         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
1088         borderType(_borderType), borderValue(_borderValue),
1089         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
1090     {
1091     }
1092 
operator ()(const Range & range) const1093     virtual void operator() (const Range& range) const CV_OVERRIDE
1094     {
1095         int x, y, x1, y1;
1096         const int buf_size = 1 << 14;
1097         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
1098         int bcols0 = std::min(buf_size/brows0, dst->cols);
1099         brows0 = std::min(buf_size/bcols0, dst->rows);
1100 
1101         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
1102         if( !nnfunc )
1103             _bufa.create(brows0, bcols0, CV_16UC1);
1104 
1105         for( y = range.start; y < range.end; y += brows0 )
1106         {
1107             for( x = 0; x < dst->cols; x += bcols0 )
1108             {
1109                 int brows = std::min(brows0, range.end - y);
1110                 int bcols = std::min(bcols0, dst->cols - x);
1111                 Mat dpart(*dst, Rect(x, y, bcols, brows));
1112                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
1113 
1114                 if( nnfunc )
1115                 {
1116                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
1117                         bufxy = (*m1)(Rect(x, y, bcols, brows));
1118                     else if( map_depth != CV_32F )
1119                     {
1120                         for( y1 = 0; y1 < brows; y1++ )
1121                         {
1122                             short* XY = bufxy.ptr<short>(y1);
1123                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
1124                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
1125 
1126                             for( x1 = 0; x1 < bcols; x1++ )
1127                             {
1128                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
1129                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
1130                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
1131                             }
1132                         }
1133                     }
1134                     else if( !planar_input )
1135                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
1136                     else
1137                     {
1138                         for( y1 = 0; y1 < brows; y1++ )
1139                         {
1140                             short* XY = bufxy.ptr<short>(y1);
1141                             const float* sX = m1->ptr<float>(y+y1) + x;
1142                             const float* sY = m2->ptr<float>(y+y1) + x;
1143                             x1 = 0;
1144 
1145                             #if CV_SIMD128
1146                             {
1147                                 int span = v_float32x4::nlanes;
1148                                 for( ; x1 <= bcols - span * 2; x1 += span * 2 )
1149                                 {
1150                                     v_int32x4 ix0 = v_round(v_load(sX + x1));
1151                                     v_int32x4 iy0 = v_round(v_load(sY + x1));
1152                                     v_int32x4 ix1 = v_round(v_load(sX + x1 + span));
1153                                     v_int32x4 iy1 = v_round(v_load(sY + x1 + span));
1154 
1155                                     v_int16x8 dx, dy;
1156                                     dx = v_pack(ix0, ix1);
1157                                     dy = v_pack(iy0, iy1);
1158                                     v_store_interleave(XY + x1 * 2, dx, dy);
1159                                 }
1160                             }
1161                             #endif
1162                             for( ; x1 < bcols; x1++ )
1163                             {
1164                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
1165                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
1166                             }
1167                         }
1168                     }
1169                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
1170                     continue;
1171                 }
1172 
1173                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
1174                 for( y1 = 0; y1 < brows; y1++ )
1175                 {
1176                     short* XY = bufxy.ptr<short>(y1);
1177                     ushort* A = bufa.ptr<ushort>(y1);
1178 
1179                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
1180                     {
1181                         bufxy = (*m1)(Rect(x, y, bcols, brows));
1182 
1183                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
1184                         x1 = 0;
1185 
1186                         #if CV_SIMD128
1187                         {
1188                             v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
1189                             int span = v_uint16x8::nlanes;
1190                             for( ; x1 <= bcols - span; x1 += span )
1191                                 v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
1192                         }
1193                         #endif
1194                         for( ; x1 < bcols; x1++ )
1195                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
1196                     }
1197                     else if( planar_input )
1198                     {
1199                         const float* sX = m1->ptr<float>(y+y1) + x;
1200                         const float* sY = m2->ptr<float>(y+y1) + x;
1201 
1202                         x1 = 0;
1203                         #if CV_SIMD128
1204                         {
1205                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
1206                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
1207                             int span = v_float32x4::nlanes;
1208                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
1209                             {
1210                                 v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
1211                                 v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
1212                                 v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
1213                                 v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
1214                                 v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
1215                                 v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
1216                                 v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
1217                                 v_store(A + x1, v_v);
1218 
1219                                 v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
1220                                 v_int16x8 v_d1 = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
1221                                 v_store_interleave(XY + (x1 << 1), v_d0, v_d1);
1222                             }
1223                         }
1224                         #endif
1225                         for( ; x1 < bcols; x1++ )
1226                         {
1227                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
1228                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
1229                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
1230                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
1231                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
1232                             A[x1] = (ushort)v;
1233                         }
1234                     }
1235                     else
1236                     {
1237                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
1238                         x1 = 0;
1239 
1240                         #if CV_SIMD128
1241                         {
1242                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
1243                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
1244                             int span = v_float32x4::nlanes;
1245                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
1246                             {
1247                                 v_float32x4 v_fx, v_fy;
1248                                 v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
1249                                 v_int32x4 v_sx0 = v_round(v_fx * v_scale);
1250                                 v_int32x4 v_sy0 = v_round(v_fy * v_scale);
1251                                 v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
1252                                 v_int32x4 v_sx1 = v_round(v_fx * v_scale);
1253                                 v_int32x4 v_sy1 = v_round(v_fy * v_scale);
1254                                 v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
1255                                 v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
1256                                 v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
1257                                 v_store(A + x1, v_v8);
1258                                 v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
1259                                 v_int16x8 v_dy = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
1260                                 v_store_interleave(XY + (x1 << 1), v_dx, v_dy);
1261                             }
1262                         }
1263                         #endif
1264 
1265                         for( ; x1 < bcols; x1++ )
1266                         {
1267                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
1268                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
1269                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
1270                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
1271                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
1272                             A[x1] = (ushort)v;
1273                         }
1274                     }
1275                 }
1276                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
1277             }
1278         }
1279     }
1280 
1281 private:
1282     const Mat* src;
1283     Mat* dst;
1284     const Mat *m1, *m2;
1285     int borderType;
1286     Scalar borderValue;
1287     int planar_input;
1288     RemapNNFunc nnfunc;
1289     RemapFunc ifunc;
1290     const void *ctab;
1291 };
1292 
1293 #ifdef HAVE_OPENCL
1294 
ocl_remap(InputArray _src,OutputArray _dst,InputArray _map1,InputArray _map2,int interpolation,int borderType,const Scalar & borderValue)1295 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
1296                       int interpolation, int borderType, const Scalar& borderValue)
1297 {
1298     const ocl::Device & dev = ocl::Device::getDefault();
1299     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
1300             rowsPerWI = dev.isIntel() ? 4 : 1;
1301 
1302     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
1303             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
1304         return false;
1305 
1306     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
1307 
1308     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
1309         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
1310     {
1311         if (map1.type() != CV_16SC2)
1312             std::swap(map1, map2);
1313     }
1314     else
1315         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
1316 
1317     _dst.create(map1.size(), type);
1318     UMat dst = _dst.getUMat();
1319 
1320     String kernelName = "remap";
1321     if (map1.type() == CV_32FC2 && map2.empty())
1322         kernelName += "_32FC2";
1323     else if (map1.type() == CV_16SC2)
1324     {
1325         kernelName += "_16SC2";
1326         if (!map2.empty())
1327             kernelName += "_16UC1";
1328     }
1329     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
1330         kernelName += "_2_32FC1";
1331     else
1332         CV_Error(Error::StsBadArg, "Unsupported map types");
1333 
1334     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
1335     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
1336                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
1337     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
1338                                  interMap[interpolation], borderMap[borderType],
1339                                  ocl::typeToStr(type), rowsPerWI);
1340 
1341     if (interpolation != INTER_NEAREST)
1342     {
1343         char cvt[3][40];
1344         int wdepth = std::max(CV_32F, depth);
1345         buildOptions = buildOptions
1346                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
1347                                " -D convertToWT2=%s -D WT2=%s",
1348                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1349                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
1350                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
1351                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
1352                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
1353     }
1354     int scalarcn = cn == 3 ? 4 : cn;
1355     int sctype = CV_MAKETYPE(depth, scalarcn);
1356     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
1357                            ocl::typeToStr(type), ocl::typeToStr(depth),
1358                            cn, ocl::typeToStr(sctype), depth);
1359 
1360     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
1361 
1362     Mat scalar(1, 1, sctype, borderValue);
1363     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
1364             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
1365             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
1366 
1367     if (map2.empty())
1368         k.args(srcarg, dstarg, map1arg, scalararg);
1369     else
1370         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
1371 
1372     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
1373     return k.run(2, globalThreads, NULL, false);
1374 }
1375 
1376 #if 0
1377 /**
1378 @deprecated with old version of cv::linearPolar
1379 */
1380 static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
1381     Point2f center, double maxRadius, int flags)
1382 {
1383     UMat src_with_border; // don't scope this variable (it holds image data)
1384 
1385     UMat mapx, mapy, r, cp_sp;
1386     UMat src = _src.getUMat();
1387     _dst.create(src.size(), src.type());
1388     Size dsize = src.size();
1389     r.create(Size(1, dsize.width), CV_32F);
1390     cp_sp.create(Size(1, dsize.height), CV_32FC2);
1391 
1392     mapx.create(dsize, CV_32F);
1393     mapy.create(dsize, CV_32F);
1394     size_t w = dsize.width;
1395     size_t h = dsize.height;
1396     String buildOptions;
1397     unsigned mem_size = 32;
1398     if (flags & CV_WARP_INVERSE_MAP)
1399     {
1400         buildOptions = "-D InverseMap";
1401     }
1402     else
1403     {
1404         buildOptions = format("-D ForwardMap  -D MEM_SIZE=%d", mem_size);
1405     }
1406     String retval;
1407     ocl::Program p(ocl::imgproc::linearPolar_oclsrc, buildOptions, retval);
1408     ocl::Kernel k("linearPolar", p);
1409     ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
1410     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
1411     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
1412 
1413     if (!(flags & CV_WARP_INVERSE_MAP))
1414     {
1415 
1416 
1417 
1418         ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
1419         float PI2_height = (float) CV_2PI / dsize.height;
1420         float maxRadius_width = (float) maxRadius / dsize.width;
1421         computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, maxRadius_width, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
1422         size_t max_dim = max(h, w);
1423         computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
1424         k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
1425     }
1426     else
1427     {
1428         const int ANGLE_BORDER = 1;
1429 
1430         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
1431         src = src_with_border;
1432         Size ssize = src_with_border.size();
1433         ssize.height -= 2 * ANGLE_BORDER;
1434         float ascale =  ssize.height / ((float)CV_2PI);
1435         float pscale =  ssize.width / ((float) maxRadius);
1436 
1437         k.args(ocl_mapx, ocl_mapy, ascale, pscale, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
1438 
1439 
1440     }
1441     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
1442     size_t localThreads[2] = { mem_size , mem_size };
1443     k.run(2, globalThreads, localThreads, false);
1444     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
1445     return true;
1446 }
1447 static bool ocl_logPolar(InputArray _src, OutputArray _dst,
1448     Point2f center, double M, int flags)
1449 {
1450     if (M <= 0)
1451         CV_Error(CV_StsOutOfRange, "M should be >0");
1452     UMat src_with_border; // don't scope this variable (it holds image data)
1453 
1454     UMat mapx, mapy, r, cp_sp;
1455     UMat src = _src.getUMat();
1456     _dst.create(src.size(), src.type());
1457     Size dsize = src.size();
1458     r.create(Size(1, dsize.width), CV_32F);
1459     cp_sp.create(Size(1, dsize.height), CV_32FC2);
1460 
1461     mapx.create(dsize, CV_32F);
1462     mapy.create(dsize, CV_32F);
1463     size_t w = dsize.width;
1464     size_t h = dsize.height;
1465     String buildOptions;
1466     unsigned mem_size = 32;
1467     if (flags & CV_WARP_INVERSE_MAP)
1468     {
1469         buildOptions = "-D InverseMap";
1470     }
1471     else
1472     {
1473         buildOptions = format("-D ForwardMap  -D MEM_SIZE=%d", mem_size);
1474     }
1475     String retval;
1476     ocl::Program p(ocl::imgproc::logPolar_oclsrc, buildOptions, retval);
1477     //ocl::Program p(ocl::imgproc::my_linearPolar_oclsrc, buildOptions, retval);
1478     //printf("%s\n", retval);
1479     ocl::Kernel k("logPolar", p);
1480     ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
1481     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
1482     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
1483 
1484     if (!(flags & CV_WARP_INVERSE_MAP))
1485     {
1486 
1487 
1488 
1489         ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
1490         float PI2_height = (float) CV_2PI / dsize.height;
1491 
1492         computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, (float)M, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
1493         size_t max_dim = max(h, w);
1494         computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
1495         k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
1496     }
1497     else
1498     {
1499         const int ANGLE_BORDER = 1;
1500 
1501         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
1502         src = src_with_border;
1503         Size ssize = src_with_border.size();
1504         ssize.height -= 2 * ANGLE_BORDER;
1505         float ascale =  ssize.height / ((float)CV_2PI);
1506 
1507 
1508         k.args(ocl_mapx, ocl_mapy, ascale, (float)M, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
1509 
1510 
1511     }
1512     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
1513     size_t localThreads[2] = { mem_size , mem_size };
1514     k.run(2, globalThreads, localThreads, false);
1515     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
1516     return true;
1517 }
1518 #endif
1519 
1520 #endif
1521 
1522 #ifdef HAVE_OPENVX
openvx_remap(Mat src,Mat dst,Mat map1,Mat map2,int interpolation,const Scalar & borderValue)1523 static bool openvx_remap(Mat src, Mat dst, Mat map1, Mat map2, int interpolation, const Scalar& borderValue)
1524 {
1525     vx_interpolation_type_e inter_type;
1526     switch (interpolation)
1527     {
1528     case INTER_LINEAR:
1529 #if VX_VERSION > VX_VERSION_1_0
1530         inter_type = VX_INTERPOLATION_BILINEAR;
1531 #else
1532         inter_type = VX_INTERPOLATION_TYPE_BILINEAR;
1533 #endif
1534         break;
1535     case INTER_NEAREST:
1536 /* NEAREST_NEIGHBOR mode disabled since OpenCV round half to even while OpenVX sample implementation round half up
1537 #if VX_VERSION > VX_VERSION_1_0
1538         inter_type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
1539 #else
1540         inter_type = VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR;
1541 #endif
1542         if (!map1.empty())
1543             for (int y = 0; y < map1.rows; ++y)
1544             {
1545                 float* line = map1.ptr<float>(y);
1546                 for (int x = 0; x < map1.cols; ++x)
1547                     line[x] = cvRound(line[x]);
1548             }
1549         if (!map2.empty())
1550             for (int y = 0; y < map2.rows; ++y)
1551             {
1552                 float* line = map2.ptr<float>(y);
1553                 for (int x = 0; x < map2.cols; ++x)
1554                     line[x] = cvRound(line[x]);
1555             }
1556         break;
1557 */
1558     case INTER_AREA://AREA interpolation mode is unsupported
1559     default:
1560         return false;
1561     }
1562 
1563     try
1564     {
1565         ivx::Context ctx = ovx::getOpenVXContext();
1566 
1567         Mat a;
1568         if (dst.data != src.data)
1569             a = src;
1570         else
1571             src.copyTo(a);
1572 
1573         ivx::Image
1574             ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
1575                 ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
1576             ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
1577                 ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
1578 
1579         //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
1580         //since OpenVX standard says nothing about thread-safety for now
1581         ivx::border_t prevBorder = ctx.immediateBorder();
1582         ctx.setImmediateBorder(VX_BORDER_CONSTANT, (vx_uint8)(borderValue[0]));
1583 
1584         ivx::Remap map = ivx::Remap::create(ctx, src.cols, src.rows, dst.cols, dst.rows);
1585         if (map1.empty()) map.setMappings(map2);
1586         else if (map2.empty()) map.setMappings(map1);
1587         else map.setMappings(map1, map2);
1588         ivx::IVX_CHECK_STATUS(vxuRemap(ctx, ia, map, inter_type, ib));
1589 #ifdef VX_VERSION_1_1
1590         ib.swapHandle();
1591         ia.swapHandle();
1592 #endif
1593 
1594         ctx.setImmediateBorder(prevBorder);
1595     }
1596     catch (const ivx::RuntimeError & e)
1597     {
1598         CV_Error(CV_StsInternal, e.what());
1599         return false;
1600     }
1601     catch (const ivx::WrapperError & e)
1602     {
1603         CV_Error(CV_StsInternal, e.what());
1604         return false;
1605     }
1606     return true;
1607 }
1608 #endif
1609 
1610 #if defined HAVE_IPP && !IPP_DISABLE_REMAP
1611 
1612 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
1613                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
1614                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
1615 
1616 class IPPRemapInvoker :
1617         public ParallelLoopBody
1618 {
1619 public:
IPPRemapInvoker(Mat & _src,Mat & _dst,Mat & _xmap,Mat & _ymap,ippiRemap _ippFunc,int _ippInterpolation,int _borderType,const Scalar & _borderValue,bool * _ok)1620     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
1621                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
1622         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
1623         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
1624     {
1625         *ok = true;
1626     }
1627 
operator ()(const Range & range) const1628     virtual void operator() (const Range & range) const
1629     {
1630         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
1631         Mat dstRoi = dst.rowRange(range);
1632         IppiSize dstRoiSize = ippiSize(dstRoi.size());
1633         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1634 
1635         if (borderType == BORDER_CONSTANT &&
1636                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
1637         {
1638             *ok = false;
1639             return;
1640         }
1641 
1642         if (CV_INSTRUMENT_FUN_IPP(ippFunc, src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
1643                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
1644                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
1645             *ok = false;
1646         else
1647         {
1648             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
1649         }
1650     }
1651 
1652 private:
1653     Mat & src, & dst, & map1, & map2;
1654     ippiRemap ippFunc;
1655     int ippInterpolation, borderType;
1656     Scalar borderValue;
1657     bool * ok;
1658 };
1659 
1660 #endif
1661 
1662 }
1663 
remap(InputArray _src,OutputArray _dst,InputArray _map1,InputArray _map2,int interpolation,int borderType,const Scalar & borderValue)1664 void cv::remap( InputArray _src, OutputArray _dst,
1665                 InputArray _map1, InputArray _map2,
1666                 int interpolation, int borderType, const Scalar& borderValue )
1667 {
1668     CV_INSTRUMENT_REGION();
1669 
1670     static RemapNNFunc nn_tab[] =
1671     {
1672         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
1673         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
1674     };
1675 
1676     static RemapFunc linear_tab[] =
1677     {
1678         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
1679         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
1680         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
1681         remapBilinear<Cast<float, float>, RemapNoVec, float>,
1682         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
1683     };
1684 
1685     static RemapFunc cubic_tab[] =
1686     {
1687         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
1688         remapBicubic<Cast<float, ushort>, float, 1>,
1689         remapBicubic<Cast<float, short>, float, 1>, 0,
1690         remapBicubic<Cast<float, float>, float, 1>,
1691         remapBicubic<Cast<double, double>, float, 1>, 0
1692     };
1693 
1694     static RemapFunc lanczos4_tab[] =
1695     {
1696         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
1697         remapLanczos4<Cast<float, ushort>, float, 1>,
1698         remapLanczos4<Cast<float, short>, float, 1>, 0,
1699         remapLanczos4<Cast<float, float>, float, 1>,
1700         remapLanczos4<Cast<double, double>, float, 1>, 0
1701     };
1702 
1703     CV_Assert( !_map1.empty() );
1704     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
1705 
1706     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
1707                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
1708 
1709     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
1710     _dst.create( map1.size(), src.type() );
1711     Mat dst = _dst.getMat();
1712 
1713 
1714     CV_OVX_RUN(
1715         src.type() == CV_8UC1 && dst.type() == CV_8UC1 &&
1716         !ovx::skipSmallImages<VX_KERNEL_REMAP>(src.cols, src.rows) &&
1717         (borderType& ~BORDER_ISOLATED) == BORDER_CONSTANT &&
1718         ((map1.type() == CV_32FC2 && map2.empty() && map1.size == dst.size) ||
1719          (map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && map1.size == dst.size && map2.size == dst.size) ||
1720          (map1.empty() && map2.type() == CV_32FC2 && map2.size == dst.size)) &&
1721         ((borderType & BORDER_ISOLATED) != 0 || !src.isSubmatrix()),
1722         openvx_remap(src, dst, map1, map2, interpolation, borderValue));
1723 
1724     CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
1725 
1726     if( dst.data == src.data )
1727         src = src.clone();
1728 
1729     if( interpolation == INTER_AREA )
1730         interpolation = INTER_LINEAR;
1731 
1732     int type = src.type(), depth = CV_MAT_DEPTH(type);
1733 
1734 #if defined HAVE_IPP && !IPP_DISABLE_REMAP
1735     CV_IPP_CHECK()
1736     {
1737         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
1738                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
1739                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
1740         {
1741             int ippInterpolation =
1742                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
1743                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
1744 
1745             ippiRemap ippFunc =
1746                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
1747                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
1748                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
1749                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
1750                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
1751                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
1752                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
1753                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
1754                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
1755 
1756             if (ippFunc)
1757             {
1758                 bool ok;
1759                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
1760                                         borderType, borderValue, &ok);
1761                 Range range(0, dst.rows);
1762                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
1763 
1764                 if (ok)
1765                 {
1766                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
1767                     return;
1768                 }
1769                 setIppErrorStatus();
1770             }
1771         }
1772     }
1773 #endif
1774 
1775     RemapNNFunc nnfunc = 0;
1776     RemapFunc ifunc = 0;
1777     const void* ctab = 0;
1778     bool fixpt = depth == CV_8U;
1779     bool planar_input = false;
1780 
1781     if( interpolation == INTER_NEAREST )
1782     {
1783         nnfunc = nn_tab[depth];
1784         CV_Assert( nnfunc != 0 );
1785     }
1786     else
1787     {
1788         if( interpolation == INTER_LINEAR )
1789             ifunc = linear_tab[depth];
1790         else if( interpolation == INTER_CUBIC ){
1791             ifunc = cubic_tab[depth];
1792             CV_Assert( _src.channels() <= 4 );
1793         }
1794         else if( interpolation == INTER_LANCZOS4 ){
1795             ifunc = lanczos4_tab[depth];
1796             CV_Assert( _src.channels() <= 4 );
1797         }
1798         else
1799             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
1800         CV_Assert( ifunc != 0 );
1801         ctab = initInterTab2D( interpolation, fixpt );
1802     }
1803 
1804     const Mat *m1 = &map1, *m2 = &map2;
1805 
1806     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
1807         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
1808     {
1809         if( map1.type() != CV_16SC2 )
1810             std::swap(m1, m2);
1811     }
1812     else
1813     {
1814         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
1815             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
1816         planar_input = map1.channels() == 1;
1817     }
1818 
1819     RemapInvoker invoker(src, dst, m1, m2,
1820                          borderType, borderValue, planar_input, nnfunc, ifunc,
1821                          ctab);
1822     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
1823 }
1824 
1825 
convertMaps(InputArray _map1,InputArray _map2,OutputArray _dstmap1,OutputArray _dstmap2,int dstm1type,bool nninterpolate)1826 void cv::convertMaps( InputArray _map1, InputArray _map2,
1827                       OutputArray _dstmap1, OutputArray _dstmap2,
1828                       int dstm1type, bool nninterpolate )
1829 {
1830     CV_INSTRUMENT_REGION();
1831 
1832     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
1833     Size size = map1.size();
1834     const Mat *m1 = &map1, *m2 = &map2;
1835     int m1type = m1->type(), m2type = m2->type();
1836 
1837     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
1838                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
1839                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
1840                (m1type == CV_32FC2 && m2->empty()) );
1841 
1842     if( m2type == CV_16SC2 )
1843     {
1844         std::swap( m1, m2 );
1845         std::swap( m1type, m2type );
1846     }
1847 
1848     if( dstm1type <= 0 )
1849         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
1850     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
1851     _dstmap1.create( size, dstm1type );
1852     dstmap1 = _dstmap1.getMat();
1853 
1854     if( !nninterpolate && dstm1type != CV_32FC2 )
1855     {
1856         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
1857         dstmap2 = _dstmap2.getMat();
1858     }
1859     else
1860         _dstmap2.release();
1861 
1862     if( m1type == dstm1type || (nninterpolate &&
1863         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
1864         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
1865     {
1866         m1->convertTo( dstmap1, dstmap1.type() );
1867         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
1868             m2->copyTo( dstmap2 );
1869         return;
1870     }
1871 
1872     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
1873     {
1874         Mat vdata[] = { *m1, *m2 };
1875         merge( vdata, 2, dstmap1 );
1876         return;
1877     }
1878 
1879     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
1880     {
1881         Mat mv[] = { dstmap1, dstmap2 };
1882         split( *m1, mv );
1883         return;
1884     }
1885 
1886     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
1887         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
1888     {
1889         size.width *= size.height;
1890         size.height = 1;
1891     }
1892 
1893 #if CV_TRY_SSE4_1
1894     bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
1895 #endif
1896 
1897     const float scale = 1.f/INTER_TAB_SIZE;
1898     int x, y;
1899     for( y = 0; y < size.height; y++ )
1900     {
1901         const float* src1f = m1->ptr<float>(y);
1902         const float* src2f = m2->ptr<float>(y);
1903         const short* src1 = (const short*)src1f;
1904         const ushort* src2 = (const ushort*)src2f;
1905 
1906         float* dst1f = dstmap1.ptr<float>(y);
1907         float* dst2f = dstmap2.ptr<float>(y);
1908         short* dst1 = (short*)dst1f;
1909         ushort* dst2 = (ushort*)dst2f;
1910         x = 0;
1911 
1912         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
1913         {
1914             if( nninterpolate )
1915             {
1916                 #if CV_TRY_SSE4_1
1917                 if (useSSE4_1)
1918                     opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, size.width);
1919                 else
1920                 #endif
1921                 {
1922                     #if CV_SIMD128
1923                     {
1924                         int span = v_int16x8::nlanes;
1925                         for( ; x <= size.width - span; x += span )
1926                         {
1927                             v_int16x8 v_dst[2];
1928                             #define CV_PACK_MAP(X) v_pack(v_round(v_load(X)), v_round(v_load((X)+4)))
1929                             v_dst[0] = CV_PACK_MAP(src1f + x);
1930                             v_dst[1] = CV_PACK_MAP(src2f + x);
1931                             #undef CV_PACK_MAP
1932                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
1933                         }
1934                     }
1935                     #endif
1936                     for( ; x < size.width; x++ )
1937                     {
1938                         dst1[x*2] = saturate_cast<short>(src1f[x]);
1939                         dst1[x*2+1] = saturate_cast<short>(src2f[x]);
1940                     }
1941                 }
1942             }
1943             else
1944             {
1945                 #if CV_TRY_SSE4_1
1946                 if (useSSE4_1)
1947                     opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width);
1948                 else
1949                 #endif
1950                 {
1951                     #if CV_SIMD128
1952                     {
1953                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
1954                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
1955                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
1956                         int span = v_float32x4::nlanes;
1957                         for( ; x <= size.width - span * 2; x += span * 2 )
1958                         {
1959                             v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
1960                             v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
1961                             v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
1962                             v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
1963 
1964                             v_int16x8 v_dst[2];
1965                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
1966                             v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
1967                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
1968 
1969                             v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
1970                             v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
1971                             v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
1972                         }
1973                     }
1974                     #endif
1975                     for( ; x < size.width; x++ )
1976                     {
1977                         int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
1978                         int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
1979                         dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
1980                         dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
1981                         dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
1982                     }
1983                 }
1984             }
1985         }
1986         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
1987         {
1988             if( nninterpolate )
1989             {
1990                 #if CV_SIMD128
1991                 int span = v_float32x4::nlanes;
1992                 {
1993                     for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
1994                         v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
1995                                                  v_round(v_load(src1f + x + span))));
1996                 }
1997                 #endif
1998                 for( ; x < size.width; x++ )
1999                 {
2000                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
2001                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
2002                 }
2003             }
2004             else
2005             {
2006                 #if CV_TRY_SSE4_1
2007                 if( useSSE4_1 )
2008                     opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width);
2009                 else
2010                 #endif
2011                 {
2012                     #if CV_SIMD128
2013                     {
2014                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
2015                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
2016                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
2017                         int span = v_uint16x8::nlanes;
2018                         for (; x <= size.width - span; x += span )
2019                         {
2020                             v_float32x4 v_src0[2], v_src1[2];
2021                             v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
2022                             v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
2023                             v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
2024                             v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
2025                             v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
2026                             v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
2027 
2028                             v_int16x8 v_dst[2];
2029                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
2030                             v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
2031                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
2032 
2033                             v_store(dst2 + x, v_pack_u(
2034                                 v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
2035                                 v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
2036                         }
2037                     }
2038                     #endif
2039                     for( ; x < size.width; x++ )
2040                     {
2041                         int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
2042                         int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
2043                         dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
2044                         dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
2045                         dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
2046                     }
2047                 }
2048             }
2049         }
2050         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
2051         {
2052             #if CV_SIMD128
2053             {
2054                 v_uint16x8 v_mask2 =  v_setall_u16(INTER_TAB_SIZE2-1);
2055                 v_uint32x4 v_zero =   v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
2056                 v_float32x4 v_scale = v_setall_f32(scale);
2057                 int span = v_float32x4::nlanes;
2058                 for( ; x <= size.width - span * 2; x += span * 2 )
2059                 {
2060                     v_uint32x4 v_fxy1, v_fxy2;
2061                     if ( src2 )
2062                     {
2063                         v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
2064                         v_expand(v_src2, v_fxy1, v_fxy2);
2065                     }
2066                     else
2067                         v_fxy1 = v_fxy2 = v_zero;
2068 
2069                     v_int16x8 v_src[2];
2070                     v_int32x4 v_src0[2], v_src1[2];
2071                     v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
2072                     v_expand(v_src[0], v_src0[0], v_src0[1]);
2073                     v_expand(v_src[1], v_src1[0], v_src1[1]);
2074                     #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
2075                                                                         v_cvt_f32(v_reinterpret_as_s32(X)))
2076                     #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
2077                                                                         v_cvt_f32(v_reinterpret_as_s32(Y)))
2078                     v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
2079                     v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
2080                     v_store(dst1f + x, v_dst1);
2081                     v_store(dst2f + x, v_dst2);
2082 
2083                     v_dst1 = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
2084                     v_dst2 = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
2085                     v_store(dst1f + x + span, v_dst1);
2086                     v_store(dst2f + x + span, v_dst2);
2087                     #undef CV_COMPUTE_MAP_X
2088                     #undef CV_COMPUTE_MAP_Y
2089                 }
2090             }
2091             #endif
2092             for( ; x < size.width; x++ )
2093             {
2094                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
2095                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
2096                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
2097             }
2098         }
2099         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
2100         {
2101             #if CV_SIMD128
2102             {
2103                 v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
2104                 v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
2105                 v_float32x4 v_scale = v_setall_f32(scale);
2106                 int span = v_int16x8::nlanes;
2107                 for( ; x <= size.width - span; x += span )
2108                 {
2109                     v_int32x4 v_fxy1, v_fxy2;
2110                     if (src2)
2111                     {
2112                         v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
2113                         v_expand(v_src2, v_fxy1, v_fxy2);
2114                     }
2115                     else
2116                         v_fxy1 = v_fxy2 = v_zero;
2117 
2118                     v_int16x8 v_src[2];
2119                     v_int32x4 v_src0[2], v_src1[2];
2120                     v_float32x4 v_dst[2];
2121                     v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
2122                     v_expand(v_src[0], v_src0[0], v_src0[1]);
2123                     v_expand(v_src[1], v_src1[0], v_src1[1]);
2124 
2125                     #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
2126                     #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
2127                     v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
2128                     v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
2129                     v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
2130 
2131                     v_dst[0] = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
2132                     v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
2133                     v_store_interleave(dst1f + (x << 1) + span, v_dst[0], v_dst[1]);
2134                     #undef CV_COMPUTE_MAP_X
2135                     #undef CV_COMPUTE_MAP_Y
2136                 }
2137             }
2138             #endif
2139             for( ; x < size.width; x++ )
2140             {
2141                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
2142                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
2143                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
2144             }
2145         }
2146         else
2147             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
2148     }
2149 }
2150 
2151 
2152 namespace cv
2153 {
2154 
2155 class WarpAffineInvoker :
2156     public ParallelLoopBody
2157 {
2158 public:
WarpAffineInvoker(const Mat & _src,Mat & _dst,int _interpolation,int _borderType,const Scalar & _borderValue,int * _adelta,int * _bdelta,const double * _M)2159     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
2160                       const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
2161         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
2162         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
2163         M(_M)
2164     {
2165     }
2166 
operator ()(const Range & range) const2167     virtual void operator() (const Range& range) const CV_OVERRIDE
2168     {
2169         const int BLOCK_SZ = 64;
2170         AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
2171         short *XY = __XY.data(), *A = __A.data();
2172         const int AB_BITS = MAX(10, (int)INTER_BITS);
2173         const int AB_SCALE = 1 << AB_BITS;
2174         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
2175     #if CV_TRY_AVX2
2176         bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
2177     #endif
2178     #if CV_TRY_SSE4_1
2179         bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
2180     #endif
2181 
2182         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
2183         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
2184         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
2185 
2186         for( y = range.start; y < range.end; y += bh0 )
2187         {
2188             for( x = 0; x < dst.cols; x += bw0 )
2189             {
2190                 int bw = std::min( bw0, dst.cols - x);
2191                 int bh = std::min( bh0, range.end - y);
2192 
2193                 Mat _XY(bh, bw, CV_16SC2, XY);
2194                 Mat dpart(dst, Rect(x, y, bw, bh));
2195 
2196                 for( y1 = 0; y1 < bh; y1++ )
2197                 {
2198                     short* xy = XY + y1*bw*2;
2199                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
2200                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
2201 
2202                     if( interpolation == INTER_NEAREST )
2203                     {
2204                         x1 = 0;
2205                         #if CV_TRY_SSE4_1
2206                         if( useSSE4_1 )
2207                             opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
2208                         else
2209                         #endif
2210                         {
2211                             #if CV_SIMD128
2212                             {
2213                                 v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
2214                                 int span = v_uint16x8::nlanes;
2215                                 for( ; x1 <= bw - span; x1 += span )
2216                                 {
2217                                     v_int16x8 v_dst[2];
2218                                     #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
2219                                                                                     v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
2220                                     v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
2221                                     v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
2222                                     #undef CV_CONVERT_MAP
2223                                     v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
2224                                 }
2225                             }
2226                             #endif
2227                             for( ; x1 < bw; x1++ )
2228                             {
2229                                 int X = (X0 + adelta[x+x1]) >> AB_BITS;
2230                                 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
2231                                 xy[x1*2] = saturate_cast<short>(X);
2232                                 xy[x1*2+1] = saturate_cast<short>(Y);
2233                             }
2234                         }
2235                     }
2236                     else
2237                     {
2238                         short* alpha = A + y1*bw;
2239                         x1 = 0;
2240                         #if CV_TRY_AVX2
2241                         if ( useAVX2 )
2242                             x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
2243                         #endif
2244                         #if CV_SIMD128
2245                         {
2246                             v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
2247                             v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
2248                             int span = v_float32x4::nlanes;
2249                             for( ; x1 <= bw - span * 2; x1 += span * 2 )
2250                             {
2251                                 v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
2252                                 v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
2253                                 v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
2254                                 v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
2255 
2256                                 v_int16x8 v_xy[2];
2257                                 v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
2258                                 v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
2259                                 v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
2260 
2261                                 v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
2262                                 v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
2263                                 v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
2264                             }
2265                         }
2266                         #endif
2267                         for( ; x1 < bw; x1++ )
2268                         {
2269                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
2270                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
2271                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
2272                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
2273                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
2274                                     (X & (INTER_TAB_SIZE-1)));
2275                         }
2276                     }
2277                 }
2278 
2279                 if( interpolation == INTER_NEAREST )
2280                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
2281                 else
2282                 {
2283                     Mat _matA(bh, bw, CV_16U, A);
2284                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
2285                 }
2286             }
2287         }
2288     }
2289 
2290 private:
2291     Mat src;
2292     Mat dst;
2293     int interpolation, borderType;
2294     Scalar borderValue;
2295     int *adelta, *bdelta;
2296     const double *M;
2297 };
2298 
2299 
2300 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
2301 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
2302 
2303 class IPPWarpAffineInvoker :
2304     public ParallelLoopBody
2305 {
2306 public:
IPPWarpAffineInvoker(Mat & _src,Mat & _dst,double (& _coeffs)[2][3],int & _interpolation,int _borderType,const Scalar & _borderValue,ippiWarpAffineBackFunc _func,bool * _ok)2307     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
2308                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
2309         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
2310         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
2311     {
2312         *ok = true;
2313     }
2314 
operator ()(const Range & range) const2315     virtual void operator() (const Range& range) const CV_OVERRIDE
2316     {
2317         IppiSize srcsize = { src.cols, src.rows };
2318         IppiRect srcroi = { 0, 0, src.cols, src.rows };
2319         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
2320         int cnn = src.channels();
2321         if( borderType == BORDER_CONSTANT )
2322         {
2323             IppiSize setSize = { dst.cols, range.end - range.start };
2324             void *dataPointer = dst.ptr(range.start);
2325             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
2326             {
2327                 *ok = false;
2328                 return;
2329             }
2330         }
2331 
2332         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
2333         IppStatus status = CV_INSTRUMENT_FUN_IPP(func,( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
2334                                 (int)dst.step[0], dstroi, coeffs, mode ));
2335         if( status < 0)
2336             *ok = false;
2337         else
2338         {
2339             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2340         }
2341     }
2342 private:
2343     Mat &src;
2344     Mat &dst;
2345     int mode;
2346     double (&coeffs)[2][3];
2347     int borderType;
2348     Scalar borderValue;
2349     ippiWarpAffineBackFunc func;
2350     bool *ok;
2351     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
2352 };
2353 #endif
2354 
2355 #ifdef HAVE_OPENCL
2356 
2357 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
2358 
ocl_warpTransform_cols4(InputArray _src,OutputArray _dst,InputArray _M0,Size dsize,int flags,int borderType,const Scalar & borderValue,int op_type)2359 static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0,
2360                                     Size dsize, int flags, int borderType, const Scalar& borderValue,
2361                                     int op_type)
2362 {
2363     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
2364     const ocl::Device & dev = ocl::Device::getDefault();
2365     int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2366 
2367     int interpolation = flags & INTER_MAX;
2368     if( interpolation == INTER_AREA )
2369         interpolation = INTER_LINEAR;
2370 
2371     if ( !dev.isIntel() || !(type == CV_8UC1) ||
2372          !(dtype == CV_8UC1) || !(_dst.cols() % 4 == 0) ||
2373          !(borderType == cv::BORDER_CONSTANT &&
2374           (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)))
2375         return false;
2376 
2377     const char * const warp_op[2] = { "Affine", "Perspective" };
2378     const char * const interpolationMap[3] = { "nearest", "linear", "cubic" };
2379     ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc;
2380     String kernelName = format("warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]);
2381 
2382     bool is32f = (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
2383     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
2384     int sctype = CV_MAKETYPE(wdepth, cn);
2385 
2386     ocl::Kernel k;
2387     String opts = format("-D ST=%s", ocl::typeToStr(sctype));
2388 
2389     k.create(kernelName.c_str(), program, opts);
2390     if (k.empty())
2391         return false;
2392 
2393     float borderBuf[] = { 0, 0, 0, 0 };
2394     scalarToRawData(borderValue, borderBuf, sctype);
2395 
2396     UMat src = _src.getUMat(), M0;
2397     _dst.create( dsize.empty() ? src.size() : dsize, src.type() );
2398     UMat dst = _dst.getUMat();
2399 
2400     float M[9] = {0};
2401     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
2402     Mat matM(matRows, 3, CV_32F, M), M1 = _M0.getMat();
2403     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && M1.rows == matRows && M1.cols == 3 );
2404     M1.convertTo(matM, matM.type());
2405 
2406     if( !(flags & WARP_INVERSE_MAP) )
2407     {
2408         if (op_type == OCL_OP_PERSPECTIVE)
2409             invert(matM, matM);
2410         else
2411         {
2412             float D = M[0]*M[4] - M[1]*M[3];
2413             D = D != 0 ? 1.f/D : 0;
2414             float A11 = M[4]*D, A22=M[0]*D;
2415             M[0] = A11; M[1] *= -D;
2416             M[3] *= -D; M[4] = A22;
2417             float b1 = -M[0]*M[2] - M[1]*M[5];
2418             float b2 = -M[3]*M[2] - M[4]*M[5];
2419             M[2] = b1; M[5] = b2;
2420         }
2421     }
2422     matM.convertTo(M0, CV_32F);
2423 
2424     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
2425            ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
2426 
2427     size_t globalThreads[2];
2428     globalThreads[0] = (size_t)(dst.cols / 4);
2429     globalThreads[1] = (size_t)dst.rows;
2430 
2431     return k.run(2, globalThreads, NULL, false);
2432 }
2433 
ocl_warpTransform(InputArray _src,OutputArray _dst,InputArray _M0,Size dsize,int flags,int borderType,const Scalar & borderValue,int op_type)2434 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
2435                               Size dsize, int flags, int borderType, const Scalar& borderValue,
2436                               int op_type)
2437 {
2438     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
2439     const ocl::Device & dev = ocl::Device::getDefault();
2440 
2441     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2442     const bool doubleSupport = dev.doubleFPConfig() > 0;
2443 
2444     int interpolation = flags & INTER_MAX;
2445     if( interpolation == INTER_AREA )
2446         interpolation = INTER_LINEAR;
2447     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
2448 
2449     if ( !(borderType == cv::BORDER_CONSTANT &&
2450            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
2451          (!doubleSupport && depth == CV_64F) || cn > 4)
2452         return false;
2453 
2454     bool useDouble = depth == CV_64F;
2455 
2456     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
2457     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
2458                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
2459     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
2460 
2461     int scalarcn = cn == 3 ? 4 : cn;
2462     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
2463     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
2464     int sctype = CV_MAKETYPE(wdepth, scalarcn);
2465 
2466     ocl::Kernel k;
2467     String opts;
2468     if (interpolation == INTER_NEAREST)
2469     {
2470         opts = format("-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
2471                       ocl::typeToStr(type),
2472                       doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2473                       useDouble ? "double" : "float",
2474                       ocl::typeToStr(CV_MAT_DEPTH(type)),
2475                       ocl::typeToStr(sctype), cn, rowsPerWI);
2476     }
2477     else
2478     {
2479         char cvt[2][50];
2480         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
2481                       " -D convertToWT=%s -D convertToT=%s%s -D CT=%s -D cn=%d -D rowsPerWI=%d",
2482                       interpolationMap[interpolation], ocl::typeToStr(type),
2483                       ocl::typeToStr(CV_MAT_DEPTH(type)),
2484                       ocl::typeToStr(sctype),
2485                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
2486                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
2487                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
2488                       doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2489                       useDouble ? "double" : "float",
2490                       cn, rowsPerWI);
2491     }
2492 
2493     k.create(kernelName, program, opts);
2494     if (k.empty())
2495         return false;
2496 
2497     double borderBuf[] = { 0, 0, 0, 0 };
2498     scalarToRawData(borderValue, borderBuf, sctype);
2499 
2500     UMat src = _src.getUMat(), M0;
2501     _dst.create( dsize.empty() ? src.size() : dsize, src.type() );
2502     UMat dst = _dst.getUMat();
2503 
2504     double M[9] = {0};
2505     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
2506     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
2507     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
2508                M1.rows == matRows && M1.cols == 3 );
2509     M1.convertTo(matM, matM.type());
2510 
2511     if( !(flags & WARP_INVERSE_MAP) )
2512     {
2513         if (op_type == OCL_OP_PERSPECTIVE)
2514             invert(matM, matM);
2515         else
2516         {
2517             double D = M[0]*M[4] - M[1]*M[3];
2518             D = D != 0 ? 1./D : 0;
2519             double A11 = M[4]*D, A22=M[0]*D;
2520             M[0] = A11; M[1] *= -D;
2521             M[3] *= -D; M[4] = A22;
2522             double b1 = -M[0]*M[2] - M[1]*M[5];
2523             double b2 = -M[3]*M[2] - M[4]*M[5];
2524             M[2] = b1; M[5] = b2;
2525         }
2526     }
2527     matM.convertTo(M0, useDouble ? CV_64F : CV_32F);
2528 
2529     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
2530            ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
2531 
2532     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
2533     return k.run(2, globalThreads, NULL, false);
2534 }
2535 
2536 #endif
2537 
2538 namespace hal {
2539 
warpAffine(int src_type,const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,const double M[6],int interpolation,int borderType,const double borderValue[4])2540 void warpAffine(int src_type,
2541                 const uchar * src_data, size_t src_step, int src_width, int src_height,
2542                 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
2543                 const double M[6], int interpolation, int borderType, const double borderValue[4])
2544 {
2545     CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
2546 
2547     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
2548     Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
2549 
2550     int x;
2551     AutoBuffer<int> _abdelta(dst.cols*2);
2552     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
2553     const int AB_BITS = MAX(10, (int)INTER_BITS);
2554     const int AB_SCALE = 1 << AB_BITS;
2555 
2556     for( x = 0; x < dst.cols; x++ )
2557     {
2558         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
2559         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
2560     }
2561 
2562     Range range(0, dst.rows);
2563     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
2564                               Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
2565                               adelta, bdelta, M);
2566     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2567 }
2568 
2569 } // hal::
2570 } // cv::
2571 
2572 
warpAffine(InputArray _src,OutputArray _dst,InputArray _M0,Size dsize,int flags,int borderType,const Scalar & borderValue)2573 void cv::warpAffine( InputArray _src, OutputArray _dst,
2574                      InputArray _M0, Size dsize,
2575                      int flags, int borderType, const Scalar& borderValue )
2576 {
2577     CV_INSTRUMENT_REGION();
2578 
2579     int interpolation = flags & INTER_MAX;
2580     CV_Assert( _src.channels() <= 4 || (interpolation != INTER_LANCZOS4 &&
2581                                         interpolation != INTER_CUBIC) );
2582 
2583     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
2584                _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
2585                ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
2586                                        borderValue, OCL_OP_AFFINE))
2587 
2588     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
2589                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
2590                                  borderValue, OCL_OP_AFFINE))
2591 
2592     Mat src = _src.getMat(), M0 = _M0.getMat();
2593     _dst.create( dsize.empty() ? src.size() : dsize, src.type() );
2594     Mat dst = _dst.getMat();
2595     CV_Assert( src.cols > 0 && src.rows > 0 );
2596     if( dst.data == src.data )
2597         src = src.clone();
2598 
2599     double M[6] = {0};
2600     Mat matM(2, 3, CV_64F, M);
2601     if( interpolation == INTER_AREA )
2602         interpolation = INTER_LINEAR;
2603 
2604     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
2605     M0.convertTo(matM, matM.type());
2606 
2607     if( !(flags & WARP_INVERSE_MAP) )
2608     {
2609         double D = M[0]*M[4] - M[1]*M[3];
2610         D = D != 0 ? 1./D : 0;
2611         double A11 = M[4]*D, A22=M[0]*D;
2612         M[0] = A11; M[1] *= -D;
2613         M[3] *= -D; M[4] = A22;
2614         double b1 = -M[0]*M[2] - M[1]*M[5];
2615         double b2 = -M[3]*M[2] - M[4]*M[5];
2616         M[2] = b1; M[5] = b2;
2617     }
2618 
2619 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
2620     CV_IPP_CHECK()
2621     {
2622         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2623         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
2624            ( cn == 1 || cn == 3 || cn == 4 ) &&
2625            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
2626            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
2627         {
2628             ippiWarpAffineBackFunc ippFunc = 0;
2629             if ((flags & WARP_INVERSE_MAP) != 0)
2630             {
2631                 ippFunc =
2632                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
2633                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
2634                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
2635                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
2636                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
2637                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
2638                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
2639                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
2640                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
2641                 0;
2642             }
2643             else
2644             {
2645                 ippFunc =
2646                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
2647                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
2648                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
2649                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
2650                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
2651                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
2652                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
2653                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
2654                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
2655                 0;
2656             }
2657             int mode =
2658             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
2659             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
2660             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
2661             0;
2662             CV_Assert(mode && ippFunc);
2663 
2664             double coeffs[2][3];
2665             for( int i = 0; i < 2; i++ )
2666                 for( int j = 0; j < 3; j++ )
2667                     coeffs[i][j] = matM.at<double>(i, j);
2668 
2669             bool ok;
2670             Range range(0, dst.rows);
2671             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
2672             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2673             if( ok )
2674             {
2675                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2676                 return;
2677             }
2678             setIppErrorStatus();
2679         }
2680     }
2681 #endif
2682 
2683     hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
2684                     M, interpolation, borderType, borderValue.val);
2685 }
2686 
2687 
2688 namespace cv
2689 {
2690 #if CV_SIMD128_64F
WarpPerspectiveLine_ProcessNN_CV_SIMD(const double * M,short * xy,double X0,double Y0,double W0,int bw)2691 void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0, double Y0, double W0, int bw)
2692 {
2693     const v_float64x2 v_M0 = v_setall_f64(M[0]);
2694     const v_float64x2 v_M3 = v_setall_f64(M[3]);
2695     const v_float64x2 v_M6 = v_setall_f64(M[6]);
2696     const v_float64x2 v_intmax = v_setall_f64((double)INT_MAX);
2697     const v_float64x2 v_intmin = v_setall_f64((double)INT_MIN);
2698     const v_float64x2 v_2 = v_setall_f64(2.0);
2699     const v_float64x2 v_zero = v_setzero_f64();
2700     const v_float64x2 v_1 = v_setall_f64(1.0);
2701 
2702     int x1 = 0;
2703     v_float64x2 v_X0d = v_setall_f64(X0);
2704     v_float64x2 v_Y0d = v_setall_f64(Y0);
2705     v_float64x2 v_W0 = v_setall_f64(W0);
2706     v_float64x2 v_x1(0.0, 1.0);
2707 
2708     for (; x1 <= bw - 16; x1 += 16)
2709     {
2710         // 0-3
2711         v_int32x4 v_X0, v_Y0;
2712         {
2713             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2714             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2715             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2716             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2717             v_x1 += v_2;
2718 
2719             v_W = v_muladd(v_M6, v_x1, v_W0);
2720             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2721             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2722             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2723             v_x1 += v_2;
2724 
2725             v_X0 = v_round(v_fX0, v_fX1);
2726             v_Y0 = v_round(v_fY0, v_fY1);
2727         }
2728 
2729         // 4-7
2730         v_int32x4 v_X1, v_Y1;
2731         {
2732             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2733             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2734             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2735             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2736             v_x1 += v_2;
2737 
2738             v_W = v_muladd(v_M6, v_x1, v_W0);
2739             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2740             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2741             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2742             v_x1 += v_2;
2743 
2744             v_X1 = v_round(v_fX0, v_fX1);
2745             v_Y1 = v_round(v_fY0, v_fY1);
2746         }
2747 
2748         // 8-11
2749         v_int32x4 v_X2, v_Y2;
2750         {
2751             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2752             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2753             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2754             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2755             v_x1 += v_2;
2756 
2757             v_W = v_muladd(v_M6, v_x1, v_W0);
2758             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2759             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2760             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2761             v_x1 += v_2;
2762 
2763             v_X2 = v_round(v_fX0, v_fX1);
2764             v_Y2 = v_round(v_fY0, v_fY1);
2765         }
2766 
2767         // 12-15
2768         v_int32x4 v_X3, v_Y3;
2769         {
2770             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2771             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2772             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2773             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2774             v_x1 += v_2;
2775 
2776             v_W = v_muladd(v_M6, v_x1, v_W0);
2777             v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
2778             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2779             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2780             v_x1 += v_2;
2781 
2782             v_X3 = v_round(v_fX0, v_fX1);
2783             v_Y3 = v_round(v_fY0, v_fY1);
2784         }
2785 
2786         // convert to 16s
2787         v_X0 = v_reinterpret_as_s32(v_pack(v_X0, v_X1));
2788         v_X1 = v_reinterpret_as_s32(v_pack(v_X2, v_X3));
2789         v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0, v_Y1));
2790         v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2, v_Y3));
2791 
2792         v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
2793         v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));
2794     }
2795 
2796     for( ; x1 < bw; x1++ )
2797     {
2798         double W = W0 + M[6]*x1;
2799         W = W ? 1./W : 0;
2800         double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
2801         double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
2802         int X = saturate_cast<int>(fX);
2803         int Y = saturate_cast<int>(fY);
2804 
2805         xy[x1*2] = saturate_cast<short>(X);
2806         xy[x1*2+1] = saturate_cast<short>(Y);
2807     }
2808 }
2809 
WarpPerspectiveLine_Process_CV_SIMD(const double * M,short * xy,short * alpha,double X0,double Y0,double W0,int bw)2810 void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
2811 {
2812     const v_float64x2 v_M0 = v_setall_f64(M[0]);
2813     const v_float64x2 v_M3 = v_setall_f64(M[3]);
2814     const v_float64x2 v_M6 = v_setall_f64(M[6]);
2815     const v_float64x2 v_intmax = v_setall_f64((double)INT_MAX);
2816     const v_float64x2 v_intmin = v_setall_f64((double)INT_MIN);
2817     const v_float64x2 v_2 = v_setall_f64(2.0);
2818     const v_float64x2 v_zero = v_setzero_f64();
2819     const v_float64x2 v_its = v_setall_f64((double)INTER_TAB_SIZE);
2820     const v_int32x4 v_itsi1 = v_setall_s32(INTER_TAB_SIZE - 1);
2821 
2822     int x1 = 0;
2823 
2824     v_float64x2 v_X0d = v_setall_f64(X0);
2825     v_float64x2 v_Y0d = v_setall_f64(Y0);
2826     v_float64x2 v_W0 = v_setall_f64(W0);
2827     v_float64x2 v_x1(0.0, 1.0);
2828 
2829     for (; x1 <= bw - 16; x1 += 16)
2830     {
2831         // 0-3
2832         v_int32x4 v_X0, v_Y0;
2833         {
2834             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2835             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2836             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2837             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2838             v_x1 += v_2;
2839 
2840             v_W = v_muladd(v_M6, v_x1, v_W0);
2841             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2842             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2843             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2844             v_x1 += v_2;
2845 
2846             v_X0 = v_round(v_fX0, v_fX1);
2847             v_Y0 = v_round(v_fY0, v_fY1);
2848         }
2849 
2850         // 4-7
2851         v_int32x4 v_X1, v_Y1;
2852         {
2853             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2854             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2855             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2856             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2857             v_x1 += v_2;
2858 
2859             v_W = v_muladd(v_M6, v_x1, v_W0);
2860             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2861             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2862             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2863             v_x1 += v_2;
2864 
2865             v_X1 = v_round(v_fX0, v_fX1);
2866             v_Y1 = v_round(v_fY0, v_fY1);
2867         }
2868 
2869         // 8-11
2870         v_int32x4 v_X2, v_Y2;
2871         {
2872             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2873             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2874             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2875             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2876             v_x1 += v_2;
2877 
2878             v_W = v_muladd(v_M6, v_x1, v_W0);
2879             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2880             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2881             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2882             v_x1 += v_2;
2883 
2884             v_X2 = v_round(v_fX0, v_fX1);
2885             v_Y2 = v_round(v_fY0, v_fY1);
2886         }
2887 
2888         // 12-15
2889         v_int32x4 v_X3, v_Y3;
2890         {
2891             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
2892             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2893             v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2894             v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2895             v_x1 += v_2;
2896 
2897             v_W = v_muladd(v_M6, v_x1, v_W0);
2898             v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
2899             v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
2900             v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
2901             v_x1 += v_2;
2902 
2903             v_X3 = v_round(v_fX0, v_fX1);
2904             v_Y3 = v_round(v_fY0, v_fY1);
2905         }
2906 
2907         // store alpha
2908         v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1);
2909         v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1);
2910         v_store((alpha + x1), v_pack(v_alpha0, v_alpha1));
2911 
2912         v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1);
2913         v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1);
2914         v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1));
2915 
2916         // convert to 16s
2917         v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS));
2918         v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS));
2919         v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS));
2920         v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS));
2921 
2922         v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
2923         v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));
2924     }
2925 
2926     for( ; x1 < bw; x1++ )
2927     {
2928         double W = W0 + M[6]*x1;
2929         W = W ? INTER_TAB_SIZE/W : 0;
2930         double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
2931         double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
2932         int X = saturate_cast<int>(fX);
2933         int Y = saturate_cast<int>(fY);
2934 
2935         xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
2936         xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
2937         alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
2938                             (X & (INTER_TAB_SIZE-1)));
2939     }
2940 }
2941 #endif
2942 
2943 class WarpPerspectiveInvoker :
2944     public ParallelLoopBody
2945 {
2946 public:
WarpPerspectiveInvoker(const Mat & _src,Mat & _dst,const double * _M,int _interpolation,int _borderType,const Scalar & _borderValue)2947     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, const double *_M, int _interpolation,
2948                            int _borderType, const Scalar &_borderValue) :
2949         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
2950         borderType(_borderType), borderValue(_borderValue)
2951     {
2952 #if defined(_MSC_VER) && _MSC_VER == 1800 /* MSVS 2013 */ && CV_AVX
2953         // details: https://github.com/opencv/opencv/issues/11026
2954         borderValue.val[2] = _borderValue.val[2];
2955         borderValue.val[3] = _borderValue.val[3];
2956 #endif
2957     }
2958 
operator ()(const Range & range) const2959     virtual void operator() (const Range& range) const CV_OVERRIDE
2960     {
2961         const int BLOCK_SZ = 32;
2962         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
2963         int x, y, y1, width = dst.cols, height = dst.rows;
2964 
2965         int bh0 = std::min(BLOCK_SZ/2, height);
2966         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
2967         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
2968 
2969         #if CV_TRY_SSE4_1
2970         Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
2971         if(CV_CPU_HAS_SUPPORT_SSE4_1)
2972             pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
2973         #endif
2974 
2975         for( y = range.start; y < range.end; y += bh0 )
2976         {
2977             for( x = 0; x < width; x += bw0 )
2978             {
2979                 int bw = std::min( bw0, width - x);
2980                 int bh = std::min( bh0, range.end - y); // height
2981 
2982                 Mat _XY(bh, bw, CV_16SC2, XY);
2983                 Mat dpart(dst, Rect(x, y, bw, bh));
2984 
2985                 for( y1 = 0; y1 < bh; y1++ )
2986                 {
2987                     short* xy = XY + y1*bw*2;
2988                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
2989                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
2990                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
2991 
2992                     if( interpolation == INTER_NEAREST )
2993                     {
2994                         #if CV_TRY_SSE4_1
2995                         if (pwarp_impl_sse4)
2996                             pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
2997                         else
2998                         #endif
2999                         #if CV_SIMD128_64F
3000                         WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
3001                         #else
3002                         for( int x1 = 0; x1 < bw; x1++ )
3003                         {
3004                             double W = W0 + M[6]*x1;
3005                             W = W ? 1./W : 0;
3006                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
3007                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
3008                             int X = saturate_cast<int>(fX);
3009                             int Y = saturate_cast<int>(fY);
3010 
3011                             xy[x1*2] = saturate_cast<short>(X);
3012                             xy[x1*2+1] = saturate_cast<short>(Y);
3013                         }
3014                         #endif
3015                     }
3016                     else
3017                     {
3018                         short* alpha = A + y1*bw;
3019 
3020                         #if CV_TRY_SSE4_1
3021                         if (pwarp_impl_sse4)
3022                             pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
3023                         else
3024                         #endif
3025                         #if CV_SIMD128_64F
3026                         WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
3027                         #else
3028                         for( int x1 = 0; x1 < bw; x1++ )
3029                         {
3030                             double W = W0 + M[6]*x1;
3031                             W = W ? INTER_TAB_SIZE/W : 0;
3032                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
3033                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
3034                             int X = saturate_cast<int>(fX);
3035                             int Y = saturate_cast<int>(fY);
3036 
3037                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
3038                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
3039                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
3040                                                 (X & (INTER_TAB_SIZE-1)));
3041                         }
3042                         #endif
3043                     }
3044                 }
3045 
3046                 if( interpolation == INTER_NEAREST )
3047                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
3048                 else
3049                 {
3050                     Mat _matA(bh, bw, CV_16U, A);
3051                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
3052                 }
3053             }
3054         }
3055     }
3056 
3057 private:
3058     Mat src;
3059     Mat dst;
3060     const double* M;
3061     int interpolation, borderType;
3062     Scalar borderValue;
3063 };
3064 
3065 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPPERSPECTIVE
3066 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
3067 
3068 class IPPWarpPerspectiveInvoker :
3069     public ParallelLoopBody
3070 {
3071 public:
IPPWarpPerspectiveInvoker(Mat & _src,Mat & _dst,double (& _coeffs)[3][3],int & _interpolation,int & _borderType,const Scalar & _borderValue,ippiWarpPerspectiveFunc _func,bool * _ok)3072     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
3073                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
3074         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
3075         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
3076     {
3077         *ok = true;
3078     }
3079 
operator ()(const Range & range) const3080     virtual void operator() (const Range& range) const CV_OVERRIDE
3081     {
3082         IppiSize srcsize = {src.cols, src.rows};
3083         IppiRect srcroi = {0, 0, src.cols, src.rows};
3084         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
3085         int cnn = src.channels();
3086 
3087         if( borderType == BORDER_CONSTANT )
3088         {
3089             IppiSize setSize = {dst.cols, range.end - range.start};
3090             void *dataPointer = dst.ptr(range.start);
3091             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
3092             {
3093                 *ok = false;
3094                 return;
3095             }
3096         }
3097 
3098         IppStatus status = CV_INSTRUMENT_FUN_IPP(func,(src.ptr();, srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode));
3099         if (status != ippStsNoErr)
3100             *ok = false;
3101         else
3102         {
3103             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3104         }
3105     }
3106 private:
3107     Mat &src;
3108     Mat &dst;
3109     int mode;
3110     double (&coeffs)[3][3];
3111     int borderType;
3112     const Scalar borderValue;
3113     ippiWarpPerspectiveFunc func;
3114     bool *ok;
3115 
3116     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
3117 };
3118 #endif
3119 
3120 namespace hal {
3121 
warpPerspective(int src_type,const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,const double M[9],int interpolation,int borderType,const double borderValue[4])3122 void warpPerspective(int src_type,
3123                     const uchar * src_data, size_t src_step, int src_width, int src_height,
3124                     uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3125                     const double M[9], int interpolation, int borderType, const double borderValue[4])
3126 {
3127     CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
3128     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3129     Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
3130 
3131     Range range(0, dst.rows);
3132     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
3133     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3134 }
3135 
3136 } // hal::
3137 } // cv::
3138 
warpPerspective(InputArray _src,OutputArray _dst,InputArray _M0,Size dsize,int flags,int borderType,const Scalar & borderValue)3139 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
3140                           Size dsize, int flags, int borderType, const Scalar& borderValue )
3141 {
3142     CV_INSTRUMENT_REGION();
3143 
3144     CV_Assert( _src.total() > 0 );
3145 
3146     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
3147                _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
3148                ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue,
3149                                        OCL_OP_PERSPECTIVE))
3150 
3151     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
3152                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
3153                               OCL_OP_PERSPECTIVE))
3154 
3155     Mat src = _src.getMat(), M0 = _M0.getMat();
3156     _dst.create( dsize.empty() ? src.size() : dsize, src.type() );
3157     Mat dst = _dst.getMat();
3158 
3159     if( dst.data == src.data )
3160         src = src.clone();
3161 
3162     double M[9];
3163     Mat matM(3, 3, CV_64F, M);
3164     int interpolation = flags & INTER_MAX;
3165     if( interpolation == INTER_AREA )
3166         interpolation = INTER_LINEAR;
3167 
3168     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
3169     M0.convertTo(matM, matM.type());
3170 
3171 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPPERSPECTIVE
3172     CV_IPP_CHECK()
3173     {
3174         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3175         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
3176            (cn == 1 || cn == 3 || cn == 4) &&
3177            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
3178            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
3179         {
3180             ippiWarpPerspectiveFunc ippFunc = 0;
3181             if ((flags & WARP_INVERSE_MAP) != 0)
3182             {
3183                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
3184                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
3185                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
3186                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
3187                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
3188                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
3189                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
3190                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
3191                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
3192             }
3193             else
3194             {
3195                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
3196                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
3197                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
3198                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
3199                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
3200                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
3201                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
3202                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
3203                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
3204             }
3205             int mode =
3206             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
3207             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
3208             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
3209             CV_Assert(mode && ippFunc);
3210 
3211             double coeffs[3][3];
3212             for( int i = 0; i < 3; i++ )
3213                 for( int j = 0; j < 3; j++ )
3214                     coeffs[i][j] = matM.at<double>(i, j);
3215 
3216             bool ok;
3217             Range range(0, dst.rows);
3218             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
3219             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3220             if( ok )
3221             {
3222                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3223                 return;
3224             }
3225             setIppErrorStatus();
3226         }
3227     }
3228 #endif
3229 
3230     if( !(flags & WARP_INVERSE_MAP) )
3231         invert(matM, matM);
3232 
3233     hal::warpPerspective(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
3234                         matM.ptr<double>(), interpolation, borderType, borderValue.val);
3235 }
3236 
3237 
getRotationMatrix2D_(Point2f center,double angle,double scale)3238 cv::Matx23d cv::getRotationMatrix2D_(Point2f center, double angle, double scale)
3239 {
3240     CV_INSTRUMENT_REGION();
3241 
3242     angle *= CV_PI/180;
3243     double alpha = std::cos(angle)*scale;
3244     double beta = std::sin(angle)*scale;
3245 
3246     Matx23d M(
3247         alpha, beta, (1-alpha)*center.x - beta*center.y,
3248         -beta, alpha, beta*center.x + (1-alpha)*center.y
3249     );
3250     return M;
3251 }
3252 
3253 /* Calculates coefficients of perspective transformation
3254  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
3255  *
3256  *      c00*xi + c01*yi + c02
3257  * ui = ---------------------
3258  *      c20*xi + c21*yi + c22
3259  *
3260  *      c10*xi + c11*yi + c12
3261  * vi = ---------------------
3262  *      c20*xi + c21*yi + c22
3263  *
3264  * Coefficients are calculated by solving linear system:
3265  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
3266  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
3267  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
3268  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
3269  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
3270  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
3271  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
3272  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
3273  *
3274  * where:
3275  *   cij - matrix coefficients, c22 = 1
3276  */
getPerspectiveTransform(const Point2f src[],const Point2f dst[],int solveMethod)3277 cv::Mat cv::getPerspectiveTransform(const Point2f src[], const Point2f dst[], int solveMethod)
3278 {
3279     CV_INSTRUMENT_REGION();
3280 
3281     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
3282     double a[8][8], b[8];
3283     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
3284 
3285     for( int i = 0; i < 4; ++i )
3286     {
3287         a[i][0] = a[i+4][3] = src[i].x;
3288         a[i][1] = a[i+4][4] = src[i].y;
3289         a[i][2] = a[i+4][5] = 1;
3290         a[i][3] = a[i][4] = a[i][5] =
3291         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
3292         a[i][6] = -src[i].x*dst[i].x;
3293         a[i][7] = -src[i].y*dst[i].x;
3294         a[i+4][6] = -src[i].x*dst[i].y;
3295         a[i+4][7] = -src[i].y*dst[i].y;
3296         b[i] = dst[i].x;
3297         b[i+4] = dst[i].y;
3298     }
3299 
3300     solve(A, B, X, solveMethod);
3301     M.ptr<double>()[8] = 1.;
3302 
3303     return M;
3304 }
3305 
3306 /* Calculates coefficients of affine transformation
3307  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
3308  *
3309  * ui = c00*xi + c01*yi + c02
3310  *
3311  * vi = c10*xi + c11*yi + c12
3312  *
3313  * Coefficients are calculated by solving linear system:
3314  * / x0 y0  1  0  0  0 \ /c00\ /u0\
3315  * | x1 y1  1  0  0  0 | |c01| |u1|
3316  * | x2 y2  1  0  0  0 | |c02| |u2|
3317  * |  0  0  0 x0 y0  1 | |c10| |v0|
3318  * |  0  0  0 x1 y1  1 | |c11| |v1|
3319  * \  0  0  0 x2 y2  1 / |c12| |v2|
3320  *
3321  * where:
3322  *   cij - matrix coefficients
3323  */
3324 
getAffineTransform(const Point2f src[],const Point2f dst[])3325 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
3326 {
3327     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
3328     double a[6*6], b[6];
3329     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
3330 
3331     for( int i = 0; i < 3; i++ )
3332     {
3333         int j = i*12;
3334         int k = i*12+6;
3335         a[j] = a[k+3] = src[i].x;
3336         a[j+1] = a[k+4] = src[i].y;
3337         a[j+2] = a[k+5] = 1;
3338         a[j+3] = a[j+4] = a[j+5] = 0;
3339         a[k] = a[k+1] = a[k+2] = 0;
3340         b[i*2] = dst[i].x;
3341         b[i*2+1] = dst[i].y;
3342     }
3343 
3344     solve( A, B, X );
3345     return M;
3346 }
3347 
invertAffineTransform(InputArray _matM,OutputArray __iM)3348 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
3349 {
3350     Mat matM = _matM.getMat();
3351     CV_Assert(matM.rows == 2 && matM.cols == 3);
3352     __iM.create(2, 3, matM.type());
3353     Mat _iM = __iM.getMat();
3354 
3355     if( matM.type() == CV_32F )
3356     {
3357         const softfloat* M = matM.ptr<softfloat>();
3358         softfloat* iM = _iM.ptr<softfloat>();
3359         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
3360 
3361         softdouble D = M[0]*M[step+1] - M[1]*M[step];
3362         D = D != 0. ? softdouble(1.)/D : softdouble(0.);
3363         softdouble A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
3364         softdouble b1 = -A11*M[2] - A12*M[step+2];
3365         softdouble b2 = -A21*M[2] - A22*M[step+2];
3366 
3367         iM[0] = A11; iM[1] = A12; iM[2] = b1;
3368         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
3369     }
3370     else if( matM.type() == CV_64F )
3371     {
3372         const softdouble* M = matM.ptr<softdouble>();
3373         softdouble* iM = _iM.ptr<softdouble>();
3374         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
3375 
3376         softdouble D = M[0]*M[step+1] - M[1]*M[step];
3377         D = D != 0. ? softdouble(1.)/D : softdouble(0.);
3378         softdouble A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
3379         softdouble b1 = -A11*M[2] - A12*M[step+2];
3380         softdouble b2 = -A21*M[2] - A22*M[step+2];
3381 
3382         iM[0] = A11; iM[1] = A12; iM[2] = b1;
3383         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
3384     }
3385     else
3386         CV_Error( CV_StsUnsupportedFormat, "" );
3387 }
3388 
getPerspectiveTransform(InputArray _src,InputArray _dst,int solveMethod)3389 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst, int solveMethod)
3390 {
3391     Mat src = _src.getMat(), dst = _dst.getMat();
3392     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
3393     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data, solveMethod);
3394 }
3395 
getAffineTransform(InputArray _src,InputArray _dst)3396 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
3397 {
3398     Mat src = _src.getMat(), dst = _dst.getMat();
3399     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
3400     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
3401 }
3402 
3403 CV_IMPL void
cvWarpAffine(const CvArr * srcarr,CvArr * dstarr,const CvMat * marr,int flags,CvScalar fillval)3404 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3405               int flags, CvScalar fillval )
3406 {
3407     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3408     cv::Mat matrix = cv::cvarrToMat(marr);
3409     CV_Assert( src.type() == dst.type() );
3410     cv::warpAffine( src, dst, matrix, dst.size(), flags,
3411         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3412         fillval );
3413 }
3414 
3415 CV_IMPL void
cvWarpPerspective(const CvArr * srcarr,CvArr * dstarr,const CvMat * marr,int flags,CvScalar fillval)3416 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3417                    int flags, CvScalar fillval )
3418 {
3419     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3420     cv::Mat matrix = cv::cvarrToMat(marr);
3421     CV_Assert( src.type() == dst.type() );
3422     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
3423         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3424         fillval );
3425 }
3426 
3427 CV_IMPL void
cvRemap(const CvArr * srcarr,CvArr * dstarr,const CvArr * _mapx,const CvArr * _mapy,int flags,CvScalar fillval)3428 cvRemap( const CvArr* srcarr, CvArr* dstarr,
3429          const CvArr* _mapx, const CvArr* _mapy,
3430          int flags, CvScalar fillval )
3431 {
3432     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
3433     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
3434     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
3435     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
3436         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3437         fillval );
3438     CV_Assert( dst0.data == dst.data );
3439 }
3440 
3441 
3442 CV_IMPL CvMat*
cv2DRotationMatrix(CvPoint2D32f center,double angle,double scale,CvMat * matrix)3443 cv2DRotationMatrix( CvPoint2D32f center, double angle,
3444                     double scale, CvMat* matrix )
3445 {
3446     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
3447     CV_Assert( M.size() == M0.size() );
3448     M.convertTo(M0, M0.type());
3449     return matrix;
3450 }
3451 
3452 
3453 CV_IMPL CvMat*
cvGetPerspectiveTransform(const CvPoint2D32f * src,const CvPoint2D32f * dst,CvMat * matrix)3454 cvGetPerspectiveTransform( const CvPoint2D32f* src,
3455                           const CvPoint2D32f* dst,
3456                           CvMat* matrix )
3457 {
3458     cv::Mat M0 = cv::cvarrToMat(matrix),
3459         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
3460     CV_Assert( M.size() == M0.size() );
3461     M.convertTo(M0, M0.type());
3462     return matrix;
3463 }
3464 
3465 
3466 CV_IMPL CvMat*
cvGetAffineTransform(const CvPoint2D32f * src,const CvPoint2D32f * dst,CvMat * matrix)3467 cvGetAffineTransform( const CvPoint2D32f* src,
3468                           const CvPoint2D32f* dst,
3469                           CvMat* matrix )
3470 {
3471     cv::Mat M0 = cv::cvarrToMat(matrix),
3472         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
3473     CV_Assert( M.size() == M0.size() );
3474     M.convertTo(M0, M0.type());
3475     return matrix;
3476 }
3477 
3478 
3479 CV_IMPL void
cvConvertMaps(const CvArr * arr1,const CvArr * arr2,CvArr * dstarr1,CvArr * dstarr2)3480 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
3481 {
3482     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
3483     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
3484 
3485     if( arr2 )
3486         map2 = cv::cvarrToMat(arr2);
3487     if( dstarr2 )
3488     {
3489         dstmap2 = cv::cvarrToMat(dstarr2);
3490         if( dstmap2.type() == CV_16SC1 )
3491             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
3492     }
3493 
3494     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
3495 }
3496 
3497 /****************************************************************************************
3498 PkLab.net 2018 based on cv::linearPolar from OpenCV by J.L. Blanco, Apr 2009
3499 ****************************************************************************************/
warpPolar(InputArray _src,OutputArray _dst,Size dsize,Point2f center,double maxRadius,int flags)3500 void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
3501                    Point2f center, double maxRadius, int flags)
3502 {
3503     // if dest size is empty given than calculate using proportional setting
3504     // thus we calculate needed angles to keep same area as bounding circle
3505     if ((dsize.width <= 0) && (dsize.height <= 0))
3506     {
3507         dsize.width = cvRound(maxRadius);
3508         dsize.height = cvRound(maxRadius * CV_PI);
3509     }
3510     else if (dsize.height <= 0)
3511     {
3512         dsize.height = cvRound(dsize.width * CV_PI);
3513     }
3514 
3515     Mat mapx, mapy;
3516     mapx.create(dsize, CV_32F);
3517     mapy.create(dsize, CV_32F);
3518     bool semiLog = (flags & WARP_POLAR_LOG) != 0;
3519 
3520     if (!(flags & CV_WARP_INVERSE_MAP))
3521     {
3522         CV_Assert(!dsize.empty());
3523         double Kangle = CV_2PI / dsize.height;
3524         int phi, rho;
3525 
3526         // precalculate scaled rho
3527         Mat rhos = Mat(1, dsize.width, CV_32F);
3528         float* bufRhos = (float*)(rhos.data);
3529         if (semiLog)
3530         {
3531             double Kmag = std::log(maxRadius) / dsize.width;
3532             for (rho = 0; rho < dsize.width; rho++)
3533                 bufRhos[rho] = (float)(std::exp(rho * Kmag) - 1.0);
3534 
3535         }
3536         else
3537         {
3538             double Kmag = maxRadius / dsize.width;
3539             for (rho = 0; rho < dsize.width; rho++)
3540                 bufRhos[rho] = (float)(rho * Kmag);
3541         }
3542 
3543         for (phi = 0; phi < dsize.height; phi++)
3544         {
3545             double KKy = Kangle * phi;
3546             double cp = std::cos(KKy);
3547             double sp = std::sin(KKy);
3548             float* mx = (float*)(mapx.data + phi*mapx.step);
3549             float* my = (float*)(mapy.data + phi*mapy.step);
3550 
3551             for (rho = 0; rho < dsize.width; rho++)
3552             {
3553                 double x = bufRhos[rho] * cp + center.x;
3554                 double y = bufRhos[rho] * sp + center.y;
3555 
3556                 mx[rho] = (float)x;
3557                 my[rho] = (float)y;
3558             }
3559         }
3560         remap(_src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
3561     }
3562     else
3563     {
3564         const int ANGLE_BORDER = 1;
3565         cv::copyMakeBorder(_src, _dst, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
3566         Mat src = _dst.getMat();
3567         Size ssize = _dst.size();
3568         ssize.height -= 2 * ANGLE_BORDER;
3569         CV_Assert(!ssize.empty());
3570         const double Kangle = CV_2PI / ssize.height;
3571         double Kmag;
3572         if (semiLog)
3573             Kmag = std::log(maxRadius) / ssize.width;
3574         else
3575             Kmag = maxRadius / ssize.width;
3576 
3577         int x, y;
3578         Mat bufx, bufy, bufp, bufa;
3579 
3580         bufx = Mat(1, dsize.width, CV_32F);
3581         bufy = Mat(1, dsize.width, CV_32F);
3582         bufp = Mat(1, dsize.width, CV_32F);
3583         bufa = Mat(1, dsize.width, CV_32F);
3584 
3585         for (x = 0; x < dsize.width; x++)
3586             bufx.at<float>(0, x) = (float)x - center.x;
3587 
3588         for (y = 0; y < dsize.height; y++)
3589         {
3590             float* mx = (float*)(mapx.data + y*mapx.step);
3591             float* my = (float*)(mapy.data + y*mapy.step);
3592 
3593             for (x = 0; x < dsize.width; x++)
3594                 bufy.at<float>(0, x) = (float)y - center.y;
3595 
3596             cartToPolar(bufx, bufy, bufp, bufa, 0);
3597 
3598             if (semiLog)
3599             {
3600                 bufp += 1.f;
3601                 log(bufp, bufp);
3602             }
3603 
3604             for (x = 0; x < dsize.width; x++)
3605             {
3606                 double rho = bufp.at<float>(0, x) / Kmag;
3607                 double phi = bufa.at<float>(0, x) / Kangle;
3608                 mx[x] = (float)rho;
3609                 my[x] = (float)phi + ANGLE_BORDER;
3610             }
3611         }
3612         remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
3613               (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
3614     }
3615 }
3616 
linearPolar(InputArray _src,OutputArray _dst,Point2f center,double maxRadius,int flags)3617 void cv::linearPolar( InputArray _src, OutputArray _dst,
3618                       Point2f center, double maxRadius, int flags )
3619 {
3620     warpPolar(_src, _dst, _src.size(), center, maxRadius, flags & ~WARP_POLAR_LOG);
3621 }
3622 
logPolar(InputArray _src,OutputArray _dst,Point2f center,double maxRadius,int flags)3623 void cv::logPolar( InputArray _src, OutputArray _dst,
3624                    Point2f center, double maxRadius, int flags )
3625 {
3626     Size ssize = _src.size();
3627     double M = maxRadius > 0 ? std::exp(ssize.width / maxRadius) : 1;
3628     warpPolar(_src, _dst, ssize, center, M, flags | WARP_POLAR_LOG);
3629 }
3630 
3631 CV_IMPL
cvLinearPolar(const CvArr * srcarr,CvArr * dstarr,CvPoint2D32f center,double maxRadius,int flags)3632 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
3633                     CvPoint2D32f center, double maxRadius, int flags )
3634 {
3635     Mat src = cvarrToMat(srcarr);
3636     Mat dst = cvarrToMat(dstarr);
3637 
3638     CV_Assert(src.size == dst.size);
3639     CV_Assert(src.type() == dst.type());
3640 
3641     cv::linearPolar(src, dst, center, maxRadius, flags);
3642 }
3643 
3644 CV_IMPL
cvLogPolar(const CvArr * srcarr,CvArr * dstarr,CvPoint2D32f center,double M,int flags)3645 void cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
3646                  CvPoint2D32f center, double M, int flags )
3647 {
3648     Mat src = cvarrToMat(srcarr);
3649     Mat dst = cvarrToMat(dstarr);
3650 
3651     CV_Assert(src.size == dst.size);
3652     CV_Assert(src.type() == dst.type());
3653 
3654     cv::logPolar(src, dst, center, M, flags);
3655 }
3656 
3657 /* End of file. */
3658