1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49 
50 #include "precomp.hpp"
51 #include "imgwarp.hpp"
52 
53 namespace cv
54 {
55 namespace opt_SSE4_1
56 {
57 
convertMaps_nninterpolate32f1c16s_SSE41(const float * src1f,const float * src2f,short * dst1,int width)58 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)
59 {
60     int x = 0;
61     for (; x <= width - 16; x += 16)
62     {
63         __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
64             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
65         __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
66             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
67 
68         __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
69             _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
70         __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
71             _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
72 
73         _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
74 
75         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
76         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
77         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
78         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
79     }
80 
81     for (; x < width; x++)
82     {
83         dst1[x * 2] = saturate_cast<short>(src1f[x]);
84         dst1[x * 2 + 1] = saturate_cast<short>(src2f[x]);
85     }
86 }
87 
convertMaps_32f1c16s_SSE41(const float * src1f,const float * src2f,short * dst1,ushort * dst2,int width)88 void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width)
89 {
90     int x = 0;
91     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
92     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
93 
94     for (; x <= width - 16; x += 16)
95     {
96         __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
97         __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
98         __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
99         __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
100 
101         __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
102             _mm_srai_epi32(v_ix1, INTER_BITS));
103         __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
104             _mm_srai_epi32(v_iy1, INTER_BITS));
105         __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
106             _mm_and_si128(v_ix0, v_its1));
107         __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
108             _mm_and_si128(v_ix1, v_its1));
109         _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
110 
111         v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
112         v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
113         v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
114         v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
115 
116         __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
117             _mm_srai_epi32(v_ix1, INTER_BITS));
118         __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
119             _mm_srai_epi32(v_iy1, INTER_BITS));
120         v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
121             _mm_and_si128(v_ix0, v_its1));
122         v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
123             _mm_and_si128(v_ix1, v_its1));
124         _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
125 
126         _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
127 
128         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
129         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
130         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
131         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
132     }
133     for (; x < width; x++)
134     {
135         int ix = saturate_cast<int>(src1f[x] * INTER_TAB_SIZE);
136         int iy = saturate_cast<int>(src2f[x] * INTER_TAB_SIZE);
137         dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
138         dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
139         dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
140     }
141 }
142 
convertMaps_32f2c16s_SSE41(const float * src1f,short * dst1,ushort * dst2,int width)143 void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width)
144 {
145     int x = 0;
146     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
147     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
148     __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16);
149 
150     for (; x <= width - 4; x += 4)
151     {
152         __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
153         __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
154 
155         __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
156             _mm_srai_epi32(v_src1, INTER_BITS));
157         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
158 
159         // x0 y0 x1 y1 . . .
160         v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
161             _mm_and_si128(v_src1, v_its1));
162         __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
163             _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
164         _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
165     }
166     for (; x < width; x++)
167     {
168         int ix = saturate_cast<int>(src1f[x * 2] * INTER_TAB_SIZE);
169         int iy = saturate_cast<int>(src1f[x * 2 + 1] * INTER_TAB_SIZE);
170         dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
171         dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
172         dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
173     }
174 }
175 
WarpAffineInvoker_Blockline_SSE41(int * adelta,int * bdelta,short * xy,int X0,int Y0,int bw)176 void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
177 {
178     const int AB_BITS = MAX(10, (int)INTER_BITS);
179     int x1 = 0;
180 
181     __m128i v_X0 = _mm_set1_epi32(X0);
182     __m128i v_Y0 = _mm_set1_epi32(Y0);
183     for (; x1 <= bw - 16; x1 += 16)
184     {
185         __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
186             _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
187         __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
188             _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
189 
190         __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
191             _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
192         __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
193             _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
194 
195         _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
196 
197         _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
198         _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
199         _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
200         _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
201     }
202     for (; x1 < bw; x1++)
203     {
204         int X = (X0 + adelta[x1]) >> AB_BITS;
205         int Y = (Y0 + bdelta[x1]) >> AB_BITS;
206         xy[x1 * 2] = saturate_cast<short>(X);
207         xy[x1 * 2 + 1] = saturate_cast<short>(Y);
208     }
209 }
210 
211 
212 class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
213 {
214 public:
WarpPerspectiveLine_SSE4_Impl(const double * M)215     WarpPerspectiveLine_SSE4_Impl(const double *M)
216     {
217         CV_UNUSED(M);
218     }
processNN(const double * M,short * xy,double X0,double Y0,double W0,int bw)219     virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) CV_OVERRIDE
220     {
221         const __m128d v_M0 = _mm_set1_pd(M[0]);
222         const __m128d v_M3 = _mm_set1_pd(M[3]);
223         const __m128d v_M6 = _mm_set1_pd(M[6]);
224         const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
225         const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
226         const __m128d v_2 = _mm_set1_pd(2);
227         const __m128d v_zero = _mm_setzero_pd();
228         const __m128d v_1 = _mm_set1_pd(1);
229 
230         int x1 = 0;
231         __m128d v_X0d = _mm_set1_pd(X0);
232         __m128d v_Y0d = _mm_set1_pd(Y0);
233         __m128d v_W0 = _mm_set1_pd(W0);
234         __m128d v_x1 = _mm_set_pd(1, 0);
235 
236         for (; x1 <= bw - 16; x1 += 16)
237         {
238             // 0-3
239             __m128i v_X0, v_Y0;
240             {
241                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
242                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
243                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
244                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
245                 v_x1 = _mm_add_pd(v_x1, v_2);
246 
247                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
248                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
249                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
250                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
251                 v_x1 = _mm_add_pd(v_x1, v_2);
252 
253                 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
254                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
255                 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
256                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
257             }
258 
259             // 4-8
260             __m128i v_X1, v_Y1;
261             {
262                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
263                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
264                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
265                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
266                 v_x1 = _mm_add_pd(v_x1, v_2);
267 
268                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
269                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
270                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
271                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
272                 v_x1 = _mm_add_pd(v_x1, v_2);
273 
274                 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
275                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
276                 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
277                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
278             }
279 
280             // 8-11
281             __m128i v_X2, v_Y2;
282             {
283                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
284                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
285                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
286                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
287                 v_x1 = _mm_add_pd(v_x1, v_2);
288 
289                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
290                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
291                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
292                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
293                 v_x1 = _mm_add_pd(v_x1, v_2);
294 
295                 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
296                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
297                 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
298                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
299             }
300 
301             // 12-15
302             __m128i v_X3, v_Y3;
303             {
304                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
305                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
306                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
307                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
308                 v_x1 = _mm_add_pd(v_x1, v_2);
309 
310                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
311                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
312                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
313                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
314                 v_x1 = _mm_add_pd(v_x1, v_2);
315 
316                 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
317                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
318                 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
319                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
320             }
321 
322             // convert to 16s
323             v_X0 = _mm_packs_epi32(v_X0, v_X1);
324             v_X1 = _mm_packs_epi32(v_X2, v_X3);
325             v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
326             v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
327 
328             _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
329 
330             _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
331             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
332             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
333             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
334         }
335 
336         for (; x1 < bw; x1++)
337         {
338             double W = W0 + M[6] * x1;
339             W = W ? 1. / W : 0;
340             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
341             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
342             int X = saturate_cast<int>(fX);
343             int Y = saturate_cast<int>(fY);
344 
345             xy[x1 * 2] = saturate_cast<short>(X);
346             xy[x1 * 2 + 1] = saturate_cast<short>(Y);
347         }
348     }
process(const double * M,short * xy,short * alpha,double X0,double Y0,double W0,int bw)349     virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) CV_OVERRIDE
350     {
351         const __m128d v_M0 = _mm_set1_pd(M[0]);
352         const __m128d v_M3 = _mm_set1_pd(M[3]);
353         const __m128d v_M6 = _mm_set1_pd(M[6]);
354         const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
355         const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
356         const __m128d v_2 = _mm_set1_pd(2);
357         const __m128d v_zero = _mm_setzero_pd();
358         const __m128d v_its = _mm_set1_pd(INTER_TAB_SIZE);
359         const __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
360 
361         int x1 = 0;
362 
363         __m128d v_X0d = _mm_set1_pd(X0);
364         __m128d v_Y0d = _mm_set1_pd(Y0);
365         __m128d v_W0 = _mm_set1_pd(W0);
366         __m128d v_x1 = _mm_set_pd(1, 0);
367 
368         for (; x1 <= bw - 16; x1 += 16)
369         {
370             // 0-3
371             __m128i v_X0, v_Y0;
372             {
373                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
374                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
375                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
376                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
377                 v_x1 = _mm_add_pd(v_x1, v_2);
378 
379                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
380                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
381                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
382                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
383                 v_x1 = _mm_add_pd(v_x1, v_2);
384 
385                 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
386                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
387                 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
388                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
389             }
390 
391             // 4-8
392             __m128i v_X1, v_Y1;
393             {
394                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
395                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
396                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
397                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
398                 v_x1 = _mm_add_pd(v_x1, v_2);
399 
400                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
401                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
402                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
403                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
404                 v_x1 = _mm_add_pd(v_x1, v_2);
405 
406                 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
407                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
408                 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
409                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
410             }
411 
412             // 8-11
413             __m128i v_X2, v_Y2;
414             {
415                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
416                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
417                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
418                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
419                 v_x1 = _mm_add_pd(v_x1, v_2);
420 
421                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
422                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
423                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
424                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
425                 v_x1 = _mm_add_pd(v_x1, v_2);
426 
427                 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
428                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
429                 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
430                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
431             }
432 
433             // 12-15
434             __m128i v_X3, v_Y3;
435             {
436                 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
437                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
438                 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
439                 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
440                 v_x1 = _mm_add_pd(v_x1, v_2);
441 
442                 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
443                 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
444                 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
445                 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
446                 v_x1 = _mm_add_pd(v_x1, v_2);
447 
448                 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
449                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
450                 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
451                     _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
452             }
453 
454             // store alpha
455             __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
456                 _mm_and_si128(v_X0, v_itsi1));
457             __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
458                 _mm_and_si128(v_X1, v_itsi1));
459             _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
460 
461             v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
462                 _mm_and_si128(v_X2, v_itsi1));
463             v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
464                 _mm_and_si128(v_X3, v_itsi1));
465             _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
466 
467             // convert to 16s
468             v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
469             v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
470             v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
471             v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
472 
473             _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
474 
475             _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
476             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
477             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
478             _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
479         }
480         for (; x1 < bw; x1++)
481         {
482             double W = W0 + M[6] * x1;
483             W = W ? INTER_TAB_SIZE / W : 0;
484             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
485             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
486             int X = saturate_cast<int>(fX);
487             int Y = saturate_cast<int>(fY);
488 
489             xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
490             xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
491             alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE +
492                 (X & (INTER_TAB_SIZE - 1)));
493         }
494     }
~WarpPerspectiveLine_SSE4_Impl()495     virtual ~WarpPerspectiveLine_SSE4_Impl() CV_OVERRIDE {};
496 };
497 
getImpl(const double * M)498 Ptr<WarpPerspectiveLine_SSE4> WarpPerspectiveLine_SSE4::getImpl(const double *M)
499 {
500     return Ptr<WarpPerspectiveLine_SSE4>(new WarpPerspectiveLine_SSE4_Impl(M));
501 }
502 
503 }
504 }
505 /* End of file. */
506