1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 // Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49
50 #include "precomp.hpp"
51 #include "imgwarp.hpp"
52
53 namespace cv
54 {
55 namespace opt_SSE4_1
56 {
57
convertMaps_nninterpolate32f1c16s_SSE41(const float * src1f,const float * src2f,short * dst1,int width)58 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)
59 {
60 int x = 0;
61 for (; x <= width - 16; x += 16)
62 {
63 __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
64 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
65 __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
66 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
67
68 __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
69 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
70 __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
71 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
72
73 _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
74
75 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
76 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
77 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
78 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
79 }
80
81 for (; x < width; x++)
82 {
83 dst1[x * 2] = saturate_cast<short>(src1f[x]);
84 dst1[x * 2 + 1] = saturate_cast<short>(src2f[x]);
85 }
86 }
87
convertMaps_32f1c16s_SSE41(const float * src1f,const float * src2f,short * dst1,ushort * dst2,int width)88 void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width)
89 {
90 int x = 0;
91 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
92 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
93
94 for (; x <= width - 16; x += 16)
95 {
96 __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
97 __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
98 __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
99 __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
100
101 __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
102 _mm_srai_epi32(v_ix1, INTER_BITS));
103 __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
104 _mm_srai_epi32(v_iy1, INTER_BITS));
105 __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
106 _mm_and_si128(v_ix0, v_its1));
107 __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
108 _mm_and_si128(v_ix1, v_its1));
109 _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
110
111 v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
112 v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
113 v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
114 v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
115
116 __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
117 _mm_srai_epi32(v_ix1, INTER_BITS));
118 __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
119 _mm_srai_epi32(v_iy1, INTER_BITS));
120 v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
121 _mm_and_si128(v_ix0, v_its1));
122 v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
123 _mm_and_si128(v_ix1, v_its1));
124 _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
125
126 _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
127
128 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
129 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
130 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
131 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
132 }
133 for (; x < width; x++)
134 {
135 int ix = saturate_cast<int>(src1f[x] * INTER_TAB_SIZE);
136 int iy = saturate_cast<int>(src2f[x] * INTER_TAB_SIZE);
137 dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
138 dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
139 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
140 }
141 }
142
convertMaps_32f2c16s_SSE41(const float * src1f,short * dst1,ushort * dst2,int width)143 void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width)
144 {
145 int x = 0;
146 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
147 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
148 __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16);
149
150 for (; x <= width - 4; x += 4)
151 {
152 __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
153 __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
154
155 __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
156 _mm_srai_epi32(v_src1, INTER_BITS));
157 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
158
159 // x0 y0 x1 y1 . . .
160 v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
161 _mm_and_si128(v_src1, v_its1));
162 __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
163 _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
164 _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
165 }
166 for (; x < width; x++)
167 {
168 int ix = saturate_cast<int>(src1f[x * 2] * INTER_TAB_SIZE);
169 int iy = saturate_cast<int>(src1f[x * 2 + 1] * INTER_TAB_SIZE);
170 dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
171 dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
172 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
173 }
174 }
175
WarpAffineInvoker_Blockline_SSE41(int * adelta,int * bdelta,short * xy,int X0,int Y0,int bw)176 void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
177 {
178 const int AB_BITS = MAX(10, (int)INTER_BITS);
179 int x1 = 0;
180
181 __m128i v_X0 = _mm_set1_epi32(X0);
182 __m128i v_Y0 = _mm_set1_epi32(Y0);
183 for (; x1 <= bw - 16; x1 += 16)
184 {
185 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
186 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
187 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
188 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
189
190 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
191 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
192 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
193 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
194
195 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
196
197 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
198 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
199 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
200 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
201 }
202 for (; x1 < bw; x1++)
203 {
204 int X = (X0 + adelta[x1]) >> AB_BITS;
205 int Y = (Y0 + bdelta[x1]) >> AB_BITS;
206 xy[x1 * 2] = saturate_cast<short>(X);
207 xy[x1 * 2 + 1] = saturate_cast<short>(Y);
208 }
209 }
210
211
212 class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
213 {
214 public:
WarpPerspectiveLine_SSE4_Impl(const double * M)215 WarpPerspectiveLine_SSE4_Impl(const double *M)
216 {
217 CV_UNUSED(M);
218 }
processNN(const double * M,short * xy,double X0,double Y0,double W0,int bw)219 virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) CV_OVERRIDE
220 {
221 const __m128d v_M0 = _mm_set1_pd(M[0]);
222 const __m128d v_M3 = _mm_set1_pd(M[3]);
223 const __m128d v_M6 = _mm_set1_pd(M[6]);
224 const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
225 const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
226 const __m128d v_2 = _mm_set1_pd(2);
227 const __m128d v_zero = _mm_setzero_pd();
228 const __m128d v_1 = _mm_set1_pd(1);
229
230 int x1 = 0;
231 __m128d v_X0d = _mm_set1_pd(X0);
232 __m128d v_Y0d = _mm_set1_pd(Y0);
233 __m128d v_W0 = _mm_set1_pd(W0);
234 __m128d v_x1 = _mm_set_pd(1, 0);
235
236 for (; x1 <= bw - 16; x1 += 16)
237 {
238 // 0-3
239 __m128i v_X0, v_Y0;
240 {
241 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
242 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
243 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
244 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
245 v_x1 = _mm_add_pd(v_x1, v_2);
246
247 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
248 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
249 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
250 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
251 v_x1 = _mm_add_pd(v_x1, v_2);
252
253 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
254 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
255 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
256 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
257 }
258
259 // 4-8
260 __m128i v_X1, v_Y1;
261 {
262 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
263 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
264 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
265 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
266 v_x1 = _mm_add_pd(v_x1, v_2);
267
268 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
269 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
270 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
271 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
272 v_x1 = _mm_add_pd(v_x1, v_2);
273
274 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
275 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
276 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
277 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
278 }
279
280 // 8-11
281 __m128i v_X2, v_Y2;
282 {
283 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
284 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
285 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
286 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
287 v_x1 = _mm_add_pd(v_x1, v_2);
288
289 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
290 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
291 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
292 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
293 v_x1 = _mm_add_pd(v_x1, v_2);
294
295 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
296 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
297 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
298 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
299 }
300
301 // 12-15
302 __m128i v_X3, v_Y3;
303 {
304 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
305 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
306 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
307 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
308 v_x1 = _mm_add_pd(v_x1, v_2);
309
310 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
311 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
312 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
313 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
314 v_x1 = _mm_add_pd(v_x1, v_2);
315
316 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
317 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
318 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
319 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
320 }
321
322 // convert to 16s
323 v_X0 = _mm_packs_epi32(v_X0, v_X1);
324 v_X1 = _mm_packs_epi32(v_X2, v_X3);
325 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
326 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
327
328 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
329
330 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
331 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
332 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
333 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
334 }
335
336 for (; x1 < bw; x1++)
337 {
338 double W = W0 + M[6] * x1;
339 W = W ? 1. / W : 0;
340 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
341 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
342 int X = saturate_cast<int>(fX);
343 int Y = saturate_cast<int>(fY);
344
345 xy[x1 * 2] = saturate_cast<short>(X);
346 xy[x1 * 2 + 1] = saturate_cast<short>(Y);
347 }
348 }
process(const double * M,short * xy,short * alpha,double X0,double Y0,double W0,int bw)349 virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) CV_OVERRIDE
350 {
351 const __m128d v_M0 = _mm_set1_pd(M[0]);
352 const __m128d v_M3 = _mm_set1_pd(M[3]);
353 const __m128d v_M6 = _mm_set1_pd(M[6]);
354 const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
355 const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
356 const __m128d v_2 = _mm_set1_pd(2);
357 const __m128d v_zero = _mm_setzero_pd();
358 const __m128d v_its = _mm_set1_pd(INTER_TAB_SIZE);
359 const __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
360
361 int x1 = 0;
362
363 __m128d v_X0d = _mm_set1_pd(X0);
364 __m128d v_Y0d = _mm_set1_pd(Y0);
365 __m128d v_W0 = _mm_set1_pd(W0);
366 __m128d v_x1 = _mm_set_pd(1, 0);
367
368 for (; x1 <= bw - 16; x1 += 16)
369 {
370 // 0-3
371 __m128i v_X0, v_Y0;
372 {
373 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
374 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
375 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
376 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
377 v_x1 = _mm_add_pd(v_x1, v_2);
378
379 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
380 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
381 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
382 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
383 v_x1 = _mm_add_pd(v_x1, v_2);
384
385 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
386 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
387 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
388 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
389 }
390
391 // 4-8
392 __m128i v_X1, v_Y1;
393 {
394 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
395 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
396 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
397 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
398 v_x1 = _mm_add_pd(v_x1, v_2);
399
400 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
401 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
402 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
403 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
404 v_x1 = _mm_add_pd(v_x1, v_2);
405
406 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
407 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
408 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
409 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
410 }
411
412 // 8-11
413 __m128i v_X2, v_Y2;
414 {
415 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
416 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
417 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
418 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
419 v_x1 = _mm_add_pd(v_x1, v_2);
420
421 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
422 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
423 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
424 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
425 v_x1 = _mm_add_pd(v_x1, v_2);
426
427 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
428 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
429 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
430 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
431 }
432
433 // 12-15
434 __m128i v_X3, v_Y3;
435 {
436 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
437 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
438 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
439 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
440 v_x1 = _mm_add_pd(v_x1, v_2);
441
442 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
443 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
444 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
445 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
446 v_x1 = _mm_add_pd(v_x1, v_2);
447
448 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
449 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
450 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
451 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
452 }
453
454 // store alpha
455 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
456 _mm_and_si128(v_X0, v_itsi1));
457 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
458 _mm_and_si128(v_X1, v_itsi1));
459 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
460
461 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
462 _mm_and_si128(v_X2, v_itsi1));
463 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
464 _mm_and_si128(v_X3, v_itsi1));
465 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
466
467 // convert to 16s
468 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
469 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
470 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
471 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
472
473 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
474
475 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
476 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
477 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
478 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
479 }
480 for (; x1 < bw; x1++)
481 {
482 double W = W0 + M[6] * x1;
483 W = W ? INTER_TAB_SIZE / W : 0;
484 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
485 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
486 int X = saturate_cast<int>(fX);
487 int Y = saturate_cast<int>(fY);
488
489 xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
490 xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
491 alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE +
492 (X & (INTER_TAB_SIZE - 1)));
493 }
494 }
~WarpPerspectiveLine_SSE4_Impl()495 virtual ~WarpPerspectiveLine_SSE4_Impl() CV_OVERRIDE {};
496 };
497
getImpl(const double * M)498 Ptr<WarpPerspectiveLine_SSE4> WarpPerspectiveLine_SSE4::getImpl(const double *M)
499 {
500 return Ptr<WarpPerspectiveLine_SSE4>(new WarpPerspectiveLine_SSE4_Impl(M));
501 }
502
503 }
504 }
505 /* End of file. */
506