1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, 2017, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 // Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49
50 #include "precomp.hpp"
51 #include "opencl_kernels_imgproc.hpp"
52 #include "hal_replacement.hpp"
53 #include "opencv2/core/hal/intrin.hpp"
54 #include "opencv2/core/utils/buffer_area.private.hpp"
55
56 #include "opencv2/core/openvx/ovx_defs.hpp"
57 #include "resize.hpp"
58
59 #include "opencv2/core/softfloat.hpp"
60 #include "fixedpoint.inl.hpp"
61
62 using namespace cv;
63
64 namespace
65 {
66
67 template <typename ET, bool needsign> struct fixedtype { typedef fixedpoint64 type; };
68 template <> struct fixedtype<uint32_t, false> { typedef ufixedpoint64 type; };
69 template <bool needsign> struct fixedtype<int16_t, needsign> { typedef fixedpoint32 type; };
70 template <> struct fixedtype<uint16_t, false> { typedef ufixedpoint32 type; };
71 template <bool needsign> struct fixedtype<int8_t, needsign> { typedef fixedpoint32 type; };
72 template <> struct fixedtype<uint8_t, false> { typedef ufixedpoint16 type; };
73
74 //FT is fixedtype<ET, needsign>::type
75 template <typename ET, typename FT, int n, bool mulall>
hlineResize(ET * src,int cn,int * ofst,FT * m,FT * dst,int dst_min,int dst_max,int dst_width)76 static void hlineResize(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
77 {
78 int i = 0;
79 for (; i < dst_min; i++, m += n) // Points that fall left from src image so became equal to leftmost src point
80 {
81 for (int j = 0; j < cn; j++, dst++)
82 {
83 *dst = src[j];
84 }
85 }
86 for (; i < dst_max; i++, m += n)
87 {
88 ET* src_ofst = src + cn*ofst[i];
89 for (int j = 0; j < cn; j++, dst++)
90 {
91 *dst = (mulall || !m[0].isZero()) ? m[0] * src_ofst[j] : FT::zero();
92 for (int k = 1; k < n; k++)
93 {
94 *dst = *dst + ((mulall || !m[k].isZero()) ? m[k] * src_ofst[j+k*cn] : FT::zero());
95 }
96 }
97 }
98 ET* src_last = src + cn*ofst[dst_width - 1];
99 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
100 {
101 for (int j = 0; j < cn; j++, dst++)
102 {
103 *dst = src_last[j];
104 }
105 }
106 }
107 template <typename ET, typename FT, int n, bool mulall, int cncnt> struct hline
108 {
ResizeCn__anon873c3d650111::hline109 static void ResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
110 {
111 hlineResize<ET, FT, n, mulall>(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
112 }
113 };
114 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 1>
115 {
ResizeCn__anon873c3d650111::hline116 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
117 {
118 int i = 0;
119 FT src0(src[0]);
120 for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
121 {
122 *(dst++) = src0;
123 }
124 for (; i < dst_max; i++, m += 2)
125 {
126 ET* px = src + ofst[i];
127 *(dst++) = m[0] * px[0] + m[1] * px[1];
128 }
129 src0 = (src + ofst[dst_width - 1])[0];
130 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
131 {
132 *(dst++) = src0;
133 }
134 }
135 };
136 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 2>
137 {
ResizeCn__anon873c3d650111::hline138 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
139 {
140 int i = 0;
141 FT src0(src[0]), src1(src[1]);
142 for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
143 {
144 *(dst++) = src0;
145 *(dst++) = src1;
146 }
147 for (; i < dst_max; i++, m += 2)
148 {
149 ET* px = src + 2*ofst[i];
150 *(dst++) = m[0] * px[0] + m[1] * px[2];
151 *(dst++) = m[0] * px[1] + m[1] * px[3];
152 }
153 src0 = (src + 2*ofst[dst_width - 1])[0];
154 src1 = (src + 2*ofst[dst_width - 1])[1];
155 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
156 {
157 *(dst++) = src0;
158 *(dst++) = src1;
159 }
160 }
161 };
162 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 3>
163 {
ResizeCn__anon873c3d650111::hline164 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
165 {
166 int i = 0;
167 FT src0(src[0]), src1(src[1]), src2(src[2]);
168 for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
169 {
170 *(dst++) = src0;
171 *(dst++) = src1;
172 *(dst++) = src2;
173 }
174 for (; i < dst_max; i++, m += 2)
175 {
176 ET* px = src + 3*ofst[i];
177 *(dst++) = m[0] * px[0] + m[1] * px[3];
178 *(dst++) = m[0] * px[1] + m[1] * px[4];
179 *(dst++) = m[0] * px[2] + m[1] * px[5];
180 }
181 src0 = (src + 3*ofst[dst_width - 1])[0];
182 src1 = (src + 3*ofst[dst_width - 1])[1];
183 src2 = (src + 3*ofst[dst_width - 1])[2];
184 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
185 {
186 *(dst++) = src0;
187 *(dst++) = src1;
188 *(dst++) = src2;
189 }
190 }
191 };
192 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 4>
193 {
ResizeCn__anon873c3d650111::hline194 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
195 {
196 int i = 0;
197 FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
198 for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
199 {
200 *(dst++) = src0;
201 *(dst++) = src1;
202 *(dst++) = src2;
203 *(dst++) = src3;
204 }
205 for (; i < dst_max; i++, m += 2)
206 {
207 ET* px = src + 4*ofst[i];
208 *(dst++) = m[0] * px[0] + m[1] * px[4];
209 *(dst++) = m[0] * px[1] + m[1] * px[5];
210 *(dst++) = m[0] * px[2] + m[1] * px[6];
211 *(dst++) = m[0] * px[3] + m[1] * px[7];
212 }
213 src0 = (src + 4*ofst[dst_width - 1])[0];
214 src1 = (src + 4*ofst[dst_width - 1])[1];
215 src2 = (src + 4*ofst[dst_width - 1])[2];
216 src3 = (src + 4*ofst[dst_width - 1])[3];
217 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
218 {
219 *(dst++) = src0;
220 *(dst++) = src1;
221 *(dst++) = src2;
222 *(dst++) = src3;
223 }
224 }
225 };
226 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 1>
227 {
ResizeCn__anon873c3d650111::hline228 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
229 {
230 int i = 0;
231 FT src0(src[0]);
232 for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
233 {
234 *(dst++) = src0;
235 }
236 for (; i < dst_max; i++, m += 4)
237 {
238 ET* px = src + ofst[i];
239 *(dst++) = m[0] * src[0] + m[1] * src[1] + m[2] * src[2] + m[3] * src[3];
240 }
241 src0 = (src + ofst[dst_width - 1])[0];
242 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
243 {
244 *(dst++) = src0;
245 }
246 }
247 };
248 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 2>
249 {
ResizeCn__anon873c3d650111::hline250 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
251 {
252 int i = 0;
253 FT src0(src[0]), src1(src[1]);
254 for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
255 {
256 *(dst++) = src0;
257 *(dst++) = src1;
258 }
259 for (; i < dst_max; i++, m += 4)
260 {
261 ET* px = src + 2*ofst[i];
262 *(dst++) = m[0] * src[0] + m[1] * src[2] + m[2] * src[4] + m[3] * src[6];
263 *(dst++) = m[0] * src[1] + m[1] * src[3] + m[2] * src[5] + m[3] * src[7];
264 }
265 src0 = (src + 2*ofst[dst_width - 1])[0];
266 src1 = (src + 2*ofst[dst_width - 1])[1];
267 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
268 {
269 *(dst++) = src0;
270 *(dst++) = src1;
271 }
272 }
273 };
274 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 3>
275 {
ResizeCn__anon873c3d650111::hline276 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
277 {
278 int i = 0;
279 FT src0(src[0]), src1(src[1]), src2(src[2]);
280 for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
281 {
282 *(dst++) = src0;
283 *(dst++) = src1;
284 *(dst++) = src2;
285 }
286 for (; i < dst_max; i++, m += 4)
287 {
288 ET* px = src + 3*ofst[i];
289 *(dst++) = m[0] * src[0] + m[1] * src[3] + m[2] * src[6] + m[3] * src[ 9];
290 *(dst++) = m[0] * src[1] + m[1] * src[4] + m[2] * src[7] + m[3] * src[10];
291 *(dst++) = m[0] * src[2] + m[1] * src[5] + m[2] * src[8] + m[3] * src[11];
292 }
293 src0 = (src + 3*ofst[dst_width - 1])[0];
294 src1 = (src + 3*ofst[dst_width - 1])[1];
295 src2 = (src + 3*ofst[dst_width - 1])[2];
296 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
297 {
298 *(dst++) = src0;
299 *(dst++) = src1;
300 *(dst++) = src2;
301 }
302 }
303 };
304 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 4>
305 {
ResizeCn__anon873c3d650111::hline306 static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
307 {
308 int i = 0;
309 FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
310 for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
311 {
312 *(dst++) = src0;
313 *(dst++) = src1;
314 *(dst++) = src2;
315 *(dst++) = src3;
316 }
317 for (; i < dst_max; i++, m += 4)
318 {
319 ET* px = src + 4*ofst[i];
320 *(dst++) = m[0] * src[0] + m[1] * src[4] + m[2] * src[ 8] + m[3] * src[12];
321 *(dst++) = m[0] * src[1] + m[1] * src[5] + m[2] * src[ 9] + m[3] * src[13];
322 *(dst++) = m[0] * src[2] + m[1] * src[6] + m[2] * src[10] + m[3] * src[14];
323 *(dst++) = m[0] * src[3] + m[1] * src[7] + m[2] * src[11] + m[3] * src[15];
324 }
325 src0 = (src + 4*ofst[dst_width - 1])[0];
326 src1 = (src + 4*ofst[dst_width - 1])[1];
327 src2 = (src + 4*ofst[dst_width - 1])[2];
328 src3 = (src + 4*ofst[dst_width - 1])[3];
329 for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
330 {
331 *(dst++) = src0;
332 *(dst++) = src1;
333 *(dst++) = src2;
334 *(dst++) = src3;
335 }
336 }
337 };
338 template <typename ET, typename FT, int n, bool mulall, int cncnt>
hlineResizeCn(ET * src,int cn,int * ofst,FT * m,FT * dst,int dst_min,int dst_max,int dst_width)339 static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
340 {
341 hline<ET, FT, n, mulall, cncnt>::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
342 };
343
344 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)345 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
346 {
347 int i = 0;
348 ufixedpoint16 src_0(src[0]);
349 #if CV_SIMD
350 const int VECSZ = v_uint16::nlanes;
351 v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
352 for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
353 {
354 v_store((uint16_t*)dst, v_src_0);
355 }
356 #endif
357 for (; i < dst_min; i++, m += 2)
358 {
359 *(dst++) = src_0;
360 }
361 #if CV_SIMD
362 for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ)
363 {
364 v_uint16 v_src0, v_src1;
365 v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
366 v_store((uint16_t*)dst , v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
367 v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
368 v_expand(vx_lut_pairs(src, ofst + i + VECSZ), v_src0, v_src1);
369 v_store((uint16_t*)dst+VECSZ, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m + 2*VECSZ))),
370 v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + 3*VECSZ)))));
371 }
372 if (i <= dst_max - VECSZ)
373 {
374 v_uint16 v_src0, v_src1;
375 v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
376 v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
377 v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
378 i += VECSZ; m += 2*VECSZ; dst += VECSZ;
379 }
380 #endif
381 for (; i < dst_max; i += 1, m += 2)
382 {
383 uint8_t* px = src + ofst[i];
384 *(dst++) = m[0] * px[0] + m[1] * px[1];
385 }
386 src_0 = (src + ofst[dst_width - 1])[0];
387 #if CV_SIMD
388 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
389 for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
390 {
391 v_store((uint16_t*)dst, v_src_0);
392 }
393 #endif
394 for (; i < dst_width; i++)
395 {
396 *(dst++) = src_0;
397 }
398 }
399 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)400 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
401 {
402 int i = 0;
403 union {
404 uint32_t d;
405 uint16_t w[2];
406 } srccn;
407 ((ufixedpoint16*)(srccn.w))[0] = src[0];
408 ((ufixedpoint16*)(srccn.w))[1] = src[1];
409 #if CV_SIMD
410 const int VECSZ = v_uint16::nlanes;
411 v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
412 for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
413 {
414 v_store((uint16_t*)dst, v_srccn);
415 }
416 #endif
417 for (; i < dst_min; i++, m += 2)
418 {
419 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
420 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
421 }
422 #if CV_SIMD
423 for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
424 {
425 v_uint16 v_src0, v_src1;
426 v_expand(v_interleave_pairs(v_reinterpret_as_u8(vx_lut_pairs((uint16_t*)src, ofst + i))), v_src0, v_src1);
427
428 v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd
429 v_uint32 v_zip0, v_zip1;
430 v_zip(v_mul, v_mul, v_zip0, v_zip1);//AaAaBbBb CcCcDdDd
431 v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_zip0)));
432 v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_zip1)));
433 v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
434 }
435 #endif
436 for (; i < dst_max; i += 1, m += 2)
437 {
438 uint8_t* px = src + 2 * ofst[i];
439 *(dst++) = m[0] * px[0] + m[1] * px[2];
440 *(dst++) = m[0] * px[1] + m[1] * px[3];
441 }
442 ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1];
443 #if CV_SIMD
444 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
445 for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
446 {
447 v_store((uint16_t*)dst, v_srccn);
448 }
449 #endif
450 for (; i < dst_width; i++)
451 {
452 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
453 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
454 }
455 }
456 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)457 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
458 {
459 int i = 0;
460 union {
461 uint64_t q;
462 uint16_t w[4];
463 } srccn;
464 ((ufixedpoint16*)(srccn.w))[0] = src[0];
465 ((ufixedpoint16*)(srccn.w))[1] = src[1];
466 ((ufixedpoint16*)(srccn.w))[2] = src[2];
467 ((ufixedpoint16*)(srccn.w))[3] = 0;
468 #if CV_SIMD
469 const int VECSZ = v_uint16::nlanes;
470 v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
471 for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point
472 {
473 v_store((uint16_t*)dst, v_srccn);
474 }
475 #endif
476 for (; i < dst_min; i++, m += 2)
477 {
478 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
479 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
480 *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
481 }
482 #if CV_SIMD
483 CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2];
484 for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2)
485 {
486 v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
487 v_uint8 v_src01, v_src23;
488 v_uint16 v_src0, v_src1, v_src2, v_src3;
489 v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23);
490 v_expand(v_src01, v_src0, v_src1);
491 v_expand(v_src23, v_src2, v_src3);
492
493 v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
494 v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
495 v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
496 v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
497 v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd
498
499 v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
500 v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
501 v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
502 v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
503 v_store((uint16_t*)dst , v_pack_triplets(v_pack(v_res0, v_res1)));
504 v_store((uint16_t*)dst + 3*VECSZ/4, v_pack_triplets(v_pack(v_res2, v_res3)));
505 }
506 #endif
507 for (; i < dst_max; i += 1, m += 2)
508 {
509 uint8_t* px = src + 3 * ofst[i];
510 *(dst++) = m[0] * px[0] + m[1] * px[3];
511 *(dst++) = m[0] * px[1] + m[1] * px[4];
512 *(dst++) = m[0] * px[2] + m[1] * px[5];
513 }
514 ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0];
515 ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1];
516 ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2];
517 #if CV_SIMD
518 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
519 for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point
520 {
521 v_store((uint16_t*)dst, v_srccn);
522 }
523 #endif
524 for (; i < dst_width; i++)
525 {
526 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
527 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
528 *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
529 }
530 }
531 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)532 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
533 {
534 int i = 0;
535 union {
536 uint64_t q;
537 uint16_t w[4];
538 } srccn;
539 ((ufixedpoint16*)(srccn.w))[0] = src[0];
540 ((ufixedpoint16*)(srccn.w))[1] = src[1];
541 ((ufixedpoint16*)(srccn.w))[2] = src[2];
542 ((ufixedpoint16*)(srccn.w))[3] = src[3];
543 #if CV_SIMD
544 const int VECSZ = v_uint16::nlanes;
545 v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
546 for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
547 {
548 v_store((uint16_t*)dst, v_srccn);
549 }
550 #endif
551 for (; i < dst_min; i++, m += 2)
552 {
553 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
554 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
555 *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
556 *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
557 }
558 #if CV_SIMD
559 for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
560 {
561 v_uint16 v_src0, v_src1, v_src2, v_src3;
562 v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i))), v_src0, v_src1);
563 v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i + VECSZ/4))), v_src2, v_src3);
564
565 v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
566 v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
567 v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
568 v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
569 v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd
570
571 v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
572 v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
573 v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
574 v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
575 v_store((uint16_t*)dst , v_pack(v_res0, v_res1));
576 v_store((uint16_t*)dst + VECSZ, v_pack(v_res2, v_res3));
577 }
578 #endif
579 for (; i < dst_max; i += 1, m += 2)
580 {
581 uint8_t* px = src + 4 * ofst[i];
582 *(dst++) = m[0] * px[0] + m[1] * px[4];
583 *(dst++) = m[0] * px[1] + m[1] * px[5];
584 *(dst++) = m[0] * px[2] + m[1] * px[6];
585 *(dst++) = m[0] * px[3] + m[1] * px[7];
586 }
587 ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1];
588 ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3];
589 #if CV_SIMD
590 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
591 for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point
592 {
593 v_store((uint16_t*)dst, v_srccn);
594 }
595 #endif
596 for (; i < dst_width; i++)
597 {
598 *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
599 *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
600 *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
601 *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
602 }
603 }
604 template <>
hlineResizeCn(uint16_t * src,int,int * ofst,ufixedpoint32 * m,ufixedpoint32 * dst,int dst_min,int dst_max,int dst_width)605 void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int *ofst, ufixedpoint32* m, ufixedpoint32* dst, int dst_min, int dst_max, int dst_width)
606 {
607 int i = 0;
608 ufixedpoint32 src_0(src[0]);
609 #if CV_SIMD
610 const int VECSZ = v_uint32::nlanes;
611 v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
612 for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
613 {
614 v_store((uint32_t*)dst, v_src_0);
615 }
616 #endif
617 for (; i < dst_min; i++, m += 2)
618 {
619 *(dst++) = src_0;
620 }
621 #if CV_SIMD
622 for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
623 {
624 v_uint32 v_src0, v_src1;
625 v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
626
627 v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m));
628 v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ));
629 v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32),
630 (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32)));
631 }
632 #endif
633 for (; i < dst_max; i += 1, m += 2)
634 {
635 uint16_t* px = src + ofst[i];
636 *(dst++) = m[0] * px[0] + m[1] * px[1];
637 }
638 src_0 = (src + ofst[dst_width - 1])[0];
639 #if CV_SIMD
640 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
641 for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ)
642 {
643 v_store((uint32_t*)dst, v_src_0);
644 }
645 #endif
646 for (; i < dst_width; i++)
647 {
648 *(dst++) = src_0;
649 }
650 }
651
652 template <typename ET, typename FT>
vlineSet(FT * src,ET * dst,int dst_width)653 void vlineSet(FT* src, ET* dst, int dst_width)
654 {
655 for (int i = 0; i < dst_width; i++)
656 dst[i] = src[i];
657 }
658 template <>
vlineSet(ufixedpoint16 * src,uint8_t * dst,int dst_width)659 void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
660 {
661 int i = 0;
662 #if CV_SIMD
663 const int VECSZ = v_uint8::nlanes;
664 static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
665 for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
666 {
667 v_uint16 v_src0 = vx_load((uint16_t*)src);
668 v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2);
669
670 v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8;
671 v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8;
672
673 v_store(dst, v_pack(v_res0, v_res1));
674 }
675 #endif
676 for (; i < dst_width; i++)
677 *(dst++) = *(src++);
678 }
679
680 template <typename ET, typename FT, int n>
vlineResize(FT * src,size_t src_step,FT * m,ET * dst,int dst_width)681 void vlineResize(FT* src, size_t src_step, FT* m, ET* dst, int dst_width)
682 {
683 for (int i = 0; i < dst_width; i++)
684 {
685 typename FT::WT res = src[i] * m[0];
686 for (int k = 1; k < n; k++)
687 res = res + src[i + k*src_step] * m[k];
688 dst[i] = res;
689 }
690 }
691 template <>
vlineResize(ufixedpoint16 * src,size_t src_step,ufixedpoint16 * m,uint8_t * dst,int dst_width)692 void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step, ufixedpoint16* m, uint8_t* dst, int dst_width)
693 {
694 int i = 0;
695 ufixedpoint16* src1 = src + src_step;
696 #if CV_SIMD
697 const int VECSZ = v_uint8::nlanes;
698 static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
699 static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
700 static const v_int8 v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
701
702 v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0]));
703 for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ)
704 {
705 v_int16 v_src00 = vx_load((int16_t*)src);
706 v_int16 v_src10 = vx_load((int16_t*)src1);
707 v_int16 v_tmp0, v_tmp1;
708 v_zip(v_add_wrap(v_src00,v_128), v_add_wrap(v_src10,v_128), v_tmp0, v_tmp1);
709
710 v_int32 v_res0 = v_dotprod(v_tmp0, v_mul);
711 v_int32 v_res1 = v_dotprod(v_tmp1, v_mul);
712
713 v_int16 v_src01 = vx_load((int16_t*)src + VECSZ/2);
714 v_int16 v_src11 = vx_load((int16_t*)src1 + VECSZ/2);
715 v_zip(v_add_wrap(v_src01,v_128), v_add_wrap(v_src11,v_128), v_tmp0, v_tmp1);
716 v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
717 v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);
718
719 v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
720 (v_res1 + v_fixedRound) >> 16),
721 v_pack((v_res2 + v_fixedRound) >> 16,
722 (v_res3 + v_fixedRound) >> 16));
723
724 v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
725 }
726 #endif
727 for (; i < dst_width; i++)
728 {
729 *(dst++) = (uint8_t)(*(src++) * m[0] + *(src1++) * m[1]);
730 }
731 }
732
733 template <typename ET> class interpolationLinear
734 {
735 public:
736 static const int len = 2;
737 static const bool needsign = false;
interpolationLinear(double inv_scale,int srcsize,int dstsize)738 interpolationLinear(double inv_scale, int srcsize, int dstsize) : scale(softdouble::one() / softdouble(inv_scale)), maxsize(srcsize), minofst(0), maxofst(dstsize) {}
getCoeffs(int val,int * offset,typename fixedtype<ET,needsign>::type * coeffs)739 void getCoeffs(int val, int* offset, typename fixedtype<ET, needsign>::type* coeffs)
740 {
741 typedef typename fixedtype<ET, needsign>::type fixedpoint;
742 softdouble fval = scale*(softdouble(val)+softdouble(0.5))-softdouble(0.5);
743 int ival = cvFloor(fval);
744 if (ival >= 0 && maxsize > 1)
745 {
746 if (ival < maxsize - 1)
747 {
748 *offset = ival;
749 coeffs[1] = fval - softdouble(ival);
750 coeffs[0] = fixedpoint::one() - coeffs[1];
751 }
752 else
753 {
754 *offset = maxsize - 1;
755 maxofst = min(maxofst, val);
756 }
757 }
758 else
759 {
760 minofst = max(minofst, val + 1);
761 }
762 }
getMinMax(int & min,int & max)763 void getMinMax(int &min, int &max)
764 {
765 min = minofst;
766 max = maxofst;
767 }
768 protected:
769 softdouble scale;
770 int maxsize;
771 int minofst, maxofst;
772 };
773
774 template <typename ET, typename FT, int interp_y_len>
775 class resize_bitExactInvoker :
776 public ParallelLoopBody
777 {
778 public:
779 typedef FT fixedpoint;
780 typedef void(*hResizeFunc)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
resize_bitExactInvoker(const uchar * _src,size_t _src_step,int _src_width,int _src_height,uchar * _dst,size_t _dst_step,int _dst_width,int _dst_height,int _cn,int * _xoffsets,int * _yoffsets,fixedpoint * _xcoeffs,fixedpoint * _ycoeffs,int _min_x,int _max_x,int _min_y,int _max_y,hResizeFunc _hResize)781 resize_bitExactInvoker(const uchar* _src, size_t _src_step, int _src_width, int _src_height,
782 uchar* _dst, size_t _dst_step, int _dst_width, int _dst_height,
783 int _cn, int *_xoffsets, int *_yoffsets, fixedpoint *_xcoeffs, fixedpoint *_ycoeffs,
784 int _min_x, int _max_x, int _min_y, int _max_y, hResizeFunc _hResize) : ParallelLoopBody(),
785 src(_src), src_step(_src_step), src_width(_src_width), src_height(_src_height),
786 dst(_dst), dst_step(_dst_step), dst_width(_dst_width), dst_height(_dst_height),
787 cn(_cn), xoffsets(_xoffsets), yoffsets(_yoffsets), xcoeffs(_xcoeffs), ycoeffs(_ycoeffs),
788 min_x(_min_x), max_x(_max_x), min_y(_min_y), max_y(_max_y), hResize(_hResize) {}
789
operator ()(const Range & range) const790 virtual void operator() (const Range& range) const CV_OVERRIDE
791 {
792 AutoBuffer<fixedpoint> linebuf(interp_y_len * dst_width * cn);
793 int last_eval = - interp_y_len;
794 int evalbuf_start = 0;
795 int rmin_y = max(min_y, range.start);
796 int rmax_y = min(max_y, range.end);
797 if (range.start < min_y)
798 {
799 last_eval = 1 - interp_y_len;
800 evalbuf_start = 1;
801 hResize((ET*)src, cn, xoffsets, xcoeffs, linebuf.data(), min_x, max_x, dst_width);
802 }
803 int dy = range.start;
804 for (; dy < rmin_y; dy++)
805 vlineSet<ET, FT>(linebuf.data(), (ET*)(dst + dst_step * dy), dst_width*cn);
806 for (; dy < rmax_y; dy++)
807 {
808 int &iy = yoffsets[dy];
809
810 int i;
811 for (i = max(iy, last_eval + interp_y_len); i < min(iy + interp_y_len, src_height); i++, evalbuf_start = (evalbuf_start + 1) % interp_y_len)
812 hResize((ET*)(src + i * src_step), cn, xoffsets, xcoeffs, linebuf.data() + evalbuf_start*(dst_width * cn), min_x, max_x, dst_width);
813 evalbuf_start = (evalbuf_start + max(iy, src_height - interp_y_len) - max(last_eval, src_height - interp_y_len)) % interp_y_len;
814 last_eval = iy;
815
816 fixedpoint curcoeffs[interp_y_len];
817 for (i = 0; i < evalbuf_start; i++)
818 curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + interp_y_len + i];
819 for (; i < interp_y_len; i++)
820 curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + i];
821
822 vlineResize<ET, FT, interp_y_len>(linebuf.data(), dst_width*cn, curcoeffs, (ET*)(dst + dst_step * dy), dst_width*cn);
823 }
824 fixedpoint *endline = linebuf.data();
825 if (last_eval + interp_y_len > src_height)
826 endline += dst_width*cn*((evalbuf_start + src_height - 1 - last_eval) % interp_y_len);
827 else
828 hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width);
829 for (; dy < range.end; dy++)
830 vlineSet<ET, FT>(endline, (ET*)(dst + dst_step * dy), dst_width*cn);
831 #if CV_SIMD
832 vx_cleanup();
833 #endif
834 }
835
836 private:
837 const uchar* src;
838 size_t src_step;
839 int src_width, src_height;
840 uchar* dst;
841 size_t dst_step;
842 int dst_width, dst_height, cn;
843 int *xoffsets, *yoffsets;
844 fixedpoint *xcoeffs, *ycoeffs;
845 int min_x, max_x, min_y, max_y;
846 hResizeFunc hResize;
847
848 resize_bitExactInvoker(const resize_bitExactInvoker&);
849 resize_bitExactInvoker& operator=(const resize_bitExactInvoker&);
850 };
851
852 template <typename ET, typename interpolation>
resize_bitExact(const uchar * src,size_t src_step,int src_width,int src_height,uchar * dst,size_t dst_step,int dst_width,int dst_height,int cn,double inv_scale_x,double inv_scale_y)853 void resize_bitExact(const uchar* src, size_t src_step, int src_width, int src_height,
854 uchar* dst, size_t dst_step, int dst_width, int dst_height,
855 int cn, double inv_scale_x, double inv_scale_y)
856 {
857 typedef typename fixedtype<ET, interpolation::needsign>::type fixedpoint;
858 void(*hResize)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
859 switch (cn)
860 {
861 case 1: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 1> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 1>; break;
862 case 2: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 2> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 2>; break;
863 case 3: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 3> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 3>; break;
864 case 4: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 4> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 4>; break;
865 default: hResize = src_width > interpolation::len ? hlineResize<ET, fixedpoint, interpolation::len, true> : hlineResize<ET, fixedpoint, interpolation::len, false> ; break;
866 }
867
868 interpolation interp_x(inv_scale_x, src_width, dst_width);
869 interpolation interp_y(inv_scale_y, src_height, dst_height);
870
871 AutoBuffer<uchar> buf( dst_width * sizeof(int) +
872 dst_height * sizeof(int) +
873 dst_width * interp_x.len*sizeof(fixedpoint) +
874 dst_height * interp_y.len * sizeof(fixedpoint) );
875 int* xoffsets = (int*)buf.data();
876 int* yoffsets = xoffsets + dst_width;
877 fixedpoint* xcoeffs = (fixedpoint*)(yoffsets + dst_height);
878 fixedpoint* ycoeffs = xcoeffs + dst_width * interp_x.len;
879
880 int min_x, max_x, min_y, max_y;
881 for (int dx = 0; dx < dst_width; dx++)
882 interp_x.getCoeffs(dx, xoffsets+dx, xcoeffs+dx*interp_x.len);
883 interp_x.getMinMax(min_x, max_x);
884 for (int dy = 0; dy < dst_height; dy++)
885 interp_y.getCoeffs(dy, yoffsets+dy, ycoeffs+dy*interp_y.len);
886 interp_y.getMinMax(min_y, max_y);
887
888 resize_bitExactInvoker<ET, fixedpoint, interpolation::len> invoker(src, src_step, src_width, src_height, dst, dst_step, dst_width, dst_height, cn,
889 xoffsets, yoffsets, xcoeffs, ycoeffs, min_x, max_x, min_y, max_y, hResize);
890 Range range(0, dst_height);
891 parallel_for_(range, invoker, dst_width * dst_height / (double)(1 << 16));
892 }
893
894 typedef void(*be_resize_func)(const uchar* src, size_t src_step, int src_width, int src_height,
895 uchar* dst, size_t dst_step, int dst_width, int dst_height,
896 int cn, double inv_scale_x, double inv_scale_y);
897
898 }
899
900 namespace cv
901 {
902
903 /************** interpolation formulas and tables ***************/
904
905 const int INTER_RESIZE_COEF_BITS=11;
906 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
907
interpolateCubic(float x,float * coeffs)908 static inline void interpolateCubic( float x, float* coeffs )
909 {
910 const float A = -0.75f;
911
912 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
913 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
914 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
915 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
916 }
917
interpolateLanczos4(float x,float * coeffs)918 static inline void interpolateLanczos4( float x, float* coeffs )
919 {
920 static const double s45 = 0.70710678118654752440084436210485;
921 static const double cs[][2]=
922 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
923
924 float sum = 0;
925 double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
926 for(int i = 0; i < 8; i++ )
927 {
928 float y0_ = (x+3-i);
929 if (fabs(y0_) >= 1e-6f)
930 {
931 double y = -y0_*CV_PI*0.25;
932 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
933 }
934 else
935 {
936 // special handling for 'x' values:
937 // - ~0.0: 0 0 0 1 0 0 0 0
938 // - ~1.0: 0 0 0 0 1 0 0 0
939 coeffs[i] = 1e30f;
940 }
941 sum += coeffs[i];
942 }
943
944 sum = 1.f/sum;
945 for(int i = 0; i < 8; i++ )
946 coeffs[i] *= sum;
947 }
948
949 template<typename ST, typename DT> struct Cast
950 {
951 typedef ST type1;
952 typedef DT rtype;
953
operator ()cv::Cast954 DT operator()(ST val) const { return saturate_cast<DT>(val); }
955 };
956
957 template<typename ST, typename DT, int bits> struct FixedPtCast
958 {
959 typedef ST type1;
960 typedef DT rtype;
961 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
962
operator ()cv::FixedPtCast963 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
964 };
965
966 /****************************************************************************************\
967 * Resize *
968 \****************************************************************************************/
969
970 class resizeNNInvoker :
971 public ParallelLoopBody
972 {
973 public:
resizeNNInvoker(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)974 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
975 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
976 ify(_ify)
977 {
978 }
979
operator ()(const Range & range) const980 virtual void operator() (const Range& range) const CV_OVERRIDE
981 {
982 Size ssize = src.size(), dsize = dst.size();
983 int y, x, pix_size = (int)src.elemSize();
984
985 for( y = range.start; y < range.end; y++ )
986 {
987 uchar* D = dst.data + dst.step*y;
988 int sy = std::min(cvFloor(y*ify), ssize.height-1);
989 const uchar* S = src.ptr(sy);
990
991 switch( pix_size )
992 {
993 case 1:
994 for( x = 0; x <= dsize.width - 2; x += 2 )
995 {
996 uchar t0 = S[x_ofs[x]];
997 uchar t1 = S[x_ofs[x+1]];
998 D[x] = t0;
999 D[x+1] = t1;
1000 }
1001
1002 for( ; x < dsize.width; x++ )
1003 D[x] = S[x_ofs[x]];
1004 break;
1005 case 2:
1006 for( x = 0; x < dsize.width; x++ )
1007 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
1008 break;
1009 case 3:
1010 for( x = 0; x < dsize.width; x++, D += 3 )
1011 {
1012 const uchar* _tS = S + x_ofs[x];
1013 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
1014 }
1015 break;
1016 case 4:
1017 for( x = 0; x < dsize.width; x++ )
1018 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
1019 break;
1020 case 6:
1021 for( x = 0; x < dsize.width; x++, D += 6 )
1022 {
1023 const ushort* _tS = (const ushort*)(S + x_ofs[x]);
1024 ushort* _tD = (ushort*)D;
1025 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1026 }
1027 break;
1028 case 8:
1029 for( x = 0; x < dsize.width; x++, D += 8 )
1030 {
1031 const int* _tS = (const int*)(S + x_ofs[x]);
1032 int* _tD = (int*)D;
1033 _tD[0] = _tS[0]; _tD[1] = _tS[1];
1034 }
1035 break;
1036 case 12:
1037 for( x = 0; x < dsize.width; x++, D += 12 )
1038 {
1039 const int* _tS = (const int*)(S + x_ofs[x]);
1040 int* _tD = (int*)D;
1041 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1042 }
1043 break;
1044 default:
1045 for( x = 0; x < dsize.width; x++, D += pix_size )
1046 {
1047 const uchar* _tS = S + x_ofs[x];
1048 for (int k = 0; k < pix_size; k++)
1049 D[k] = _tS[k];
1050 }
1051 }
1052 }
1053 }
1054
1055 private:
1056 const Mat& src;
1057 Mat& dst;
1058 int* x_ofs;
1059 double ify;
1060
1061 resizeNNInvoker(const resizeNNInvoker&);
1062 resizeNNInvoker& operator=(const resizeNNInvoker&);
1063 };
1064
1065 static void
resizeNN(const Mat & src,Mat & dst,double fx,double fy)1066 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
1067 {
1068 Size ssize = src.size(), dsize = dst.size();
1069 AutoBuffer<int> _x_ofs(dsize.width);
1070 int* x_ofs = _x_ofs.data();
1071 int pix_size = (int)src.elemSize();
1072 double ifx = 1./fx, ify = 1./fy;
1073 int x;
1074
1075 for( x = 0; x < dsize.width; x++ )
1076 {
1077 int sx = cvFloor(x*ifx);
1078 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
1079 }
1080
1081 Range range(0, dsize.height);
1082 #if CV_TRY_AVX2
1083 if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4)))
1084 {
1085 if(pix_size == 2)
1086 opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, ify);
1087 else
1088 opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, ify);
1089 }
1090 else
1091 #endif
1092 #if CV_TRY_SSE4_1
1093 if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4)))
1094 {
1095 if(pix_size == 2)
1096 opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, ify);
1097 else
1098 opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
1099 }
1100 else
1101 #endif
1102 {
1103 resizeNNInvoker invoker(src, dst, x_ofs, ify);
1104 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1105 }
1106 }
1107
1108 class resizeNN_bitexactInvoker : public ParallelLoopBody
1109 {
1110 public:
resizeNN_bitexactInvoker(const Mat & _src,Mat & _dst,int * _x_ofse,int _ify,int _ify0)1111 resizeNN_bitexactInvoker(const Mat& _src, Mat& _dst, int* _x_ofse, int _ify, int _ify0)
1112 : src(_src), dst(_dst), x_ofse(_x_ofse), ify(_ify), ify0(_ify0) {}
1113
operator ()(const Range & range) const1114 virtual void operator() (const Range& range) const CV_OVERRIDE
1115 {
1116 Size ssize = src.size(), dsize = dst.size();
1117 int pix_size = (int)src.elemSize();
1118 for( int y = range.start; y < range.end; y++ )
1119 {
1120 uchar* D = dst.ptr(y);
1121 int _sy = (ify * y + ify0) >> 16;
1122 int sy = std::min(_sy, ssize.height-1);
1123 const uchar* S = src.ptr(sy);
1124
1125 int x = 0;
1126 switch( pix_size )
1127 {
1128 case 1:
1129 #if CV_SIMD
1130 for( ; x <= dsize.width - v_uint8::nlanes; x += v_uint8::nlanes )
1131 v_store(D + x, vx_lut(S, x_ofse + x));
1132 #endif
1133 for( ; x < dsize.width; x++ )
1134 D[x] = S[x_ofse[x]];
1135 break;
1136 case 2:
1137 #if CV_SIMD
1138 for( ; x <= dsize.width - v_uint16::nlanes; x += v_uint16::nlanes )
1139 v_store((ushort*)D + x, vx_lut((ushort*)S, x_ofse + x));
1140 #endif
1141 for( ; x < dsize.width; x++ )
1142 *((ushort*)D + x) = *((ushort*)S + x_ofse[x]);
1143 break;
1144 case 3:
1145 for( ; x < dsize.width; x++, D += 3 )
1146 {
1147 const uchar* _tS = S + x_ofse[x] * 3;
1148 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
1149 }
1150 break;
1151 case 4:
1152 #if CV_SIMD
1153 for( ; x <= dsize.width - v_uint32::nlanes; x += v_uint32::nlanes )
1154 v_store((uint32_t*)D + x, vx_lut((uint32_t*)S, x_ofse + x));
1155 #endif
1156 for( ; x < dsize.width; x++ )
1157 *((uint32_t*)D + x) = *((uint32_t*)S + x_ofse[x]);
1158 break;
1159 case 6:
1160 for( ; x < dsize.width; x++, D += 6 )
1161 {
1162 const ushort* _tS = (const ushort*)(S + x_ofse[x]*6);
1163 ushort* _tD = (ushort*)D;
1164 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1165 }
1166 break;
1167 case 8:
1168 #if CV_SIMD
1169 for( ; x <= dsize.width - v_uint64::nlanes; x += v_uint64::nlanes )
1170 v_store((uint64_t*)D + x, vx_lut((uint64_t*)S, x_ofse + x));
1171 #endif
1172 for( ; x < dsize.width; x++ )
1173 *((uint64_t*)D + x) = *((uint64_t*)S + x_ofse[x]);
1174 break;
1175 case 12:
1176 for( ; x < dsize.width; x++, D += 12 )
1177 {
1178 const int* _tS = (const int*)(S + x_ofse[x]*12);
1179 int* _tD = (int*)D;
1180 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1181 }
1182 break;
1183 default:
1184 for( x = 0; x < dsize.width; x++, D += pix_size )
1185 {
1186 const uchar* _tS = S + x_ofse[x] * pix_size;
1187 for (int k = 0; k < pix_size; k++)
1188 D[k] = _tS[k];
1189 }
1190 }
1191 }
1192 }
1193 private:
1194 const Mat& src;
1195 Mat& dst;
1196 int* x_ofse;
1197 const int ify;
1198 const int ify0;
1199 };
1200
resizeNN_bitexact(const Mat & src,Mat & dst,double,double)1201 static void resizeNN_bitexact( const Mat& src, Mat& dst, double /*fx*/, double /*fy*/ )
1202 {
1203 Size ssize = src.size(), dsize = dst.size();
1204 int ifx = ((ssize.width << 16) + dsize.width / 2) / dsize.width; // 16bit fixed-point arithmetic
1205 int ifx0 = ifx / 2 - 1; // This method uses center pixel coordinate as Pillow and scikit-images do.
1206 int ify = ((ssize.height << 16) + dsize.height / 2) / dsize.height;
1207 int ify0 = ify / 2 - 1;
1208
1209 cv::utils::BufferArea area;
1210 int* x_ofse = 0;
1211 area.allocate(x_ofse, dsize.width, CV_SIMD_WIDTH);
1212 area.commit();
1213
1214 for( int x = 0; x < dsize.width; x++ )
1215 {
1216 int sx = (ifx * x + ifx0) >> 16;
1217 x_ofse[x] = std::min(sx, ssize.width-1); // offset in element (not byte)
1218 }
1219 Range range(0, dsize.height);
1220 resizeNN_bitexactInvoker invoker(src, dst, x_ofse, ify, ify0);
1221 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1222 }
1223
1224 struct VResizeNoVec
1225 {
1226 template<typename WT, typename T, typename BT>
operator ()cv::VResizeNoVec1227 int operator()(const WT**, T*, const BT*, int ) const
1228 {
1229 return 0;
1230 }
1231 };
1232
1233 struct HResizeNoVec
1234 {
1235 template<typename T, typename WT, typename AT> inline
operator ()cv::HResizeNoVec1236 int operator()(const T**, WT**, int, const int*,
1237 const AT*, int, int, int, int, int) const
1238 {
1239 return 0;
1240 }
1241 };
1242
1243 #if CV_SIMD
1244
1245 struct VResizeLinearVec_32s8u
1246 {
operator ()cv::VResizeLinearVec_32s8u1247 int operator()(const int** src, uchar* dst, const short* beta, int width) const
1248 {
1249 const int *S0 = src[0], *S1 = src[1];
1250 int x = 0;
1251 v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
1252
1253 if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1254 for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
1255 v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x ) >> 4, vx_load_aligned(S0 + x + v_int32::nlanes) >> 4), b0) +
1256 v_mul_hi(v_pack(vx_load_aligned(S1 + x ) >> 4, vx_load_aligned(S1 + x + v_int32::nlanes) >> 4), b1),
1257 v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
1258 v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
1259 else
1260 for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
1261 v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x ) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
1262 v_mul_hi(v_pack(vx_load(S1 + x ) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1),
1263 v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
1264 v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
1265
1266 for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
1267 v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
1268 v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
1269
1270 return x;
1271 }
1272 };
1273
1274 struct VResizeLinearVec_32f16u
1275 {
operator ()cv::VResizeLinearVec_32f16u1276 int operator()(const float** src, ushort* dst, const float* beta, int width) const
1277 {
1278 const float *S0 = src[0], *S1 = src[1];
1279 int x = 0;
1280
1281 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1282
1283 if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1284 for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1285 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)),
1286 v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
1287 else
1288 for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1289 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)),
1290 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
1291 for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
1292 {
1293 v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1294 v_store_low(dst + x, v_pack_u(t0, t0));
1295 }
1296
1297 return x;
1298 }
1299 };
1300
1301 struct VResizeLinearVec_32f16s
1302 {
operator ()cv::VResizeLinearVec_32f16s1303 int operator()(const float** src, short* dst, const float* beta, int width) const
1304 {
1305 const float *S0 = src[0], *S1 = src[1];
1306 int x = 0;
1307
1308 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1309
1310 if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1311 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1312 v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)),
1313 v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
1314 else
1315 for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1316 v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)),
1317 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
1318 for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
1319 {
1320 v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1321 v_store_low(dst + x, v_pack(t0, t0));
1322 }
1323
1324 return x;
1325 }
1326 };
1327
1328 struct VResizeLinearVec_32f
1329 {
operator ()cv::VResizeLinearVec_32f1330 int operator()(const float** src, float* dst, const float* beta, int width) const
1331 {
1332 const float *S0 = src[0], *S1 = src[1];
1333 int x = 0;
1334
1335 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1336
1337 if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1338 for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1339 v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
1340 else
1341 for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1342 v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1343
1344 return x;
1345 }
1346 };
1347
1348
1349 struct VResizeCubicVec_32s8u
1350 {
operator ()cv::VResizeCubicVec_32s8u1351 int operator()(const int** src, uchar* dst, const short* beta, int width) const
1352 {
1353 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1354 int x = 0;
1355 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
1356
1357 v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
1358 b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
1359
1360 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
1361 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1362 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x )), b0,
1363 v_muladd(v_cvt_f32(vx_load_aligned(S1 + x )), b1,
1364 v_muladd(v_cvt_f32(vx_load_aligned(S2 + x )), b2,
1365 v_cvt_f32(vx_load_aligned(S3 + x )) * b3)))),
1366 v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)), b0,
1367 v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)), b1,
1368 v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)), b2,
1369 v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
1370 else
1371 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1372 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x )), b0,
1373 v_muladd(v_cvt_f32(vx_load(S1 + x )), b1,
1374 v_muladd(v_cvt_f32(vx_load(S2 + x )), b2,
1375 v_cvt_f32(vx_load(S3 + x )) * b3)))),
1376 v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)), b0,
1377 v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)), b1,
1378 v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)), b2,
1379 v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
1380 return x;
1381 }
1382 };
1383
1384 struct VResizeCubicVec_32f16u
1385 {
operator ()cv::VResizeCubicVec_32f16u1386 int operator()(const float** src, ushort* dst, const float* beta, int width) const
1387 {
1388 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1389 int x = 0;
1390 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1391 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1392
1393 for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1394 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0,
1395 v_muladd(vx_load(S1 + x ), b1,
1396 v_muladd(vx_load(S2 + x ), b2,
1397 vx_load(S3 + x ) * b3)))),
1398 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
1399 v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
1400 v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
1401 vx_load(S3 + x + v_float32::nlanes) * b3))))));
1402
1403 return x;
1404 }
1405 };
1406
1407 struct VResizeCubicVec_32f16s
1408 {
operator ()cv::VResizeCubicVec_32f16s1409 int operator()(const float** src, short* dst, const float* beta, int width) const
1410 {
1411 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1412 int x = 0;
1413 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1414 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1415
1416 for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1417 v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0,
1418 v_muladd(vx_load(S1 + x ), b1,
1419 v_muladd(vx_load(S2 + x ), b2,
1420 vx_load(S3 + x ) * b3)))),
1421 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
1422 v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
1423 v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
1424 vx_load(S3 + x + v_float32::nlanes) * b3))))));
1425
1426 return x;
1427 }
1428 };
1429
1430 struct VResizeCubicVec_32f
1431 {
operator ()cv::VResizeCubicVec_32f1432 int operator()(const float** src, float* dst, const float* beta, int width) const
1433 {
1434 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1435 int x = 0;
1436 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1437 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1438
1439 for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1440 v_store(dst + x, v_muladd(vx_load(S0 + x), b0,
1441 v_muladd(vx_load(S1 + x), b1,
1442 v_muladd(vx_load(S2 + x), b2,
1443 vx_load(S3 + x) * b3))));
1444
1445 return x;
1446 }
1447 };
1448
1449
1450 #if CV_TRY_SSE4_1
1451
1452 struct VResizeLanczos4Vec_32f16u
1453 {
operator ()cv::VResizeLanczos4Vec_32f16u1454 int operator()(const float** src, ushort* dst, const float* beta, int width) const
1455 {
1456 if (CV_CPU_HAS_SUPPORT_SSE4_1)
1457 return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(src, dst, beta, width);
1458 else
1459 return 0;
1460 }
1461 };
1462
1463 #else
1464
1465 struct VResizeLanczos4Vec_32f16u
1466 {
operator ()cv::VResizeLanczos4Vec_32f16u1467 int operator()(const float** src, ushort* dst, const float* beta, int width ) const
1468 {
1469 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1470 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1471 int x = 0;
1472 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1473 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1474 b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1475 b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1476
1477 for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1478 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0,
1479 v_muladd(vx_load(S1 + x ), b1,
1480 v_muladd(vx_load(S2 + x ), b2,
1481 v_muladd(vx_load(S3 + x ), b3,
1482 v_muladd(vx_load(S4 + x ), b4,
1483 v_muladd(vx_load(S5 + x ), b5,
1484 v_muladd(vx_load(S6 + x ), b6,
1485 vx_load(S7 + x ) * b7)))))))),
1486 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
1487 v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
1488 v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
1489 v_muladd(vx_load(S3 + x + v_float32::nlanes), b3,
1490 v_muladd(vx_load(S4 + x + v_float32::nlanes), b4,
1491 v_muladd(vx_load(S5 + x + v_float32::nlanes), b5,
1492 v_muladd(vx_load(S6 + x + v_float32::nlanes), b6,
1493 vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
1494
1495 return x;
1496 }
1497 };
1498
1499 #endif
1500
1501 struct VResizeLanczos4Vec_32f16s
1502 {
operator ()cv::VResizeLanczos4Vec_32f16s1503 int operator()(const float** src, short* dst, const float* beta, int width ) const
1504 {
1505 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1506 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1507 int x = 0;
1508 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1509 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1510 b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1511 b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1512
1513 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1514 v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0,
1515 v_muladd(vx_load(S1 + x ), b1,
1516 v_muladd(vx_load(S2 + x ), b2,
1517 v_muladd(vx_load(S3 + x ), b3,
1518 v_muladd(vx_load(S4 + x ), b4,
1519 v_muladd(vx_load(S5 + x ), b5,
1520 v_muladd(vx_load(S6 + x ), b6,
1521 vx_load(S7 + x ) * b7)))))))),
1522 v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
1523 v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
1524 v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
1525 v_muladd(vx_load(S3 + x + v_float32::nlanes), b3,
1526 v_muladd(vx_load(S4 + x + v_float32::nlanes), b4,
1527 v_muladd(vx_load(S5 + x + v_float32::nlanes), b5,
1528 v_muladd(vx_load(S6 + x + v_float32::nlanes), b6,
1529 vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
1530
1531 return x;
1532 }
1533 };
1534
1535 struct VResizeLanczos4Vec_32f
1536 {
operator ()cv::VResizeLanczos4Vec_32f1537 int operator()(const float** src, float* dst, const float* beta, int width ) const
1538 {
1539 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1540 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1541 int x = 0;
1542
1543 v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1544 b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1545 b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1546 b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1547
1548 for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1549 v_store(dst + x, v_muladd(vx_load(S0 + x), b0,
1550 v_muladd(vx_load(S1 + x), b1,
1551 v_muladd(vx_load(S2 + x), b2,
1552 v_muladd(vx_load(S3 + x), b3,
1553 v_muladd(vx_load(S4 + x), b4,
1554 v_muladd(vx_load(S5 + x), b5,
1555 v_muladd(vx_load(S6 + x), b6,
1556 vx_load(S7 + x) * b7))))))));
1557
1558 return x;
1559 }
1560 };
1561
1562 #else
1563
1564 typedef VResizeNoVec VResizeLinearVec_32s8u;
1565 typedef VResizeNoVec VResizeLinearVec_32f16u;
1566 typedef VResizeNoVec VResizeLinearVec_32f16s;
1567 typedef VResizeNoVec VResizeLinearVec_32f;
1568
1569 typedef VResizeNoVec VResizeCubicVec_32s8u;
1570 typedef VResizeNoVec VResizeCubicVec_32f16u;
1571 typedef VResizeNoVec VResizeCubicVec_32f16s;
1572 typedef VResizeNoVec VResizeCubicVec_32f;
1573
1574 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1575 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1576 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1577
1578 #endif
1579
1580 #if CV_SIMD128
1581
1582 template<typename ST, typename DT, typename AT, typename DVT>
1583 struct HResizeLinearVec_X4
1584 {
operator ()cv::HResizeLinearVec_X41585 int operator()(const ST** src, DT** dst, int count, const int* xofs,
1586 const AT* alpha, int, int, int cn, int, int xmax) const
1587 {
1588 const int nlanes = 4;
1589 const int len0 = xmax & -nlanes;
1590 int dx = 0, k = 0;
1591
1592 for( ; k <= (count - 2); k+=2 )
1593 {
1594 const ST *S0 = src[k];
1595 DT *D0 = dst[k];
1596 const ST *S1 = src[k+1];
1597 DT *D1 = dst[k+1];
1598
1599 for( dx = 0; dx < len0; dx += nlanes )
1600 {
1601 int sx0 = xofs[dx+0];
1602 int sx1 = xofs[dx+1];
1603 int sx2 = xofs[dx+2];
1604 int sx3 = xofs[dx+3];
1605 DVT a_even;
1606 DVT a_odd;
1607
1608 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
1609 DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]);
1610 DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
1611 DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
1612 DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
1613 v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
1614 v_store(&D0[dx], s0 * a_even + s1 * a_odd);
1615 }
1616 }
1617 for( ; k < count; k++ )
1618 {
1619 const ST *S = src[k];
1620 DT *D = dst[k];
1621 for( dx = 0; dx < len0; dx += nlanes )
1622 {
1623 int sx0 = xofs[dx+0];
1624 int sx1 = xofs[dx+1];
1625 int sx2 = xofs[dx+2];
1626 int sx3 = xofs[dx+3];
1627 DVT a_even;
1628 DVT a_odd;
1629
1630 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
1631 DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
1632 DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
1633 v_store(&D[dx], s0 * a_even + s1 * a_odd);
1634 }
1635 }
1636 return dx;
1637 }
1638 };
1639
1640 struct HResizeLinearVecU8_X4
1641 {
operator ()cv::HResizeLinearVecU8_X41642 int operator()(const uchar** src, int** dst, int count, const int* xofs,
1643 const short* alpha/*[xmax]*/, int /*smax*/, int dmax, int cn, int /*xmin*/, int xmax) const
1644 {
1645 int dx = 0, k = 0;
1646
1647 if(cn == 1)
1648 {
1649 const int step = 8;
1650 const int len0 = xmax & -step;
1651 for( ; k <= (count - 2); k+=2 )
1652 {
1653 const uchar *S0 = src[k];
1654 int *D0 = dst[k];
1655 const uchar *S1 = src[k+1];
1656 int *D1 = dst[k+1];
1657
1658 for( dx = 0; dx < len0; dx += step )
1659 {
1660 v_int16x8 al = v_load(alpha+dx*2);
1661 v_int16x8 ah = v_load(alpha+dx*2+8);
1662 v_uint16x8 sl, sh;
1663 v_expand(v_lut_pairs(S0, xofs+dx), sl, sh);
1664 v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1665 v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1666 v_expand(v_lut_pairs(S1, xofs+dx), sl, sh);
1667 v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1668 v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1669 }
1670 }
1671 for( ; k < count; k++ )
1672 {
1673 const uchar *S = src[k];
1674 int *D = dst[k];
1675 for( dx = 0; dx < len0; dx += step )
1676 {
1677 v_int16x8 al = v_load(alpha+dx*2);
1678 v_int16x8 ah = v_load(alpha+dx*2+8);
1679 v_uint16x8 sl, sh;
1680 v_expand(v_lut_pairs(S, xofs+dx), sl, sh);
1681 v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1682 v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1683 }
1684 }
1685 }
1686 else if(cn == 2)
1687 {
1688 const int step = 8;
1689 const int len0 = xmax & -step;
1690 for( ; k <= (count - 2); k+=2 )
1691 {
1692 const uchar *S0 = src[k];
1693 int *D0 = dst[k];
1694 const uchar *S1 = src[k+1];
1695 int *D1 = dst[k+1];
1696
1697 for( dx = 0; dx < len0; dx += step )
1698 {
1699 int ofs[4] = { xofs[dx], xofs[dx + 2], xofs[dx + 4], xofs[dx + 6] };
1700 v_int16x8 al = v_load(alpha+dx*2);
1701 v_int16x8 ah = v_load(alpha+dx*2+8);
1702 v_uint16x8 sl, sh;
1703 v_expand(v_interleave_pairs(v_lut_quads(S0, ofs)), sl, sh);
1704 v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1705 v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1706 v_expand(v_interleave_pairs(v_lut_quads(S1, ofs)), sl, sh);
1707 v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1708 v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1709 }
1710 }
1711 for( ; k < count; k++ )
1712 {
1713 const uchar *S = src[k];
1714 int *D = dst[k];
1715 for( dx = 0; dx < len0; dx += step )
1716 {
1717 int ofs[4] = { xofs[dx], xofs[dx + 2], xofs[dx + 4], xofs[dx + 6] };
1718 v_int16x8 al = v_load(alpha+dx*2);
1719 v_int16x8 ah = v_load(alpha+dx*2+8);
1720 v_uint16x8 sl, sh;
1721 v_expand(v_interleave_pairs(v_lut_quads(S, ofs)), sl, sh);
1722 v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1723 v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1724 }
1725 }
1726 }
1727 else if(cn == 3)
1728 {
1729 /* Peek at the last x offset to find the maximal s offset. We know the loop
1730 will terminate prior to value which may be 1 or more elements prior to the
1731 final valid offset. xofs[] is constucted to be an array of increasingly
1732 large offsets (i.e xofs[x] <= xofs[x+1] for x < xmax). */
1733 int smax = xofs[dmax-cn];
1734
1735 for( ; k <= (count - 2); k+=2 )
1736 {
1737 const uchar *S0 = src[k];
1738 int *D0 = dst[k];
1739 const uchar *S1 = src[k+1];
1740 int *D1 = dst[k+1];
1741
1742 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
1743 {
1744 v_int16x8 a = v_load(alpha+dx*2);
1745 v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
1746 v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
1747 }
1748 }
1749 for( ; k < count; k++ )
1750 {
1751 const uchar *S = src[k];
1752 int *D = dst[k];
1753 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
1754 {
1755 v_int16x8 a = v_load(alpha+dx*2);
1756 v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
1757 }
1758 }
1759 /* Debug check to ensure truthiness that we never vector the final value. */
1760 CV_DbgAssert(dx < dmax);
1761 }
1762 else if(cn == 4)
1763 {
1764 const int step = 4;
1765 const int len0 = xmax & -step;
1766 for( ; k <= (count - 2); k+=2 )
1767 {
1768 const uchar *S0 = src[k];
1769 int *D0 = dst[k];
1770 const uchar *S1 = src[k+1];
1771 int *D1 = dst[k+1];
1772
1773 for( dx = 0; dx < len0; dx += step )
1774 {
1775 v_int16x8 a = v_load(alpha+dx*2);
1776 v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a));
1777 v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a));
1778 }
1779 }
1780 for( ; k < count; k++ )
1781 {
1782 const uchar *S = src[k];
1783 int *D = dst[k];
1784 for( dx = 0; dx < len0; dx += step )
1785 {
1786 v_int16x8 a = v_load(alpha+dx*2);
1787 v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a));
1788 }
1789 }
1790 }
1791 else
1792 {
1793 return 0; // images with channels >4 are out of optimization scope
1794 }
1795 return dx;
1796 }
1797 };
1798
1799 typedef HResizeLinearVec_X4<float,float,float,v_float32x4> HResizeLinearVec_32f;
1800 typedef HResizeLinearVec_X4<ushort,float,float,v_float32x4> HResizeLinearVec_16u32f;
1801 typedef HResizeLinearVec_X4<short,float,float,v_float32x4> HResizeLinearVec_16s32f;
1802 typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s;
1803
1804 #else
1805
1806 typedef HResizeNoVec HResizeLinearVec_8u32s;
1807 typedef HResizeNoVec HResizeLinearVec_16u32f;
1808 typedef HResizeNoVec HResizeLinearVec_16s32f;
1809 typedef HResizeNoVec HResizeLinearVec_32f;
1810
1811 #endif
1812
1813 typedef HResizeNoVec HResizeLinearVec_64f;
1814
1815
1816 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1817 struct HResizeLinear
1818 {
1819 typedef T value_type;
1820 typedef WT buf_type;
1821 typedef AT alpha_type;
1822
operator ()cv::HResizeLinear1823 void operator()(const T** src, WT** dst, int count,
1824 const int* xofs, const AT* alpha,
1825 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1826 {
1827 int dx, k;
1828 VecOp vecOp;
1829
1830 int dx0 = vecOp(src, dst, count,
1831 xofs, alpha, swidth, dwidth, cn, xmin, xmax );
1832
1833 for( k = 0; k <= count - 2; k+=2 )
1834 {
1835 const T *S0 = src[k], *S1 = src[k+1];
1836 WT *D0 = dst[k], *D1 = dst[k+1];
1837 for( dx = dx0; dx < xmax; dx++ )
1838 {
1839 int sx = xofs[dx];
1840 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1841 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1842 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1843 D0[dx] = t0; D1[dx] = t1;
1844 }
1845
1846 for( ; dx < dwidth; dx++ )
1847 {
1848 int sx = xofs[dx];
1849 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1850 }
1851 }
1852
1853 for( ; k < count; k++ )
1854 {
1855 const T *S = src[k];
1856 WT *D = dst[k];
1857 for( dx = dx0; dx < xmax; dx++ )
1858 {
1859 int sx = xofs[dx];
1860 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1861 }
1862
1863 for( ; dx < dwidth; dx++ )
1864 D[dx] = WT(S[xofs[dx]]*ONE);
1865 }
1866 }
1867 };
1868
1869
1870 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1871 struct VResizeLinear
1872 {
1873 typedef T value_type;
1874 typedef WT buf_type;
1875 typedef AT alpha_type;
1876
operator ()cv::VResizeLinear1877 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1878 {
1879 WT b0 = beta[0], b1 = beta[1];
1880 const WT *S0 = src[0], *S1 = src[1];
1881 CastOp castOp;
1882 VecOp vecOp;
1883
1884 int x = vecOp(src, dst, beta, width);
1885 #if CV_ENABLE_UNROLLED
1886 for( ; x <= width - 4; x += 4 )
1887 {
1888 WT t0, t1;
1889 t0 = S0[x]*b0 + S1[x]*b1;
1890 t1 = S0[x+1]*b0 + S1[x+1]*b1;
1891 dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1892 t0 = S0[x+2]*b0 + S1[x+2]*b1;
1893 t1 = S0[x+3]*b0 + S1[x+3]*b1;
1894 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1895 }
1896 #endif
1897 for( ; x < width; x++ )
1898 dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1899 }
1900 };
1901
1902 template<>
1903 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1904 {
1905 typedef uchar value_type;
1906 typedef int buf_type;
1907 typedef short alpha_type;
1908
operator ()cv::VResizeLinear1909 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1910 {
1911 alpha_type b0 = beta[0], b1 = beta[1];
1912 const buf_type *S0 = src[0], *S1 = src[1];
1913 VResizeLinearVec_32s8u vecOp;
1914
1915 int x = vecOp(src, dst, beta, width);
1916 #if CV_ENABLE_UNROLLED
1917 for( ; x <= width - 4; x += 4 )
1918 {
1919 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1920 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1921 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1922 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1923 }
1924 #endif
1925 for( ; x < width; x++ )
1926 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1927 }
1928 };
1929
1930
1931 template<typename T, typename WT, typename AT>
1932 struct HResizeCubic
1933 {
1934 typedef T value_type;
1935 typedef WT buf_type;
1936 typedef AT alpha_type;
1937
operator ()cv::HResizeCubic1938 void operator()(const T** src, WT** dst, int count,
1939 const int* xofs, const AT* alpha,
1940 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1941 {
1942 for( int k = 0; k < count; k++ )
1943 {
1944 const T *S = src[k];
1945 WT *D = dst[k];
1946 int dx = 0, limit = xmin;
1947 for(;;)
1948 {
1949 for( ; dx < limit; dx++, alpha += 4 )
1950 {
1951 int j, sx = xofs[dx] - cn;
1952 WT v = 0;
1953 for( j = 0; j < 4; j++ )
1954 {
1955 int sxj = sx + j*cn;
1956 if( (unsigned)sxj >= (unsigned)swidth )
1957 {
1958 while( sxj < 0 )
1959 sxj += cn;
1960 while( sxj >= swidth )
1961 sxj -= cn;
1962 }
1963 v += S[sxj]*alpha[j];
1964 }
1965 D[dx] = v;
1966 }
1967 if( limit == dwidth )
1968 break;
1969 for( ; dx < xmax; dx++, alpha += 4 )
1970 {
1971 int sx = xofs[dx];
1972 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1973 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1974 }
1975 limit = dwidth;
1976 }
1977 alpha -= dwidth*4;
1978 }
1979 }
1980 };
1981
1982
1983 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1984 struct VResizeCubic
1985 {
1986 typedef T value_type;
1987 typedef WT buf_type;
1988 typedef AT alpha_type;
1989
operator ()cv::VResizeCubic1990 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1991 {
1992 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1993 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1994 CastOp castOp;
1995 VecOp vecOp;
1996
1997 int x = vecOp(src, dst, beta, width);
1998 for( ; x < width; x++ )
1999 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
2000 }
2001 };
2002
2003
2004 template<typename T, typename WT, typename AT>
2005 struct HResizeLanczos4
2006 {
2007 typedef T value_type;
2008 typedef WT buf_type;
2009 typedef AT alpha_type;
2010
operator ()cv::HResizeLanczos42011 void operator()(const T** src, WT** dst, int count,
2012 const int* xofs, const AT* alpha,
2013 int swidth, int dwidth, int cn, int xmin, int xmax ) const
2014 {
2015 for( int k = 0; k < count; k++ )
2016 {
2017 const T *S = src[k];
2018 WT *D = dst[k];
2019 int dx = 0, limit = xmin;
2020 for(;;)
2021 {
2022 for( ; dx < limit; dx++, alpha += 8 )
2023 {
2024 int j, sx = xofs[dx] - cn*3;
2025 WT v = 0;
2026 for( j = 0; j < 8; j++ )
2027 {
2028 int sxj = sx + j*cn;
2029 if( (unsigned)sxj >= (unsigned)swidth )
2030 {
2031 while( sxj < 0 )
2032 sxj += cn;
2033 while( sxj >= swidth )
2034 sxj -= cn;
2035 }
2036 v += S[sxj]*alpha[j];
2037 }
2038 D[dx] = v;
2039 }
2040 if( limit == dwidth )
2041 break;
2042 for( ; dx < xmax; dx++, alpha += 8 )
2043 {
2044 int sx = xofs[dx];
2045 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
2046 S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
2047 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
2048 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
2049 }
2050 limit = dwidth;
2051 }
2052 alpha -= dwidth*8;
2053 }
2054 }
2055 };
2056
2057
2058 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
2059 struct VResizeLanczos4
2060 {
2061 typedef T value_type;
2062 typedef WT buf_type;
2063 typedef AT alpha_type;
2064
operator ()cv::VResizeLanczos42065 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
2066 {
2067 CastOp castOp;
2068 VecOp vecOp;
2069 int x = vecOp(src, dst, beta, width);
2070 #if CV_ENABLE_UNROLLED
2071 for( ; x <= width - 4; x += 4 )
2072 {
2073 WT b = beta[0];
2074 const WT* S = src[0];
2075 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
2076
2077 for( int k = 1; k < 8; k++ )
2078 {
2079 b = beta[k]; S = src[k];
2080 s0 += S[x]*b; s1 += S[x+1]*b;
2081 s2 += S[x+2]*b; s3 += S[x+3]*b;
2082 }
2083
2084 dst[x] = castOp(s0); dst[x+1] = castOp(s1);
2085 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
2086 }
2087 #endif
2088 for( ; x < width; x++ )
2089 {
2090 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
2091 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
2092 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
2093 }
2094 }
2095 };
2096
2097
clip(int x,int a,int b)2098 static inline int clip(int x, int a, int b)
2099 {
2100 return x >= a ? (x < b ? x : b-1) : a;
2101 }
2102
2103 static const int MAX_ESIZE=16;
2104
2105 template <typename HResize, typename VResize>
2106 class resizeGeneric_Invoker :
2107 public ParallelLoopBody
2108 {
2109 public:
2110 typedef typename HResize::value_type T;
2111 typedef typename HResize::buf_type WT;
2112 typedef typename HResize::alpha_type AT;
2113
resizeGeneric_Invoker(const Mat & _src,Mat & _dst,const int * _xofs,const int * _yofs,const AT * _alpha,const AT * __beta,const Size & _ssize,const Size & _dsize,int _ksize,int _xmin,int _xmax)2114 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
2115 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
2116 int _ksize, int _xmin, int _xmax) :
2117 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
2118 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
2119 ksize(_ksize), xmin(_xmin), xmax(_xmax)
2120 {
2121 CV_Assert(ksize <= MAX_ESIZE);
2122 }
2123
operator ()(const Range & range) const2124 virtual void operator() (const Range& range) const CV_OVERRIDE
2125 {
2126 int dy, cn = src.channels();
2127 HResize hresize;
2128 VResize vresize;
2129
2130 int bufstep = (int)alignSize(dsize.width, 16);
2131 AutoBuffer<WT> _buffer(bufstep*ksize);
2132 const T* srows[MAX_ESIZE]={0};
2133 WT* rows[MAX_ESIZE]={0};
2134 int prev_sy[MAX_ESIZE];
2135
2136 for(int k = 0; k < ksize; k++ )
2137 {
2138 prev_sy[k] = -1;
2139 rows[k] = _buffer.data() + bufstep*k;
2140 }
2141
2142 const AT* beta = _beta + ksize * range.start;
2143
2144 for( dy = range.start; dy < range.end; dy++, beta += ksize )
2145 {
2146 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
2147
2148 for(int k = 0; k < ksize; k++ )
2149 {
2150 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
2151 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
2152 {
2153 if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
2154 {
2155 if( k1 > k )
2156 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
2157 break;
2158 }
2159 }
2160 if( k1 == ksize )
2161 k0 = std::min(k0, k); // remember the first row that needs to be computed
2162 srows[k] = src.template ptr<T>(sy);
2163 prev_sy[k] = sy;
2164 }
2165
2166 if( k0 < ksize )
2167 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
2168 ssize.width, dsize.width, cn, xmin, xmax );
2169 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
2170 }
2171 }
2172
2173 private:
2174 Mat src;
2175 Mat dst;
2176 const int* xofs, *yofs;
2177 const AT* alpha, *_beta;
2178 Size ssize, dsize;
2179 const int ksize, xmin, xmax;
2180
2181 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
2182 };
2183
2184 template<class HResize, class VResize>
resizeGeneric_(const Mat & src,Mat & dst,const int * xofs,const void * _alpha,const int * yofs,const void * _beta,int xmin,int xmax,int ksize)2185 static void resizeGeneric_( const Mat& src, Mat& dst,
2186 const int* xofs, const void* _alpha,
2187 const int* yofs, const void* _beta,
2188 int xmin, int xmax, int ksize )
2189 {
2190 typedef typename HResize::alpha_type AT;
2191
2192 const AT* beta = (const AT*)_beta;
2193 Size ssize = src.size(), dsize = dst.size();
2194 int cn = src.channels();
2195 ssize.width *= cn;
2196 dsize.width *= cn;
2197 xmin *= cn;
2198 xmax *= cn;
2199 // image resize is a separable operation. In case of not too strong
2200
2201 Range range(0, dsize.height);
2202 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
2203 ssize, dsize, ksize, xmin, xmax);
2204 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2205 }
2206
2207 template <typename T, typename WT>
2208 struct ResizeAreaFastNoVec
2209 {
ResizeAreaFastNoVeccv::ResizeAreaFastNoVec2210 ResizeAreaFastNoVec(int, int) { }
ResizeAreaFastNoVeccv::ResizeAreaFastNoVec2211 ResizeAreaFastNoVec(int, int, int, int) { }
operator ()cv::ResizeAreaFastNoVec2212 int operator() (const T*, T*, int) const
2213 { return 0; }
2214 };
2215
2216 #if CV_NEON
2217
2218 class ResizeAreaFastVec_SIMD_8u
2219 {
2220 public:
ResizeAreaFastVec_SIMD_8u(int _cn,int _step)2221 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2222 cn(_cn), step(_step)
2223 {
2224 }
2225
operator ()(const uchar * S,uchar * D,int w) const2226 int operator() (const uchar* S, uchar* D, int w) const
2227 {
2228 int dx = 0;
2229 const uchar* S0 = S, * S1 = S0 + step;
2230
2231 uint16x8_t v_2 = vdupq_n_u16(2);
2232
2233 if (cn == 1)
2234 {
2235 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
2236 {
2237 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
2238
2239 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
2240 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
2241 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
2242
2243 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
2244 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
2245 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
2246
2247 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
2248 }
2249 }
2250 else if (cn == 4)
2251 {
2252 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2253 {
2254 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
2255
2256 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
2257 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
2258 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
2259 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
2260
2261 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
2262 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
2263 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
2264 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
2265 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
2266
2267 vst1_u8(D, vmovn_u16(v_dst));
2268 }
2269 }
2270
2271 return dx;
2272 }
2273
2274 private:
2275 int cn, step;
2276 };
2277
2278 class ResizeAreaFastVec_SIMD_16u
2279 {
2280 public:
ResizeAreaFastVec_SIMD_16u(int _cn,int _step)2281 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2282 cn(_cn), step(_step)
2283 {
2284 }
2285
operator ()(const ushort * S,ushort * D,int w) const2286 int operator() (const ushort * S, ushort * D, int w) const
2287 {
2288 int dx = 0;
2289 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
2290
2291 uint32x4_t v_2 = vdupq_n_u32(2);
2292
2293 if (cn == 1)
2294 {
2295 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2296 {
2297 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
2298
2299 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
2300 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
2301 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
2302
2303 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
2304 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
2305 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
2306
2307 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
2308 }
2309 }
2310 else if (cn == 4)
2311 {
2312 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2313 {
2314 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
2315 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
2316 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
2317 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
2318 }
2319 }
2320
2321 return dx;
2322 }
2323
2324 private:
2325 int cn, step;
2326 };
2327
2328 class ResizeAreaFastVec_SIMD_16s
2329 {
2330 public:
ResizeAreaFastVec_SIMD_16s(int _cn,int _step)2331 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2332 cn(_cn), step(_step)
2333 {
2334 }
2335
operator ()(const short * S,short * D,int w) const2336 int operator() (const short * S, short * D, int w) const
2337 {
2338 int dx = 0;
2339 const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
2340
2341 int32x4_t v_2 = vdupq_n_s32(2);
2342
2343 if (cn == 1)
2344 {
2345 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2346 {
2347 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
2348
2349 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
2350 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
2351 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
2352
2353 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
2354 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
2355 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
2356
2357 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
2358 }
2359 }
2360 else if (cn == 4)
2361 {
2362 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2363 {
2364 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
2365 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
2366 vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
2367 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
2368 }
2369 }
2370
2371 return dx;
2372 }
2373
2374 private:
2375 int cn, step;
2376 };
2377
2378 struct ResizeAreaFastVec_SIMD_32f
2379 {
ResizeAreaFastVec_SIMD_32fcv::ResizeAreaFastVec_SIMD_32f2380 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2381 cn(_cn), step(_step)
2382 {
2383 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2384 }
2385
operator ()cv::ResizeAreaFastVec_SIMD_32f2386 int operator() (const float * S, float * D, int w) const
2387 {
2388 if (!fast_mode)
2389 return 0;
2390
2391 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2392 int dx = 0;
2393
2394 float32x4_t v_025 = vdupq_n_f32(0.25f);
2395
2396 if (cn == 1)
2397 {
2398 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2399 {
2400 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
2401
2402 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
2403 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
2404
2405 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2406 }
2407 }
2408 else if (cn == 4)
2409 {
2410 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2411 {
2412 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
2413 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
2414
2415 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2416 }
2417 }
2418
2419 return dx;
2420 }
2421
2422 private:
2423 int cn;
2424 bool fast_mode;
2425 int step;
2426 };
2427
2428 #elif CV_SIMD
2429
2430 class ResizeAreaFastVec_SIMD_8u
2431 {
2432 public:
ResizeAreaFastVec_SIMD_8u(int _cn,int _step)2433 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2434 cn(_cn), step(_step) {}
2435
operator ()(const uchar * S,uchar * D,int w) const2436 int operator() (const uchar* S, uchar* D, int w) const
2437 {
2438 int dx = 0;
2439 const uchar* S0 = S;
2440 const uchar* S1 = S0 + step;
2441
2442 if (cn == 1)
2443 {
2444 v_uint16 masklow = vx_setall_u16(0x00ff);
2445 for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
2446 {
2447 v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
2448 v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
2449 v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
2450 }
2451 }
2452 else if (cn == 3)
2453 {
2454 if (CV_SIMD_WIDTH > 64)
2455 return 0;
2456 for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
2457 {
2458 v_uint16 t0, t1, t2, t3, t4, t5;
2459 v_uint16 s0, s1, s2, s3, s4, s5;
2460 s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
2461 s1 = vx_load_expand(S0 + v_uint16::nlanes) + vx_load_expand(S1 + v_uint16::nlanes);
2462 s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
2463 s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
2464 s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
2465 s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
2466 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2467 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2468 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2469 v_uint16 bl, gl, rl;
2470 #if CV_SIMD_WIDTH == 16
2471 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2472 #elif CV_SIMD_WIDTH == 32
2473 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2474 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2475 #elif CV_SIMD_WIDTH == 64
2476 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2477 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2478 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2479 #endif
2480 s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
2481 s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
2482 s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
2483 s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
2484 s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
2485 s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
2486 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2487 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2488 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2489 v_uint16 bh, gh, rh;
2490 #if CV_SIMD_WIDTH == 16
2491 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2492 #elif CV_SIMD_WIDTH == 32
2493 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2494 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2495 #elif CV_SIMD_WIDTH == 64
2496 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2497 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2498 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2499 #endif
2500 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2501 }
2502 }
2503 else
2504 {
2505 CV_Assert(cn == 4);
2506 for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
2507 {
2508 v_uint32 r00, r01, r10, r11;
2509 v_load_deinterleave((uint32_t*)S0, r00, r01);
2510 v_load_deinterleave((uint32_t*)S1, r10, r11);
2511
2512 v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2513 v_expand(v_reinterpret_as_u8(r00), r00l, r00h);
2514 v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
2515 v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
2516 v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
2517 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2518 }
2519 }
2520
2521 return dx;
2522 }
2523
2524 private:
2525 int cn;
2526 int step;
2527 };
2528
2529 class ResizeAreaFastVec_SIMD_16u
2530 {
2531 public:
ResizeAreaFastVec_SIMD_16u(int _cn,int _step)2532 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2533 cn(_cn), step(_step) {}
2534
operator ()(const ushort * S,ushort * D,int w) const2535 int operator() (const ushort* S, ushort* D, int w) const
2536 {
2537 int dx = 0;
2538 const ushort* S0 = (const ushort*)S;
2539 const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2540
2541 if (cn == 1)
2542 {
2543 v_uint32 masklow = vx_setall_u32(0x0000ffff);
2544 for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
2545 {
2546 v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
2547 v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
2548 v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
2549 }
2550 }
2551 else if (cn == 3)
2552 {
2553 #if CV_SIMD_WIDTH == 16
2554 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2555 #if CV_SSE4_1
2556 {
2557 v_uint32 r0, r1, r2, r3;
2558 v_expand(vx_load(S0), r0, r1);
2559 v_expand(vx_load(S1), r2, r3);
2560 r0 += r2; r1 += r3;
2561 v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
2562 }
2563 #else
2564 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
2565 #endif
2566 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
2567 for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
2568 {
2569 v_uint32 t0, t1, t2, t3, t4, t5;
2570 v_uint32 s0, s1, s2, s3, s4, s5;
2571 s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
2572 s1 = vx_load_expand(S0 + v_uint32::nlanes) + vx_load_expand(S1 + v_uint32::nlanes);
2573 s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
2574 s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
2575 s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
2576 s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
2577 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2578 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2579 v_uint32 bl, gl, rl;
2580 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2581 #if CV_SIMD_WIDTH == 32
2582 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2583 #else //CV_SIMD_WIDTH == 64
2584 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2585 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2586 #endif
2587 s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
2588 s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
2589 s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
2590 s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
2591 s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
2592 s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
2593 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2594 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2595 v_uint32 bh, gh, rh;
2596 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2597 #if CV_SIMD_WIDTH == 32
2598 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2599 #else //CV_SIMD_WIDTH == 64
2600 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2601 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2602 #endif
2603 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2604 }
2605 #elif CV_SIMD_WIDTH >= 64
2606 v_uint32 masklow = vx_setall_u32(0x0000ffff);
2607 for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
2608 {
2609 v_uint16 b0, g0, r0, b1, g1, r1;
2610 v_load_deinterleave(S0, b0, g0, r0);
2611 v_load_deinterleave(S1, b1, g1, r1);
2612 v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
2613 v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
2614 v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
2615 v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
2616 v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
2617 v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
2618 v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
2619 v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
2620 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2621 }
2622 #endif
2623 }
2624 else
2625 {
2626 CV_Assert(cn == 4);
2627 #if CV_SIMD_WIDTH >= 64
2628 for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
2629 {
2630 v_uint64 r00, r01, r10, r11;
2631 v_load_deinterleave((uint64_t*)S0, r00, r01);
2632 v_load_deinterleave((uint64_t*)S1, r10, r11);
2633
2634 v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2635 v_expand(v_reinterpret_as_u16(r00), r00l, r00h);
2636 v_expand(v_reinterpret_as_u16(r01), r01l, r01h);
2637 v_expand(v_reinterpret_as_u16(r10), r10l, r10h);
2638 v_expand(v_reinterpret_as_u16(r11), r11l, r11h);
2639 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2640 }
2641 #else
2642 for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
2643 {
2644 v_uint32 r0, r1, r2, r3;
2645 v_expand(vx_load(S0), r0, r1);
2646 v_expand(vx_load(S1), r2, r3);
2647 r0 += r2; r1 += r3;
2648 v_uint32 v_d;
2649 #if CV_SIMD_WIDTH == 16
2650 v_d = r0 + r1;
2651 #elif CV_SIMD_WIDTH == 32
2652 v_uint32 t0, t1;
2653 v_recombine(r0, r1, t0, t1);
2654 v_d = t0 + t1;
2655 #endif
2656 v_rshr_pack_store<2>(D, v_d);
2657 }
2658 #endif
2659 }
2660
2661 return dx;
2662 }
2663
2664 private:
2665 int cn;
2666 int step;
2667 };
2668
2669 class ResizeAreaFastVec_SIMD_16s
2670 {
2671 public:
ResizeAreaFastVec_SIMD_16s(int _cn,int _step)2672 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2673 cn(_cn), step(_step) {}
2674
operator ()(const short * S,short * D,int w) const2675 int operator() (const short* S, short* D, int w) const
2676 {
2677 int dx = 0;
2678 const short* S0 = (const short*)S;
2679 const short* S1 = (const short*)((const uchar*)(S) + step);
2680
2681 if (cn == 1)
2682 {
2683 v_int32 masklow = vx_setall_s32(0x0000ffff);
2684 for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
2685 {
2686 v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
2687 v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
2688 v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
2689 }
2690 }
2691 else if (cn == 3)
2692 {
2693 #if CV_SIMD_WIDTH == 16
2694 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2695 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
2696 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
2697 for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
2698 {
2699 v_int32 t0, t1, t2, t3, t4, t5;
2700 v_int32 s0, s1, s2, s3, s4, s5;
2701 s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
2702 s1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes);
2703 s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
2704 s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
2705 s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
2706 s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
2707 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2708 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2709 v_int32 bl, gl, rl;
2710 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2711 #if CV_SIMD_WIDTH == 32
2712 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2713 #else //CV_SIMD_WIDTH == 64
2714 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2715 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2716 #endif
2717 s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
2718 s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
2719 s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
2720 s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
2721 s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
2722 s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
2723 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2724 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2725 v_int32 bh, gh, rh;
2726 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2727 #if CV_SIMD_WIDTH == 32
2728 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2729 #else //CV_SIMD_WIDTH == 64
2730 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2731 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2732 #endif
2733 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2734 }
2735 #elif CV_SIMD_WIDTH >= 64
2736 for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
2737 {
2738 v_int16 b0, g0, r0, b1, g1, r1;
2739 v_load_deinterleave(S0, b0, g0, r0);
2740 v_load_deinterleave(S1, b1, g1, r1);
2741 v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
2742 v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
2743 v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
2744 v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
2745 v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
2746 v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
2747 v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
2748 v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
2749 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2750 }
2751 #endif
2752 }
2753 else
2754 {
2755 CV_Assert(cn == 4);
2756 for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
2757 {
2758 #if CV_SIMD_WIDTH >= 64
2759 v_int64 r00, r01, r10, r11;
2760 v_load_deinterleave((int64_t*)S0, r00, r01);
2761 v_load_deinterleave((int64_t*)S1, r10, r11);
2762
2763 v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2764 v_expand(v_reinterpret_as_s16(r00), r00l, r00h);
2765 v_expand(v_reinterpret_as_s16(r01), r01l, r01h);
2766 v_expand(v_reinterpret_as_s16(r10), r10l, r10h);
2767 v_expand(v_reinterpret_as_s16(r11), r11l, r11h);
2768 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2769 #else
2770 v_int32 r0, r1, r2, r3;
2771 r0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
2772 r1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes);
2773 r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
2774 r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
2775 v_int32 dl, dh;
2776 #if CV_SIMD_WIDTH == 16
2777 dl = r0 + r1; dh = r2 + r3;
2778 #elif CV_SIMD_WIDTH == 32
2779 v_int32 t0, t1, t2, t3;
2780 v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
2781 dl = t0 + t1; dh = t2 + t3;
2782 #endif
2783 v_store(D, v_rshr_pack<2>(dl, dh));
2784 #endif
2785 }
2786 }
2787
2788 return dx;
2789 }
2790
2791 private:
2792 int cn;
2793 int step;
2794 };
2795
2796 struct ResizeAreaFastVec_SIMD_32f
2797 {
ResizeAreaFastVec_SIMD_32fcv::ResizeAreaFastVec_SIMD_32f2798 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2799 cn(_cn), step(_step)
2800 {
2801 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2802 }
2803
operator ()cv::ResizeAreaFastVec_SIMD_32f2804 int operator() (const float * S, float * D, int w) const
2805 {
2806 if (!fast_mode)
2807 return 0;
2808
2809 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2810 int dx = 0;
2811
2812 if (cn == 1)
2813 {
2814 v_float32 v_025 = vx_setall_f32(0.25f);
2815 for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
2816 {
2817 v_float32 v_row00, v_row01, v_row10, v_row11;
2818 v_load_deinterleave(S0, v_row00, v_row01);
2819 v_load_deinterleave(S1, v_row10, v_row11);
2820 v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
2821 }
2822 }
2823 else if (cn == 4)
2824 {
2825 #if CV_SIMD_WIDTH == 16
2826 v_float32 v_025 = vx_setall_f32(0.25f);
2827 for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
2828 v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
2829 #elif CV_SIMD256
2830 v_float32x8 v_025 = v256_setall_f32(0.25f);
2831 for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
2832 {
2833 v_float32x8 dst0, dst1;
2834 v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
2835 v_store(D, (dst0 + dst1) * v_025);
2836 }
2837 #endif
2838 }
2839
2840 return dx;
2841 }
2842
2843 private:
2844 int cn;
2845 bool fast_mode;
2846 int step;
2847 };
2848
2849 #else
2850
2851 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2852 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2853 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2854 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2855
2856 #endif
2857
2858 template<typename T, typename SIMDVecOp>
2859 struct ResizeAreaFastVec
2860 {
ResizeAreaFastVeccv::ResizeAreaFastVec2861 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2862 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2863 {
2864 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2865 }
2866
operator ()cv::ResizeAreaFastVec2867 int operator() (const T* S, T* D, int w) const
2868 {
2869 if (!fast_mode)
2870 return 0;
2871
2872 const T* nextS = (const T*)((const uchar*)S + step);
2873 int dx = vecOp(S, D, w);
2874
2875 if (cn == 1)
2876 for( ; dx < w; ++dx )
2877 {
2878 int index = dx*2;
2879 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2880 }
2881 else if (cn == 3)
2882 for( ; dx < w; dx += 3 )
2883 {
2884 int index = dx*2;
2885 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2886 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2887 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2888 }
2889 else
2890 {
2891 CV_Assert(cn == 4);
2892 for( ; dx < w; dx += 4 )
2893 {
2894 int index = dx*2;
2895 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2896 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2897 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2898 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2899 }
2900 }
2901
2902 return dx;
2903 }
2904
2905 private:
2906 int scale_x, scale_y;
2907 int cn;
2908 bool fast_mode;
2909 int step;
2910 SIMDVecOp vecOp;
2911 };
2912
2913 template <typename T, typename WT, typename VecOp>
2914 class resizeAreaFast_Invoker :
2915 public ParallelLoopBody
2916 {
2917 public:
resizeAreaFast_Invoker(const Mat & _src,Mat & _dst,int _scale_x,int _scale_y,const int * _ofs,const int * _xofs)2918 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2919 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2920 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2921 scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2922 {
2923 }
2924
operator ()(const Range & range) const2925 virtual void operator() (const Range& range) const CV_OVERRIDE
2926 {
2927 Size ssize = src.size(), dsize = dst.size();
2928 int cn = src.channels();
2929 int area = scale_x*scale_y;
2930 float scale = 1.f/(area);
2931 int dwidth1 = (ssize.width/scale_x)*cn;
2932 dsize.width *= cn;
2933 ssize.width *= cn;
2934 int dy, dx, k = 0;
2935
2936 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2937
2938 for( dy = range.start; dy < range.end; dy++ )
2939 {
2940 T* D = (T*)(dst.data + dst.step*dy);
2941 int sy0 = dy*scale_y;
2942 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2943
2944 if( sy0 >= ssize.height )
2945 {
2946 for( dx = 0; dx < dsize.width; dx++ )
2947 D[dx] = 0;
2948 continue;
2949 }
2950
2951 dx = vop(src.template ptr<T>(sy0), D, w);
2952 for( ; dx < w; dx++ )
2953 {
2954 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2955 WT sum = 0;
2956 k = 0;
2957 #if CV_ENABLE_UNROLLED
2958 for( ; k <= area - 4; k += 4 )
2959 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2960 #endif
2961 for( ; k < area; k++ )
2962 sum += S[ofs[k]];
2963
2964 D[dx] = saturate_cast<T>(sum * scale);
2965 }
2966
2967 for( ; dx < dsize.width; dx++ )
2968 {
2969 WT sum = 0;
2970 int count = 0, sx0 = xofs[dx];
2971 if( sx0 >= ssize.width )
2972 D[dx] = 0;
2973
2974 for( int sy = 0; sy < scale_y; sy++ )
2975 {
2976 if( sy0 + sy >= ssize.height )
2977 break;
2978 const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2979 for( int sx = 0; sx < scale_x*cn; sx += cn )
2980 {
2981 if( sx0 + sx >= ssize.width )
2982 break;
2983 sum += S[sx];
2984 count++;
2985 }
2986 }
2987
2988 D[dx] = saturate_cast<T>((float)sum/count);
2989 }
2990 }
2991 }
2992
2993 private:
2994 Mat src;
2995 Mat dst;
2996 int scale_x, scale_y;
2997 const int *ofs, *xofs;
2998 };
2999
3000 template<typename T, typename WT, typename VecOp>
resizeAreaFast_(const Mat & src,Mat & dst,const int * ofs,const int * xofs,int scale_x,int scale_y)3001 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
3002 int scale_x, int scale_y )
3003 {
3004 Range range(0, dst.rows);
3005 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
3006 scale_y, ofs, xofs);
3007 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3008 }
3009
3010 struct DecimateAlpha
3011 {
3012 int si, di;
3013 float alpha;
3014 };
3015
3016
3017 template<typename T, typename WT> class ResizeArea_Invoker :
3018 public ParallelLoopBody
3019 {
3020 public:
ResizeArea_Invoker(const Mat & _src,Mat & _dst,const DecimateAlpha * _xtab,int _xtab_size,const DecimateAlpha * _ytab,int _ytab_size,const int * _tabofs)3021 ResizeArea_Invoker( const Mat& _src, Mat& _dst,
3022 const DecimateAlpha* _xtab, int _xtab_size,
3023 const DecimateAlpha* _ytab, int _ytab_size,
3024 const int* _tabofs )
3025 {
3026 src = &_src;
3027 dst = &_dst;
3028 xtab0 = _xtab;
3029 xtab_size0 = _xtab_size;
3030 ytab = _ytab;
3031 ytab_size = _ytab_size;
3032 tabofs = _tabofs;
3033 }
3034
operator ()(const Range & range) const3035 virtual void operator() (const Range& range) const CV_OVERRIDE
3036 {
3037 Size dsize = dst->size();
3038 int cn = dst->channels();
3039 dsize.width *= cn;
3040 AutoBuffer<WT> _buffer(dsize.width*2);
3041 const DecimateAlpha* xtab = xtab0;
3042 int xtab_size = xtab_size0;
3043 WT *buf = _buffer.data(), *sum = buf + dsize.width;
3044 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
3045
3046 for( dx = 0; dx < dsize.width; dx++ )
3047 sum[dx] = (WT)0;
3048
3049 for( j = j_start; j < j_end; j++ )
3050 {
3051 WT beta = ytab[j].alpha;
3052 int dy = ytab[j].di;
3053 int sy = ytab[j].si;
3054
3055 {
3056 const T* S = src->template ptr<T>(sy);
3057 for( dx = 0; dx < dsize.width; dx++ )
3058 buf[dx] = (WT)0;
3059
3060 if( cn == 1 )
3061 for( k = 0; k < xtab_size; k++ )
3062 {
3063 int dxn = xtab[k].di;
3064 WT alpha = xtab[k].alpha;
3065 buf[dxn] += S[xtab[k].si]*alpha;
3066 }
3067 else if( cn == 2 )
3068 for( k = 0; k < xtab_size; k++ )
3069 {
3070 int sxn = xtab[k].si;
3071 int dxn = xtab[k].di;
3072 WT alpha = xtab[k].alpha;
3073 WT t0 = buf[dxn] + S[sxn]*alpha;
3074 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3075 buf[dxn] = t0; buf[dxn+1] = t1;
3076 }
3077 else if( cn == 3 )
3078 for( k = 0; k < xtab_size; k++ )
3079 {
3080 int sxn = xtab[k].si;
3081 int dxn = xtab[k].di;
3082 WT alpha = xtab[k].alpha;
3083 WT t0 = buf[dxn] + S[sxn]*alpha;
3084 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3085 WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
3086 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
3087 }
3088 else if( cn == 4 )
3089 {
3090 for( k = 0; k < xtab_size; k++ )
3091 {
3092 int sxn = xtab[k].si;
3093 int dxn = xtab[k].di;
3094 WT alpha = xtab[k].alpha;
3095 WT t0 = buf[dxn] + S[sxn]*alpha;
3096 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3097 buf[dxn] = t0; buf[dxn+1] = t1;
3098 t0 = buf[dxn+2] + S[sxn+2]*alpha;
3099 t1 = buf[dxn+3] + S[sxn+3]*alpha;
3100 buf[dxn+2] = t0; buf[dxn+3] = t1;
3101 }
3102 }
3103 else
3104 {
3105 for( k = 0; k < xtab_size; k++ )
3106 {
3107 int sxn = xtab[k].si;
3108 int dxn = xtab[k].di;
3109 WT alpha = xtab[k].alpha;
3110 for( int c = 0; c < cn; c++ )
3111 buf[dxn + c] += S[sxn + c]*alpha;
3112 }
3113 }
3114 }
3115
3116 if( dy != prev_dy )
3117 {
3118 T* D = dst->template ptr<T>(prev_dy);
3119
3120 for( dx = 0; dx < dsize.width; dx++ )
3121 {
3122 D[dx] = saturate_cast<T>(sum[dx]);
3123 sum[dx] = beta*buf[dx];
3124 }
3125 prev_dy = dy;
3126 }
3127 else
3128 {
3129 for( dx = 0; dx < dsize.width; dx++ )
3130 sum[dx] += beta*buf[dx];
3131 }
3132 }
3133
3134 {
3135 T* D = dst->template ptr<T>(prev_dy);
3136 for( dx = 0; dx < dsize.width; dx++ )
3137 D[dx] = saturate_cast<T>(sum[dx]);
3138 }
3139 }
3140
3141 private:
3142 const Mat* src;
3143 Mat* dst;
3144 const DecimateAlpha* xtab0;
3145 const DecimateAlpha* ytab;
3146 int xtab_size0, ytab_size;
3147 const int* tabofs;
3148 };
3149
3150
3151 template <typename T, typename WT>
resizeArea_(const Mat & src,Mat & dst,const DecimateAlpha * xtab,int xtab_size,const DecimateAlpha * ytab,int ytab_size,const int * tabofs)3152 static void resizeArea_( const Mat& src, Mat& dst,
3153 const DecimateAlpha* xtab, int xtab_size,
3154 const DecimateAlpha* ytab, int ytab_size,
3155 const int* tabofs )
3156 {
3157 parallel_for_(Range(0, dst.rows),
3158 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
3159 dst.total()/((double)(1 << 16)));
3160 }
3161
3162
3163 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
3164 const int* xofs, const void* alpha,
3165 const int* yofs, const void* beta,
3166 int xmin, int xmax, int ksize );
3167
3168 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
3169 const int* ofs, const int *xofs,
3170 int scale_x, int scale_y );
3171
3172 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
3173 const DecimateAlpha* xtab, int xtab_size,
3174 const DecimateAlpha* ytab, int ytab_size,
3175 const int* yofs);
3176
3177
computeResizeAreaTab(int ssize,int dsize,int cn,double scale,DecimateAlpha * tab)3178 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
3179 {
3180 int k = 0;
3181 for(int dx = 0; dx < dsize; dx++ )
3182 {
3183 double fsx1 = dx * scale;
3184 double fsx2 = fsx1 + scale;
3185 double cellWidth = std::min(scale, ssize - fsx1);
3186
3187 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
3188
3189 sx2 = std::min(sx2, ssize - 1);
3190 sx1 = std::min(sx1, sx2);
3191
3192 if( sx1 - fsx1 > 1e-3 )
3193 {
3194 assert( k < ssize*2 );
3195 tab[k].di = dx * cn;
3196 tab[k].si = (sx1 - 1) * cn;
3197 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
3198 }
3199
3200 for(int sx = sx1; sx < sx2; sx++ )
3201 {
3202 assert( k < ssize*2 );
3203 tab[k].di = dx * cn;
3204 tab[k].si = sx * cn;
3205 tab[k++].alpha = float(1.0 / cellWidth);
3206 }
3207
3208 if( fsx2 - sx2 > 1e-3 )
3209 {
3210 assert( k < ssize*2 );
3211 tab[k].di = dx * cn;
3212 tab[k].si = sx2 * cn;
3213 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
3214 }
3215 }
3216 return k;
3217 }
3218
3219 #ifdef HAVE_OPENCL
ocl_computeResizeAreaTabs(int ssize,int dsize,double scale,int * const map_tab,float * const alpha_tab,int * const ofs_tab)3220 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
3221 float * const alpha_tab, int * const ofs_tab)
3222 {
3223 int k = 0, dx = 0;
3224 for ( ; dx < dsize; dx++)
3225 {
3226 ofs_tab[dx] = k;
3227
3228 double fsx1 = dx * scale;
3229 double fsx2 = fsx1 + scale;
3230 double cellWidth = std::min(scale, ssize - fsx1);
3231
3232 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
3233
3234 sx2 = std::min(sx2, ssize - 1);
3235 sx1 = std::min(sx1, sx2);
3236
3237 if (sx1 - fsx1 > 1e-3)
3238 {
3239 map_tab[k] = sx1 - 1;
3240 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
3241 }
3242
3243 for (int sx = sx1; sx < sx2; sx++)
3244 {
3245 map_tab[k] = sx;
3246 alpha_tab[k++] = float(1.0 / cellWidth);
3247 }
3248
3249 if (fsx2 - sx2 > 1e-3)
3250 {
3251 map_tab[k] = sx2;
3252 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
3253 }
3254 }
3255 ofs_tab[dx] = k;
3256 }
3257
ocl_resize(InputArray _src,OutputArray _dst,Size dsize,double fx,double fy,int interpolation)3258 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
3259 double fx, double fy, int interpolation)
3260 {
3261 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3262
3263 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
3264 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
3265 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
3266 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
3267 std::abs(inv_fy - iscale_y) < DBL_EPSILON;
3268
3269 // in case of scale_x && scale_y is equal to 2
3270 // INTER_AREA (fast) also is equal to INTER_LINEAR
3271 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3272 /*interpolation = INTER_AREA*/CV_UNUSED(0); // INTER_AREA is slower
3273
3274 if( !(cn <= 4 &&
3275 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
3276 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
3277 return false;
3278
3279 UMat src = _src.getUMat();
3280 _dst.create(dsize, type);
3281 UMat dst = _dst.getUMat();
3282
3283 Size ssize = src.size();
3284 ocl::Kernel k;
3285 size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
3286
3287 ocl::Image2D srcImage;
3288
3289 // See if this could be done with a sampler. We stick with integer
3290 // datatypes because the observed error is low.
3291 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
3292 ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
3293 ocl::Image2D::isFormatSupported(depth, cn, true) &&
3294 src.offset==0);
3295 if (useSampler)
3296 {
3297 int wdepth = std::max(depth, CV_32S);
3298 char buf[2][32];
3299 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
3300 "-D convertToDT=%s -D cn=%d",
3301 depth, ocl::typeToStr(type), ocl::typeToStr(depth),
3302 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3303 cn);
3304 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
3305
3306 if (k.empty())
3307 useSampler = false;
3308 else
3309 {
3310 // Convert the input into an OpenCL image type, using normalized channel data types
3311 // and aliasing the UMat.
3312 srcImage = ocl::Image2D(src, true, true);
3313 k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
3314 (float)inv_fx, (float)inv_fy);
3315 }
3316 }
3317
3318 if (interpolation == INTER_LINEAR && !useSampler)
3319 {
3320 char buf[2][32];
3321
3322 // integer path is slower because of CPU part, so it's disabled
3323 if (depth == CV_8U && ((void)0, 0))
3324 {
3325 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
3326 int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width;
3327 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
3328 float fxx, fyy;
3329 int sx, sy;
3330
3331 for (int dx = 0; dx < dsize.width; dx++)
3332 {
3333 fxx = (float)((dx+0.5)*inv_fx - 0.5);
3334 sx = cvFloor(fxx);
3335 fxx -= sx;
3336
3337 if (sx < 0)
3338 fxx = 0, sx = 0;
3339
3340 if (sx >= ssize.width-1)
3341 fxx = 0, sx = ssize.width-1;
3342
3343 xofs[dx] = sx;
3344 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
3345 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
3346 }
3347
3348 for (int dy = 0; dy < dsize.height; dy++)
3349 {
3350 fyy = (float)((dy+0.5)*inv_fy - 0.5);
3351 sy = cvFloor(fyy);
3352 fyy -= sy;
3353
3354 yofs[dy] = sy;
3355 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
3356 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
3357 }
3358
3359 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3360 UMat coeffs;
3361 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
3362
3363 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3364 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
3365 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3366 "-D INTER_RESIZE_COEF_BITS=%d",
3367 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3368 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3369 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3370 cn, INTER_RESIZE_COEF_BITS));
3371 if (k.empty())
3372 return false;
3373
3374 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3375 ocl::KernelArg::PtrReadOnly(coeffs));
3376 }
3377 else
3378 {
3379 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3380 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3381 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
3382 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3383 "-D INTER_RESIZE_COEF_BITS=%d",
3384 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3385 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3386 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3387 cn, INTER_RESIZE_COEF_BITS));
3388 if (k.empty())
3389 return false;
3390
3391 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3392 (float)inv_fx, (float)inv_fy);
3393 }
3394 }
3395 else if (interpolation == INTER_NEAREST)
3396 {
3397 k.create("resizeNN", ocl::imgproc::resize_oclsrc,
3398 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
3399 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
3400 if (k.empty())
3401 return false;
3402
3403 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3404 (float)inv_fx, (float)inv_fy);
3405 }
3406 else if (interpolation == INTER_AREA)
3407 {
3408 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
3409 int wtype = CV_MAKE_TYPE(wdepth, cn);
3410
3411 char cvt[2][40];
3412 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
3413 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3414 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
3415
3416 UMat alphaOcl, tabofsOcl, mapOcl;
3417 UMat dmap, smap;
3418
3419 if (is_area_fast)
3420 {
3421 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
3422 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
3423 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
3424 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
3425 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
3426 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
3427
3428 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
3429 if (k.empty())
3430 return false;
3431 }
3432 else
3433 {
3434 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
3435 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
3436 if (k.empty())
3437 return false;
3438
3439 int xytab_size = (ssize.width + ssize.height) << 1;
3440 int tabofs_size = dsize.height + dsize.width + 2;
3441
3442 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
3443 AutoBuffer<float> _xyalpha_tab(xytab_size);
3444 int * xmap_tab = _xymap_tab.data(), * ymap_tab = _xymap_tab.data() + (ssize.width << 1);
3445 float * xalpha_tab = _xyalpha_tab.data(), * yalpha_tab = _xyalpha_tab.data() + (ssize.width << 1);
3446 int * xofs_tab = _xyofs_tab.data(), * yofs_tab = _xyofs_tab.data() + dsize.width + 1;
3447
3448 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
3449 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
3450
3451 // loading precomputed arrays to GPU
3452 Mat(1, xytab_size, CV_32FC1, _xyalpha_tab.data()).copyTo(alphaOcl);
3453 Mat(1, xytab_size, CV_32SC1, _xymap_tab.data()).copyTo(mapOcl);
3454 Mat(1, tabofs_size, CV_32SC1, _xyofs_tab.data()).copyTo(tabofsOcl);
3455 }
3456
3457 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
3458
3459 if (is_area_fast)
3460 k.args(srcarg, dstarg);
3461 else
3462 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
3463 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
3464
3465 return k.run(2, globalsize, NULL, false);
3466 }
3467
3468 return k.run(2, globalsize, 0, false);
3469 }
3470
3471 #endif
3472
3473 #ifdef HAVE_IPP
3474 #define IPP_RESIZE_PARALLEL 1
3475
3476 #ifdef HAVE_IPP_IW
3477 class ipp_resizeParallel: public ParallelLoopBody
3478 {
3479 public:
ipp_resizeParallel(::ipp::IwiImage & src,::ipp::IwiImage & dst,bool & ok)3480 ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
3481 m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeParallel()3482 ~ipp_resizeParallel()
3483 {
3484 }
3485
Init(IppiInterpolationType inter)3486 void Init(IppiInterpolationType inter)
3487 {
3488 iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl);
3489
3490 m_ok = true;
3491 }
3492
operator ()(const Range & range) const3493 virtual void operator() (const Range& range) const CV_OVERRIDE
3494 {
3495 CV_INSTRUMENT_REGION_IPP();
3496
3497 if(!m_ok)
3498 return;
3499
3500 try
3501 {
3502 ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
3503 CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile);
3504 }
3505 catch(const ::ipp::IwException &)
3506 {
3507 m_ok = false;
3508 return;
3509 }
3510 }
3511 private:
3512 ::ipp::IwiImage &m_src;
3513 ::ipp::IwiImage &m_dst;
3514
3515 mutable ::ipp::IwiResize iwiResize;
3516
3517 volatile bool &m_ok;
3518 const ipp_resizeParallel& operator= (const ipp_resizeParallel&);
3519 };
3520
3521 class ipp_resizeAffineParallel: public ParallelLoopBody
3522 {
3523 public:
ipp_resizeAffineParallel(::ipp::IwiImage & src,::ipp::IwiImage & dst,bool & ok)3524 ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
3525 m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeAffineParallel()3526 ~ipp_resizeAffineParallel()
3527 {
3528 }
3529
Init(IppiInterpolationType inter,double scaleX,double scaleY)3530 void Init(IppiInterpolationType inter, double scaleX, double scaleY)
3531 {
3532 double shift = (inter == ippNearest)?-1e-10:-0.5;
3533 double coeffs[2][3] = {
3534 {scaleX, 0, shift+0.5*scaleX},
3535 {0, scaleY, shift+0.5*scaleY}
3536 };
3537
3538 iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl);
3539
3540 m_ok = true;
3541 }
3542
operator ()(const Range & range) const3543 virtual void operator() (const Range& range) const CV_OVERRIDE
3544 {
3545 CV_INSTRUMENT_REGION_IPP();
3546
3547 if(!m_ok)
3548 return;
3549
3550 try
3551 {
3552 ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
3553 CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile);
3554 }
3555 catch(const ::ipp::IwException &)
3556 {
3557 m_ok = false;
3558 return;
3559 }
3560 }
3561 private:
3562 ::ipp::IwiImage &m_src;
3563 ::ipp::IwiImage &m_dst;
3564
3565 mutable ::ipp::IwiWarpAffine iwiWarpAffine;
3566
3567 volatile bool &m_ok;
3568 const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&);
3569 };
3570 #endif
3571
ipp_resize(const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,double inv_scale_x,double inv_scale_y,int depth,int channels,int interpolation)3572 static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height,
3573 uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y,
3574 int depth, int channels, int interpolation)
3575 {
3576 #ifdef HAVE_IPP_IW
3577 CV_INSTRUMENT_REGION_IPP();
3578
3579 IppDataType ippDataType = ippiGetDataType(depth);
3580 IppiInterpolationType ippInter = ippiGetInterpolation(interpolation);
3581 if((int)ippInter < 0)
3582 return false;
3583
3584 // Resize which doesn't match OpenCV exactly
3585 if (!cv::ipp::useIPP_NotExact())
3586 {
3587 if (ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear))
3588 return false;
3589 }
3590
3591 if(ippInter != ippLinear && ippDataType == ipp64f)
3592 return false;
3593
3594 #if IPP_VERSION_X100 < 201801
3595 // Degradations on int^2 linear downscale
3596 if (ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale
3597 {
3598 int scale_x = (int)(1 / inv_scale_x);
3599 int scale_y = (int)(1 / inv_scale_y);
3600 if (1 / inv_scale_x - scale_x < DBL_EPSILON && 1 / inv_scale_y - scale_y < DBL_EPSILON) // if integer
3601 {
3602 if (!(scale_x&(scale_x - 1)) && !(scale_y&(scale_y - 1))) // if power of 2
3603 return false;
3604 }
3605 }
3606 #endif
3607
3608 bool affine = false;
3609 const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10;
3610 double ex = fabs((double)dst_width / src_width - inv_scale_x) / inv_scale_x;
3611 double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y;
3612
3613 // Use affine transform resize to allow sub-pixel accuracy
3614 if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS)
3615 affine = true;
3616
3617 // Affine doesn't support Lanczos and Super interpolations
3618 if(affine && (ippInter == ippLanczos || ippInter == ippSuper))
3619 return false;
3620
3621 try
3622 {
3623 ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step);
3624 ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step);
3625
3626 bool ok;
3627 int threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height)));
3628 Range range(0, dst_height);
3629 ipp_resizeParallel invokerGeneral(iwSrc, iwDst, ok);
3630 ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok);
3631 ParallelLoopBody *pInvoker = NULL;
3632 if(affine)
3633 {
3634 pInvoker = &invokerAffine;
3635 invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y);
3636 }
3637 else
3638 {
3639 pInvoker = &invokerGeneral;
3640 invokerGeneral.Init(ippInter);
3641 }
3642
3643 if(IPP_RESIZE_PARALLEL && threads > 1)
3644 parallel_for_(range, *pInvoker, threads*4);
3645 else
3646 pInvoker->operator()(range);
3647
3648 if(!ok)
3649 return false;
3650 }
3651 catch(const ::ipp::IwException &)
3652 {
3653 return false;
3654 }
3655 return true;
3656 #else
3657 CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step);
3658 CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth);
3659 CV_UNUSED(channels); CV_UNUSED(interpolation);
3660 return false;
3661 #endif
3662 }
3663 #endif
3664
3665 //==================================================================================================
3666
3667 namespace hal {
3668
resize(int src_type,const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,double inv_scale_x,double inv_scale_y,int interpolation)3669 void resize(int src_type,
3670 const uchar * src_data, size_t src_step, int src_width, int src_height,
3671 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3672 double inv_scale_x, double inv_scale_y, int interpolation)
3673 {
3674 CV_INSTRUMENT_REGION();
3675
3676 CV_Assert((dst_width > 0 && dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
3677 if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
3678 {
3679 inv_scale_x = static_cast<double>(dst_width) / src_width;
3680 inv_scale_y = static_cast<double>(dst_height) / src_height;
3681 }
3682
3683 CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
3684
3685 int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
3686 Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
3687 saturate_cast<int>(src_height*inv_scale_y));
3688 CV_Assert( !dsize.empty() );
3689
3690 CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation))
3691
3692 static ResizeFunc linear_tab[] =
3693 {
3694 resizeGeneric_<
3695 HResizeLinear<uchar, int, short,
3696 INTER_RESIZE_COEF_SCALE,
3697 HResizeLinearVec_8u32s>,
3698 VResizeLinear<uchar, int, short,
3699 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3700 VResizeLinearVec_32s8u> >,
3701 0,
3702 resizeGeneric_<
3703 HResizeLinear<ushort, float, float, 1,
3704 HResizeLinearVec_16u32f>,
3705 VResizeLinear<ushort, float, float, Cast<float, ushort>,
3706 VResizeLinearVec_32f16u> >,
3707 resizeGeneric_<
3708 HResizeLinear<short, float, float, 1,
3709 HResizeLinearVec_16s32f>,
3710 VResizeLinear<short, float, float, Cast<float, short>,
3711 VResizeLinearVec_32f16s> >,
3712 0,
3713 resizeGeneric_<
3714 HResizeLinear<float, float, float, 1,
3715 HResizeLinearVec_32f>,
3716 VResizeLinear<float, float, float, Cast<float, float>,
3717 VResizeLinearVec_32f> >,
3718 resizeGeneric_<
3719 HResizeLinear<double, double, float, 1,
3720 HResizeNoVec>,
3721 VResizeLinear<double, double, float, Cast<double, double>,
3722 VResizeNoVec> >,
3723 0
3724 };
3725
3726 static ResizeFunc cubic_tab[] =
3727 {
3728 resizeGeneric_<
3729 HResizeCubic<uchar, int, short>,
3730 VResizeCubic<uchar, int, short,
3731 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3732 VResizeCubicVec_32s8u> >,
3733 0,
3734 resizeGeneric_<
3735 HResizeCubic<ushort, float, float>,
3736 VResizeCubic<ushort, float, float, Cast<float, ushort>,
3737 VResizeCubicVec_32f16u> >,
3738 resizeGeneric_<
3739 HResizeCubic<short, float, float>,
3740 VResizeCubic<short, float, float, Cast<float, short>,
3741 VResizeCubicVec_32f16s> >,
3742 0,
3743 resizeGeneric_<
3744 HResizeCubic<float, float, float>,
3745 VResizeCubic<float, float, float, Cast<float, float>,
3746 VResizeCubicVec_32f> >,
3747 resizeGeneric_<
3748 HResizeCubic<double, double, float>,
3749 VResizeCubic<double, double, float, Cast<double, double>,
3750 VResizeNoVec> >,
3751 0
3752 };
3753
3754 static ResizeFunc lanczos4_tab[] =
3755 {
3756 resizeGeneric_<HResizeLanczos4<uchar, int, short>,
3757 VResizeLanczos4<uchar, int, short,
3758 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3759 VResizeNoVec> >,
3760 0,
3761 resizeGeneric_<HResizeLanczos4<ushort, float, float>,
3762 VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
3763 VResizeLanczos4Vec_32f16u> >,
3764 resizeGeneric_<HResizeLanczos4<short, float, float>,
3765 VResizeLanczos4<short, float, float, Cast<float, short>,
3766 VResizeLanczos4Vec_32f16s> >,
3767 0,
3768 resizeGeneric_<HResizeLanczos4<float, float, float>,
3769 VResizeLanczos4<float, float, float, Cast<float, float>,
3770 VResizeLanczos4Vec_32f> >,
3771 resizeGeneric_<HResizeLanczos4<double, double, float>,
3772 VResizeLanczos4<double, double, float, Cast<double, double>,
3773 VResizeNoVec> >,
3774 0
3775 };
3776
3777 static ResizeAreaFastFunc areafast_tab[] =
3778 {
3779 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
3780 0,
3781 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
3782 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
3783 0,
3784 resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
3785 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
3786 0
3787 };
3788
3789 static ResizeAreaFunc area_tab[] =
3790 {
3791 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
3792 resizeArea_<short, float>, 0, resizeArea_<float, float>,
3793 resizeArea_<double, double>, 0
3794 };
3795
3796 static be_resize_func linear_exact_tab[] =
3797 {
3798 resize_bitExact<uchar, interpolationLinear<uchar> >,
3799 resize_bitExact<schar, interpolationLinear<schar> >,
3800 resize_bitExact<ushort, interpolationLinear<ushort> >,
3801 resize_bitExact<short, interpolationLinear<short> >,
3802 resize_bitExact<int, interpolationLinear<int> >,
3803 0,
3804 0,
3805 0
3806 };
3807
3808 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
3809
3810 int iscale_x = saturate_cast<int>(scale_x);
3811 int iscale_y = saturate_cast<int>(scale_y);
3812
3813 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
3814 std::abs(scale_y - iscale_y) < DBL_EPSILON;
3815
3816 Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3817 Mat dst(dsize, src_type, dst_data, dst_step);
3818
3819 if (interpolation == INTER_LINEAR_EXACT)
3820 {
3821 // in case of inv_scale_x && inv_scale_y is equal to 0.5
3822 // INTER_AREA (fast) is equal to bit exact INTER_LINEAR
3823 if (is_area_fast && iscale_x == 2 && iscale_y == 2 && cn != 2)//Area resize implementation for 2-channel images isn't bit-exact
3824 interpolation = INTER_AREA;
3825 else
3826 {
3827 be_resize_func func = linear_exact_tab[depth];
3828 CV_Assert(func != 0);
3829 func(src_data, src_step, src_width, src_height,
3830 dst_data, dst_step, dst_width, dst_height,
3831 cn, inv_scale_x, inv_scale_y);
3832 return;
3833 }
3834 }
3835
3836 if( interpolation == INTER_NEAREST )
3837 {
3838 resizeNN( src, dst, inv_scale_x, inv_scale_y );
3839 return;
3840 }
3841
3842 if( interpolation == INTER_NEAREST_EXACT )
3843 {
3844 resizeNN_bitexact( src, dst, inv_scale_x, inv_scale_y );
3845 return;
3846 }
3847
3848 int k, sx, sy, dx, dy;
3849
3850
3851 {
3852 // in case of scale_x && scale_y is equal to 2
3853 // INTER_AREA (fast) also is equal to INTER_LINEAR
3854 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3855 interpolation = INTER_AREA;
3856
3857 // true "area" interpolation is only implemented for the case (scale_x >= 1 && scale_y >= 1).
3858 // In other cases it is emulated using some variant of bilinear interpolation
3859 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3860 {
3861 if( is_area_fast )
3862 {
3863 int area = iscale_x*iscale_y;
3864 size_t srcstep = src_step / src.elemSize1();
3865 AutoBuffer<int> _ofs(area + dsize.width*cn);
3866 int* ofs = _ofs.data();
3867 int* xofs = ofs + area;
3868 ResizeAreaFastFunc func = areafast_tab[depth];
3869 CV_Assert( func != 0 );
3870
3871 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3872 for( sx = 0; sx < iscale_x; sx++ )
3873 ofs[k++] = (int)(sy*srcstep + sx*cn);
3874
3875 for( dx = 0; dx < dsize.width; dx++ )
3876 {
3877 int j = dx * cn;
3878 sx = iscale_x * j;
3879 for( k = 0; k < cn; k++ )
3880 xofs[j + k] = sx + k;
3881 }
3882
3883 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3884 return;
3885 }
3886
3887 ResizeAreaFunc func = area_tab[depth];
3888 CV_Assert( func != 0 && cn <= 4 );
3889
3890 AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
3891 DecimateAlpha* xtab = _xytab.data(), *ytab = xtab + src_width*2;
3892
3893 int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
3894 int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
3895
3896 AutoBuffer<int> _tabofs(dsize.height + 1);
3897 int* tabofs = _tabofs.data();
3898 for( k = 0, dy = 0; k < ytab_size; k++ )
3899 {
3900 if( k == 0 || ytab[k].di != ytab[k-1].di )
3901 {
3902 assert( ytab[k].di == dy );
3903 tabofs[dy++] = k;
3904 }
3905 }
3906 tabofs[dy] = ytab_size;
3907
3908 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3909 return;
3910 }
3911 }
3912
3913 int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3914 bool area_mode = interpolation == INTER_AREA;
3915 bool fixpt = depth == CV_8U;
3916 float fx, fy;
3917 ResizeFunc func=0;
3918 int ksize=0, ksize2;
3919 if( interpolation == INTER_CUBIC )
3920 ksize = 4, func = cubic_tab[depth];
3921 else if( interpolation == INTER_LANCZOS4 )
3922 ksize = 8, func = lanczos4_tab[depth];
3923 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3924 ksize = 2, func = linear_tab[depth];
3925 else
3926 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3927 ksize2 = ksize/2;
3928
3929 CV_Assert( func != 0 );
3930
3931 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3932 int* xofs = (int*)_buffer.data();
3933 int* yofs = xofs + width;
3934 float* alpha = (float*)(yofs + dsize.height);
3935 short* ialpha = (short*)alpha;
3936 float* beta = alpha + width*ksize;
3937 short* ibeta = ialpha + width*ksize;
3938 float cbuf[MAX_ESIZE] = {0};
3939
3940 for( dx = 0; dx < dsize.width; dx++ )
3941 {
3942 if( !area_mode )
3943 {
3944 fx = (float)((dx+0.5)*scale_x - 0.5);
3945 sx = cvFloor(fx);
3946 fx -= sx;
3947 }
3948 else
3949 {
3950 sx = cvFloor(dx*scale_x);
3951 fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3952 fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3953 }
3954
3955 if( sx < ksize2-1 )
3956 {
3957 xmin = dx+1;
3958 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3959 fx = 0, sx = 0;
3960 }
3961
3962 if( sx + ksize2 >= src_width )
3963 {
3964 xmax = std::min( xmax, dx );
3965 if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3966 fx = 0, sx = src_width-1;
3967 }
3968
3969 for( k = 0, sx *= cn; k < cn; k++ )
3970 xofs[dx*cn + k] = sx + k;
3971
3972 if( interpolation == INTER_CUBIC )
3973 interpolateCubic( fx, cbuf );
3974 else if( interpolation == INTER_LANCZOS4 )
3975 interpolateLanczos4( fx, cbuf );
3976 else
3977 {
3978 cbuf[0] = 1.f - fx;
3979 cbuf[1] = fx;
3980 }
3981 if( fixpt )
3982 {
3983 for( k = 0; k < ksize; k++ )
3984 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3985 for( ; k < cn*ksize; k++ )
3986 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3987 }
3988 else
3989 {
3990 for( k = 0; k < ksize; k++ )
3991 alpha[dx*cn*ksize + k] = cbuf[k];
3992 for( ; k < cn*ksize; k++ )
3993 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3994 }
3995 }
3996
3997 for( dy = 0; dy < dsize.height; dy++ )
3998 {
3999 if( !area_mode )
4000 {
4001 fy = (float)((dy+0.5)*scale_y - 0.5);
4002 sy = cvFloor(fy);
4003 fy -= sy;
4004 }
4005 else
4006 {
4007 sy = cvFloor(dy*scale_y);
4008 fy = (float)((dy+1) - (sy+1)*inv_scale_y);
4009 fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
4010 }
4011
4012 yofs[dy] = sy;
4013 if( interpolation == INTER_CUBIC )
4014 interpolateCubic( fy, cbuf );
4015 else if( interpolation == INTER_LANCZOS4 )
4016 interpolateLanczos4( fy, cbuf );
4017 else
4018 {
4019 cbuf[0] = 1.f - fy;
4020 cbuf[1] = fy;
4021 }
4022
4023 if( fixpt )
4024 {
4025 for( k = 0; k < ksize; k++ )
4026 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
4027 }
4028 else
4029 {
4030 for( k = 0; k < ksize; k++ )
4031 beta[dy*ksize + k] = cbuf[k];
4032 }
4033 }
4034
4035 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
4036 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
4037 }
4038
4039 } // cv::hal::
4040 } // cv::
4041
4042 //==================================================================================================
4043
resize(InputArray _src,OutputArray _dst,Size dsize,double inv_scale_x,double inv_scale_y,int interpolation)4044 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
4045 double inv_scale_x, double inv_scale_y, int interpolation )
4046 {
4047 CV_INSTRUMENT_REGION();
4048
4049 Size ssize = _src.size();
4050
4051 CV_Assert( !ssize.empty() );
4052 if( dsize.empty() )
4053 {
4054 CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
4055 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
4056 saturate_cast<int>(ssize.height*inv_scale_y));
4057 CV_Assert( !dsize.empty() );
4058 }
4059 else
4060 {
4061 inv_scale_x = (double)dsize.width/ssize.width;
4062 inv_scale_y = (double)dsize.height/ssize.height;
4063 CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
4064 }
4065
4066 if (interpolation == INTER_LINEAR_EXACT && (_src.depth() == CV_32F || _src.depth() == CV_64F))
4067 interpolation = INTER_LINEAR; // If depth isn't supported fallback to generic resize
4068
4069 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
4070 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
4071
4072 // Fake reference to source. Resolves issue 13577 in case of src == dst.
4073 UMat srcUMat;
4074 if (_src.isUMat())
4075 srcUMat = _src.getUMat();
4076
4077 Mat src = _src.getMat();
4078 _dst.create(dsize, src.type());
4079 Mat dst = _dst.getMat();
4080
4081 if (dsize == ssize)
4082 {
4083 // Source and destination are of same size. Use simple copy.
4084 src.copyTo(dst);
4085 return;
4086 }
4087
4088 hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
4089 }
4090
4091
4092 CV_IMPL void
cvResize(const CvArr * srcarr,CvArr * dstarr,int method)4093 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
4094 {
4095 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
4096 CV_Assert( src.type() == dst.type() );
4097 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
4098 (double)dst.rows/src.rows, method );
4099 }
4100
4101 /* End of file. */
4102