1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, 2017, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49 
50 #include "precomp.hpp"
51 #include "opencl_kernels_imgproc.hpp"
52 #include "hal_replacement.hpp"
53 #include "opencv2/core/hal/intrin.hpp"
54 #include "opencv2/core/utils/buffer_area.private.hpp"
55 
56 #include "opencv2/core/openvx/ovx_defs.hpp"
57 #include "resize.hpp"
58 
59 #include "opencv2/core/softfloat.hpp"
60 #include "fixedpoint.inl.hpp"
61 
62 using namespace cv;
63 
64 namespace
65 {
66 
67 template <typename ET, bool needsign> struct fixedtype { typedef fixedpoint64 type; };
68 template <> struct fixedtype<uint32_t, false> { typedef ufixedpoint64 type; };
69 template <bool needsign> struct fixedtype<int16_t, needsign> { typedef fixedpoint32 type; };
70 template <> struct fixedtype<uint16_t, false> { typedef ufixedpoint32 type; };
71 template <bool needsign> struct fixedtype<int8_t, needsign> { typedef fixedpoint32 type; };
72 template <> struct fixedtype<uint8_t, false> { typedef ufixedpoint16 type; };
73 
74 //FT is fixedtype<ET, needsign>::type
75 template <typename ET, typename FT, int n, bool mulall>
hlineResize(ET * src,int cn,int * ofst,FT * m,FT * dst,int dst_min,int dst_max,int dst_width)76 static void hlineResize(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
77 {
78     int i = 0;
79     for (; i < dst_min; i++, m += n) // Points that fall left from src image so became equal to leftmost src point
80     {
81         for (int j = 0; j < cn; j++, dst++)
82         {
83             *dst = src[j];
84         }
85     }
86     for (; i < dst_max; i++, m += n)
87     {
88         ET* src_ofst = src + cn*ofst[i];
89         for (int j = 0; j < cn; j++, dst++)
90         {
91             *dst = (mulall || !m[0].isZero()) ? m[0] * src_ofst[j] : FT::zero();
92             for (int k = 1; k < n; k++)
93             {
94                 *dst = *dst + ((mulall || !m[k].isZero()) ? m[k] * src_ofst[j+k*cn] : FT::zero());
95             }
96         }
97     }
98     ET* src_last = src + cn*ofst[dst_width - 1];
99     for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
100     {
101         for (int j = 0; j < cn; j++, dst++)
102         {
103             *dst = src_last[j];
104         }
105     }
106 }
107 template <typename ET, typename FT, int n, bool mulall, int cncnt> struct hline
108 {
ResizeCn__anon873c3d650111::hline109     static void ResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
110     {
111         hlineResize<ET, FT, n, mulall>(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
112     }
113 };
114 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 1>
115 {
ResizeCn__anon873c3d650111::hline116     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
117     {
118         int i = 0;
119         FT src0(src[0]);
120         for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
121         {
122             *(dst++) = src0;
123         }
124         for (; i < dst_max; i++, m += 2)
125         {
126             ET* px = src + ofst[i];
127             *(dst++) = m[0] * px[0] + m[1] * px[1];
128         }
129         src0 = (src + ofst[dst_width - 1])[0];
130         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
131         {
132             *(dst++) = src0;
133         }
134     }
135 };
136 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 2>
137 {
ResizeCn__anon873c3d650111::hline138     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
139     {
140         int i = 0;
141         FT src0(src[0]), src1(src[1]);
142         for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
143         {
144             *(dst++) = src0;
145             *(dst++) = src1;
146         }
147         for (; i < dst_max; i++, m += 2)
148         {
149             ET* px = src + 2*ofst[i];
150             *(dst++) = m[0] * px[0] + m[1] * px[2];
151             *(dst++) = m[0] * px[1] + m[1] * px[3];
152         }
153         src0 = (src + 2*ofst[dst_width - 1])[0];
154         src1 = (src + 2*ofst[dst_width - 1])[1];
155         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
156         {
157             *(dst++) = src0;
158             *(dst++) = src1;
159         }
160     }
161 };
162 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 3>
163 {
ResizeCn__anon873c3d650111::hline164     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
165     {
166         int i = 0;
167         FT src0(src[0]), src1(src[1]), src2(src[2]);
168         for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
169         {
170             *(dst++) = src0;
171             *(dst++) = src1;
172             *(dst++) = src2;
173         }
174         for (; i < dst_max; i++, m += 2)
175         {
176             ET* px = src + 3*ofst[i];
177             *(dst++) = m[0] * px[0] + m[1] * px[3];
178             *(dst++) = m[0] * px[1] + m[1] * px[4];
179             *(dst++) = m[0] * px[2] + m[1] * px[5];
180         }
181         src0 = (src + 3*ofst[dst_width - 1])[0];
182         src1 = (src + 3*ofst[dst_width - 1])[1];
183         src2 = (src + 3*ofst[dst_width - 1])[2];
184         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
185         {
186             *(dst++) = src0;
187             *(dst++) = src1;
188             *(dst++) = src2;
189         }
190     }
191 };
192 template <typename ET, typename FT> struct hline<ET, FT, 2, true, 4>
193 {
ResizeCn__anon873c3d650111::hline194     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
195     {
196         int i = 0;
197         FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
198         for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
199         {
200             *(dst++) = src0;
201             *(dst++) = src1;
202             *(dst++) = src2;
203             *(dst++) = src3;
204         }
205         for (; i < dst_max; i++, m += 2)
206         {
207             ET* px = src + 4*ofst[i];
208             *(dst++) = m[0] * px[0] + m[1] * px[4];
209             *(dst++) = m[0] * px[1] + m[1] * px[5];
210             *(dst++) = m[0] * px[2] + m[1] * px[6];
211             *(dst++) = m[0] * px[3] + m[1] * px[7];
212         }
213         src0 = (src + 4*ofst[dst_width - 1])[0];
214         src1 = (src + 4*ofst[dst_width - 1])[1];
215         src2 = (src + 4*ofst[dst_width - 1])[2];
216         src3 = (src + 4*ofst[dst_width - 1])[3];
217         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
218         {
219             *(dst++) = src0;
220             *(dst++) = src1;
221             *(dst++) = src2;
222             *(dst++) = src3;
223         }
224     }
225 };
226 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 1>
227 {
ResizeCn__anon873c3d650111::hline228     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
229     {
230         int i = 0;
231         FT src0(src[0]);
232         for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
233         {
234             *(dst++) = src0;
235         }
236         for (; i < dst_max; i++, m += 4)
237         {
238             ET* px = src + ofst[i];
239             *(dst++) = m[0] * src[0] + m[1] * src[1] + m[2] * src[2] + m[3] * src[3];
240         }
241         src0 = (src + ofst[dst_width - 1])[0];
242         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
243         {
244             *(dst++) = src0;
245         }
246     }
247 };
248 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 2>
249 {
ResizeCn__anon873c3d650111::hline250     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
251     {
252         int i = 0;
253         FT src0(src[0]), src1(src[1]);
254         for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
255         {
256             *(dst++) = src0;
257             *(dst++) = src1;
258         }
259         for (; i < dst_max; i++, m += 4)
260         {
261             ET* px = src + 2*ofst[i];
262             *(dst++) = m[0] * src[0] + m[1] * src[2] + m[2] * src[4] + m[3] * src[6];
263             *(dst++) = m[0] * src[1] + m[1] * src[3] + m[2] * src[5] + m[3] * src[7];
264         }
265         src0 = (src + 2*ofst[dst_width - 1])[0];
266         src1 = (src + 2*ofst[dst_width - 1])[1];
267         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
268         {
269             *(dst++) = src0;
270             *(dst++) = src1;
271         }
272     }
273 };
274 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 3>
275 {
ResizeCn__anon873c3d650111::hline276     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
277     {
278         int i = 0;
279         FT src0(src[0]), src1(src[1]), src2(src[2]);
280         for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
281         {
282             *(dst++) = src0;
283             *(dst++) = src1;
284             *(dst++) = src2;
285         }
286         for (; i < dst_max; i++, m += 4)
287         {
288             ET* px = src + 3*ofst[i];
289             *(dst++) = m[0] * src[0] + m[1] * src[3] + m[2] * src[6] + m[3] * src[ 9];
290             *(dst++) = m[0] * src[1] + m[1] * src[4] + m[2] * src[7] + m[3] * src[10];
291             *(dst++) = m[0] * src[2] + m[1] * src[5] + m[2] * src[8] + m[3] * src[11];
292         }
293         src0 = (src + 3*ofst[dst_width - 1])[0];
294         src1 = (src + 3*ofst[dst_width - 1])[1];
295         src2 = (src + 3*ofst[dst_width - 1])[2];
296         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
297         {
298             *(dst++) = src0;
299             *(dst++) = src1;
300             *(dst++) = src2;
301         }
302     }
303 };
304 template <typename ET, typename FT> struct hline<ET, FT, 4, true, 4>
305 {
ResizeCn__anon873c3d650111::hline306     static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
307     {
308         int i = 0;
309         FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
310         for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
311         {
312             *(dst++) = src0;
313             *(dst++) = src1;
314             *(dst++) = src2;
315             *(dst++) = src3;
316         }
317         for (; i < dst_max; i++, m += 4)
318         {
319             ET* px = src + 4*ofst[i];
320             *(dst++) = m[0] * src[0] + m[1] * src[4] + m[2] * src[ 8] + m[3] * src[12];
321             *(dst++) = m[0] * src[1] + m[1] * src[5] + m[2] * src[ 9] + m[3] * src[13];
322             *(dst++) = m[0] * src[2] + m[1] * src[6] + m[2] * src[10] + m[3] * src[14];
323             *(dst++) = m[0] * src[3] + m[1] * src[7] + m[2] * src[11] + m[3] * src[15];
324         }
325         src0 = (src + 4*ofst[dst_width - 1])[0];
326         src1 = (src + 4*ofst[dst_width - 1])[1];
327         src2 = (src + 4*ofst[dst_width - 1])[2];
328         src3 = (src + 4*ofst[dst_width - 1])[3];
329         for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
330         {
331             *(dst++) = src0;
332             *(dst++) = src1;
333             *(dst++) = src2;
334             *(dst++) = src3;
335         }
336     }
337 };
338 template <typename ET, typename FT, int n, bool mulall, int cncnt>
hlineResizeCn(ET * src,int cn,int * ofst,FT * m,FT * dst,int dst_min,int dst_max,int dst_width)339 static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
340 {
341     hline<ET, FT, n, mulall, cncnt>::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
342 };
343 
344 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)345 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
346 {
347     int i = 0;
348     ufixedpoint16 src_0(src[0]);
349 #if CV_SIMD
350     const int VECSZ = v_uint16::nlanes;
351     v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
352     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
353     {
354         v_store((uint16_t*)dst, v_src_0);
355     }
356 #endif
357     for (; i < dst_min; i++, m += 2)
358     {
359         *(dst++) = src_0;
360     }
361 #if CV_SIMD
362     for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ)
363     {
364         v_uint16 v_src0, v_src1;
365         v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
366         v_store((uint16_t*)dst      , v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
367                                              v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
368         v_expand(vx_lut_pairs(src, ofst + i + VECSZ), v_src0, v_src1);
369         v_store((uint16_t*)dst+VECSZ, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m + 2*VECSZ))),
370                                              v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + 3*VECSZ)))));
371     }
372     if (i <= dst_max - VECSZ)
373     {
374         v_uint16 v_src0, v_src1;
375         v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
376         v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
377                                        v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
378         i += VECSZ; m += 2*VECSZ; dst += VECSZ;
379     }
380 #endif
381     for (; i < dst_max; i += 1, m += 2)
382     {
383         uint8_t* px = src + ofst[i];
384         *(dst++) = m[0] * px[0] + m[1] * px[1];
385     }
386     src_0 = (src + ofst[dst_width - 1])[0];
387 #if CV_SIMD
388     v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
389     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
390     {
391         v_store((uint16_t*)dst, v_src_0);
392     }
393 #endif
394     for (; i < dst_width; i++)
395     {
396         *(dst++) = src_0;
397     }
398 }
399 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)400 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
401 {
402     int i = 0;
403     union {
404         uint32_t d;
405         uint16_t w[2];
406     } srccn;
407     ((ufixedpoint16*)(srccn.w))[0] = src[0];
408     ((ufixedpoint16*)(srccn.w))[1] = src[1];
409 #if CV_SIMD
410     const int VECSZ = v_uint16::nlanes;
411     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
412     for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
413     {
414         v_store((uint16_t*)dst, v_srccn);
415     }
416 #endif
417     for (; i < dst_min; i++, m += 2)
418     {
419         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
420         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
421     }
422 #if CV_SIMD
423     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
424     {
425         v_uint16 v_src0, v_src1;
426         v_expand(v_interleave_pairs(v_reinterpret_as_u8(vx_lut_pairs((uint16_t*)src, ofst + i))), v_src0, v_src1);
427 
428         v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd
429         v_uint32 v_zip0, v_zip1;
430         v_zip(v_mul, v_mul, v_zip0, v_zip1);//AaAaBbBb CcCcDdDd
431         v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_zip0)));
432         v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_zip1)));
433         v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
434     }
435 #endif
436     for (; i < dst_max; i += 1, m += 2)
437     {
438         uint8_t* px = src + 2 * ofst[i];
439         *(dst++) = m[0] * px[0] + m[1] * px[2];
440         *(dst++) = m[0] * px[1] + m[1] * px[3];
441     }
442     ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1];
443 #if CV_SIMD
444     v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
445     for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
446     {
447         v_store((uint16_t*)dst, v_srccn);
448     }
449 #endif
450     for (; i < dst_width; i++)
451     {
452         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
453         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
454     }
455 }
456 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)457 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
458 {
459     int i = 0;
460     union {
461         uint64_t q;
462         uint16_t w[4];
463     } srccn;
464     ((ufixedpoint16*)(srccn.w))[0] = src[0];
465     ((ufixedpoint16*)(srccn.w))[1] = src[1];
466     ((ufixedpoint16*)(srccn.w))[2] = src[2];
467     ((ufixedpoint16*)(srccn.w))[3] = 0;
468 #if CV_SIMD
469     const int VECSZ = v_uint16::nlanes;
470     v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
471     for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point
472     {
473         v_store((uint16_t*)dst, v_srccn);
474     }
475 #endif
476     for (; i < dst_min; i++, m += 2)
477     {
478         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
479         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
480         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
481     }
482 #if CV_SIMD
483     CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2];
484     for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2)
485     {
486         v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
487         v_uint8 v_src01, v_src23;
488         v_uint16 v_src0, v_src1, v_src2, v_src3;
489         v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23);
490         v_expand(v_src01, v_src0, v_src1);
491         v_expand(v_src23, v_src2, v_src3);
492 
493         v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
494         v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
495         v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
496         v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
497         v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd
498 
499         v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
500         v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
501         v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
502         v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
503         v_store((uint16_t*)dst            , v_pack_triplets(v_pack(v_res0, v_res1)));
504         v_store((uint16_t*)dst + 3*VECSZ/4, v_pack_triplets(v_pack(v_res2, v_res3)));
505     }
506 #endif
507     for (; i < dst_max; i += 1, m += 2)
508     {
509         uint8_t* px = src + 3 * ofst[i];
510         *(dst++) = m[0] * px[0] + m[1] * px[3];
511         *(dst++) = m[0] * px[1] + m[1] * px[4];
512         *(dst++) = m[0] * px[2] + m[1] * px[5];
513     }
514     ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0];
515     ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1];
516     ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2];
517 #if CV_SIMD
518     v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
519     for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point
520     {
521         v_store((uint16_t*)dst, v_srccn);
522     }
523 #endif
524     for (; i < dst_width; i++)
525     {
526         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
527         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
528         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
529     }
530 }
531 template <>
hlineResizeCn(uint8_t * src,int,int * ofst,ufixedpoint16 * m,ufixedpoint16 * dst,int dst_min,int dst_max,int dst_width)532 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
533 {
534     int i = 0;
535     union {
536         uint64_t q;
537         uint16_t w[4];
538     } srccn;
539     ((ufixedpoint16*)(srccn.w))[0] = src[0];
540     ((ufixedpoint16*)(srccn.w))[1] = src[1];
541     ((ufixedpoint16*)(srccn.w))[2] = src[2];
542     ((ufixedpoint16*)(srccn.w))[3] = src[3];
543 #if CV_SIMD
544     const int VECSZ = v_uint16::nlanes;
545     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
546     for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
547     {
548         v_store((uint16_t*)dst, v_srccn);
549     }
550 #endif
551     for (; i < dst_min; i++, m += 2)
552     {
553         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
554         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
555         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
556         *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
557     }
558 #if CV_SIMD
559     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
560     {
561         v_uint16 v_src0, v_src1, v_src2, v_src3;
562         v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i))), v_src0, v_src1);
563         v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i + VECSZ/4))), v_src2, v_src3);
564 
565         v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
566         v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
567         v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
568         v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
569         v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd
570 
571         v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
572         v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
573         v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
574         v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
575         v_store((uint16_t*)dst        , v_pack(v_res0, v_res1));
576         v_store((uint16_t*)dst + VECSZ, v_pack(v_res2, v_res3));
577     }
578 #endif
579     for (; i < dst_max; i += 1, m += 2)
580     {
581         uint8_t* px = src + 4 * ofst[i];
582         *(dst++) = m[0] * px[0] + m[1] * px[4];
583         *(dst++) = m[0] * px[1] + m[1] * px[5];
584         *(dst++) = m[0] * px[2] + m[1] * px[6];
585         *(dst++) = m[0] * px[3] + m[1] * px[7];
586     }
587     ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1];
588     ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3];
589 #if CV_SIMD
590     v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
591     for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point
592     {
593         v_store((uint16_t*)dst, v_srccn);
594     }
595 #endif
596     for (; i < dst_width; i++)
597     {
598         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
599         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
600         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
601         *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
602     }
603 }
604 template <>
hlineResizeCn(uint16_t * src,int,int * ofst,ufixedpoint32 * m,ufixedpoint32 * dst,int dst_min,int dst_max,int dst_width)605 void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int *ofst, ufixedpoint32* m, ufixedpoint32* dst, int dst_min, int dst_max, int dst_width)
606 {
607     int i = 0;
608     ufixedpoint32 src_0(src[0]);
609 #if CV_SIMD
610     const int VECSZ = v_uint32::nlanes;
611     v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
612     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
613     {
614         v_store((uint32_t*)dst, v_src_0);
615     }
616 #endif
617     for (; i < dst_min; i++, m += 2)
618     {
619         *(dst++) = src_0;
620     }
621 #if CV_SIMD
622     for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
623     {
624         v_uint32 v_src0, v_src1;
625         v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
626 
627         v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m));
628         v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ));
629         v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32),
630                                        (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32)));
631     }
632 #endif
633     for (; i < dst_max; i += 1, m += 2)
634     {
635         uint16_t* px = src + ofst[i];
636         *(dst++) = m[0] * px[0] + m[1] * px[1];
637     }
638     src_0 = (src + ofst[dst_width - 1])[0];
639 #if CV_SIMD
640     v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
641     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ)
642     {
643         v_store((uint32_t*)dst, v_src_0);
644     }
645 #endif
646     for (; i < dst_width; i++)
647     {
648         *(dst++) = src_0;
649     }
650 }
651 
652 template <typename ET, typename FT>
vlineSet(FT * src,ET * dst,int dst_width)653 void vlineSet(FT* src, ET* dst, int dst_width)
654 {
655     for (int i = 0; i < dst_width; i++)
656         dst[i] = src[i];
657 }
658 template <>
vlineSet(ufixedpoint16 * src,uint8_t * dst,int dst_width)659 void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
660 {
661     int i = 0;
662 #if CV_SIMD
663     const int VECSZ = v_uint8::nlanes;
664     static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
665     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
666     {
667         v_uint16 v_src0 = vx_load((uint16_t*)src);
668         v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2);
669 
670         v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8;
671         v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8;
672 
673         v_store(dst, v_pack(v_res0, v_res1));
674     }
675 #endif
676     for (; i < dst_width; i++)
677         *(dst++) = *(src++);
678 }
679 
680 template <typename ET, typename FT, int n>
vlineResize(FT * src,size_t src_step,FT * m,ET * dst,int dst_width)681 void vlineResize(FT* src, size_t src_step, FT* m, ET* dst, int dst_width)
682 {
683     for (int i = 0; i < dst_width; i++)
684     {
685         typename FT::WT res = src[i] * m[0];
686         for (int k = 1; k < n; k++)
687             res = res + src[i + k*src_step] * m[k];
688         dst[i] = res;
689     }
690 }
691 template <>
vlineResize(ufixedpoint16 * src,size_t src_step,ufixedpoint16 * m,uint8_t * dst,int dst_width)692 void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step, ufixedpoint16* m, uint8_t* dst, int dst_width)
693 {
694     int i = 0;
695     ufixedpoint16* src1 = src + src_step;
696 #if CV_SIMD
697     const int VECSZ = v_uint8::nlanes;
698     static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
699     static const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
700     static const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
701 
702     v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0]));
703     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ)
704     {
705         v_int16 v_src00 = vx_load((int16_t*)src);
706         v_int16 v_src10 = vx_load((int16_t*)src1);
707         v_int16 v_tmp0, v_tmp1;
708         v_zip(v_add_wrap(v_src00,v_128), v_add_wrap(v_src10,v_128), v_tmp0, v_tmp1);
709 
710         v_int32 v_res0 = v_dotprod(v_tmp0, v_mul);
711         v_int32 v_res1 = v_dotprod(v_tmp1, v_mul);
712 
713         v_int16 v_src01 = vx_load((int16_t*)src + VECSZ/2);
714         v_int16 v_src11 = vx_load((int16_t*)src1 + VECSZ/2);
715         v_zip(v_add_wrap(v_src01,v_128), v_add_wrap(v_src11,v_128), v_tmp0, v_tmp1);
716         v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
717         v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);
718 
719         v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
720                                      (v_res1 + v_fixedRound) >> 16),
721                               v_pack((v_res2 + v_fixedRound) >> 16,
722                                      (v_res3 + v_fixedRound) >> 16));
723 
724         v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
725     }
726 #endif
727     for (; i < dst_width; i++)
728     {
729         *(dst++) = (uint8_t)(*(src++) * m[0] + *(src1++) * m[1]);
730     }
731 }
732 
733 template <typename ET> class interpolationLinear
734 {
735 public:
736     static const int len = 2;
737     static const bool needsign = false;
interpolationLinear(double inv_scale,int srcsize,int dstsize)738     interpolationLinear(double inv_scale, int srcsize, int dstsize) : scale(softdouble::one() / softdouble(inv_scale)), maxsize(srcsize), minofst(0), maxofst(dstsize) {}
getCoeffs(int val,int * offset,typename fixedtype<ET,needsign>::type * coeffs)739     void getCoeffs(int val, int* offset, typename fixedtype<ET, needsign>::type* coeffs)
740     {
741         typedef typename fixedtype<ET, needsign>::type fixedpoint;
742         softdouble fval = scale*(softdouble(val)+softdouble(0.5))-softdouble(0.5);
743         int ival = cvFloor(fval);
744         if (ival >= 0 && maxsize > 1)
745         {
746             if (ival < maxsize - 1)
747             {
748                 *offset = ival;
749                 coeffs[1] = fval - softdouble(ival);
750                 coeffs[0] = fixedpoint::one() - coeffs[1];
751             }
752             else
753             {
754                 *offset = maxsize - 1;
755                 maxofst = min(maxofst, val);
756             }
757         }
758         else
759         {
760             minofst = max(minofst, val + 1);
761         }
762     }
getMinMax(int & min,int & max)763     void getMinMax(int &min, int &max)
764     {
765         min = minofst;
766         max = maxofst;
767     }
768 protected:
769     softdouble scale;
770     int maxsize;
771     int minofst, maxofst;
772 };
773 
774 template <typename ET, typename FT, int interp_y_len>
775 class resize_bitExactInvoker :
776     public ParallelLoopBody
777 {
778 public:
779     typedef FT fixedpoint;
780     typedef void(*hResizeFunc)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
resize_bitExactInvoker(const uchar * _src,size_t _src_step,int _src_width,int _src_height,uchar * _dst,size_t _dst_step,int _dst_width,int _dst_height,int _cn,int * _xoffsets,int * _yoffsets,fixedpoint * _xcoeffs,fixedpoint * _ycoeffs,int _min_x,int _max_x,int _min_y,int _max_y,hResizeFunc _hResize)781     resize_bitExactInvoker(const uchar* _src, size_t _src_step, int _src_width, int _src_height,
782                            uchar* _dst, size_t _dst_step, int _dst_width, int _dst_height,
783                            int _cn, int *_xoffsets, int *_yoffsets, fixedpoint *_xcoeffs, fixedpoint *_ycoeffs,
784                            int _min_x, int _max_x, int _min_y, int _max_y, hResizeFunc _hResize) : ParallelLoopBody(),
785                            src(_src), src_step(_src_step), src_width(_src_width), src_height(_src_height),
786                            dst(_dst), dst_step(_dst_step), dst_width(_dst_width), dst_height(_dst_height),
787                            cn(_cn), xoffsets(_xoffsets), yoffsets(_yoffsets), xcoeffs(_xcoeffs), ycoeffs(_ycoeffs),
788                            min_x(_min_x), max_x(_max_x), min_y(_min_y), max_y(_max_y), hResize(_hResize) {}
789 
operator ()(const Range & range) const790     virtual void operator() (const Range& range) const CV_OVERRIDE
791     {
792         AutoBuffer<fixedpoint> linebuf(interp_y_len * dst_width * cn);
793         int last_eval = - interp_y_len;
794         int evalbuf_start = 0;
795         int rmin_y = max(min_y, range.start);
796         int rmax_y = min(max_y, range.end);
797         if (range.start < min_y)
798         {
799             last_eval = 1 - interp_y_len;
800             evalbuf_start = 1;
801             hResize((ET*)src, cn, xoffsets, xcoeffs, linebuf.data(), min_x, max_x, dst_width);
802         }
803         int dy = range.start;
804         for (; dy < rmin_y; dy++)
805             vlineSet<ET, FT>(linebuf.data(), (ET*)(dst + dst_step * dy), dst_width*cn);
806         for (; dy < rmax_y; dy++)
807         {
808             int &iy = yoffsets[dy];
809 
810             int i;
811             for (i = max(iy, last_eval + interp_y_len); i < min(iy + interp_y_len, src_height); i++, evalbuf_start = (evalbuf_start + 1) % interp_y_len)
812                 hResize((ET*)(src + i * src_step), cn, xoffsets, xcoeffs, linebuf.data() + evalbuf_start*(dst_width * cn), min_x, max_x, dst_width);
813             evalbuf_start = (evalbuf_start + max(iy, src_height - interp_y_len) - max(last_eval, src_height - interp_y_len)) % interp_y_len;
814             last_eval = iy;
815 
816             fixedpoint curcoeffs[interp_y_len];
817             for (i = 0; i < evalbuf_start; i++)
818                 curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + interp_y_len + i];
819             for (; i < interp_y_len; i++)
820                 curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + i];
821 
822             vlineResize<ET, FT, interp_y_len>(linebuf.data(), dst_width*cn, curcoeffs, (ET*)(dst + dst_step * dy), dst_width*cn);
823         }
824         fixedpoint *endline = linebuf.data();
825         if (last_eval + interp_y_len > src_height)
826             endline += dst_width*cn*((evalbuf_start + src_height - 1 - last_eval) % interp_y_len);
827         else
828             hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width);
829         for (; dy < range.end; dy++)
830             vlineSet<ET, FT>(endline, (ET*)(dst + dst_step * dy), dst_width*cn);
831 #if CV_SIMD
832         vx_cleanup();
833 #endif
834     }
835 
836 private:
837     const uchar* src;
838     size_t src_step;
839     int src_width, src_height;
840     uchar* dst;
841     size_t dst_step;
842     int dst_width, dst_height, cn;
843     int *xoffsets, *yoffsets;
844     fixedpoint *xcoeffs, *ycoeffs;
845     int min_x, max_x, min_y, max_y;
846     hResizeFunc hResize;
847 
848     resize_bitExactInvoker(const resize_bitExactInvoker&);
849     resize_bitExactInvoker& operator=(const resize_bitExactInvoker&);
850 };
851 
852 template <typename ET, typename interpolation>
resize_bitExact(const uchar * src,size_t src_step,int src_width,int src_height,uchar * dst,size_t dst_step,int dst_width,int dst_height,int cn,double inv_scale_x,double inv_scale_y)853 void resize_bitExact(const uchar* src, size_t src_step, int src_width, int src_height,
854                            uchar* dst, size_t dst_step, int dst_width, int dst_height,
855                      int cn, double inv_scale_x, double inv_scale_y)
856 {
857     typedef typename fixedtype<ET, interpolation::needsign>::type fixedpoint;
858     void(*hResize)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
859     switch (cn)
860     {
861     case  1: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 1> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 1>; break;
862     case  2: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 2> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 2>; break;
863     case  3: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 3> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 3>; break;
864     case  4: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 4> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 4>; break;
865     default: hResize = src_width > interpolation::len ? hlineResize<ET, fixedpoint, interpolation::len, true>      : hlineResize<ET, fixedpoint, interpolation::len, false>     ; break;
866     }
867 
868     interpolation interp_x(inv_scale_x, src_width, dst_width);
869     interpolation interp_y(inv_scale_y, src_height, dst_height);
870 
871     AutoBuffer<uchar> buf( dst_width * sizeof(int) +
872                            dst_height * sizeof(int) +
873                            dst_width * interp_x.len*sizeof(fixedpoint) +
874                            dst_height * interp_y.len * sizeof(fixedpoint) );
875     int* xoffsets = (int*)buf.data();
876     int* yoffsets = xoffsets + dst_width;
877     fixedpoint* xcoeffs = (fixedpoint*)(yoffsets + dst_height);
878     fixedpoint* ycoeffs = xcoeffs + dst_width * interp_x.len;
879 
880     int min_x, max_x, min_y, max_y;
881     for (int dx = 0; dx < dst_width; dx++)
882         interp_x.getCoeffs(dx, xoffsets+dx, xcoeffs+dx*interp_x.len);
883     interp_x.getMinMax(min_x, max_x);
884     for (int dy = 0; dy < dst_height; dy++)
885         interp_y.getCoeffs(dy, yoffsets+dy, ycoeffs+dy*interp_y.len);
886     interp_y.getMinMax(min_y, max_y);
887 
888     resize_bitExactInvoker<ET, fixedpoint, interpolation::len> invoker(src, src_step, src_width, src_height, dst, dst_step, dst_width, dst_height, cn,
889                                                                        xoffsets, yoffsets, xcoeffs, ycoeffs, min_x, max_x, min_y, max_y, hResize);
890     Range range(0, dst_height);
891     parallel_for_(range, invoker, dst_width * dst_height / (double)(1 << 16));
892 }
893 
894 typedef void(*be_resize_func)(const uchar* src, size_t src_step, int src_width, int src_height,
895                                     uchar* dst, size_t dst_step, int dst_width, int dst_height,
896                               int cn, double inv_scale_x, double inv_scale_y);
897 
898 }
899 
900 namespace cv
901 {
902 
903 /************** interpolation formulas and tables ***************/
904 
905 const int INTER_RESIZE_COEF_BITS=11;
906 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
907 
interpolateCubic(float x,float * coeffs)908 static inline void interpolateCubic( float x, float* coeffs )
909 {
910     const float A = -0.75f;
911 
912     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
913     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
914     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
915     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
916 }
917 
interpolateLanczos4(float x,float * coeffs)918 static inline void interpolateLanczos4( float x, float* coeffs )
919 {
920     static const double s45 = 0.70710678118654752440084436210485;
921     static const double cs[][2]=
922     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
923 
924     float sum = 0;
925     double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
926     for(int i = 0; i < 8; i++ )
927     {
928         float y0_ = (x+3-i);
929         if (fabs(y0_) >= 1e-6f)
930         {
931             double y = -y0_*CV_PI*0.25;
932             coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
933         }
934         else
935         {
936             // special handling for 'x' values:
937             // - ~0.0: 0 0 0 1 0 0 0 0
938             // - ~1.0: 0 0 0 0 1 0 0 0
939             coeffs[i] = 1e30f;
940         }
941         sum += coeffs[i];
942     }
943 
944     sum = 1.f/sum;
945     for(int i = 0; i < 8; i++ )
946         coeffs[i] *= sum;
947 }
948 
949 template<typename ST, typename DT> struct Cast
950 {
951     typedef ST type1;
952     typedef DT rtype;
953 
operator ()cv::Cast954     DT operator()(ST val) const { return saturate_cast<DT>(val); }
955 };
956 
957 template<typename ST, typename DT, int bits> struct FixedPtCast
958 {
959     typedef ST type1;
960     typedef DT rtype;
961     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
962 
operator ()cv::FixedPtCast963     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
964 };
965 
966 /****************************************************************************************\
967 *                                         Resize                                         *
968 \****************************************************************************************/
969 
970 class resizeNNInvoker :
971     public ParallelLoopBody
972 {
973 public:
resizeNNInvoker(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)974     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
975         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
976         ify(_ify)
977     {
978     }
979 
operator ()(const Range & range) const980     virtual void operator() (const Range& range) const CV_OVERRIDE
981     {
982         Size ssize = src.size(), dsize = dst.size();
983         int y, x, pix_size = (int)src.elemSize();
984 
985         for( y = range.start; y < range.end; y++ )
986         {
987             uchar* D = dst.data + dst.step*y;
988             int sy = std::min(cvFloor(y*ify), ssize.height-1);
989             const uchar* S = src.ptr(sy);
990 
991             switch( pix_size )
992             {
993             case 1:
994                 for( x = 0; x <= dsize.width - 2; x += 2 )
995                 {
996                     uchar t0 = S[x_ofs[x]];
997                     uchar t1 = S[x_ofs[x+1]];
998                     D[x] = t0;
999                     D[x+1] = t1;
1000                 }
1001 
1002                 for( ; x < dsize.width; x++ )
1003                     D[x] = S[x_ofs[x]];
1004                 break;
1005             case 2:
1006                 for( x = 0; x < dsize.width; x++ )
1007                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
1008                 break;
1009             case 3:
1010                 for( x = 0; x < dsize.width; x++, D += 3 )
1011                 {
1012                     const uchar* _tS = S + x_ofs[x];
1013                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
1014                 }
1015                 break;
1016             case 4:
1017                 for( x = 0; x < dsize.width; x++ )
1018                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
1019                 break;
1020             case 6:
1021                 for( x = 0; x < dsize.width; x++, D += 6 )
1022                 {
1023                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
1024                     ushort* _tD = (ushort*)D;
1025                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1026                 }
1027                 break;
1028             case 8:
1029                 for( x = 0; x < dsize.width; x++, D += 8 )
1030                 {
1031                     const int* _tS = (const int*)(S + x_ofs[x]);
1032                     int* _tD = (int*)D;
1033                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
1034                 }
1035                 break;
1036             case 12:
1037                 for( x = 0; x < dsize.width; x++, D += 12 )
1038                 {
1039                     const int* _tS = (const int*)(S + x_ofs[x]);
1040                     int* _tD = (int*)D;
1041                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1042                 }
1043                 break;
1044             default:
1045                 for( x = 0; x < dsize.width; x++, D += pix_size )
1046                 {
1047                     const uchar* _tS = S + x_ofs[x];
1048                     for (int k = 0; k < pix_size; k++)
1049                         D[k] = _tS[k];
1050                 }
1051             }
1052         }
1053     }
1054 
1055 private:
1056     const Mat& src;
1057     Mat& dst;
1058     int* x_ofs;
1059     double ify;
1060 
1061     resizeNNInvoker(const resizeNNInvoker&);
1062     resizeNNInvoker& operator=(const resizeNNInvoker&);
1063 };
1064 
1065 static void
resizeNN(const Mat & src,Mat & dst,double fx,double fy)1066 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
1067 {
1068     Size ssize = src.size(), dsize = dst.size();
1069     AutoBuffer<int> _x_ofs(dsize.width);
1070     int* x_ofs = _x_ofs.data();
1071     int pix_size = (int)src.elemSize();
1072     double ifx = 1./fx, ify = 1./fy;
1073     int x;
1074 
1075     for( x = 0; x < dsize.width; x++ )
1076     {
1077         int sx = cvFloor(x*ifx);
1078         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
1079     }
1080 
1081     Range range(0, dsize.height);
1082 #if CV_TRY_AVX2
1083     if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4)))
1084     {
1085         if(pix_size == 2)
1086             opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, ify);
1087         else
1088             opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, ify);
1089     }
1090     else
1091 #endif
1092 #if CV_TRY_SSE4_1
1093     if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4)))
1094     {
1095         if(pix_size == 2)
1096             opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, ify);
1097         else
1098             opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
1099     }
1100     else
1101 #endif
1102     {
1103         resizeNNInvoker invoker(src, dst, x_ofs, ify);
1104         parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1105     }
1106 }
1107 
1108 class resizeNN_bitexactInvoker : public ParallelLoopBody
1109 {
1110 public:
resizeNN_bitexactInvoker(const Mat & _src,Mat & _dst,int * _x_ofse,int _ify,int _ify0)1111     resizeNN_bitexactInvoker(const Mat& _src, Mat& _dst, int* _x_ofse, int _ify, int _ify0)
1112         : src(_src), dst(_dst), x_ofse(_x_ofse), ify(_ify), ify0(_ify0) {}
1113 
operator ()(const Range & range) const1114     virtual void operator() (const Range& range) const CV_OVERRIDE
1115     {
1116         Size ssize = src.size(), dsize = dst.size();
1117         int pix_size = (int)src.elemSize();
1118         for( int y = range.start; y < range.end; y++ )
1119         {
1120             uchar* D = dst.ptr(y);
1121             int _sy = (ify * y + ify0) >> 16;
1122             int sy = std::min(_sy, ssize.height-1);
1123             const uchar* S = src.ptr(sy);
1124 
1125             int x = 0;
1126             switch( pix_size )
1127             {
1128             case 1:
1129 #if CV_SIMD
1130                 for( ; x <= dsize.width - v_uint8::nlanes; x += v_uint8::nlanes )
1131                     v_store(D + x, vx_lut(S, x_ofse + x));
1132 #endif
1133                 for( ; x < dsize.width; x++ )
1134                     D[x] = S[x_ofse[x]];
1135                 break;
1136             case 2:
1137 #if CV_SIMD
1138                 for( ; x <= dsize.width - v_uint16::nlanes; x += v_uint16::nlanes )
1139                     v_store((ushort*)D + x, vx_lut((ushort*)S, x_ofse + x));
1140 #endif
1141                 for( ; x < dsize.width; x++ )
1142                     *((ushort*)D + x) = *((ushort*)S + x_ofse[x]);
1143                 break;
1144             case 3:
1145                 for( ; x < dsize.width; x++, D += 3 )
1146                 {
1147                     const uchar* _tS = S + x_ofse[x] * 3;
1148                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
1149                 }
1150                 break;
1151             case 4:
1152 #if CV_SIMD
1153                 for( ; x <= dsize.width - v_uint32::nlanes; x += v_uint32::nlanes )
1154                     v_store((uint32_t*)D + x, vx_lut((uint32_t*)S, x_ofse + x));
1155 #endif
1156                 for( ; x < dsize.width; x++ )
1157                     *((uint32_t*)D + x) = *((uint32_t*)S + x_ofse[x]);
1158                 break;
1159             case 6:
1160                 for( ; x < dsize.width; x++, D += 6 )
1161                 {
1162                     const ushort* _tS = (const ushort*)(S + x_ofse[x]*6);
1163                     ushort* _tD = (ushort*)D;
1164                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1165                 }
1166                 break;
1167             case 8:
1168 #if CV_SIMD
1169                 for( ; x <= dsize.width - v_uint64::nlanes; x += v_uint64::nlanes )
1170                     v_store((uint64_t*)D + x, vx_lut((uint64_t*)S, x_ofse + x));
1171 #endif
1172                 for( ; x < dsize.width; x++ )
1173                     *((uint64_t*)D + x) = *((uint64_t*)S + x_ofse[x]);
1174                 break;
1175             case 12:
1176                 for( ; x < dsize.width; x++, D += 12 )
1177                 {
1178                     const int* _tS = (const int*)(S + x_ofse[x]*12);
1179                     int* _tD = (int*)D;
1180                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
1181                 }
1182                 break;
1183             default:
1184                 for( x = 0; x < dsize.width; x++, D += pix_size )
1185                 {
1186                     const uchar* _tS = S + x_ofse[x] * pix_size;
1187                     for (int k = 0; k < pix_size; k++)
1188                         D[k] = _tS[k];
1189                 }
1190             }
1191         }
1192     }
1193 private:
1194     const Mat& src;
1195     Mat& dst;
1196     int* x_ofse;
1197     const int ify;
1198     const int ify0;
1199 };
1200 
resizeNN_bitexact(const Mat & src,Mat & dst,double,double)1201 static void resizeNN_bitexact( const Mat& src, Mat& dst, double /*fx*/, double /*fy*/ )
1202 {
1203     Size ssize = src.size(), dsize = dst.size();
1204     int ifx = ((ssize.width << 16) + dsize.width / 2) / dsize.width; // 16bit fixed-point arithmetic
1205     int ifx0 = ifx / 2 - 1;                                     // This method uses center pixel coordinate as Pillow and scikit-images do.
1206     int ify = ((ssize.height << 16) + dsize.height / 2) / dsize.height;
1207     int ify0 = ify / 2 - 1;
1208 
1209     cv::utils::BufferArea area;
1210     int* x_ofse = 0;
1211     area.allocate(x_ofse, dsize.width, CV_SIMD_WIDTH);
1212     area.commit();
1213 
1214     for( int x = 0; x < dsize.width; x++ )
1215     {
1216         int sx = (ifx * x + ifx0) >> 16;
1217         x_ofse[x] = std::min(sx, ssize.width-1);    // offset in element (not byte)
1218     }
1219     Range range(0, dsize.height);
1220     resizeNN_bitexactInvoker invoker(src, dst, x_ofse, ify, ify0);
1221     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1222 }
1223 
1224 struct VResizeNoVec
1225 {
1226     template<typename WT, typename T, typename BT>
operator ()cv::VResizeNoVec1227     int operator()(const WT**, T*, const BT*, int ) const
1228     {
1229         return 0;
1230     }
1231 };
1232 
1233 struct HResizeNoVec
1234 {
1235     template<typename T, typename WT, typename AT> inline
operator ()cv::HResizeNoVec1236     int operator()(const T**, WT**, int, const int*,
1237         const AT*, int, int, int, int, int) const
1238     {
1239         return 0;
1240     }
1241 };
1242 
1243 #if CV_SIMD
1244 
1245 struct VResizeLinearVec_32s8u
1246 {
operator ()cv::VResizeLinearVec_32s8u1247     int operator()(const int** src, uchar* dst, const short* beta, int width) const
1248     {
1249         const int *S0 = src[0], *S1 = src[1];
1250         int x = 0;
1251         v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
1252 
1253         if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1254             for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
1255                 v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x                      ) >> 4, vx_load_aligned(S0 + x +     v_int32::nlanes) >> 4), b0) +
1256                                                   v_mul_hi(v_pack(vx_load_aligned(S1 + x                      ) >> 4, vx_load_aligned(S1 + x +     v_int32::nlanes) >> 4), b1),
1257                                                   v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
1258                                                   v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
1259         else
1260             for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
1261                 v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x                      ) >> 4, vx_load(S0 + x +     v_int32::nlanes) >> 4), b0) +
1262                                                   v_mul_hi(v_pack(vx_load(S1 + x                      ) >> 4, vx_load(S1 + x +     v_int32::nlanes) >> 4), b1),
1263                                                   v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
1264                                                   v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
1265 
1266         for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
1267             v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
1268                                             v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
1269 
1270         return x;
1271     }
1272 };
1273 
1274 struct VResizeLinearVec_32f16u
1275 {
operator ()cv::VResizeLinearVec_32f16u1276     int operator()(const float** src, ushort* dst, const float* beta, int width) const
1277     {
1278         const float *S0 = src[0], *S1 = src[1];
1279         int x = 0;
1280 
1281         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1282 
1283         if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1284             for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1285                 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
1286                                           v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
1287         else
1288             for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1289                 v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
1290                                           v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
1291         for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
1292         {
1293             v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1294             v_store_low(dst + x, v_pack_u(t0, t0));
1295         }
1296 
1297         return x;
1298     }
1299 };
1300 
1301 struct VResizeLinearVec_32f16s
1302 {
operator ()cv::VResizeLinearVec_32f16s1303     int operator()(const float** src, short* dst, const float* beta, int width) const
1304     {
1305         const float *S0 = src[0], *S1 = src[1];
1306         int x = 0;
1307 
1308         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1309 
1310         if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1311             for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1312                 v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
1313                                         v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
1314         else
1315             for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1316                 v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
1317                                         v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
1318         for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
1319         {
1320             v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1321             v_store_low(dst + x, v_pack(t0, t0));
1322         }
1323 
1324         return x;
1325     }
1326 };
1327 
1328 struct VResizeLinearVec_32f
1329 {
operator ()cv::VResizeLinearVec_32f1330     int operator()(const float** src, float* dst, const float* beta, int width) const
1331     {
1332         const float *S0 = src[0], *S1 = src[1];
1333         int x = 0;
1334 
1335         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
1336 
1337         if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
1338             for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1339                 v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
1340         else
1341             for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1342                 v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
1343 
1344         return x;
1345     }
1346 };
1347 
1348 
1349 struct VResizeCubicVec_32s8u
1350 {
operator ()cv::VResizeCubicVec_32s8u1351     int operator()(const int** src, uchar* dst, const short* beta, int width) const
1352     {
1353         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1354         int x = 0;
1355         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
1356 
1357         v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
1358                   b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
1359 
1360         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
1361             for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1362                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x                    )),  b0,
1363                                                        v_muladd(v_cvt_f32(vx_load_aligned(S1 + x                    )),  b1,
1364                                                        v_muladd(v_cvt_f32(vx_load_aligned(S2 + x                    )),  b2,
1365                                                                 v_cvt_f32(vx_load_aligned(S3 + x                    )) * b3)))),
1366                                                v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)),  b0,
1367                                                        v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)),  b1,
1368                                                        v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)),  b2,
1369                                                                 v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
1370         else
1371             for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1372                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x                    )),  b0,
1373                                                        v_muladd(v_cvt_f32(vx_load(S1 + x                    )),  b1,
1374                                                        v_muladd(v_cvt_f32(vx_load(S2 + x                    )),  b2,
1375                                                                 v_cvt_f32(vx_load(S3 + x                    )) * b3)))),
1376                                                v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)),  b0,
1377                                                        v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)),  b1,
1378                                                        v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)),  b2,
1379                                                                 v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
1380         return x;
1381     }
1382 };
1383 
1384 struct VResizeCubicVec_32f16u
1385 {
operator ()cv::VResizeCubicVec_32f16u1386     int operator()(const float** src, ushort* dst, const float* beta, int width) const
1387     {
1388         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1389         int x = 0;
1390         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1391                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1392 
1393         for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1394             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
1395                                               v_muladd(vx_load(S1 + x                    ),  b1,
1396                                               v_muladd(vx_load(S2 + x                    ),  b2,
1397                                                        vx_load(S3 + x                    ) * b3)))),
1398                                       v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
1399                                               v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
1400                                               v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
1401                                                        vx_load(S3 + x + v_float32::nlanes) * b3))))));
1402 
1403         return x;
1404     }
1405 };
1406 
1407 struct VResizeCubicVec_32f16s
1408 {
operator ()cv::VResizeCubicVec_32f16s1409     int operator()(const float** src, short* dst, const float* beta, int width) const
1410     {
1411         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1412         int x = 0;
1413         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1414                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1415 
1416         for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1417             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
1418                                             v_muladd(vx_load(S1 + x                    ),  b1,
1419                                             v_muladd(vx_load(S2 + x                    ),  b2,
1420                                                      vx_load(S3 + x                    ) * b3)))),
1421                                     v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
1422                                             v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
1423                                             v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
1424                                                      vx_load(S3 + x + v_float32::nlanes) * b3))))));
1425 
1426         return x;
1427     }
1428 };
1429 
1430 struct VResizeCubicVec_32f
1431 {
operator ()cv::VResizeCubicVec_32f1432     int operator()(const float** src, float* dst, const float* beta, int width) const
1433     {
1434         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1435         int x = 0;
1436         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1437                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
1438 
1439         for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1440             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
1441                              v_muladd(vx_load(S1 + x),  b1,
1442                              v_muladd(vx_load(S2 + x),  b2,
1443                                       vx_load(S3 + x) * b3))));
1444 
1445         return x;
1446     }
1447 };
1448 
1449 
1450 #if CV_TRY_SSE4_1
1451 
1452 struct VResizeLanczos4Vec_32f16u
1453 {
operator ()cv::VResizeLanczos4Vec_32f16u1454     int operator()(const float** src, ushort* dst, const float* beta, int width) const
1455     {
1456         if (CV_CPU_HAS_SUPPORT_SSE4_1)
1457             return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(src, dst, beta, width);
1458         else
1459             return 0;
1460     }
1461 };
1462 
1463 #else
1464 
1465 struct VResizeLanczos4Vec_32f16u
1466 {
operator ()cv::VResizeLanczos4Vec_32f16u1467     int operator()(const float** src, ushort* dst, const float* beta, int width ) const
1468     {
1469         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1470                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1471         int x = 0;
1472         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1473                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1474                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1475                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1476 
1477         for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
1478             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
1479                                               v_muladd(vx_load(S1 + x                    ),  b1,
1480                                               v_muladd(vx_load(S2 + x                    ),  b2,
1481                                               v_muladd(vx_load(S3 + x                    ),  b3,
1482                                               v_muladd(vx_load(S4 + x                    ),  b4,
1483                                               v_muladd(vx_load(S5 + x                    ),  b5,
1484                                               v_muladd(vx_load(S6 + x                    ),  b6,
1485                                                        vx_load(S7 + x                    ) * b7)))))))),
1486                                       v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
1487                                               v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
1488                                               v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
1489                                               v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
1490                                               v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
1491                                               v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
1492                                               v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
1493                                                        vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
1494 
1495         return x;
1496     }
1497 };
1498 
1499 #endif
1500 
1501 struct VResizeLanczos4Vec_32f16s
1502 {
operator ()cv::VResizeLanczos4Vec_32f16s1503     int operator()(const float** src, short* dst, const float* beta, int width ) const
1504     {
1505         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1506                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1507         int x = 0;
1508         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1509                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1510                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1511                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1512 
1513         for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1514             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
1515                                             v_muladd(vx_load(S1 + x                    ),  b1,
1516                                             v_muladd(vx_load(S2 + x                    ),  b2,
1517                                             v_muladd(vx_load(S3 + x                    ),  b3,
1518                                             v_muladd(vx_load(S4 + x                    ),  b4,
1519                                             v_muladd(vx_load(S5 + x                    ),  b5,
1520                                             v_muladd(vx_load(S6 + x                    ),  b6,
1521                                                      vx_load(S7 + x                    ) * b7)))))))),
1522                                     v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
1523                                             v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
1524                                             v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
1525                                             v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
1526                                             v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
1527                                             v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
1528                                             v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
1529                                                      vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
1530 
1531         return x;
1532     }
1533 };
1534 
1535 struct VResizeLanczos4Vec_32f
1536 {
operator ()cv::VResizeLanczos4Vec_32f1537     int operator()(const float** src, float* dst, const float* beta, int width ) const
1538     {
1539         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1540                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1541         int x = 0;
1542 
1543         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
1544                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
1545                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
1546                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
1547 
1548         for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
1549             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
1550                              v_muladd(vx_load(S1 + x),  b1,
1551                              v_muladd(vx_load(S2 + x),  b2,
1552                              v_muladd(vx_load(S3 + x),  b3,
1553                              v_muladd(vx_load(S4 + x),  b4,
1554                              v_muladd(vx_load(S5 + x),  b5,
1555                              v_muladd(vx_load(S6 + x),  b6,
1556                                       vx_load(S7 + x) * b7))))))));
1557 
1558         return x;
1559     }
1560 };
1561 
1562 #else
1563 
1564 typedef VResizeNoVec VResizeLinearVec_32s8u;
1565 typedef VResizeNoVec VResizeLinearVec_32f16u;
1566 typedef VResizeNoVec VResizeLinearVec_32f16s;
1567 typedef VResizeNoVec VResizeLinearVec_32f;
1568 
1569 typedef VResizeNoVec VResizeCubicVec_32s8u;
1570 typedef VResizeNoVec VResizeCubicVec_32f16u;
1571 typedef VResizeNoVec VResizeCubicVec_32f16s;
1572 typedef VResizeNoVec VResizeCubicVec_32f;
1573 
1574 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1575 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1576 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1577 
1578 #endif
1579 
1580 #if CV_SIMD128
1581 
1582 template<typename ST, typename DT, typename AT, typename DVT>
1583 struct HResizeLinearVec_X4
1584 {
operator ()cv::HResizeLinearVec_X41585     int operator()(const ST** src, DT** dst, int count, const int* xofs,
1586         const AT* alpha, int, int, int cn, int, int xmax) const
1587     {
1588         const int nlanes = 4;
1589         const int len0 = xmax & -nlanes;
1590         int dx = 0, k = 0;
1591 
1592         for( ; k <= (count - 2); k+=2 )
1593         {
1594             const ST *S0 = src[k];
1595             DT *D0 = dst[k];
1596             const ST *S1 = src[k+1];
1597             DT *D1 = dst[k+1];
1598 
1599             for( dx = 0; dx < len0; dx += nlanes )
1600             {
1601                 int sx0 = xofs[dx+0];
1602                 int sx1 = xofs[dx+1];
1603                 int sx2 = xofs[dx+2];
1604                 int sx3 = xofs[dx+3];
1605                 DVT a_even;
1606                 DVT a_odd;
1607 
1608                 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
1609                 DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]);
1610                 DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
1611                 DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
1612                 DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
1613                 v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
1614                 v_store(&D0[dx], s0 * a_even + s1 * a_odd);
1615             }
1616         }
1617         for( ; k < count; k++ )
1618         {
1619             const ST *S = src[k];
1620             DT *D = dst[k];
1621             for( dx = 0; dx < len0; dx += nlanes )
1622             {
1623                 int sx0 = xofs[dx+0];
1624                 int sx1 = xofs[dx+1];
1625                 int sx2 = xofs[dx+2];
1626                 int sx3 = xofs[dx+3];
1627                 DVT a_even;
1628                 DVT a_odd;
1629 
1630                 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
1631                 DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
1632                 DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
1633                 v_store(&D[dx], s0 * a_even + s1 * a_odd);
1634             }
1635         }
1636         return dx;
1637     }
1638 };
1639 
1640 struct HResizeLinearVecU8_X4
1641 {
operator ()cv::HResizeLinearVecU8_X41642     int operator()(const uchar** src, int** dst, int count, const int* xofs,
1643         const short* alpha/*[xmax]*/, int /*smax*/, int dmax, int cn, int /*xmin*/, int xmax) const
1644     {
1645         int dx = 0, k = 0;
1646 
1647         if(cn == 1)
1648         {
1649             const int step = 8;
1650             const int len0 = xmax & -step;
1651             for( ; k <= (count - 2); k+=2 )
1652             {
1653                 const uchar *S0 = src[k];
1654                 int *D0 = dst[k];
1655                 const uchar *S1 = src[k+1];
1656                 int *D1 = dst[k+1];
1657 
1658                 for( dx = 0; dx < len0; dx += step )
1659                 {
1660                     v_int16x8 al = v_load(alpha+dx*2);
1661                     v_int16x8 ah = v_load(alpha+dx*2+8);
1662                     v_uint16x8 sl, sh;
1663                     v_expand(v_lut_pairs(S0, xofs+dx), sl, sh);
1664                     v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1665                     v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1666                     v_expand(v_lut_pairs(S1, xofs+dx), sl, sh);
1667                     v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1668                     v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1669                 }
1670             }
1671             for( ; k < count; k++ )
1672             {
1673                 const uchar *S = src[k];
1674                 int *D = dst[k];
1675                 for( dx = 0; dx < len0; dx += step )
1676                 {
1677                     v_int16x8 al = v_load(alpha+dx*2);
1678                     v_int16x8 ah = v_load(alpha+dx*2+8);
1679                     v_uint16x8 sl, sh;
1680                     v_expand(v_lut_pairs(S, xofs+dx), sl, sh);
1681                     v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1682                     v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1683                 }
1684             }
1685         }
1686         else if(cn == 2)
1687         {
1688             const int step = 8;
1689             const int len0 = xmax & -step;
1690             for( ; k <= (count - 2); k+=2 )
1691             {
1692                 const uchar *S0 = src[k];
1693                 int *D0 = dst[k];
1694                 const uchar *S1 = src[k+1];
1695                 int *D1 = dst[k+1];
1696 
1697                 for( dx = 0; dx < len0; dx += step )
1698                 {
1699                     int ofs[4] = { xofs[dx], xofs[dx + 2], xofs[dx + 4], xofs[dx + 6] };
1700                     v_int16x8 al = v_load(alpha+dx*2);
1701                     v_int16x8 ah = v_load(alpha+dx*2+8);
1702                     v_uint16x8 sl, sh;
1703                     v_expand(v_interleave_pairs(v_lut_quads(S0, ofs)), sl, sh);
1704                     v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1705                     v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1706                     v_expand(v_interleave_pairs(v_lut_quads(S1, ofs)), sl, sh);
1707                     v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1708                     v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1709                 }
1710             }
1711             for( ; k < count; k++ )
1712             {
1713                 const uchar *S = src[k];
1714                 int *D = dst[k];
1715                 for( dx = 0; dx < len0; dx += step )
1716                 {
1717                     int ofs[4] = { xofs[dx], xofs[dx + 2], xofs[dx + 4], xofs[dx + 6] };
1718                     v_int16x8 al = v_load(alpha+dx*2);
1719                     v_int16x8 ah = v_load(alpha+dx*2+8);
1720                     v_uint16x8 sl, sh;
1721                     v_expand(v_interleave_pairs(v_lut_quads(S, ofs)), sl, sh);
1722                     v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
1723                     v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
1724                 }
1725             }
1726         }
1727         else if(cn == 3)
1728         {
1729             /* Peek at the last x offset to find the maximal s offset.  We know the loop
1730                will terminate prior to value which may be 1 or more elements prior to the
1731                final valid offset. xofs[] is constucted to be an array of increasingly
1732                large offsets (i.e xofs[x] <= xofs[x+1] for x < xmax). */
1733             int smax = xofs[dmax-cn];
1734 
1735             for( ; k <= (count - 2); k+=2 )
1736             {
1737                 const uchar *S0 = src[k];
1738                 int *D0 = dst[k];
1739                 const uchar *S1 = src[k+1];
1740                 int *D1 = dst[k+1];
1741 
1742                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
1743                 {
1744                     v_int16x8 a = v_load(alpha+dx*2);
1745                     v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
1746                     v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
1747                 }
1748             }
1749             for( ; k < count; k++ )
1750             {
1751                 const uchar *S = src[k];
1752                 int *D = dst[k];
1753                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
1754                 {
1755                     v_int16x8 a = v_load(alpha+dx*2);
1756                     v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
1757                 }
1758             }
1759             /* Debug check to ensure truthiness that we never vector the final value. */
1760             CV_DbgAssert(dx < dmax);
1761         }
1762         else if(cn == 4)
1763         {
1764             const int step = 4;
1765             const int len0 = xmax & -step;
1766             for( ; k <= (count - 2); k+=2 )
1767             {
1768                 const uchar *S0 = src[k];
1769                 int *D0 = dst[k];
1770                 const uchar *S1 = src[k+1];
1771                 int *D1 = dst[k+1];
1772 
1773                 for( dx = 0; dx < len0; dx += step )
1774                 {
1775                     v_int16x8 a = v_load(alpha+dx*2);
1776                     v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a));
1777                     v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a));
1778                 }
1779             }
1780             for( ; k < count; k++ )
1781             {
1782                 const uchar *S = src[k];
1783                 int *D = dst[k];
1784                 for( dx = 0; dx < len0; dx += step )
1785                 {
1786                     v_int16x8 a = v_load(alpha+dx*2);
1787                     v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a));
1788                 }
1789             }
1790         }
1791         else
1792         {
1793             return 0;  // images with channels >4 are out of optimization scope
1794         }
1795         return dx;
1796     }
1797 };
1798 
1799 typedef HResizeLinearVec_X4<float,float,float,v_float32x4> HResizeLinearVec_32f;
1800 typedef HResizeLinearVec_X4<ushort,float,float,v_float32x4> HResizeLinearVec_16u32f;
1801 typedef HResizeLinearVec_X4<short,float,float,v_float32x4> HResizeLinearVec_16s32f;
1802 typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s;
1803 
1804 #else
1805 
1806 typedef HResizeNoVec HResizeLinearVec_8u32s;
1807 typedef HResizeNoVec HResizeLinearVec_16u32f;
1808 typedef HResizeNoVec HResizeLinearVec_16s32f;
1809 typedef HResizeNoVec HResizeLinearVec_32f;
1810 
1811 #endif
1812 
1813 typedef HResizeNoVec HResizeLinearVec_64f;
1814 
1815 
1816 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1817 struct HResizeLinear
1818 {
1819     typedef T value_type;
1820     typedef WT buf_type;
1821     typedef AT alpha_type;
1822 
operator ()cv::HResizeLinear1823     void operator()(const T** src, WT** dst, int count,
1824                     const int* xofs, const AT* alpha,
1825                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1826     {
1827         int dx, k;
1828         VecOp vecOp;
1829 
1830         int dx0 = vecOp(src, dst, count,
1831             xofs, alpha, swidth, dwidth, cn, xmin, xmax );
1832 
1833         for( k = 0; k <= count - 2; k+=2 )
1834         {
1835             const T *S0 = src[k], *S1 = src[k+1];
1836             WT *D0 = dst[k], *D1 = dst[k+1];
1837             for( dx = dx0; dx < xmax; dx++ )
1838             {
1839                 int sx = xofs[dx];
1840                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1841                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1842                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1843                 D0[dx] = t0; D1[dx] = t1;
1844             }
1845 
1846             for( ; dx < dwidth; dx++ )
1847             {
1848                 int sx = xofs[dx];
1849                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1850             }
1851         }
1852 
1853         for( ; k < count; k++ )
1854         {
1855             const T *S = src[k];
1856             WT *D = dst[k];
1857             for( dx = dx0; dx < xmax; dx++ )
1858             {
1859                 int sx = xofs[dx];
1860                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1861             }
1862 
1863             for( ; dx < dwidth; dx++ )
1864                 D[dx] = WT(S[xofs[dx]]*ONE);
1865         }
1866     }
1867 };
1868 
1869 
1870 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1871 struct VResizeLinear
1872 {
1873     typedef T value_type;
1874     typedef WT buf_type;
1875     typedef AT alpha_type;
1876 
operator ()cv::VResizeLinear1877     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1878     {
1879         WT b0 = beta[0], b1 = beta[1];
1880         const WT *S0 = src[0], *S1 = src[1];
1881         CastOp castOp;
1882         VecOp vecOp;
1883 
1884         int x = vecOp(src, dst, beta, width);
1885         #if CV_ENABLE_UNROLLED
1886         for( ; x <= width - 4; x += 4 )
1887         {
1888             WT t0, t1;
1889             t0 = S0[x]*b0 + S1[x]*b1;
1890             t1 = S0[x+1]*b0 + S1[x+1]*b1;
1891             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1892             t0 = S0[x+2]*b0 + S1[x+2]*b1;
1893             t1 = S0[x+3]*b0 + S1[x+3]*b1;
1894             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1895         }
1896         #endif
1897         for( ; x < width; x++ )
1898             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1899     }
1900 };
1901 
1902 template<>
1903 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1904 {
1905     typedef uchar value_type;
1906     typedef int buf_type;
1907     typedef short alpha_type;
1908 
operator ()cv::VResizeLinear1909     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1910     {
1911         alpha_type b0 = beta[0], b1 = beta[1];
1912         const buf_type *S0 = src[0], *S1 = src[1];
1913         VResizeLinearVec_32s8u vecOp;
1914 
1915         int x = vecOp(src, dst, beta, width);
1916         #if CV_ENABLE_UNROLLED
1917         for( ; x <= width - 4; x += 4 )
1918         {
1919             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1920             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1921             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1922             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1923         }
1924         #endif
1925         for( ; x < width; x++ )
1926             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1927     }
1928 };
1929 
1930 
1931 template<typename T, typename WT, typename AT>
1932 struct HResizeCubic
1933 {
1934     typedef T value_type;
1935     typedef WT buf_type;
1936     typedef AT alpha_type;
1937 
operator ()cv::HResizeCubic1938     void operator()(const T** src, WT** dst, int count,
1939                     const int* xofs, const AT* alpha,
1940                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1941     {
1942         for( int k = 0; k < count; k++ )
1943         {
1944             const T *S = src[k];
1945             WT *D = dst[k];
1946             int dx = 0, limit = xmin;
1947             for(;;)
1948             {
1949                 for( ; dx < limit; dx++, alpha += 4 )
1950                 {
1951                     int j, sx = xofs[dx] - cn;
1952                     WT v = 0;
1953                     for( j = 0; j < 4; j++ )
1954                     {
1955                         int sxj = sx + j*cn;
1956                         if( (unsigned)sxj >= (unsigned)swidth )
1957                         {
1958                             while( sxj < 0 )
1959                                 sxj += cn;
1960                             while( sxj >= swidth )
1961                                 sxj -= cn;
1962                         }
1963                         v += S[sxj]*alpha[j];
1964                     }
1965                     D[dx] = v;
1966                 }
1967                 if( limit == dwidth )
1968                     break;
1969                 for( ; dx < xmax; dx++, alpha += 4 )
1970                 {
1971                     int sx = xofs[dx];
1972                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1973                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1974                 }
1975                 limit = dwidth;
1976             }
1977             alpha -= dwidth*4;
1978         }
1979     }
1980 };
1981 
1982 
1983 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1984 struct VResizeCubic
1985 {
1986     typedef T value_type;
1987     typedef WT buf_type;
1988     typedef AT alpha_type;
1989 
operator ()cv::VResizeCubic1990     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1991     {
1992         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1993         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1994         CastOp castOp;
1995         VecOp vecOp;
1996 
1997         int x = vecOp(src, dst, beta, width);
1998         for( ; x < width; x++ )
1999             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
2000     }
2001 };
2002 
2003 
2004 template<typename T, typename WT, typename AT>
2005 struct HResizeLanczos4
2006 {
2007     typedef T value_type;
2008     typedef WT buf_type;
2009     typedef AT alpha_type;
2010 
operator ()cv::HResizeLanczos42011     void operator()(const T** src, WT** dst, int count,
2012                     const int* xofs, const AT* alpha,
2013                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
2014     {
2015         for( int k = 0; k < count; k++ )
2016         {
2017             const T *S = src[k];
2018             WT *D = dst[k];
2019             int dx = 0, limit = xmin;
2020             for(;;)
2021             {
2022                 for( ; dx < limit; dx++, alpha += 8 )
2023                 {
2024                     int j, sx = xofs[dx] - cn*3;
2025                     WT v = 0;
2026                     for( j = 0; j < 8; j++ )
2027                     {
2028                         int sxj = sx + j*cn;
2029                         if( (unsigned)sxj >= (unsigned)swidth )
2030                         {
2031                             while( sxj < 0 )
2032                                 sxj += cn;
2033                             while( sxj >= swidth )
2034                                 sxj -= cn;
2035                         }
2036                         v += S[sxj]*alpha[j];
2037                     }
2038                     D[dx] = v;
2039                 }
2040                 if( limit == dwidth )
2041                     break;
2042                 for( ; dx < xmax; dx++, alpha += 8 )
2043                 {
2044                     int sx = xofs[dx];
2045                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
2046                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
2047                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
2048                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
2049                 }
2050                 limit = dwidth;
2051             }
2052             alpha -= dwidth*8;
2053         }
2054     }
2055 };
2056 
2057 
2058 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
2059 struct VResizeLanczos4
2060 {
2061     typedef T value_type;
2062     typedef WT buf_type;
2063     typedef AT alpha_type;
2064 
operator ()cv::VResizeLanczos42065     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
2066     {
2067         CastOp castOp;
2068         VecOp vecOp;
2069         int x = vecOp(src, dst, beta, width);
2070         #if CV_ENABLE_UNROLLED
2071         for( ; x <= width - 4; x += 4 )
2072         {
2073             WT b = beta[0];
2074             const WT* S = src[0];
2075             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
2076 
2077             for( int k = 1; k < 8; k++ )
2078             {
2079                 b = beta[k]; S = src[k];
2080                 s0 += S[x]*b; s1 += S[x+1]*b;
2081                 s2 += S[x+2]*b; s3 += S[x+3]*b;
2082             }
2083 
2084             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
2085             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
2086         }
2087         #endif
2088         for( ; x < width; x++ )
2089         {
2090             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
2091                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
2092                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
2093         }
2094     }
2095 };
2096 
2097 
clip(int x,int a,int b)2098 static inline int clip(int x, int a, int b)
2099 {
2100     return x >= a ? (x < b ? x : b-1) : a;
2101 }
2102 
2103 static const int MAX_ESIZE=16;
2104 
2105 template <typename HResize, typename VResize>
2106 class resizeGeneric_Invoker :
2107     public ParallelLoopBody
2108 {
2109 public:
2110     typedef typename HResize::value_type T;
2111     typedef typename HResize::buf_type WT;
2112     typedef typename HResize::alpha_type AT;
2113 
resizeGeneric_Invoker(const Mat & _src,Mat & _dst,const int * _xofs,const int * _yofs,const AT * _alpha,const AT * __beta,const Size & _ssize,const Size & _dsize,int _ksize,int _xmin,int _xmax)2114     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
2115         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
2116         int _ksize, int _xmin, int _xmax) :
2117         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
2118         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
2119         ksize(_ksize), xmin(_xmin), xmax(_xmax)
2120     {
2121         CV_Assert(ksize <= MAX_ESIZE);
2122     }
2123 
operator ()(const Range & range) const2124     virtual void operator() (const Range& range) const CV_OVERRIDE
2125     {
2126         int dy, cn = src.channels();
2127         HResize hresize;
2128         VResize vresize;
2129 
2130         int bufstep = (int)alignSize(dsize.width, 16);
2131         AutoBuffer<WT> _buffer(bufstep*ksize);
2132         const T* srows[MAX_ESIZE]={0};
2133         WT* rows[MAX_ESIZE]={0};
2134         int prev_sy[MAX_ESIZE];
2135 
2136         for(int k = 0; k < ksize; k++ )
2137         {
2138             prev_sy[k] = -1;
2139             rows[k] = _buffer.data() + bufstep*k;
2140         }
2141 
2142         const AT* beta = _beta + ksize * range.start;
2143 
2144         for( dy = range.start; dy < range.end; dy++, beta += ksize )
2145         {
2146             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
2147 
2148             for(int k = 0; k < ksize; k++ )
2149             {
2150                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
2151                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
2152                 {
2153                     if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
2154                     {
2155                         if( k1 > k )
2156                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
2157                         break;
2158                     }
2159                 }
2160                 if( k1 == ksize )
2161                     k0 = std::min(k0, k); // remember the first row that needs to be computed
2162                 srows[k] = src.template ptr<T>(sy);
2163                 prev_sy[k] = sy;
2164             }
2165 
2166             if( k0 < ksize )
2167                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
2168                         ssize.width, dsize.width, cn, xmin, xmax );
2169             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
2170         }
2171     }
2172 
2173 private:
2174     Mat src;
2175     Mat dst;
2176     const int* xofs, *yofs;
2177     const AT* alpha, *_beta;
2178     Size ssize, dsize;
2179     const int ksize, xmin, xmax;
2180 
2181     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
2182 };
2183 
2184 template<class HResize, class VResize>
resizeGeneric_(const Mat & src,Mat & dst,const int * xofs,const void * _alpha,const int * yofs,const void * _beta,int xmin,int xmax,int ksize)2185 static void resizeGeneric_( const Mat& src, Mat& dst,
2186                             const int* xofs, const void* _alpha,
2187                             const int* yofs, const void* _beta,
2188                             int xmin, int xmax, int ksize )
2189 {
2190     typedef typename HResize::alpha_type AT;
2191 
2192     const AT* beta = (const AT*)_beta;
2193     Size ssize = src.size(), dsize = dst.size();
2194     int cn = src.channels();
2195     ssize.width *= cn;
2196     dsize.width *= cn;
2197     xmin *= cn;
2198     xmax *= cn;
2199     // image resize is a separable operation. In case of not too strong
2200 
2201     Range range(0, dsize.height);
2202     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
2203         ssize, dsize, ksize, xmin, xmax);
2204     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2205 }
2206 
2207 template <typename T, typename WT>
2208 struct ResizeAreaFastNoVec
2209 {
ResizeAreaFastNoVeccv::ResizeAreaFastNoVec2210     ResizeAreaFastNoVec(int, int) { }
ResizeAreaFastNoVeccv::ResizeAreaFastNoVec2211     ResizeAreaFastNoVec(int, int, int, int) { }
operator ()cv::ResizeAreaFastNoVec2212     int operator() (const T*, T*, int) const
2213     { return 0; }
2214 };
2215 
2216 #if CV_NEON
2217 
2218 class ResizeAreaFastVec_SIMD_8u
2219 {
2220 public:
ResizeAreaFastVec_SIMD_8u(int _cn,int _step)2221     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2222         cn(_cn), step(_step)
2223     {
2224     }
2225 
operator ()(const uchar * S,uchar * D,int w) const2226     int operator() (const uchar* S, uchar* D, int w) const
2227     {
2228         int dx = 0;
2229         const uchar* S0 = S, * S1 = S0 + step;
2230 
2231         uint16x8_t v_2 = vdupq_n_u16(2);
2232 
2233         if (cn == 1)
2234         {
2235             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
2236             {
2237                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
2238 
2239                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
2240                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
2241                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
2242 
2243                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
2244                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
2245                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
2246 
2247                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
2248             }
2249         }
2250         else if (cn == 4)
2251         {
2252             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2253             {
2254                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
2255 
2256                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
2257                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
2258                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
2259                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
2260 
2261                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
2262                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
2263                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
2264                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
2265                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
2266 
2267                 vst1_u8(D, vmovn_u16(v_dst));
2268             }
2269         }
2270 
2271         return dx;
2272     }
2273 
2274 private:
2275     int cn, step;
2276 };
2277 
2278 class ResizeAreaFastVec_SIMD_16u
2279 {
2280 public:
ResizeAreaFastVec_SIMD_16u(int _cn,int _step)2281     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2282         cn(_cn), step(_step)
2283     {
2284     }
2285 
operator ()(const ushort * S,ushort * D,int w) const2286     int operator() (const ushort * S, ushort * D, int w) const
2287     {
2288         int dx = 0;
2289         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
2290 
2291         uint32x4_t v_2 = vdupq_n_u32(2);
2292 
2293         if (cn == 1)
2294         {
2295             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2296             {
2297                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
2298 
2299                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
2300                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
2301                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
2302 
2303                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
2304                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
2305                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
2306 
2307                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
2308             }
2309         }
2310         else if (cn == 4)
2311         {
2312             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2313             {
2314                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
2315                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
2316                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
2317                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
2318             }
2319         }
2320 
2321         return dx;
2322     }
2323 
2324 private:
2325     int cn, step;
2326 };
2327 
2328 class ResizeAreaFastVec_SIMD_16s
2329 {
2330 public:
ResizeAreaFastVec_SIMD_16s(int _cn,int _step)2331     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2332         cn(_cn), step(_step)
2333     {
2334     }
2335 
operator ()(const short * S,short * D,int w) const2336     int operator() (const short * S, short * D, int w) const
2337     {
2338         int dx = 0;
2339         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
2340 
2341         int32x4_t v_2 = vdupq_n_s32(2);
2342 
2343         if (cn == 1)
2344         {
2345             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2346             {
2347                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
2348 
2349                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
2350                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
2351                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
2352 
2353                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
2354                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
2355                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
2356 
2357                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
2358             }
2359         }
2360         else if (cn == 4)
2361         {
2362             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2363             {
2364                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
2365                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
2366                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
2367                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
2368             }
2369         }
2370 
2371         return dx;
2372     }
2373 
2374 private:
2375     int cn, step;
2376 };
2377 
2378 struct ResizeAreaFastVec_SIMD_32f
2379 {
ResizeAreaFastVec_SIMD_32fcv::ResizeAreaFastVec_SIMD_32f2380     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2381         cn(_cn), step(_step)
2382     {
2383         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2384     }
2385 
operator ()cv::ResizeAreaFastVec_SIMD_32f2386     int operator() (const float * S, float * D, int w) const
2387     {
2388         if (!fast_mode)
2389             return 0;
2390 
2391         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2392         int dx = 0;
2393 
2394         float32x4_t v_025 = vdupq_n_f32(0.25f);
2395 
2396         if (cn == 1)
2397         {
2398             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2399             {
2400                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
2401 
2402                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
2403                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
2404 
2405                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2406             }
2407         }
2408         else if (cn == 4)
2409         {
2410             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2411             {
2412                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
2413                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
2414 
2415                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2416             }
2417         }
2418 
2419         return dx;
2420     }
2421 
2422 private:
2423     int cn;
2424     bool fast_mode;
2425     int step;
2426 };
2427 
2428 #elif CV_SIMD
2429 
2430 class ResizeAreaFastVec_SIMD_8u
2431 {
2432 public:
ResizeAreaFastVec_SIMD_8u(int _cn,int _step)2433     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2434         cn(_cn), step(_step) {}
2435 
operator ()(const uchar * S,uchar * D,int w) const2436     int operator() (const uchar* S, uchar* D, int w) const
2437     {
2438         int dx = 0;
2439         const uchar* S0 = S;
2440         const uchar* S1 = S0 + step;
2441 
2442         if (cn == 1)
2443         {
2444             v_uint16 masklow = vx_setall_u16(0x00ff);
2445             for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
2446             {
2447                 v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
2448                 v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
2449                 v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
2450             }
2451         }
2452         else if (cn == 3)
2453         {
2454             if (CV_SIMD_WIDTH > 64)
2455                 return 0;
2456             for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
2457             {
2458                 v_uint16 t0, t1, t2, t3, t4, t5;
2459                 v_uint16 s0, s1, s2, s3, s4, s5;
2460                 s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
2461                 s1 = vx_load_expand(S0 +   v_uint16::nlanes) + vx_load_expand(S1 +   v_uint16::nlanes);
2462                 s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
2463                 s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
2464                 s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
2465                 s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
2466                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2467                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2468                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2469                 v_uint16 bl, gl, rl;
2470 #if CV_SIMD_WIDTH == 16
2471                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2472 #elif CV_SIMD_WIDTH == 32
2473                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2474                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2475 #elif CV_SIMD_WIDTH == 64
2476                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2477                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2478                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2479 #endif
2480                 s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
2481                 s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
2482                 s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
2483                 s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
2484                 s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
2485                 s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
2486                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2487                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2488                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2489                 v_uint16 bh, gh, rh;
2490 #if CV_SIMD_WIDTH == 16
2491                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2492 #elif CV_SIMD_WIDTH == 32
2493                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2494                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2495 #elif CV_SIMD_WIDTH == 64
2496                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2497                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2498                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2499 #endif
2500                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2501             }
2502         }
2503         else
2504         {
2505             CV_Assert(cn == 4);
2506             for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
2507             {
2508                 v_uint32 r00, r01, r10, r11;
2509                 v_load_deinterleave((uint32_t*)S0, r00, r01);
2510                 v_load_deinterleave((uint32_t*)S1, r10, r11);
2511 
2512                 v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2513                 v_expand(v_reinterpret_as_u8(r00), r00l, r00h);
2514                 v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
2515                 v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
2516                 v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
2517                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2518             }
2519         }
2520 
2521         return dx;
2522     }
2523 
2524 private:
2525     int cn;
2526     int step;
2527 };
2528 
2529 class ResizeAreaFastVec_SIMD_16u
2530 {
2531 public:
ResizeAreaFastVec_SIMD_16u(int _cn,int _step)2532     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2533         cn(_cn), step(_step) {}
2534 
operator ()(const ushort * S,ushort * D,int w) const2535     int operator() (const ushort* S, ushort* D, int w) const
2536     {
2537         int dx = 0;
2538         const ushort* S0 = (const ushort*)S;
2539         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2540 
2541         if (cn == 1)
2542         {
2543             v_uint32 masklow = vx_setall_u32(0x0000ffff);
2544             for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
2545             {
2546                 v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
2547                 v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
2548                 v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
2549             }
2550         }
2551         else if (cn == 3)
2552         {
2553 #if CV_SIMD_WIDTH == 16
2554             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2555 #if CV_SSE4_1
2556             {
2557                 v_uint32 r0, r1, r2, r3;
2558                 v_expand(vx_load(S0), r0, r1);
2559                 v_expand(vx_load(S1), r2, r3);
2560                 r0 += r2; r1 += r3;
2561                 v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
2562             }
2563 #else
2564                 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
2565 #endif
2566 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
2567             for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
2568             {
2569                 v_uint32 t0, t1, t2, t3, t4, t5;
2570                 v_uint32 s0, s1, s2, s3, s4, s5;
2571                 s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
2572                 s1 = vx_load_expand(S0 +   v_uint32::nlanes) + vx_load_expand(S1 +   v_uint32::nlanes);
2573                 s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
2574                 s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
2575                 s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
2576                 s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
2577                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2578                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2579                 v_uint32 bl, gl, rl;
2580                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2581 #if CV_SIMD_WIDTH == 32
2582                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2583 #else //CV_SIMD_WIDTH == 64
2584                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2585                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2586 #endif
2587                 s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
2588                 s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
2589                 s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
2590                 s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
2591                 s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
2592                 s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
2593                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2594                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2595                 v_uint32 bh, gh, rh;
2596                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2597 #if CV_SIMD_WIDTH == 32
2598                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2599 #else //CV_SIMD_WIDTH == 64
2600                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2601                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2602 #endif
2603                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2604             }
2605 #elif CV_SIMD_WIDTH >= 64
2606             v_uint32 masklow = vx_setall_u32(0x0000ffff);
2607             for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
2608             {
2609                 v_uint16 b0, g0, r0, b1, g1, r1;
2610                 v_load_deinterleave(S0, b0, g0, r0);
2611                 v_load_deinterleave(S1, b1, g1, r1);
2612                 v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
2613                 v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
2614                 v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
2615                 v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
2616                 v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
2617                 v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
2618                 v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
2619                 v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
2620                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2621             }
2622 #endif
2623         }
2624         else
2625         {
2626             CV_Assert(cn == 4);
2627 #if CV_SIMD_WIDTH >= 64
2628             for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
2629             {
2630                 v_uint64 r00, r01, r10, r11;
2631                 v_load_deinterleave((uint64_t*)S0, r00, r01);
2632                 v_load_deinterleave((uint64_t*)S1, r10, r11);
2633 
2634                 v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2635                 v_expand(v_reinterpret_as_u16(r00), r00l, r00h);
2636                 v_expand(v_reinterpret_as_u16(r01), r01l, r01h);
2637                 v_expand(v_reinterpret_as_u16(r10), r10l, r10h);
2638                 v_expand(v_reinterpret_as_u16(r11), r11l, r11h);
2639                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2640             }
2641 #else
2642             for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
2643             {
2644                 v_uint32 r0, r1, r2, r3;
2645                 v_expand(vx_load(S0), r0, r1);
2646                 v_expand(vx_load(S1), r2, r3);
2647                 r0 += r2; r1 += r3;
2648                 v_uint32 v_d;
2649 #if CV_SIMD_WIDTH == 16
2650                 v_d = r0 + r1;
2651 #elif CV_SIMD_WIDTH == 32
2652                 v_uint32 t0, t1;
2653                 v_recombine(r0, r1, t0, t1);
2654                 v_d = t0 + t1;
2655 #endif
2656                 v_rshr_pack_store<2>(D, v_d);
2657             }
2658 #endif
2659         }
2660 
2661         return dx;
2662     }
2663 
2664 private:
2665     int cn;
2666     int step;
2667 };
2668 
2669 class ResizeAreaFastVec_SIMD_16s
2670 {
2671 public:
ResizeAreaFastVec_SIMD_16s(int _cn,int _step)2672     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2673         cn(_cn), step(_step) {}
2674 
operator ()(const short * S,short * D,int w) const2675     int operator() (const short* S, short* D, int w) const
2676     {
2677         int dx = 0;
2678         const short* S0 = (const short*)S;
2679         const short* S1 = (const short*)((const uchar*)(S) + step);
2680 
2681         if (cn == 1)
2682         {
2683             v_int32 masklow = vx_setall_s32(0x0000ffff);
2684             for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
2685             {
2686                 v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
2687                 v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
2688                 v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
2689             }
2690         }
2691         else if (cn == 3)
2692         {
2693 #if CV_SIMD_WIDTH == 16
2694             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2695                 v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
2696 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
2697             for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
2698             {
2699                 v_int32 t0, t1, t2, t3, t4, t5;
2700                 v_int32 s0, s1, s2, s3, s4, s5;
2701                 s0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
2702                 s1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
2703                 s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
2704                 s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
2705                 s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
2706                 s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
2707                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2708                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2709                 v_int32 bl, gl, rl;
2710                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2711 #if CV_SIMD_WIDTH == 32
2712                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
2713 #else //CV_SIMD_WIDTH == 64
2714                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2715                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
2716 #endif
2717                 s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
2718                 s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
2719                 s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
2720                 s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
2721                 s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
2722                 s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
2723                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2724                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2725                 v_int32 bh, gh, rh;
2726                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
2727 #if CV_SIMD_WIDTH == 32
2728                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
2729 #else //CV_SIMD_WIDTH == 64
2730                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
2731                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
2732 #endif
2733                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2734             }
2735 #elif CV_SIMD_WIDTH >= 64
2736             for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
2737             {
2738                 v_int16 b0, g0, r0, b1, g1, r1;
2739                 v_load_deinterleave(S0, b0, g0, r0);
2740                 v_load_deinterleave(S1, b1, g1, r1);
2741                 v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
2742                 v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
2743                 v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
2744                 v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
2745                 v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
2746                 v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
2747                 v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
2748                 v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
2749                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
2750             }
2751 #endif
2752         }
2753         else
2754         {
2755             CV_Assert(cn == 4);
2756             for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
2757             {
2758 #if CV_SIMD_WIDTH >= 64
2759                 v_int64 r00, r01, r10, r11;
2760                 v_load_deinterleave((int64_t*)S0, r00, r01);
2761                 v_load_deinterleave((int64_t*)S1, r10, r11);
2762 
2763                 v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
2764                 v_expand(v_reinterpret_as_s16(r00), r00l, r00h);
2765                 v_expand(v_reinterpret_as_s16(r01), r01l, r01h);
2766                 v_expand(v_reinterpret_as_s16(r10), r10l, r10h);
2767                 v_expand(v_reinterpret_as_s16(r11), r11l, r11h);
2768                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
2769 #else
2770                 v_int32 r0, r1, r2, r3;
2771                 r0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
2772                 r1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
2773                 r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
2774                 r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
2775                 v_int32 dl, dh;
2776 #if CV_SIMD_WIDTH == 16
2777                 dl = r0 + r1; dh = r2 + r3;
2778 #elif CV_SIMD_WIDTH == 32
2779                 v_int32 t0, t1, t2, t3;
2780                 v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
2781                 dl = t0 + t1; dh = t2 + t3;
2782 #endif
2783                 v_store(D, v_rshr_pack<2>(dl, dh));
2784 #endif
2785             }
2786         }
2787 
2788         return dx;
2789     }
2790 
2791 private:
2792     int cn;
2793     int step;
2794 };
2795 
2796 struct ResizeAreaFastVec_SIMD_32f
2797 {
ResizeAreaFastVec_SIMD_32fcv::ResizeAreaFastVec_SIMD_32f2798     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2799         cn(_cn), step(_step)
2800     {
2801         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2802     }
2803 
operator ()cv::ResizeAreaFastVec_SIMD_32f2804     int operator() (const float * S, float * D, int w) const
2805     {
2806         if (!fast_mode)
2807             return 0;
2808 
2809         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2810         int dx = 0;
2811 
2812         if (cn == 1)
2813         {
2814             v_float32 v_025 = vx_setall_f32(0.25f);
2815             for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
2816             {
2817                 v_float32 v_row00, v_row01, v_row10, v_row11;
2818                 v_load_deinterleave(S0, v_row00, v_row01);
2819                 v_load_deinterleave(S1, v_row10, v_row11);
2820                 v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
2821             }
2822         }
2823         else if (cn == 4)
2824         {
2825 #if CV_SIMD_WIDTH == 16
2826             v_float32 v_025 = vx_setall_f32(0.25f);
2827             for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
2828                 v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
2829 #elif CV_SIMD256
2830             v_float32x8 v_025 = v256_setall_f32(0.25f);
2831             for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
2832             {
2833                 v_float32x8 dst0, dst1;
2834                 v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
2835                 v_store(D, (dst0 + dst1) * v_025);
2836             }
2837 #endif
2838         }
2839 
2840         return dx;
2841     }
2842 
2843 private:
2844     int cn;
2845     bool fast_mode;
2846     int step;
2847 };
2848 
2849 #else
2850 
2851 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2852 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2853 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2854 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2855 
2856 #endif
2857 
2858 template<typename T, typename SIMDVecOp>
2859 struct ResizeAreaFastVec
2860 {
ResizeAreaFastVeccv::ResizeAreaFastVec2861     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2862         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2863     {
2864         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2865     }
2866 
operator ()cv::ResizeAreaFastVec2867     int operator() (const T* S, T* D, int w) const
2868     {
2869         if (!fast_mode)
2870             return 0;
2871 
2872         const T* nextS = (const T*)((const uchar*)S + step);
2873         int dx = vecOp(S, D, w);
2874 
2875         if (cn == 1)
2876             for( ; dx < w; ++dx )
2877             {
2878                 int index = dx*2;
2879                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2880             }
2881         else if (cn == 3)
2882             for( ; dx < w; dx += 3 )
2883             {
2884                 int index = dx*2;
2885                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2886                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2887                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2888             }
2889         else
2890             {
2891                 CV_Assert(cn == 4);
2892                 for( ; dx < w; dx += 4 )
2893                 {
2894                     int index = dx*2;
2895                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2896                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2897                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2898                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2899                 }
2900             }
2901 
2902         return dx;
2903     }
2904 
2905 private:
2906     int scale_x, scale_y;
2907     int cn;
2908     bool fast_mode;
2909     int step;
2910     SIMDVecOp vecOp;
2911 };
2912 
2913 template <typename T, typename WT, typename VecOp>
2914 class resizeAreaFast_Invoker :
2915     public ParallelLoopBody
2916 {
2917 public:
resizeAreaFast_Invoker(const Mat & _src,Mat & _dst,int _scale_x,int _scale_y,const int * _ofs,const int * _xofs)2918     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2919         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2920         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2921         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2922     {
2923     }
2924 
operator ()(const Range & range) const2925     virtual void operator() (const Range& range) const CV_OVERRIDE
2926     {
2927         Size ssize = src.size(), dsize = dst.size();
2928         int cn = src.channels();
2929         int area = scale_x*scale_y;
2930         float scale = 1.f/(area);
2931         int dwidth1 = (ssize.width/scale_x)*cn;
2932         dsize.width *= cn;
2933         ssize.width *= cn;
2934         int dy, dx, k = 0;
2935 
2936         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2937 
2938         for( dy = range.start; dy < range.end; dy++ )
2939         {
2940             T* D = (T*)(dst.data + dst.step*dy);
2941             int sy0 = dy*scale_y;
2942             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2943 
2944             if( sy0 >= ssize.height )
2945             {
2946                 for( dx = 0; dx < dsize.width; dx++ )
2947                     D[dx] = 0;
2948                 continue;
2949             }
2950 
2951             dx = vop(src.template ptr<T>(sy0), D, w);
2952             for( ; dx < w; dx++ )
2953             {
2954                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2955                 WT sum = 0;
2956                 k = 0;
2957                 #if CV_ENABLE_UNROLLED
2958                 for( ; k <= area - 4; k += 4 )
2959                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2960                 #endif
2961                 for( ; k < area; k++ )
2962                     sum += S[ofs[k]];
2963 
2964                 D[dx] = saturate_cast<T>(sum * scale);
2965             }
2966 
2967             for( ; dx < dsize.width; dx++ )
2968             {
2969                 WT sum = 0;
2970                 int count = 0, sx0 = xofs[dx];
2971                 if( sx0 >= ssize.width )
2972                     D[dx] = 0;
2973 
2974                 for( int sy = 0; sy < scale_y; sy++ )
2975                 {
2976                     if( sy0 + sy >= ssize.height )
2977                         break;
2978                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2979                     for( int sx = 0; sx < scale_x*cn; sx += cn )
2980                     {
2981                         if( sx0 + sx >= ssize.width )
2982                             break;
2983                         sum += S[sx];
2984                         count++;
2985                     }
2986                 }
2987 
2988                 D[dx] = saturate_cast<T>((float)sum/count);
2989             }
2990         }
2991     }
2992 
2993 private:
2994     Mat src;
2995     Mat dst;
2996     int scale_x, scale_y;
2997     const int *ofs, *xofs;
2998 };
2999 
3000 template<typename T, typename WT, typename VecOp>
resizeAreaFast_(const Mat & src,Mat & dst,const int * ofs,const int * xofs,int scale_x,int scale_y)3001 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
3002                              int scale_x, int scale_y )
3003 {
3004     Range range(0, dst.rows);
3005     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
3006         scale_y, ofs, xofs);
3007     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3008 }
3009 
3010 struct DecimateAlpha
3011 {
3012     int si, di;
3013     float alpha;
3014 };
3015 
3016 
3017 template<typename T, typename WT> class ResizeArea_Invoker :
3018     public ParallelLoopBody
3019 {
3020 public:
ResizeArea_Invoker(const Mat & _src,Mat & _dst,const DecimateAlpha * _xtab,int _xtab_size,const DecimateAlpha * _ytab,int _ytab_size,const int * _tabofs)3021     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
3022                         const DecimateAlpha* _xtab, int _xtab_size,
3023                         const DecimateAlpha* _ytab, int _ytab_size,
3024                         const int* _tabofs )
3025     {
3026         src = &_src;
3027         dst = &_dst;
3028         xtab0 = _xtab;
3029         xtab_size0 = _xtab_size;
3030         ytab = _ytab;
3031         ytab_size = _ytab_size;
3032         tabofs = _tabofs;
3033     }
3034 
operator ()(const Range & range) const3035     virtual void operator() (const Range& range) const CV_OVERRIDE
3036     {
3037         Size dsize = dst->size();
3038         int cn = dst->channels();
3039         dsize.width *= cn;
3040         AutoBuffer<WT> _buffer(dsize.width*2);
3041         const DecimateAlpha* xtab = xtab0;
3042         int xtab_size = xtab_size0;
3043         WT *buf = _buffer.data(), *sum = buf + dsize.width;
3044         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
3045 
3046         for( dx = 0; dx < dsize.width; dx++ )
3047             sum[dx] = (WT)0;
3048 
3049         for( j = j_start; j < j_end; j++ )
3050         {
3051             WT beta = ytab[j].alpha;
3052             int dy = ytab[j].di;
3053             int sy = ytab[j].si;
3054 
3055             {
3056                 const T* S = src->template ptr<T>(sy);
3057                 for( dx = 0; dx < dsize.width; dx++ )
3058                     buf[dx] = (WT)0;
3059 
3060                 if( cn == 1 )
3061                     for( k = 0; k < xtab_size; k++ )
3062                     {
3063                         int dxn = xtab[k].di;
3064                         WT alpha = xtab[k].alpha;
3065                         buf[dxn] += S[xtab[k].si]*alpha;
3066                     }
3067                 else if( cn == 2 )
3068                     for( k = 0; k < xtab_size; k++ )
3069                     {
3070                         int sxn = xtab[k].si;
3071                         int dxn = xtab[k].di;
3072                         WT alpha = xtab[k].alpha;
3073                         WT t0 = buf[dxn] + S[sxn]*alpha;
3074                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3075                         buf[dxn] = t0; buf[dxn+1] = t1;
3076                     }
3077                 else if( cn == 3 )
3078                     for( k = 0; k < xtab_size; k++ )
3079                     {
3080                         int sxn = xtab[k].si;
3081                         int dxn = xtab[k].di;
3082                         WT alpha = xtab[k].alpha;
3083                         WT t0 = buf[dxn] + S[sxn]*alpha;
3084                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3085                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
3086                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
3087                     }
3088                 else if( cn == 4 )
3089                 {
3090                     for( k = 0; k < xtab_size; k++ )
3091                     {
3092                         int sxn = xtab[k].si;
3093                         int dxn = xtab[k].di;
3094                         WT alpha = xtab[k].alpha;
3095                         WT t0 = buf[dxn] + S[sxn]*alpha;
3096                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
3097                         buf[dxn] = t0; buf[dxn+1] = t1;
3098                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
3099                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
3100                         buf[dxn+2] = t0; buf[dxn+3] = t1;
3101                     }
3102                 }
3103                 else
3104                 {
3105                     for( k = 0; k < xtab_size; k++ )
3106                     {
3107                         int sxn = xtab[k].si;
3108                         int dxn = xtab[k].di;
3109                         WT alpha = xtab[k].alpha;
3110                         for( int c = 0; c < cn; c++ )
3111                             buf[dxn + c] += S[sxn + c]*alpha;
3112                     }
3113                 }
3114             }
3115 
3116             if( dy != prev_dy )
3117             {
3118                 T* D = dst->template ptr<T>(prev_dy);
3119 
3120                 for( dx = 0; dx < dsize.width; dx++ )
3121                 {
3122                     D[dx] = saturate_cast<T>(sum[dx]);
3123                     sum[dx] = beta*buf[dx];
3124                 }
3125                 prev_dy = dy;
3126             }
3127             else
3128             {
3129                 for( dx = 0; dx < dsize.width; dx++ )
3130                     sum[dx] += beta*buf[dx];
3131             }
3132         }
3133 
3134         {
3135         T* D = dst->template ptr<T>(prev_dy);
3136         for( dx = 0; dx < dsize.width; dx++ )
3137             D[dx] = saturate_cast<T>(sum[dx]);
3138         }
3139     }
3140 
3141 private:
3142     const Mat* src;
3143     Mat* dst;
3144     const DecimateAlpha* xtab0;
3145     const DecimateAlpha* ytab;
3146     int xtab_size0, ytab_size;
3147     const int* tabofs;
3148 };
3149 
3150 
3151 template <typename T, typename WT>
resizeArea_(const Mat & src,Mat & dst,const DecimateAlpha * xtab,int xtab_size,const DecimateAlpha * ytab,int ytab_size,const int * tabofs)3152 static void resizeArea_( const Mat& src, Mat& dst,
3153                          const DecimateAlpha* xtab, int xtab_size,
3154                          const DecimateAlpha* ytab, int ytab_size,
3155                          const int* tabofs )
3156 {
3157     parallel_for_(Range(0, dst.rows),
3158                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
3159                  dst.total()/((double)(1 << 16)));
3160 }
3161 
3162 
3163 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
3164                             const int* xofs, const void* alpha,
3165                             const int* yofs, const void* beta,
3166                             int xmin, int xmax, int ksize );
3167 
3168 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
3169                                     const int* ofs, const int *xofs,
3170                                     int scale_x, int scale_y );
3171 
3172 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
3173                                 const DecimateAlpha* xtab, int xtab_size,
3174                                 const DecimateAlpha* ytab, int ytab_size,
3175                                 const int* yofs);
3176 
3177 
computeResizeAreaTab(int ssize,int dsize,int cn,double scale,DecimateAlpha * tab)3178 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
3179 {
3180     int k = 0;
3181     for(int dx = 0; dx < dsize; dx++ )
3182     {
3183         double fsx1 = dx * scale;
3184         double fsx2 = fsx1 + scale;
3185         double cellWidth = std::min(scale, ssize - fsx1);
3186 
3187         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
3188 
3189         sx2 = std::min(sx2, ssize - 1);
3190         sx1 = std::min(sx1, sx2);
3191 
3192         if( sx1 - fsx1 > 1e-3 )
3193         {
3194             assert( k < ssize*2 );
3195             tab[k].di = dx * cn;
3196             tab[k].si = (sx1 - 1) * cn;
3197             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
3198         }
3199 
3200         for(int sx = sx1; sx < sx2; sx++ )
3201         {
3202             assert( k < ssize*2 );
3203             tab[k].di = dx * cn;
3204             tab[k].si = sx * cn;
3205             tab[k++].alpha = float(1.0 / cellWidth);
3206         }
3207 
3208         if( fsx2 - sx2 > 1e-3 )
3209         {
3210             assert( k < ssize*2 );
3211             tab[k].di = dx * cn;
3212             tab[k].si = sx2 * cn;
3213             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
3214         }
3215     }
3216     return k;
3217 }
3218 
3219 #ifdef HAVE_OPENCL
ocl_computeResizeAreaTabs(int ssize,int dsize,double scale,int * const map_tab,float * const alpha_tab,int * const ofs_tab)3220 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
3221                                       float * const alpha_tab, int * const ofs_tab)
3222 {
3223     int k = 0, dx = 0;
3224     for ( ; dx < dsize; dx++)
3225     {
3226         ofs_tab[dx] = k;
3227 
3228         double fsx1 = dx * scale;
3229         double fsx2 = fsx1 + scale;
3230         double cellWidth = std::min(scale, ssize - fsx1);
3231 
3232         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
3233 
3234         sx2 = std::min(sx2, ssize - 1);
3235         sx1 = std::min(sx1, sx2);
3236 
3237         if (sx1 - fsx1 > 1e-3)
3238         {
3239             map_tab[k] = sx1 - 1;
3240             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
3241         }
3242 
3243         for (int sx = sx1; sx < sx2; sx++)
3244         {
3245             map_tab[k] = sx;
3246             alpha_tab[k++] = float(1.0 / cellWidth);
3247         }
3248 
3249         if (fsx2 - sx2 > 1e-3)
3250         {
3251             map_tab[k] = sx2;
3252             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
3253         }
3254     }
3255     ofs_tab[dx] = k;
3256 }
3257 
ocl_resize(InputArray _src,OutputArray _dst,Size dsize,double fx,double fy,int interpolation)3258 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
3259                         double fx, double fy, int interpolation)
3260 {
3261     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3262 
3263     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
3264     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
3265     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
3266     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
3267         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
3268 
3269     // in case of scale_x && scale_y is equal to 2
3270     // INTER_AREA (fast) also is equal to INTER_LINEAR
3271     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3272         /*interpolation = INTER_AREA*/CV_UNUSED(0); // INTER_AREA is slower
3273 
3274     if( !(cn <= 4 &&
3275            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
3276             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
3277         return false;
3278 
3279     UMat src = _src.getUMat();
3280     _dst.create(dsize, type);
3281     UMat dst = _dst.getUMat();
3282 
3283     Size ssize = src.size();
3284     ocl::Kernel k;
3285     size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
3286 
3287     ocl::Image2D srcImage;
3288 
3289     // See if this could be done with a sampler.  We stick with integer
3290     // datatypes because the observed error is low.
3291     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
3292                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
3293                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
3294                        src.offset==0);
3295     if (useSampler)
3296     {
3297         int wdepth = std::max(depth, CV_32S);
3298         char buf[2][32];
3299         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
3300                         "-D convertToDT=%s -D cn=%d",
3301                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
3302                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3303                         cn);
3304         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
3305 
3306         if (k.empty())
3307             useSampler = false;
3308         else
3309         {
3310             // Convert the input into an OpenCL image type, using normalized channel data types
3311             // and aliasing the UMat.
3312             srcImage = ocl::Image2D(src, true, true);
3313             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
3314                    (float)inv_fx, (float)inv_fy);
3315         }
3316     }
3317 
3318     if (interpolation == INTER_LINEAR && !useSampler)
3319     {
3320         char buf[2][32];
3321 
3322         // integer path is slower because of CPU part, so it's disabled
3323         if (depth == CV_8U && ((void)0, 0))
3324         {
3325             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
3326             int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width;
3327             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
3328             float fxx, fyy;
3329             int sx, sy;
3330 
3331             for (int dx = 0; dx < dsize.width; dx++)
3332             {
3333                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
3334                 sx = cvFloor(fxx);
3335                 fxx -= sx;
3336 
3337                 if (sx < 0)
3338                     fxx = 0, sx = 0;
3339 
3340                 if (sx >= ssize.width-1)
3341                     fxx = 0, sx = ssize.width-1;
3342 
3343                 xofs[dx] = sx;
3344                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
3345                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
3346             }
3347 
3348             for (int dy = 0; dy < dsize.height; dy++)
3349             {
3350                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
3351                 sy = cvFloor(fyy);
3352                 fyy -= sy;
3353 
3354                 yofs[dy] = sy;
3355                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
3356                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
3357             }
3358 
3359             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3360             UMat coeffs;
3361             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
3362 
3363             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3364                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
3365                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3366                             "-D INTER_RESIZE_COEF_BITS=%d",
3367                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3368                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3369                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3370                             cn, INTER_RESIZE_COEF_BITS));
3371             if (k.empty())
3372                 return false;
3373 
3374             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3375                    ocl::KernelArg::PtrReadOnly(coeffs));
3376         }
3377         else
3378         {
3379             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3380             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3381                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
3382                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3383                             "-D INTER_RESIZE_COEF_BITS=%d",
3384                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3385                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3386                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3387                             cn, INTER_RESIZE_COEF_BITS));
3388             if (k.empty())
3389                 return false;
3390 
3391             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3392                    (float)inv_fx, (float)inv_fy);
3393         }
3394     }
3395     else if (interpolation == INTER_NEAREST)
3396     {
3397         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
3398                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
3399                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
3400         if (k.empty())
3401             return false;
3402 
3403         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3404                (float)inv_fx, (float)inv_fy);
3405     }
3406     else if (interpolation == INTER_AREA)
3407     {
3408         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
3409         int wtype = CV_MAKE_TYPE(wdepth, cn);
3410 
3411         char cvt[2][40];
3412         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
3413                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3414                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
3415 
3416         UMat alphaOcl, tabofsOcl, mapOcl;
3417         UMat dmap, smap;
3418 
3419         if (is_area_fast)
3420         {
3421             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
3422             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
3423                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
3424                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
3425                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
3426                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
3427 
3428             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
3429             if (k.empty())
3430                 return false;
3431         }
3432         else
3433         {
3434             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
3435             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
3436             if (k.empty())
3437                 return false;
3438 
3439             int xytab_size = (ssize.width + ssize.height) << 1;
3440             int tabofs_size = dsize.height + dsize.width + 2;
3441 
3442             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
3443             AutoBuffer<float> _xyalpha_tab(xytab_size);
3444             int * xmap_tab = _xymap_tab.data(), * ymap_tab = _xymap_tab.data() + (ssize.width << 1);
3445             float * xalpha_tab = _xyalpha_tab.data(), * yalpha_tab = _xyalpha_tab.data() + (ssize.width << 1);
3446             int * xofs_tab = _xyofs_tab.data(), * yofs_tab = _xyofs_tab.data() + dsize.width + 1;
3447 
3448             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
3449             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
3450 
3451             // loading precomputed arrays to GPU
3452             Mat(1, xytab_size, CV_32FC1, _xyalpha_tab.data()).copyTo(alphaOcl);
3453             Mat(1, xytab_size, CV_32SC1, _xymap_tab.data()).copyTo(mapOcl);
3454             Mat(1, tabofs_size, CV_32SC1, _xyofs_tab.data()).copyTo(tabofsOcl);
3455         }
3456 
3457         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
3458 
3459         if (is_area_fast)
3460             k.args(srcarg, dstarg);
3461         else
3462             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
3463                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
3464 
3465         return k.run(2, globalsize, NULL, false);
3466     }
3467 
3468     return k.run(2, globalsize, 0, false);
3469 }
3470 
3471 #endif
3472 
3473 #ifdef HAVE_IPP
3474 #define IPP_RESIZE_PARALLEL 1
3475 
3476 #ifdef HAVE_IPP_IW
3477 class ipp_resizeParallel: public ParallelLoopBody
3478 {
3479 public:
ipp_resizeParallel(::ipp::IwiImage & src,::ipp::IwiImage & dst,bool & ok)3480     ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
3481         m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeParallel()3482     ~ipp_resizeParallel()
3483     {
3484     }
3485 
Init(IppiInterpolationType inter)3486     void Init(IppiInterpolationType inter)
3487     {
3488         iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl);
3489 
3490         m_ok = true;
3491     }
3492 
operator ()(const Range & range) const3493     virtual void operator() (const Range& range) const CV_OVERRIDE
3494     {
3495         CV_INSTRUMENT_REGION_IPP();
3496 
3497         if(!m_ok)
3498             return;
3499 
3500         try
3501         {
3502             ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
3503             CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile);
3504         }
3505         catch(const ::ipp::IwException &)
3506         {
3507             m_ok = false;
3508             return;
3509         }
3510     }
3511 private:
3512     ::ipp::IwiImage &m_src;
3513     ::ipp::IwiImage &m_dst;
3514 
3515     mutable ::ipp::IwiResize iwiResize;
3516 
3517     volatile bool &m_ok;
3518     const ipp_resizeParallel& operator= (const ipp_resizeParallel&);
3519 };
3520 
3521 class ipp_resizeAffineParallel: public ParallelLoopBody
3522 {
3523 public:
ipp_resizeAffineParallel(::ipp::IwiImage & src,::ipp::IwiImage & dst,bool & ok)3524     ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
3525         m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeAffineParallel()3526     ~ipp_resizeAffineParallel()
3527     {
3528     }
3529 
Init(IppiInterpolationType inter,double scaleX,double scaleY)3530     void Init(IppiInterpolationType inter, double scaleX, double scaleY)
3531     {
3532         double shift = (inter == ippNearest)?-1e-10:-0.5;
3533         double coeffs[2][3] = {
3534             {scaleX, 0,      shift+0.5*scaleX},
3535             {0,      scaleY, shift+0.5*scaleY}
3536         };
3537 
3538         iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl);
3539 
3540         m_ok = true;
3541     }
3542 
operator ()(const Range & range) const3543     virtual void operator() (const Range& range) const CV_OVERRIDE
3544     {
3545         CV_INSTRUMENT_REGION_IPP();
3546 
3547         if(!m_ok)
3548             return;
3549 
3550         try
3551         {
3552             ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
3553             CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile);
3554         }
3555         catch(const ::ipp::IwException &)
3556         {
3557             m_ok = false;
3558             return;
3559         }
3560     }
3561 private:
3562     ::ipp::IwiImage &m_src;
3563     ::ipp::IwiImage &m_dst;
3564 
3565     mutable ::ipp::IwiWarpAffine iwiWarpAffine;
3566 
3567     volatile bool &m_ok;
3568     const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&);
3569 };
3570 #endif
3571 
ipp_resize(const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,double inv_scale_x,double inv_scale_y,int depth,int channels,int interpolation)3572 static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height,
3573             uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y,
3574             int depth, int channels, int interpolation)
3575 {
3576 #ifdef HAVE_IPP_IW
3577     CV_INSTRUMENT_REGION_IPP();
3578 
3579     IppDataType           ippDataType = ippiGetDataType(depth);
3580     IppiInterpolationType ippInter    = ippiGetInterpolation(interpolation);
3581     if((int)ippInter < 0)
3582         return false;
3583 
3584     // Resize which doesn't match OpenCV exactly
3585     if (!cv::ipp::useIPP_NotExact())
3586     {
3587         if (ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear))
3588             return false;
3589     }
3590 
3591     if(ippInter != ippLinear && ippDataType == ipp64f)
3592         return false;
3593 
3594 #if IPP_VERSION_X100 < 201801
3595     // Degradations on int^2 linear downscale
3596     if (ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale
3597     {
3598         int scale_x = (int)(1 / inv_scale_x);
3599         int scale_y = (int)(1 / inv_scale_y);
3600         if (1 / inv_scale_x - scale_x < DBL_EPSILON && 1 / inv_scale_y - scale_y < DBL_EPSILON) // if integer
3601         {
3602             if (!(scale_x&(scale_x - 1)) && !(scale_y&(scale_y - 1))) // if power of 2
3603                 return false;
3604         }
3605     }
3606 #endif
3607 
3608     bool  affine = false;
3609     const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10;
3610     double ex = fabs((double)dst_width / src_width  - inv_scale_x) / inv_scale_x;
3611     double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y;
3612 
3613     // Use affine transform resize to allow sub-pixel accuracy
3614     if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS)
3615         affine = true;
3616 
3617     // Affine doesn't support Lanczos and Super interpolations
3618     if(affine && (ippInter == ippLanczos || ippInter == ippSuper))
3619         return false;
3620 
3621     try
3622     {
3623         ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step);
3624         ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step);
3625 
3626         bool  ok;
3627         int   threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height)));
3628         Range range(0, dst_height);
3629         ipp_resizeParallel       invokerGeneral(iwSrc, iwDst, ok);
3630         ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok);
3631         ParallelLoopBody        *pInvoker = NULL;
3632         if(affine)
3633         {
3634             pInvoker = &invokerAffine;
3635             invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y);
3636         }
3637         else
3638         {
3639             pInvoker = &invokerGeneral;
3640             invokerGeneral.Init(ippInter);
3641         }
3642 
3643         if(IPP_RESIZE_PARALLEL && threads > 1)
3644             parallel_for_(range, *pInvoker, threads*4);
3645         else
3646             pInvoker->operator()(range);
3647 
3648         if(!ok)
3649             return false;
3650     }
3651     catch(const ::ipp::IwException &)
3652     {
3653         return false;
3654     }
3655     return true;
3656 #else
3657     CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step);
3658     CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth);
3659     CV_UNUSED(channels); CV_UNUSED(interpolation);
3660     return false;
3661 #endif
3662 }
3663 #endif
3664 
3665 //==================================================================================================
3666 
3667 namespace hal {
3668 
resize(int src_type,const uchar * src_data,size_t src_step,int src_width,int src_height,uchar * dst_data,size_t dst_step,int dst_width,int dst_height,double inv_scale_x,double inv_scale_y,int interpolation)3669 void resize(int src_type,
3670             const uchar * src_data, size_t src_step, int src_width, int src_height,
3671             uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3672             double inv_scale_x, double inv_scale_y, int interpolation)
3673 {
3674     CV_INSTRUMENT_REGION();
3675 
3676     CV_Assert((dst_width > 0 && dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
3677     if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
3678     {
3679         inv_scale_x = static_cast<double>(dst_width) / src_width;
3680         inv_scale_y = static_cast<double>(dst_height) / src_height;
3681     }
3682 
3683     CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
3684 
3685     int  depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
3686     Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
3687                         saturate_cast<int>(src_height*inv_scale_y));
3688     CV_Assert( !dsize.empty() );
3689 
3690     CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation))
3691 
3692     static ResizeFunc linear_tab[] =
3693     {
3694         resizeGeneric_<
3695             HResizeLinear<uchar, int, short,
3696                 INTER_RESIZE_COEF_SCALE,
3697                 HResizeLinearVec_8u32s>,
3698             VResizeLinear<uchar, int, short,
3699                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3700                 VResizeLinearVec_32s8u> >,
3701         0,
3702         resizeGeneric_<
3703             HResizeLinear<ushort, float, float, 1,
3704                 HResizeLinearVec_16u32f>,
3705             VResizeLinear<ushort, float, float, Cast<float, ushort>,
3706                 VResizeLinearVec_32f16u> >,
3707         resizeGeneric_<
3708             HResizeLinear<short, float, float, 1,
3709                 HResizeLinearVec_16s32f>,
3710             VResizeLinear<short, float, float, Cast<float, short>,
3711                 VResizeLinearVec_32f16s> >,
3712         0,
3713         resizeGeneric_<
3714             HResizeLinear<float, float, float, 1,
3715                 HResizeLinearVec_32f>,
3716             VResizeLinear<float, float, float, Cast<float, float>,
3717                 VResizeLinearVec_32f> >,
3718         resizeGeneric_<
3719             HResizeLinear<double, double, float, 1,
3720                 HResizeNoVec>,
3721             VResizeLinear<double, double, float, Cast<double, double>,
3722                 VResizeNoVec> >,
3723         0
3724     };
3725 
3726     static ResizeFunc cubic_tab[] =
3727     {
3728         resizeGeneric_<
3729             HResizeCubic<uchar, int, short>,
3730             VResizeCubic<uchar, int, short,
3731                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3732                 VResizeCubicVec_32s8u> >,
3733         0,
3734         resizeGeneric_<
3735             HResizeCubic<ushort, float, float>,
3736             VResizeCubic<ushort, float, float, Cast<float, ushort>,
3737             VResizeCubicVec_32f16u> >,
3738         resizeGeneric_<
3739             HResizeCubic<short, float, float>,
3740             VResizeCubic<short, float, float, Cast<float, short>,
3741             VResizeCubicVec_32f16s> >,
3742         0,
3743         resizeGeneric_<
3744             HResizeCubic<float, float, float>,
3745             VResizeCubic<float, float, float, Cast<float, float>,
3746             VResizeCubicVec_32f> >,
3747         resizeGeneric_<
3748             HResizeCubic<double, double, float>,
3749             VResizeCubic<double, double, float, Cast<double, double>,
3750             VResizeNoVec> >,
3751         0
3752     };
3753 
3754     static ResizeFunc lanczos4_tab[] =
3755     {
3756         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
3757             VResizeLanczos4<uchar, int, short,
3758             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3759             VResizeNoVec> >,
3760         0,
3761         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
3762             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
3763             VResizeLanczos4Vec_32f16u> >,
3764         resizeGeneric_<HResizeLanczos4<short, float, float>,
3765             VResizeLanczos4<short, float, float, Cast<float, short>,
3766             VResizeLanczos4Vec_32f16s> >,
3767         0,
3768         resizeGeneric_<HResizeLanczos4<float, float, float>,
3769             VResizeLanczos4<float, float, float, Cast<float, float>,
3770             VResizeLanczos4Vec_32f> >,
3771         resizeGeneric_<HResizeLanczos4<double, double, float>,
3772             VResizeLanczos4<double, double, float, Cast<double, double>,
3773             VResizeNoVec> >,
3774         0
3775     };
3776 
3777     static ResizeAreaFastFunc areafast_tab[] =
3778     {
3779         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
3780         0,
3781         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
3782         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
3783         0,
3784         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
3785         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
3786         0
3787     };
3788 
3789     static ResizeAreaFunc area_tab[] =
3790     {
3791         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
3792         resizeArea_<short, float>, 0, resizeArea_<float, float>,
3793         resizeArea_<double, double>, 0
3794     };
3795 
3796     static be_resize_func linear_exact_tab[] =
3797     {
3798         resize_bitExact<uchar, interpolationLinear<uchar> >,
3799         resize_bitExact<schar, interpolationLinear<schar> >,
3800         resize_bitExact<ushort, interpolationLinear<ushort> >,
3801         resize_bitExact<short, interpolationLinear<short> >,
3802         resize_bitExact<int, interpolationLinear<int> >,
3803         0,
3804         0,
3805         0
3806     };
3807 
3808     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
3809 
3810     int iscale_x = saturate_cast<int>(scale_x);
3811     int iscale_y = saturate_cast<int>(scale_y);
3812 
3813     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
3814             std::abs(scale_y - iscale_y) < DBL_EPSILON;
3815 
3816     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3817     Mat dst(dsize, src_type, dst_data, dst_step);
3818 
3819     if (interpolation == INTER_LINEAR_EXACT)
3820     {
3821         // in case of inv_scale_x && inv_scale_y is equal to 0.5
3822         // INTER_AREA (fast) is equal to bit exact INTER_LINEAR
3823         if (is_area_fast && iscale_x == 2 && iscale_y == 2 && cn != 2)//Area resize implementation for 2-channel images isn't bit-exact
3824             interpolation = INTER_AREA;
3825         else
3826         {
3827             be_resize_func func = linear_exact_tab[depth];
3828             CV_Assert(func != 0);
3829             func(src_data, src_step, src_width, src_height,
3830                  dst_data, dst_step, dst_width, dst_height,
3831                  cn, inv_scale_x, inv_scale_y);
3832             return;
3833         }
3834     }
3835 
3836     if( interpolation == INTER_NEAREST )
3837     {
3838         resizeNN( src, dst, inv_scale_x, inv_scale_y );
3839         return;
3840     }
3841 
3842     if( interpolation == INTER_NEAREST_EXACT )
3843     {
3844         resizeNN_bitexact( src, dst, inv_scale_x, inv_scale_y );
3845         return;
3846     }
3847 
3848     int k, sx, sy, dx, dy;
3849 
3850 
3851     {
3852         // in case of scale_x && scale_y is equal to 2
3853         // INTER_AREA (fast) also is equal to INTER_LINEAR
3854         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3855             interpolation = INTER_AREA;
3856 
3857         // true "area" interpolation is only implemented for the case (scale_x >= 1 && scale_y >= 1).
3858         // In other cases it is emulated using some variant of bilinear interpolation
3859         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3860         {
3861             if( is_area_fast )
3862             {
3863                 int area = iscale_x*iscale_y;
3864                 size_t srcstep = src_step / src.elemSize1();
3865                 AutoBuffer<int> _ofs(area + dsize.width*cn);
3866                 int* ofs = _ofs.data();
3867                 int* xofs = ofs + area;
3868                 ResizeAreaFastFunc func = areafast_tab[depth];
3869                 CV_Assert( func != 0 );
3870 
3871                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3872                     for( sx = 0; sx < iscale_x; sx++ )
3873                         ofs[k++] = (int)(sy*srcstep + sx*cn);
3874 
3875                 for( dx = 0; dx < dsize.width; dx++ )
3876                 {
3877                     int j = dx * cn;
3878                     sx = iscale_x * j;
3879                     for( k = 0; k < cn; k++ )
3880                         xofs[j + k] = sx + k;
3881                 }
3882 
3883                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3884                 return;
3885             }
3886 
3887             ResizeAreaFunc func = area_tab[depth];
3888             CV_Assert( func != 0 && cn <= 4 );
3889 
3890             AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
3891             DecimateAlpha* xtab = _xytab.data(), *ytab = xtab + src_width*2;
3892 
3893             int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
3894             int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
3895 
3896             AutoBuffer<int> _tabofs(dsize.height + 1);
3897             int* tabofs = _tabofs.data();
3898             for( k = 0, dy = 0; k < ytab_size; k++ )
3899             {
3900                 if( k == 0 || ytab[k].di != ytab[k-1].di )
3901                 {
3902                     assert( ytab[k].di == dy );
3903                     tabofs[dy++] = k;
3904                 }
3905             }
3906             tabofs[dy] = ytab_size;
3907 
3908             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3909             return;
3910         }
3911     }
3912 
3913     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3914     bool area_mode = interpolation == INTER_AREA;
3915     bool fixpt = depth == CV_8U;
3916     float fx, fy;
3917     ResizeFunc func=0;
3918     int ksize=0, ksize2;
3919     if( interpolation == INTER_CUBIC )
3920         ksize = 4, func = cubic_tab[depth];
3921     else if( interpolation == INTER_LANCZOS4 )
3922         ksize = 8, func = lanczos4_tab[depth];
3923     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3924         ksize = 2, func = linear_tab[depth];
3925     else
3926         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3927     ksize2 = ksize/2;
3928 
3929     CV_Assert( func != 0 );
3930 
3931     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3932     int* xofs = (int*)_buffer.data();
3933     int* yofs = xofs + width;
3934     float* alpha = (float*)(yofs + dsize.height);
3935     short* ialpha = (short*)alpha;
3936     float* beta = alpha + width*ksize;
3937     short* ibeta = ialpha + width*ksize;
3938     float cbuf[MAX_ESIZE] = {0};
3939 
3940     for( dx = 0; dx < dsize.width; dx++ )
3941     {
3942         if( !area_mode )
3943         {
3944             fx = (float)((dx+0.5)*scale_x - 0.5);
3945             sx = cvFloor(fx);
3946             fx -= sx;
3947         }
3948         else
3949         {
3950             sx = cvFloor(dx*scale_x);
3951             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3952             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3953         }
3954 
3955         if( sx < ksize2-1 )
3956         {
3957             xmin = dx+1;
3958             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3959                 fx = 0, sx = 0;
3960         }
3961 
3962         if( sx + ksize2 >= src_width )
3963         {
3964             xmax = std::min( xmax, dx );
3965             if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3966                 fx = 0, sx = src_width-1;
3967         }
3968 
3969         for( k = 0, sx *= cn; k < cn; k++ )
3970             xofs[dx*cn + k] = sx + k;
3971 
3972         if( interpolation == INTER_CUBIC )
3973             interpolateCubic( fx, cbuf );
3974         else if( interpolation == INTER_LANCZOS4 )
3975             interpolateLanczos4( fx, cbuf );
3976         else
3977         {
3978             cbuf[0] = 1.f - fx;
3979             cbuf[1] = fx;
3980         }
3981         if( fixpt )
3982         {
3983             for( k = 0; k < ksize; k++ )
3984                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3985             for( ; k < cn*ksize; k++ )
3986                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3987         }
3988         else
3989         {
3990             for( k = 0; k < ksize; k++ )
3991                 alpha[dx*cn*ksize + k] = cbuf[k];
3992             for( ; k < cn*ksize; k++ )
3993                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3994         }
3995     }
3996 
3997     for( dy = 0; dy < dsize.height; dy++ )
3998     {
3999         if( !area_mode )
4000         {
4001             fy = (float)((dy+0.5)*scale_y - 0.5);
4002             sy = cvFloor(fy);
4003             fy -= sy;
4004         }
4005         else
4006         {
4007             sy = cvFloor(dy*scale_y);
4008             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
4009             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
4010         }
4011 
4012         yofs[dy] = sy;
4013         if( interpolation == INTER_CUBIC )
4014             interpolateCubic( fy, cbuf );
4015         else if( interpolation == INTER_LANCZOS4 )
4016             interpolateLanczos4( fy, cbuf );
4017         else
4018         {
4019             cbuf[0] = 1.f - fy;
4020             cbuf[1] = fy;
4021         }
4022 
4023         if( fixpt )
4024         {
4025             for( k = 0; k < ksize; k++ )
4026                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
4027         }
4028         else
4029         {
4030             for( k = 0; k < ksize; k++ )
4031                 beta[dy*ksize + k] = cbuf[k];
4032         }
4033     }
4034 
4035     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
4036           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
4037 }
4038 
4039 } // cv::hal::
4040 } // cv::
4041 
4042 //==================================================================================================
4043 
resize(InputArray _src,OutputArray _dst,Size dsize,double inv_scale_x,double inv_scale_y,int interpolation)4044 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
4045                  double inv_scale_x, double inv_scale_y, int interpolation )
4046 {
4047     CV_INSTRUMENT_REGION();
4048 
4049     Size ssize = _src.size();
4050 
4051     CV_Assert( !ssize.empty() );
4052     if( dsize.empty() )
4053     {
4054         CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
4055         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
4056                      saturate_cast<int>(ssize.height*inv_scale_y));
4057         CV_Assert( !dsize.empty() );
4058     }
4059     else
4060     {
4061         inv_scale_x = (double)dsize.width/ssize.width;
4062         inv_scale_y = (double)dsize.height/ssize.height;
4063         CV_Assert(inv_scale_x > 0); CV_Assert(inv_scale_y > 0);
4064     }
4065 
4066     if (interpolation == INTER_LINEAR_EXACT && (_src.depth() == CV_32F || _src.depth() == CV_64F))
4067         interpolation = INTER_LINEAR; // If depth isn't supported fallback to generic resize
4068 
4069     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
4070                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
4071 
4072     // Fake reference to source. Resolves issue 13577 in case of src == dst.
4073     UMat srcUMat;
4074     if (_src.isUMat())
4075         srcUMat = _src.getUMat();
4076 
4077     Mat src = _src.getMat();
4078     _dst.create(dsize, src.type());
4079     Mat dst = _dst.getMat();
4080 
4081     if (dsize == ssize)
4082     {
4083         // Source and destination are of same size. Use simple copy.
4084         src.copyTo(dst);
4085         return;
4086     }
4087 
4088     hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
4089 }
4090 
4091 
4092 CV_IMPL void
cvResize(const CvArr * srcarr,CvArr * dstarr,int method)4093 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
4094 {
4095     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
4096     CV_Assert( src.type() == dst.type() );
4097     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
4098         (double)dst.rows/src.rows, method );
4099 }
4100 
4101 /* End of file. */
4102