1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49 
50 #include "precomp.hpp"
51 #include "resize.hpp"
52 
53 namespace cv
54 {
55 namespace opt_SSE4_1
56 {
57 
58 class resizeNNInvokerSSE2 :
59     public ParallelLoopBody
60 {
61 public:
resizeNNInvokerSSE2(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)62     resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
63         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
64         ify(_ify)
65     {
66     }
67 
68 #if defined(__INTEL_COMPILER)
69 #pragma optimization_parameter target_arch=SSE4.2
70 #endif
operator ()(const Range & range) const71     virtual void operator() (const Range& range) const CV_OVERRIDE
72     {
73         Size ssize = src.size(), dsize = dst.size();
74         int y, x;
75         int width = dsize.width;
76         int sseWidth = width - (width & 0x7);
77         for(y = range.start; y < range.end; y++)
78         {
79             uchar* D = dst.data + dst.step*y;
80             uchar* Dstart = D;
81             int sy = std::min(cvFloor(y*ify), ssize.height-1);
82             const uchar* S = src.data + sy*src.step;
83             __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
84             for(x = 0; x < sseWidth; x += 8)
85             {
86                 ushort imm = *(ushort*)(S + x_ofs[x + 0]);
87                 pixels = _mm_insert_epi16(pixels, imm, 0);
88                 imm = *(ushort*)(S + x_ofs[x + 1]);
89                 pixels = _mm_insert_epi16(pixels, imm, 1);
90                 imm = *(ushort*)(S + x_ofs[x + 2]);
91                 pixels = _mm_insert_epi16(pixels, imm, 2);
92                 imm = *(ushort*)(S + x_ofs[x + 3]);
93                 pixels = _mm_insert_epi16(pixels, imm, 3);
94                 imm = *(ushort*)(S + x_ofs[x + 4]);
95                 pixels = _mm_insert_epi16(pixels, imm, 4);
96                 imm = *(ushort*)(S + x_ofs[x + 5]);
97                 pixels = _mm_insert_epi16(pixels, imm, 5);
98                 imm = *(ushort*)(S + x_ofs[x + 6]);
99                 pixels = _mm_insert_epi16(pixels, imm, 6);
100                 imm = *(ushort*)(S + x_ofs[x + 7]);
101                 pixels = _mm_insert_epi16(pixels, imm, 7);
102                 _mm_storeu_si128((__m128i*)D, pixels);
103                 D += 16;
104             }
105             for(; x < width; x++)
106             {
107                 *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
108             }
109         }
110     }
111 
112 private:
113     const Mat& src;
114     Mat& dst;
115     int* x_ofs;
116     double ify;
117 
118     resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
119     resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
120 };
121 
122 class resizeNNInvokerSSE4 :
123     public ParallelLoopBody
124 {
125 public:
resizeNNInvokerSSE4(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)126     resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
127         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
128         ify(_ify)
129     {
130     }
131 #if defined(__INTEL_COMPILER)
132 #pragma optimization_parameter target_arch=SSE4.2
133 #endif
operator ()(const Range & range) const134     virtual void operator() (const Range& range) const CV_OVERRIDE
135     {
136         Size ssize = src.size(), dsize = dst.size();
137         int y, x;
138         int width = dsize.width;
139         int sseWidth = width - (width & 0x3);
140         for(y = range.start; y < range.end; y++)
141         {
142             uchar* D = dst.data + dst.step*y;
143             uchar* Dstart = D;
144             int sy = std::min(cvFloor(y*ify), ssize.height-1);
145             const uchar* S = src.data + sy*src.step;
146             __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
147             for(x = 0; x < sseWidth; x += 4)
148             {
149                 int imm = *(int*)(S + x_ofs[x + 0]);
150                 pixels = _mm_insert_epi32(pixels, imm, 0);
151                 imm = *(int*)(S + x_ofs[x + 1]);
152                 pixels = _mm_insert_epi32(pixels, imm, 1);
153                 imm = *(int*)(S + x_ofs[x + 2]);
154                 pixels = _mm_insert_epi32(pixels, imm, 2);
155                 imm = *(int*)(S + x_ofs[x + 3]);
156                 pixels = _mm_insert_epi32(pixels, imm, 3);
157                 _mm_storeu_si128((__m128i*)D, pixels);
158                 D += 16;
159             }
160             for(; x < width; x++)
161             {
162                 *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
163             }
164         }
165     }
166 
167 private:
168     const Mat& src;
169     Mat& dst;
170     int* x_ofs;
171     double ify;
172 
173     resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
174     resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
175 };
176 
resizeNN2_SSE4_1(const Range & range,const Mat & src,Mat & dst,int * x_ofs,double ify)177 void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
178 {
179     resizeNNInvokerSSE2 invoker(src, dst, x_ofs, ify);
180     parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
181 }
182 
resizeNN4_SSE4_1(const Range & range,const Mat & src,Mat & dst,int * x_ofs,double ify)183 void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
184 {
185     resizeNNInvokerSSE4 invoker(src, dst, x_ofs, ify);
186     parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
187 }
188 
VResizeLanczos4Vec_32f16u_SSE41(const float ** src,ushort * dst,const float * beta,int width)189 int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width)
190 {
191     const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
192         *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
193     int x = 0;
194     __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
195         v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
196         v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
197         v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
198 
199     for (; x <= width - 8; x += 8)
200     {
201         __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
202         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
203         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
204         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
205         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
206         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
207         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
208         v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
209 
210         __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
211         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
212         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
213         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
214         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
215         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
216         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
217         v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
218 
219         __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
220         __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
221 
222         _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
223     }
224 
225     return x;
226 }
227 
228 }
229 }
230 /* End of file. */
231