1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 // Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49
50 #include "precomp.hpp"
51 #include "resize.hpp"
52
53 namespace cv
54 {
55 namespace opt_SSE4_1
56 {
57
58 class resizeNNInvokerSSE2 :
59 public ParallelLoopBody
60 {
61 public:
resizeNNInvokerSSE2(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)62 resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
63 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
64 ify(_ify)
65 {
66 }
67
68 #if defined(__INTEL_COMPILER)
69 #pragma optimization_parameter target_arch=SSE4.2
70 #endif
operator ()(const Range & range) const71 virtual void operator() (const Range& range) const CV_OVERRIDE
72 {
73 Size ssize = src.size(), dsize = dst.size();
74 int y, x;
75 int width = dsize.width;
76 int sseWidth = width - (width & 0x7);
77 for(y = range.start; y < range.end; y++)
78 {
79 uchar* D = dst.data + dst.step*y;
80 uchar* Dstart = D;
81 int sy = std::min(cvFloor(y*ify), ssize.height-1);
82 const uchar* S = src.data + sy*src.step;
83 __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
84 for(x = 0; x < sseWidth; x += 8)
85 {
86 ushort imm = *(ushort*)(S + x_ofs[x + 0]);
87 pixels = _mm_insert_epi16(pixels, imm, 0);
88 imm = *(ushort*)(S + x_ofs[x + 1]);
89 pixels = _mm_insert_epi16(pixels, imm, 1);
90 imm = *(ushort*)(S + x_ofs[x + 2]);
91 pixels = _mm_insert_epi16(pixels, imm, 2);
92 imm = *(ushort*)(S + x_ofs[x + 3]);
93 pixels = _mm_insert_epi16(pixels, imm, 3);
94 imm = *(ushort*)(S + x_ofs[x + 4]);
95 pixels = _mm_insert_epi16(pixels, imm, 4);
96 imm = *(ushort*)(S + x_ofs[x + 5]);
97 pixels = _mm_insert_epi16(pixels, imm, 5);
98 imm = *(ushort*)(S + x_ofs[x + 6]);
99 pixels = _mm_insert_epi16(pixels, imm, 6);
100 imm = *(ushort*)(S + x_ofs[x + 7]);
101 pixels = _mm_insert_epi16(pixels, imm, 7);
102 _mm_storeu_si128((__m128i*)D, pixels);
103 D += 16;
104 }
105 for(; x < width; x++)
106 {
107 *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
108 }
109 }
110 }
111
112 private:
113 const Mat& src;
114 Mat& dst;
115 int* x_ofs;
116 double ify;
117
118 resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
119 resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
120 };
121
122 class resizeNNInvokerSSE4 :
123 public ParallelLoopBody
124 {
125 public:
resizeNNInvokerSSE4(const Mat & _src,Mat & _dst,int * _x_ofs,double _ify)126 resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
127 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
128 ify(_ify)
129 {
130 }
131 #if defined(__INTEL_COMPILER)
132 #pragma optimization_parameter target_arch=SSE4.2
133 #endif
operator ()(const Range & range) const134 virtual void operator() (const Range& range) const CV_OVERRIDE
135 {
136 Size ssize = src.size(), dsize = dst.size();
137 int y, x;
138 int width = dsize.width;
139 int sseWidth = width - (width & 0x3);
140 for(y = range.start; y < range.end; y++)
141 {
142 uchar* D = dst.data + dst.step*y;
143 uchar* Dstart = D;
144 int sy = std::min(cvFloor(y*ify), ssize.height-1);
145 const uchar* S = src.data + sy*src.step;
146 __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
147 for(x = 0; x < sseWidth; x += 4)
148 {
149 int imm = *(int*)(S + x_ofs[x + 0]);
150 pixels = _mm_insert_epi32(pixels, imm, 0);
151 imm = *(int*)(S + x_ofs[x + 1]);
152 pixels = _mm_insert_epi32(pixels, imm, 1);
153 imm = *(int*)(S + x_ofs[x + 2]);
154 pixels = _mm_insert_epi32(pixels, imm, 2);
155 imm = *(int*)(S + x_ofs[x + 3]);
156 pixels = _mm_insert_epi32(pixels, imm, 3);
157 _mm_storeu_si128((__m128i*)D, pixels);
158 D += 16;
159 }
160 for(; x < width; x++)
161 {
162 *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
163 }
164 }
165 }
166
167 private:
168 const Mat& src;
169 Mat& dst;
170 int* x_ofs;
171 double ify;
172
173 resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
174 resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
175 };
176
resizeNN2_SSE4_1(const Range & range,const Mat & src,Mat & dst,int * x_ofs,double ify)177 void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
178 {
179 resizeNNInvokerSSE2 invoker(src, dst, x_ofs, ify);
180 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
181 }
182
resizeNN4_SSE4_1(const Range & range,const Mat & src,Mat & dst,int * x_ofs,double ify)183 void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
184 {
185 resizeNNInvokerSSE4 invoker(src, dst, x_ofs, ify);
186 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
187 }
188
VResizeLanczos4Vec_32f16u_SSE41(const float ** src,ushort * dst,const float * beta,int width)189 int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width)
190 {
191 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
192 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
193 int x = 0;
194 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
195 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
196 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
197 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
198
199 for (; x <= width - 8; x += 8)
200 {
201 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
202 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
203 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
204 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
205 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
206 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
207 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
208 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
209
210 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
211 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
212 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
213 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
214 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
215 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
216 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
217 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
218
219 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
220 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
221
222 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
223 }
224
225 return x;
226 }
227
228 }
229 }
230 /* End of file. */
231