1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5
6 #include "precomp.hpp"
7 #include "stat.hpp"
8
9 namespace cv {
10 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
11
12 SumFunc getSumFunc(int depth);
13
14 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
15
16 template <typename T, typename ST>
17 struct Sum_SIMD
18 {
operator ()cv::Sum_SIMD19 int operator () (const T *, const uchar *, ST *, int, int) const
20 {
21 return 0;
22 }
23 };
24
25 #if CV_SIMD
26
27 template <>
28 struct Sum_SIMD<uchar, int>
29 {
operator ()cv::Sum_SIMD30 int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
31 {
32 if (mask || (cn != 1 && cn != 2 && cn != 4))
33 return 0;
34 len *= cn;
35
36 int x = 0;
37 v_uint32 v_sum = vx_setzero_u32();
38
39 int len0 = len & -v_uint8::nlanes;
40 while (x < len0)
41 {
42 const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
43 v_uint16 v_sum16 = vx_setzero_u16();
44 for (; x < len_tmp; x += v_uint8::nlanes)
45 {
46 v_uint16 v_src0, v_src1;
47 v_expand(vx_load(src0 + x), v_src0, v_src1);
48 v_sum16 += v_src0 + v_src1;
49 }
50 v_uint32 v_half0, v_half1;
51 v_expand(v_sum16, v_half0, v_half1);
52 v_sum += v_half0 + v_half1;
53 }
54 if (x <= len - v_uint16::nlanes)
55 {
56 v_uint32 v_half0, v_half1;
57 v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
58 v_sum += v_half0 + v_half1;
59 x += v_uint16::nlanes;
60 }
61 if (x <= len - v_uint32::nlanes)
62 {
63 v_sum += vx_load_expand_q(src0 + x);
64 x += v_uint32::nlanes;
65 }
66
67 if (cn == 1)
68 *dst += v_reduce_sum(v_sum);
69 else
70 {
71 uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
72 v_store_aligned(ar, v_sum);
73 for (int i = 0; i < v_uint32::nlanes; ++i)
74 dst[i % cn] += ar[i];
75 }
76 v_cleanup();
77
78 return x / cn;
79 }
80 };
81
82 template <>
83 struct Sum_SIMD<schar, int>
84 {
operator ()cv::Sum_SIMD85 int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
86 {
87 if (mask || (cn != 1 && cn != 2 && cn != 4))
88 return 0;
89 len *= cn;
90
91 int x = 0;
92 v_int32 v_sum = vx_setzero_s32();
93
94 int len0 = len & -v_int8::nlanes;
95 while (x < len0)
96 {
97 const int len_tmp = min(x + 256*v_int16::nlanes, len0);
98 v_int16 v_sum16 = vx_setzero_s16();
99 for (; x < len_tmp; x += v_int8::nlanes)
100 {
101 v_int16 v_src0, v_src1;
102 v_expand(vx_load(src0 + x), v_src0, v_src1);
103 v_sum16 += v_src0 + v_src1;
104 }
105 v_int32 v_half0, v_half1;
106 v_expand(v_sum16, v_half0, v_half1);
107 v_sum += v_half0 + v_half1;
108 }
109 if (x <= len - v_int16::nlanes)
110 {
111 v_int32 v_half0, v_half1;
112 v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
113 v_sum += v_half0 + v_half1;
114 x += v_int16::nlanes;
115 }
116 if (x <= len - v_int32::nlanes)
117 {
118 v_sum += vx_load_expand_q(src0 + x);
119 x += v_int32::nlanes;
120 }
121
122 if (cn == 1)
123 *dst += v_reduce_sum(v_sum);
124 else
125 {
126 int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
127 v_store_aligned(ar, v_sum);
128 for (int i = 0; i < v_int32::nlanes; ++i)
129 dst[i % cn] += ar[i];
130 }
131 v_cleanup();
132
133 return x / cn;
134 }
135 };
136
137 template <>
138 struct Sum_SIMD<ushort, int>
139 {
operator ()cv::Sum_SIMD140 int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
141 {
142 if (mask || (cn != 1 && cn != 2 && cn != 4))
143 return 0;
144 len *= cn;
145
146 int x = 0;
147 v_uint32 v_sum = vx_setzero_u32();
148
149 for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
150 {
151 v_uint32 v_src0, v_src1;
152 v_expand(vx_load(src0 + x), v_src0, v_src1);
153 v_sum += v_src0 + v_src1;
154 }
155 if (x <= len - v_uint32::nlanes)
156 {
157 v_sum += vx_load_expand(src0 + x);
158 x += v_uint32::nlanes;
159 }
160
161 if (cn == 1)
162 *dst += v_reduce_sum(v_sum);
163 else
164 {
165 uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
166 v_store_aligned(ar, v_sum);
167 for (int i = 0; i < v_uint32::nlanes; ++i)
168 dst[i % cn] += ar[i];
169 }
170 v_cleanup();
171
172 return x / cn;
173 }
174 };
175
176 template <>
177 struct Sum_SIMD<short, int>
178 {
operator ()cv::Sum_SIMD179 int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
180 {
181 if (mask || (cn != 1 && cn != 2 && cn != 4))
182 return 0;
183 len *= cn;
184
185 int x = 0;
186 v_int32 v_sum = vx_setzero_s32();
187
188 for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
189 {
190 v_int32 v_src0, v_src1;
191 v_expand(vx_load(src0 + x), v_src0, v_src1);
192 v_sum += v_src0 + v_src1;
193 }
194 if (x <= len - v_int32::nlanes)
195 {
196 v_sum += vx_load_expand(src0 + x);
197 x += v_int32::nlanes;
198 }
199
200 if (cn == 1)
201 *dst += v_reduce_sum(v_sum);
202 else
203 {
204 int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
205 v_store_aligned(ar, v_sum);
206 for (int i = 0; i < v_int32::nlanes; ++i)
207 dst[i % cn] += ar[i];
208 }
209 v_cleanup();
210
211 return x / cn;
212 }
213 };
214
215 #if CV_SIMD_64F
216 template <>
217 struct Sum_SIMD<int, double>
218 {
operator ()cv::Sum_SIMD219 int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
220 {
221 if (mask || (cn != 1 && cn != 2 && cn != 4))
222 return 0;
223 len *= cn;
224
225 int x = 0;
226 v_float64 v_sum0 = vx_setzero_f64();
227 v_float64 v_sum1 = vx_setzero_f64();
228
229 for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
230 {
231 v_int32 v_src0 = vx_load(src0 + x);
232 v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
233 v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
234 v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
235 }
236
237 #if CV_SIMD256 || CV_SIMD512
238 double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
239 v_store_aligned(ar, v_sum0 + v_sum1);
240 for (int i = 0; i < v_float64::nlanes; ++i)
241 dst[i % cn] += ar[i];
242 #else
243 double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
244 v_store_aligned(ar, v_sum0);
245 v_store_aligned(ar + v_float64::nlanes, v_sum1);
246 for (int i = 0; i < 2 * v_float64::nlanes; ++i)
247 dst[i % cn] += ar[i];
248 #endif
249 v_cleanup();
250
251 return x / cn;
252 }
253 };
254
255 template <>
256 struct Sum_SIMD<float, double>
257 {
operator ()cv::Sum_SIMD258 int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
259 {
260 if (mask || (cn != 1 && cn != 2 && cn != 4))
261 return 0;
262 len *= cn;
263
264 int x = 0;
265 v_float64 v_sum0 = vx_setzero_f64();
266 v_float64 v_sum1 = vx_setzero_f64();
267
268 for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
269 {
270 v_float32 v_src0 = vx_load(src0 + x);
271 v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
272 v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
273 v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
274 }
275
276 #if CV_SIMD256 || CV_SIMD512
277 double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
278 v_store_aligned(ar, v_sum0 + v_sum1);
279 for (int i = 0; i < v_float64::nlanes; ++i)
280 dst[i % cn] += ar[i];
281 #else
282 double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
283 v_store_aligned(ar, v_sum0);
284 v_store_aligned(ar + v_float64::nlanes, v_sum1);
285 for (int i = 0; i < 2 * v_float64::nlanes; ++i)
286 dst[i % cn] += ar[i];
287 #endif
288 v_cleanup();
289
290 return x / cn;
291 }
292 };
293 #endif
294 #endif
295
296 template<typename T, typename ST>
sum_(const T * src0,const uchar * mask,ST * dst,int len,int cn)297 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
298 {
299 const T* src = src0;
300 if( !mask )
301 {
302 Sum_SIMD<T, ST> vop;
303 int i = vop(src0, mask, dst, len, cn), k = cn % 4;
304 src += i * cn;
305
306 if( k == 1 )
307 {
308 ST s0 = dst[0];
309
310 #if CV_ENABLE_UNROLLED
311 for(; i <= len - 4; i += 4, src += cn*4 )
312 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
313 #endif
314 for( ; i < len; i++, src += cn )
315 s0 += src[0];
316 dst[0] = s0;
317 }
318 else if( k == 2 )
319 {
320 ST s0 = dst[0], s1 = dst[1];
321 for( ; i < len; i++, src += cn )
322 {
323 s0 += src[0];
324 s1 += src[1];
325 }
326 dst[0] = s0;
327 dst[1] = s1;
328 }
329 else if( k == 3 )
330 {
331 ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
332 for( ; i < len; i++, src += cn )
333 {
334 s0 += src[0];
335 s1 += src[1];
336 s2 += src[2];
337 }
338 dst[0] = s0;
339 dst[1] = s1;
340 dst[2] = s2;
341 }
342
343 for( ; k < cn; k += 4 )
344 {
345 src = src0 + i*cn + k;
346 ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
347 for( ; i < len; i++, src += cn )
348 {
349 s0 += src[0]; s1 += src[1];
350 s2 += src[2]; s3 += src[3];
351 }
352 dst[k] = s0;
353 dst[k+1] = s1;
354 dst[k+2] = s2;
355 dst[k+3] = s3;
356 }
357 return len;
358 }
359
360 int i, nzm = 0;
361 if( cn == 1 )
362 {
363 ST s = dst[0];
364 for( i = 0; i < len; i++ )
365 if( mask[i] )
366 {
367 s += src[i];
368 nzm++;
369 }
370 dst[0] = s;
371 }
372 else if( cn == 3 )
373 {
374 ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
375 for( i = 0; i < len; i++, src += 3 )
376 if( mask[i] )
377 {
378 s0 += src[0];
379 s1 += src[1];
380 s2 += src[2];
381 nzm++;
382 }
383 dst[0] = s0;
384 dst[1] = s1;
385 dst[2] = s2;
386 }
387 else
388 {
389 for( i = 0; i < len; i++, src += cn )
390 if( mask[i] )
391 {
392 int k = 0;
393 #if CV_ENABLE_UNROLLED
394 for( ; k <= cn - 4; k += 4 )
395 {
396 ST s0, s1;
397 s0 = dst[k] + src[k];
398 s1 = dst[k+1] + src[k+1];
399 dst[k] = s0; dst[k+1] = s1;
400 s0 = dst[k+2] + src[k+2];
401 s1 = dst[k+3] + src[k+3];
402 dst[k+2] = s0; dst[k+3] = s1;
403 }
404 #endif
405 for( ; k < cn; k++ )
406 dst[k] += src[k];
407 nzm++;
408 }
409 }
410 return nzm;
411 }
412
413
sum8u(const uchar * src,const uchar * mask,int * dst,int len,int cn)414 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
415 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
416
sum8s(const schar * src,const uchar * mask,int * dst,int len,int cn)417 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
418 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
419
sum16u(const ushort * src,const uchar * mask,int * dst,int len,int cn)420 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
421 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
422
sum16s(const short * src,const uchar * mask,int * dst,int len,int cn)423 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
424 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
425
sum32s(const int * src,const uchar * mask,double * dst,int len,int cn)426 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
427 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
428
sum32f(const float * src,const uchar * mask,double * dst,int len,int cn)429 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
430 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
431
sum64f(const double * src,const uchar * mask,double * dst,int len,int cn)432 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
433 { CV_INSTRUMENT_REGION(); return sum_(src, mask, dst, len, cn); }
434
getSumFunc(int depth)435 SumFunc getSumFunc(int depth)
436 {
437 static SumFunc sumTab[] =
438 {
439 (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
440 (SumFunc)sum16u, (SumFunc)sum16s,
441 (SumFunc)sum32s,
442 (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
443 0
444 };
445
446 return sumTab[depth];
447 }
448
449 #endif
450
451 CV_CPU_OPTIMIZATION_NAMESPACE_END
452 } // namespace
453