1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40
41 #include "common.hpp"
42 #include "vtransform.hpp"
43
44 #include <cstring>
45
46 namespace CAROTENE_NS {
47
accumulate(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)48 void accumulate(const Size2D &size,
49 const u8 *srcBase, ptrdiff_t srcStride,
50 s16 *dstBase, ptrdiff_t dstStride)
51 {
52 internal::assertSupportedConfiguration();
53 #ifdef CAROTENE_NEON
54 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
55 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56
57 for (size_t i = 0; i < size.height; ++i)
58 {
59 const u8* src = internal::getRowPtr(srcBase, srcStride, i);
60 s16* dst = internal::getRowPtr(dstBase, dstStride, i);
61 size_t j = 0;
62
63 for (; j < roiw16; j += 16)
64 {
65 internal::prefetch(src + j);
66 internal::prefetch(dst + j);
67 uint8x16_t v_src = vld1q_u8(src + j);
68 int16x8_t v_dst0 = vld1q_s16(dst + j);
69 int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
70 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
71 int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
72 v_dst0 = vqaddq_s16(v_dst0, v_src0);
73 v_dst1 = vqaddq_s16(v_dst1, v_src1);
74 vst1q_s16(dst + j, v_dst0);
75 vst1q_s16(dst + j + 8, v_dst1);
76 }
77 for (; j < roiw8; j += 8)
78 {
79 uint8x8_t v_src = vld1_u8(src + j);
80 int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
81 int16x8_t v_dst = vld1q_s16(dst + j);
82 v_dst = vqaddq_s16(v_dst, v_src16);
83 vst1q_s16(dst + j, v_dst);
84 }
85
86 for (; j < size.width; j++)
87 dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
88 }
89 #else
90 (void)size;
91 (void)srcBase;
92 (void)srcStride;
93 (void)dstBase;
94 (void)dstStride;
95 #endif
96 }
97
98 #ifdef CAROTENE_NEON
99
100 namespace {
101
102 template <int shift>
accumulateSquareConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)103 void accumulateSquareConst(const Size2D &size,
104 const u8 *srcBase, ptrdiff_t srcStride,
105 s16 *dstBase, ptrdiff_t dstStride)
106 {
107 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
108 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
109
110 for (size_t i = 0; i < size.height; ++i)
111 {
112 const u8* src = internal::getRowPtr(srcBase, srcStride, i);
113 s16* dst = internal::getRowPtr(dstBase, dstStride, i);
114 size_t j = 0;
115
116 for (; j < roiw16; j += 16)
117 {
118 internal::prefetch(src + j);
119 internal::prefetch(dst + j);
120 uint8x16_t v_src = vld1q_u8(src + j);
121 int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
122 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
123 int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
124
125 int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
126 v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
127 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
128
129 v_srclo = vget_low_s16(v_src1);
130 v_srchi = vget_high_s16(v_src1);
131 v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
132 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
133
134 vst1q_s16(dst + j, v_dst0);
135 vst1q_s16(dst + j + 8, v_dst1);
136 }
137 for (; j < roiw8; j += 8)
138 {
139 int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
140 int16x8_t v_dst = vld1q_s16(dst + j);
141 int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
142 v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
143 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
144 vst1q_s16(dst + j, v_dst);
145 }
146
147 for (; j < size.width; j++)
148 {
149 s32 srcVal = src[j];
150 dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
151 }
152 }
153 }
154
155 template <>
accumulateSquareConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)156 void accumulateSquareConst<0>(const Size2D &size,
157 const u8 *srcBase, ptrdiff_t srcStride,
158 s16 *dstBase, ptrdiff_t dstStride)
159 {
160 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
161 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
162
163 for (size_t i = 0; i < size.height; ++i)
164 {
165 const u8* src = internal::getRowPtr(srcBase, srcStride, i);
166 s16* dst = internal::getRowPtr(dstBase, dstStride, i);
167 size_t j = 0;
168
169 for (; j < roiw16; j += 16)
170 {
171 internal::prefetch(src + j);
172 internal::prefetch(dst + j);
173 uint8x16_t v_src = vld1q_u8(src + j);
174 int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
175 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
176 int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
177
178 int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
179 v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
180 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
181
182 v_srclo = vget_low_s16(v_src1);
183 v_srchi = vget_high_s16(v_src1);
184 v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
185 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
186
187 vst1q_s16(dst + j, v_dst0);
188 vst1q_s16(dst + j + 8, v_dst1);
189 }
190 for (; j < roiw8; j += 8)
191 {
192 int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
193 int16x8_t v_dst = vld1q_s16(dst + j);
194 int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
195 v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
196 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
197 vst1q_s16(dst + j, v_dst);
198 }
199
200 for (; j < size.width; j++)
201 {
202 s32 srcVal = src[j];
203 dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
204 }
205 }
206 }
207
208 typedef void (* accumulateSquareConstFunc)(const Size2D &size,
209 const u8 *srcBase, ptrdiff_t srcStride,
210 s16 *dstBase, ptrdiff_t dstStride);
211
212 } // namespace
213
214 #endif
215
accumulateSquare(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,u32 shift)216 void accumulateSquare(const Size2D &size,
217 const u8 *srcBase, ptrdiff_t srcStride,
218 s16 *dstBase, ptrdiff_t dstStride,
219 u32 shift)
220 {
221 if (shift >= 16)
222 {
223 for (size_t i = 0; i < size.height; ++i)
224 {
225 s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
226 std::memset(dst, 0, sizeof(s16) * size.width);
227 }
228 return;
229 }
230
231 internal::assertSupportedConfiguration();
232
233 #ifdef CAROTENE_NEON
234 // this ugly contruction is needed to avoid:
235 // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
236 // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
237
238 accumulateSquareConstFunc funcs[16] =
239 {
240 accumulateSquareConst<0>,
241 accumulateSquareConst<1>,
242 accumulateSquareConst<2>,
243 accumulateSquareConst<3>,
244 accumulateSquareConst<4>,
245 accumulateSquareConst<5>,
246 accumulateSquareConst<6>,
247 accumulateSquareConst<7>,
248 accumulateSquareConst<8>,
249 accumulateSquareConst<9>,
250 accumulateSquareConst<10>,
251 accumulateSquareConst<11>,
252 accumulateSquareConst<12>,
253 accumulateSquareConst<13>,
254 accumulateSquareConst<14>,
255 accumulateSquareConst<15>
256 }, func = funcs[shift];
257
258 func(size, srcBase, srcStride, dstBase, dstStride);
259 #else
260 (void)size;
261 (void)srcBase;
262 (void)srcStride;
263 (void)dstBase;
264 (void)dstStride;
265 (void)shift;
266 #endif
267 }
268
269 #ifdef CAROTENE_NEON
270
271 namespace {
272
273 struct AccumulateWeightedHalf
274 {
275 typedef u8 type;
276
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf277 void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
278 uint8x16_t & v_dst) const
279 {
280 v_dst = vhaddq_u8(v_src0, v_src1);
281 }
282
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf283 void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
284 uint8x8_t & v_dst) const
285 {
286 v_dst = vhadd_u8(v_src0, v_src1);
287 }
288
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf289 void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
290 {
291 dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
292 }
293 };
294
295 struct AccumulateWeighted
296 {
297 typedef u8 type;
298
299 float alpha, beta;
300 float32x4_t v_alpha, v_beta;
301
AccumulateWeightedCAROTENE_NS::__anona2d356970211::AccumulateWeighted302 explicit AccumulateWeighted(float _alpha) :
303 alpha(_alpha), beta(1 - _alpha)
304 {
305 v_alpha = vdupq_n_f32(alpha);
306 v_beta = vdupq_n_f32(beta);
307 }
308
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted309 void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
310 uint8x16_t & v_dst) const
311 {
312 uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
313 uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
314 float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
315 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
316 float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
317 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
318 uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
319 vmovn_u32(vcvtq_u32_f32(v_dst1f)));
320
321 v_src0_p = vmovl_u8(vget_high_u8(v_src0));
322 v_src1_p = vmovl_u8(vget_high_u8(v_src1));
323 v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
324 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
325 v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
326 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
327 uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
328 vmovn_u32(vcvtq_u32_f32(v_dst1f)));
329
330 v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
331 }
332
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted333 void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
334 uint8x8_t & v_dst) const
335 {
336 uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
337
338 float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
339 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
340 float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
341 v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
342 uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
343 vmovn_u32(vcvtq_u32_f32(v_dst1f)));
344
345 v_dst = vmovn_u16(_v_dst);
346 }
347
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted348 void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
349 {
350 dst[0] = beta * src1[0] + alpha * src0[0];
351 }
352 };
353
354 } // namespace
355
356 #endif
357
accumulateWeighted(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,f32 alpha)358 void accumulateWeighted(const Size2D &size,
359 const u8 *srcBase, ptrdiff_t srcStride,
360 u8 *dstBase, ptrdiff_t dstStride,
361 f32 alpha)
362 {
363 if (alpha == 0.0f)
364 return;
365 if (alpha == 1.0f)
366 {
367 for (size_t i = 0; i < size.height; ++i)
368 {
369 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
370 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
371 std::memcpy(dst, src, sizeof(u8) * size.width);
372 }
373 return;
374 }
375
376 internal::assertSupportedConfiguration();
377
378 #ifdef CAROTENE_NEON
379 // in this case we can use the following scheme:
380 // dst[p] = (src[p] + dst[p]) >> 1
381 // which is faster
382 if (alpha == 0.5f)
383 {
384 internal::vtransform(size,
385 srcBase, srcStride,
386 dstBase, dstStride,
387 dstBase, dstStride,
388 AccumulateWeightedHalf());
389
390 return;
391 }
392
393 internal::vtransform(size,
394 srcBase, srcStride,
395 dstBase, dstStride,
396 dstBase, dstStride,
397 AccumulateWeighted(alpha));
398 #else
399 (void)size;
400 (void)srcBase;
401 (void)srcStride;
402 (void)dstBase;
403 (void)dstStride;
404 (void)alpha;
405 #endif
406 }
407
408 } //namespace CAROTENE_NS
409