1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 
41 #include "common.hpp"
42 #include "vtransform.hpp"
43 
44 #include <cstring>
45 
46 namespace CAROTENE_NS {
47 
accumulate(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)48 void accumulate(const Size2D &size,
49                 const u8 *srcBase, ptrdiff_t srcStride,
50                 s16 *dstBase, ptrdiff_t dstStride)
51 {
52     internal::assertSupportedConfiguration();
53 #ifdef CAROTENE_NEON
54     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
55     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56 
57     for (size_t i = 0; i < size.height; ++i)
58     {
59         const u8* src = internal::getRowPtr(srcBase, srcStride, i);
60         s16* dst = internal::getRowPtr(dstBase, dstStride, i);
61         size_t j = 0;
62 
63         for (; j < roiw16; j += 16)
64         {
65             internal::prefetch(src + j);
66             internal::prefetch(dst + j);
67             uint8x16_t v_src = vld1q_u8(src + j);
68             int16x8_t v_dst0 = vld1q_s16(dst + j);
69             int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
70             int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
71             int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
72             v_dst0 = vqaddq_s16(v_dst0, v_src0);
73             v_dst1 = vqaddq_s16(v_dst1, v_src1);
74             vst1q_s16(dst + j, v_dst0);
75             vst1q_s16(dst + j + 8, v_dst1);
76         }
77         for (; j < roiw8; j += 8)
78         {
79             uint8x8_t v_src = vld1_u8(src + j);
80             int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
81             int16x8_t v_dst = vld1q_s16(dst + j);
82             v_dst = vqaddq_s16(v_dst, v_src16);
83             vst1q_s16(dst + j, v_dst);
84         }
85 
86         for (; j < size.width; j++)
87             dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
88     }
89 #else
90     (void)size;
91     (void)srcBase;
92     (void)srcStride;
93     (void)dstBase;
94     (void)dstStride;
95 #endif
96 }
97 
98 #ifdef CAROTENE_NEON
99 
100 namespace {
101 
102 template <int shift>
accumulateSquareConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)103 void accumulateSquareConst(const Size2D &size,
104                            const u8 *srcBase, ptrdiff_t srcStride,
105                            s16 *dstBase, ptrdiff_t dstStride)
106 {
107     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
108     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
109 
110     for (size_t i = 0; i < size.height; ++i)
111     {
112         const u8* src = internal::getRowPtr(srcBase, srcStride, i);
113         s16* dst = internal::getRowPtr(dstBase, dstStride, i);
114         size_t j = 0;
115 
116         for (; j < roiw16; j += 16)
117         {
118             internal::prefetch(src + j);
119             internal::prefetch(dst + j);
120             uint8x16_t v_src = vld1q_u8(src + j);
121             int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
122             int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
123             int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
124 
125             int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
126             v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
127                                   vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
128 
129             v_srclo = vget_low_s16(v_src1);
130             v_srchi = vget_high_s16(v_src1);
131             v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
132                                   vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
133 
134             vst1q_s16(dst + j, v_dst0);
135             vst1q_s16(dst + j + 8, v_dst1);
136         }
137         for (; j < roiw8; j += 8)
138         {
139             int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
140             int16x8_t v_dst = vld1q_s16(dst + j);
141             int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
142             v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
143                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
144             vst1q_s16(dst + j, v_dst);
145         }
146 
147         for (; j < size.width; j++)
148         {
149             s32 srcVal = src[j];
150             dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
151         }
152     }
153 }
154 
155 template <>
accumulateSquareConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)156 void accumulateSquareConst<0>(const Size2D &size,
157                               const u8 *srcBase, ptrdiff_t srcStride,
158                               s16 *dstBase, ptrdiff_t dstStride)
159 {
160     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
161     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
162 
163     for (size_t i = 0; i < size.height; ++i)
164     {
165         const u8* src = internal::getRowPtr(srcBase, srcStride, i);
166         s16* dst = internal::getRowPtr(dstBase, dstStride, i);
167         size_t j = 0;
168 
169         for (; j < roiw16; j += 16)
170         {
171             internal::prefetch(src + j);
172             internal::prefetch(dst + j);
173             uint8x16_t v_src = vld1q_u8(src + j);
174             int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
175             int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
176             int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
177 
178             int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
179             v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
180                                   vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
181 
182             v_srclo = vget_low_s16(v_src1);
183             v_srchi = vget_high_s16(v_src1);
184             v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
185                                   vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
186 
187             vst1q_s16(dst + j, v_dst0);
188             vst1q_s16(dst + j + 8, v_dst1);
189         }
190         for (; j < roiw8; j += 8)
191         {
192             int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
193             int16x8_t v_dst = vld1q_s16(dst + j);
194             int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
195             v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
196                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
197             vst1q_s16(dst + j, v_dst);
198         }
199 
200         for (; j < size.width; j++)
201         {
202             s32 srcVal = src[j];
203             dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
204         }
205     }
206 }
207 
208 typedef void (* accumulateSquareConstFunc)(const Size2D &size,
209                                            const u8 *srcBase, ptrdiff_t srcStride,
210                                            s16 *dstBase, ptrdiff_t dstStride);
211 
212 } // namespace
213 
214 #endif
215 
accumulateSquare(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,u32 shift)216 void accumulateSquare(const Size2D &size,
217                       const u8 *srcBase, ptrdiff_t srcStride,
218                       s16 *dstBase, ptrdiff_t dstStride,
219                       u32 shift)
220 {
221     if (shift >= 16)
222     {
223         for (size_t i = 0; i < size.height; ++i)
224         {
225             s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
226             std::memset(dst, 0, sizeof(s16) * size.width);
227         }
228         return;
229     }
230 
231     internal::assertSupportedConfiguration();
232 
233 #ifdef CAROTENE_NEON
234     // this ugly contruction is needed to avoid:
235     // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
236     // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
237 
238     accumulateSquareConstFunc funcs[16] =
239     {
240         accumulateSquareConst<0>,
241         accumulateSquareConst<1>,
242         accumulateSquareConst<2>,
243         accumulateSquareConst<3>,
244         accumulateSquareConst<4>,
245         accumulateSquareConst<5>,
246         accumulateSquareConst<6>,
247         accumulateSquareConst<7>,
248         accumulateSquareConst<8>,
249         accumulateSquareConst<9>,
250         accumulateSquareConst<10>,
251         accumulateSquareConst<11>,
252         accumulateSquareConst<12>,
253         accumulateSquareConst<13>,
254         accumulateSquareConst<14>,
255         accumulateSquareConst<15>
256     }, func = funcs[shift];
257 
258     func(size, srcBase, srcStride, dstBase, dstStride);
259 #else
260     (void)size;
261     (void)srcBase;
262     (void)srcStride;
263     (void)dstBase;
264     (void)dstStride;
265     (void)shift;
266 #endif
267 }
268 
269 #ifdef CAROTENE_NEON
270 
271 namespace {
272 
273 struct AccumulateWeightedHalf
274 {
275     typedef u8 type;
276 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf277     void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
278                      uint8x16_t & v_dst) const
279     {
280         v_dst = vhaddq_u8(v_src0, v_src1);
281     }
282 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf283     void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
284                      uint8x8_t & v_dst) const
285     {
286         v_dst = vhadd_u8(v_src0, v_src1);
287     }
288 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeightedHalf289     void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
290     {
291         dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
292     }
293 };
294 
295 struct AccumulateWeighted
296 {
297     typedef u8 type;
298 
299     float alpha, beta;
300     float32x4_t v_alpha, v_beta;
301 
AccumulateWeightedCAROTENE_NS::__anona2d356970211::AccumulateWeighted302     explicit AccumulateWeighted(float _alpha) :
303         alpha(_alpha), beta(1 - _alpha)
304     {
305         v_alpha = vdupq_n_f32(alpha);
306         v_beta = vdupq_n_f32(beta);
307     }
308 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted309     void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
310                      uint8x16_t & v_dst) const
311     {
312         uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
313         uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
314         float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
315                                         v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
316         float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
317                                         v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
318         uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
319                                          vmovn_u32(vcvtq_u32_f32(v_dst1f)));
320 
321         v_src0_p = vmovl_u8(vget_high_u8(v_src0));
322         v_src1_p = vmovl_u8(vget_high_u8(v_src1));
323         v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
324                             v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
325         v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
326                             v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
327         uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
328                                          vmovn_u32(vcvtq_u32_f32(v_dst1f)));
329 
330         v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
331     }
332 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted333     void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
334                      uint8x8_t & v_dst) const
335     {
336         uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
337 
338         float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
339                                         v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
340         float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
341                                         v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
342         uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
343                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
344 
345         v_dst = vmovn_u16(_v_dst);
346     }
347 
operator ()CAROTENE_NS::__anona2d356970211::AccumulateWeighted348     void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
349     {
350         dst[0] = beta * src1[0] + alpha * src0[0];
351     }
352 };
353 
354 } // namespace
355 
356 #endif
357 
accumulateWeighted(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,f32 alpha)358 void accumulateWeighted(const Size2D &size,
359                         const u8 *srcBase, ptrdiff_t srcStride,
360                         u8 *dstBase, ptrdiff_t dstStride,
361                         f32 alpha)
362 {
363     if (alpha == 0.0f)
364         return;
365     if (alpha == 1.0f)
366     {
367         for (size_t i = 0; i < size.height; ++i)
368         {
369             const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
370             u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
371             std::memcpy(dst, src, sizeof(u8) * size.width);
372         }
373         return;
374     }
375 
376     internal::assertSupportedConfiguration();
377 
378 #ifdef CAROTENE_NEON
379     // in this case we can use the following scheme:
380     // dst[p] = (src[p] + dst[p]) >> 1
381     // which is faster
382     if (alpha == 0.5f)
383     {
384         internal::vtransform(size,
385                              srcBase, srcStride,
386                              dstBase, dstStride,
387                              dstBase, dstStride,
388                              AccumulateWeightedHalf());
389 
390         return;
391     }
392 
393     internal::vtransform(size,
394                      srcBase, srcStride,
395                      dstBase, dstStride,
396                      dstBase, dstStride,
397                      AccumulateWeighted(alpha));
398 #else
399     (void)size;
400     (void)srcBase;
401     (void)srcStride;
402     (void)dstBase;
403     (void)dstStride;
404     (void)alpha;
405 #endif
406 }
407 
408 } //namespace CAROTENE_NS
409