1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "vtransform.hpp"
42 
43 namespace CAROTENE_NS {
44 
45 #ifdef CAROTENE_NEON
46 
47 namespace {
48 
49 using namespace internal;
50 
51 template <typename T> struct TypeTraits;
52 template <> struct TypeTraits< u8> { typedef u16 wide;                     typedef  u8 unsign; typedef  uint8x16_t vec128; };
53 template <> struct TypeTraits< s8> { typedef s16 wide;                     typedef  u8 unsign; typedef   int8x16_t vec128; };
54 template <> struct TypeTraits<u16> { typedef u32 wide; typedef  u8 narrow; typedef u16 unsign; typedef  uint16x8_t vec128; };
55 template <> struct TypeTraits<s16> { typedef s32 wide; typedef  s8 narrow; typedef u16 unsign; typedef   int16x8_t vec128; };
56 template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef  uint32x4_t vec128; };
57 template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef   int32x4_t vec128; };
58 template <> struct TypeTraits<f32> { typedef f64 wide;                                         typedef float32x4_t vec128; };
59 
60 template <typename T> struct wAdd
61 {
62     typedef T type;
63 
64     f32 alpha, beta, gamma;
65     typedef typename TypeTraits<T>::wide wtype;
66     wAdd<wtype> wideAdd;
wAddCAROTENE_NS::__anone3ae816c0111::wAdd67     wAdd(f32 _alpha, f32 _beta, f32 _gamma):
68         alpha(_alpha), beta(_beta), gamma(_gamma),
69         wideAdd(_alpha, _beta, _gamma) {}
70 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd71     void operator() (const typename VecTraits<T>::vec128 & v_src0,
72                      const typename VecTraits<T>::vec128 & v_src1,
73                      typename VecTraits<T>::vec128 & v_dst) const
74     {
75         typename VecTraits<wtype>::vec128 vrl, vrh;
76         wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
77         wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
78 
79         v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
80     }
81 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd82     void operator() (const typename VecTraits<T>::vec64 & v_src0,
83                      const typename VecTraits<T>::vec64 & v_src1,
84                      typename VecTraits<T>::vec64 & v_dst) const
85     {
86         typename VecTraits<wtype>::vec128 vr;
87         wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
88 
89         v_dst = vqmovn(vr);
90     }
91 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd92     void operator() (const T * src0, const T * src1, T * dst) const
93     {
94         dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
95     }
96 };
97 
98 template <> struct wAdd<s32>
99 {
100     typedef s32 type;
101 
102     f32 alpha, beta, gamma;
103     float32x4_t valpha, vbeta, vgamma;
wAddCAROTENE_NS::__anone3ae816c0111::wAdd104     wAdd(f32 _alpha, f32 _beta, f32 _gamma):
105         alpha(_alpha), beta(_beta), gamma(_gamma)
106     {
107         valpha = vdupq_n_f32(_alpha);
108         vbeta = vdupq_n_f32(_beta);
109         vgamma = vdupq_n_f32(_gamma + 0.5);
110     }
111 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd112     void operator() (const typename VecTraits<s32>::vec128 & v_src0,
113                      const typename VecTraits<s32>::vec128 & v_src1,
114                      typename VecTraits<s32>::vec128 & v_dst) const
115     {
116         float32x4_t vs1 = vcvtq_f32_s32(v_src0);
117         float32x4_t vs2 = vcvtq_f32_s32(v_src1);
118 
119         vs1 = vmlaq_f32(vgamma, vs1, valpha);
120         vs1 = vmlaq_f32(vs1, vs2, vbeta);
121         v_dst = vcvtq_s32_f32(vs1);
122     }
123 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd124     void operator() (const typename VecTraits<s32>::vec64 & v_src0,
125                      const typename VecTraits<s32>::vec64 & v_src1,
126                      typename VecTraits<s32>::vec64 & v_dst) const
127     {
128         float32x2_t vs1 = vcvt_f32_s32(v_src0);
129         float32x2_t vs2 = vcvt_f32_s32(v_src1);
130 
131         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
132         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
133         v_dst = vcvt_s32_f32(vs1);
134     }
135 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd136     void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
137     {
138         dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
139     }
140 };
141 
142 template <> struct wAdd<u32>
143 {
144     typedef u32 type;
145 
146     f32 alpha, beta, gamma;
147     float32x4_t valpha, vbeta, vgamma;
wAddCAROTENE_NS::__anone3ae816c0111::wAdd148     wAdd(f32 _alpha, f32 _beta, f32 _gamma):
149         alpha(_alpha), beta(_beta), gamma(_gamma)
150     {
151         valpha = vdupq_n_f32(_alpha);
152         vbeta = vdupq_n_f32(_beta);
153         vgamma = vdupq_n_f32(_gamma + 0.5);
154     }
155 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd156     void operator() (const typename VecTraits<u32>::vec128 & v_src0,
157                      const typename VecTraits<u32>::vec128 & v_src1,
158                      typename VecTraits<u32>::vec128 & v_dst) const
159     {
160         float32x4_t vs1 = vcvtq_f32_u32(v_src0);
161         float32x4_t vs2 = vcvtq_f32_u32(v_src1);
162 
163         vs1 = vmlaq_f32(vgamma, vs1, valpha);
164         vs1 = vmlaq_f32(vs1, vs2, vbeta);
165         v_dst = vcvtq_u32_f32(vs1);
166     }
167 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd168     void operator() (const typename VecTraits<u32>::vec64 & v_src0,
169                      const typename VecTraits<u32>::vec64 & v_src1,
170                      typename VecTraits<u32>::vec64 & v_dst) const
171     {
172         float32x2_t vs1 = vcvt_f32_u32(v_src0);
173         float32x2_t vs2 = vcvt_f32_u32(v_src1);
174 
175         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
176         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
177         v_dst = vcvt_u32_f32(vs1);
178     }
179 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd180     void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
181     {
182         dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
183     }
184 };
185 
186 template <> struct wAdd<f32>
187 {
188     typedef f32 type;
189 
190     f32 alpha, beta, gamma;
191     float32x4_t valpha, vbeta, vgamma;
wAddCAROTENE_NS::__anone3ae816c0111::wAdd192     wAdd(f32 _alpha, f32 _beta, f32 _gamma):
193         alpha(_alpha), beta(_beta), gamma(_gamma)
194     {
195         valpha = vdupq_n_f32(_alpha);
196         vbeta = vdupq_n_f32(_beta);
197         vgamma = vdupq_n_f32(_gamma + 0.5);
198     }
199 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd200     void operator() (const typename VecTraits<f32>::vec128 & v_src0,
201                      const typename VecTraits<f32>::vec128 & v_src1,
202                      typename VecTraits<f32>::vec128 & v_dst) const
203     {
204         float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
205         v_dst = vmlaq_f32(vs1, v_src1, vbeta);
206     }
207 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd208     void operator() (const typename VecTraits<f32>::vec64 & v_src0,
209                      const typename VecTraits<f32>::vec64 & v_src1,
210                      typename VecTraits<f32>::vec64 & v_dst) const
211     {
212         float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
213         v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
214 
215     }
216 
operator ()CAROTENE_NS::__anone3ae816c0111::wAdd217     void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
218     {
219         dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
220     }
221 };
222 
223 } // namespace
224 
225 #define IMPL_ADDWEIGHTED(type)                                \
226 void addWeighted(const Size2D &size,                          \
227                  const type * src0Base, ptrdiff_t src0Stride, \
228                  const type * src1Base, ptrdiff_t src1Stride, \
229                  type * dstBase, ptrdiff_t dstStride,         \
230                  f32 alpha, f32 beta, f32 gamma)              \
231 {                                                             \
232     internal::assertSupportedConfiguration();                 \
233     wAdd<type> wgtAdd(alpha,                                  \
234                       beta,                                   \
235                       gamma);                                 \
236     internal::vtransform(size,                                \
237                          src0Base, src0Stride,                \
238                          src1Base, src1Stride,                \
239                          dstBase, dstStride,                  \
240                          wgtAdd);                             \
241 }
242 
243 #else
244 
245 #define IMPL_ADDWEIGHTED(type)                                \
246 void addWeighted(const Size2D &,                              \
247                  const type *, ptrdiff_t,                     \
248                  const type *, ptrdiff_t,                     \
249                  type *, ptrdiff_t,                           \
250                  f32, f32, f32)                               \
251 {                                                             \
252     internal::assertSupportedConfiguration();                 \
253 }
254 
255 #endif
256 
257 IMPL_ADDWEIGHTED(u8)
258 IMPL_ADDWEIGHTED(s8)
259 IMPL_ADDWEIGHTED(u16)
260 IMPL_ADDWEIGHTED(s16)
261 IMPL_ADDWEIGHTED(u32)
262 IMPL_ADDWEIGHTED(s32)
263 IMPL_ADDWEIGHTED(f32)
264 
265 } // namespace CAROTENE_NS
266