1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 #include <cstring>
43 
44 namespace CAROTENE_NS {
45 
46 #ifdef CAROTENE_NEON
47 
48 namespace {
49 
50 template <int shift>
lshiftConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)51 void lshiftConst(const Size2D &size,
52                  const u8 * srcBase, ptrdiff_t srcStride,
53                  s16 * dstBase, ptrdiff_t dstStride)
54 {
55     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
56     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
57 
58     for (size_t i = 0; i < size.height; ++i)
59     {
60         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
61         s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
62         size_t j = 0;
63 
64         for (; j < roiw16; j += 16)
65         {
66             internal::prefetch(src + j);
67             uint8x16_t v_src = vld1q_u8(src + j);
68             int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
69             int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
70 
71             vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
72             vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
73         }
74         for (; j < roiw8; j += 8)
75         {
76             int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
77             vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
78         }
79 
80         for (; j < size.width; j++)
81         {
82             dst[j] = ((s16)src[j] << shift);
83         }
84     }
85 }
86 
87 template <>
lshiftConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)88 void lshiftConst<0>(const Size2D &size,
89                     const u8 * srcBase, ptrdiff_t srcStride,
90                     s16 * dstBase, ptrdiff_t dstStride)
91 {
92     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
93     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
94 
95     for (size_t i = 0; i < size.height; ++i)
96     {
97         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
98         s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
99         size_t j = 0;
100 
101         for (; j < roiw16; j += 16)
102         {
103             internal::prefetch(src + j);
104             uint8x16_t v_src = vld1q_u8(src + j);
105             int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
106             int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
107 
108             vst1q_s16(dst + j, v_dst0);
109             vst1q_s16(dst + j + 8, v_dst1);
110         }
111         for (; j < roiw8; j += 8)
112         {
113             int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
114             vst1q_s16(dst + j, v_dst);
115         }
116 
117         for (; j < size.width; j++)
118         {
119             dst[j] = (s16)src[j];
120         }
121     }
122 }
123 
124 template <int shift>
rshiftConst(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY cpolicy)125 void rshiftConst(const Size2D &size,
126                  const s16 * srcBase, ptrdiff_t srcStride,
127                  u8 * dstBase, ptrdiff_t dstStride,
128                  CONVERT_POLICY cpolicy)
129 {
130     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
131     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
132 
133     for (size_t i = 0; i < size.height; ++i)
134     {
135         const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
136         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
137         size_t j = 0;
138 
139         if (cpolicy == CONVERT_POLICY_SATURATE)
140         {
141             for (; j < roiw16; j += 16)
142             {
143                 internal::prefetch(src + j);
144                 int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
145                           v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
146                 uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
147                                                vqmovun_s16(v_src1));
148                 vst1q_u8(dst + j, v_dst);
149             }
150             for (; j < roiw8; j += 8)
151             {
152                 int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
153                 vst1_u8(dst + j, vqmovun_s16(v_src));
154             }
155 
156             for (; j < size.width; j++)
157             {
158                 dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
159             }
160         }
161         else // CONVERT_POLICY_WRAP
162         {
163             for (; j < roiw16; j += 16)
164             {
165                 internal::prefetch(src + j);
166                 int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
167                           v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
168                 int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
169                                               vmovn_s16(v_src1));
170                 vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
171             }
172             for (; j < roiw8; j += 8)
173             {
174                 int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
175                 vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
176             }
177 
178             for (; j < size.width; j++)
179             {
180                 dst[j] = (u8)((src[j] >> shift));
181             }
182         }
183     }
184 }
185 
186 template <>
rshiftConst(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY cpolicy)187 void rshiftConst<0>(const Size2D &size,
188                     const s16 * srcBase, ptrdiff_t srcStride,
189                     u8 * dstBase, ptrdiff_t dstStride,
190                     CONVERT_POLICY cpolicy)
191 {
192     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
193     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
194 
195     for (size_t i = 0; i < size.height; ++i)
196     {
197         const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
198         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
199         size_t j = 0;
200 
201         if (cpolicy == CONVERT_POLICY_SATURATE)
202         {
203             for (; j < roiw16; j += 16)
204             {
205                 internal::prefetch(src + j);
206                 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
207                 uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
208                 vst1q_u8(dst + j, v_dst);
209             }
210             for (; j < roiw8; j += 8)
211             {
212                 int16x8_t v_src = vld1q_s16(src + j);
213                 vst1_u8(dst + j, vqmovun_s16(v_src));
214             }
215 
216             for (; j < size.width; j++)
217             {
218                 dst[j] = internal::saturate_cast<u8>(src[j]);
219             }
220         }
221         else // CONVERT_POLICY_WRAP
222         {
223             for (; j < roiw16; j += 16)
224             {
225                 internal::prefetch(src + j);
226                 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
227                 int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
228                 vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
229             }
230             for (; j < roiw8; j += 8)
231             {
232                 int16x8_t v_src = vld1q_s16(src + j);
233                 vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
234             }
235 
236             for (; j < size.width; j++)
237             {
238                 dst[j] = (u8)src[j];
239             }
240         }
241     }
242 }
243 
244 typedef void (* lshiftConstFunc)(const Size2D &size,
245                                 const u8 * srcBase, ptrdiff_t srcStride,
246                                 s16 * dstBase, ptrdiff_t dstStride);
247 
248 typedef void (* rshiftConstFunc)(const Size2D &size,
249                                 const s16 * srcBase, ptrdiff_t srcStride,
250                                 u8 * dstBase, ptrdiff_t dstStride,
251                                 CONVERT_POLICY cpolicy);
252 
253 } // namespace
254 
255 #endif
256 
lshift(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,u32 shift)257 void lshift(const Size2D &size,
258             const u8 * srcBase, ptrdiff_t srcStride,
259             s16 * dstBase, ptrdiff_t dstStride,
260             u32 shift)
261 {
262     internal::assertSupportedConfiguration();
263 
264 #ifdef CAROTENE_NEON
265     if (shift >= 16u)
266     {
267         for (size_t i = 0; i < size.height; ++i)
268         {
269             s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
270             std::memset(dst, 0, sizeof(s16) * size.width);
271         }
272         return;
273     }
274 
275     // this ugly contruction is needed to avoid:
276     // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
277     // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
278 
279     lshiftConstFunc funcs[16] =
280     {
281         lshiftConst<0>,
282         lshiftConst<1>,
283         lshiftConst<2>,
284         lshiftConst<3>,
285         lshiftConst<4>,
286         lshiftConst<5>,
287         lshiftConst<6>,
288         lshiftConst<7>,
289         lshiftConst<8>,
290         lshiftConst<9>,
291         lshiftConst<10>,
292         lshiftConst<11>,
293         lshiftConst<12>,
294         lshiftConst<13>,
295         lshiftConst<14>,
296         lshiftConst<15>
297     }, func = funcs[shift];
298 
299     func(size, srcBase, srcStride, dstBase, dstStride);
300 #else
301     (void)size;
302     (void)srcBase;
303     (void)srcStride;
304     (void)dstBase;
305     (void)dstStride;
306     (void)shift;
307 #endif
308 }
309 
rshift(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 shift,CONVERT_POLICY cpolicy)310 void rshift(const Size2D &size,
311             const s16 * srcBase, ptrdiff_t srcStride,
312             u8 * dstBase, ptrdiff_t dstStride,
313             u32 shift, CONVERT_POLICY cpolicy)
314 {
315     internal::assertSupportedConfiguration();
316 
317 #ifdef CAROTENE_NEON
318     if (shift >= 16)
319     {
320         if (cpolicy == CONVERT_POLICY_WRAP)
321         {
322             size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
323             size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324             int16x8_t v_zero = vdupq_n_s16(0);
325 
326             for (size_t i = 0; i < size.height; ++i)
327             {
328                 const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
329                 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
330                 size_t j = 0;
331 
332                 for (; j < roiw16; j += 16)
333                 {
334                     internal::prefetch(src + j);
335                     int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
336                     uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
337                                                    vmovn_u16(vcltq_s16(v_src1, v_zero)));
338                     vst1q_u8(dst + j, v_dst);
339                 }
340                 for (; j < roiw8; j += 8)
341                 {
342                     int16x8_t v_src = vld1q_s16(src + j);
343                     vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
344                 }
345 
346                 for (; j < size.width; j++)
347                 {
348                     dst[j] = src[j] >= 0 ? 0 : 255;
349                 }
350             }
351         }
352         else
353         {
354             for (size_t i = 0; i < size.height; ++i)
355             {
356                 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
357                 std::memset(dst, 0, sizeof(u8) * size.width);
358             }
359         }
360         return;
361     }
362 
363     // this ugly contruction is needed to avoid:
364     // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
365     // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
366 
367     rshiftConstFunc funcs[16] =
368     {
369         rshiftConst<0>,
370         rshiftConst<1>,
371         rshiftConst<2>,
372         rshiftConst<3>,
373         rshiftConst<4>,
374         rshiftConst<5>,
375         rshiftConst<6>,
376         rshiftConst<7>,
377         rshiftConst<8>,
378         rshiftConst<9>,
379         rshiftConst<10>,
380         rshiftConst<11>,
381         rshiftConst<12>,
382         rshiftConst<13>,
383         rshiftConst<14>,
384         rshiftConst<15>
385     }, func = funcs[shift];
386 
387     func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
388 #else
389     (void)size;
390     (void)srcBase;
391     (void)srcStride;
392     (void)dstBase;
393     (void)dstStride;
394     (void)shift;
395     (void)cpolicy;
396 #endif
397 }
398 
399 } // namespace CAROTENE_NS
400