1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 #include <cstring>
43
44 namespace CAROTENE_NS {
45
46 #ifdef CAROTENE_NEON
47
48 namespace {
49
50 template <int shift>
lshiftConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)51 void lshiftConst(const Size2D &size,
52 const u8 * srcBase, ptrdiff_t srcStride,
53 s16 * dstBase, ptrdiff_t dstStride)
54 {
55 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
56 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
57
58 for (size_t i = 0; i < size.height; ++i)
59 {
60 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
61 s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
62 size_t j = 0;
63
64 for (; j < roiw16; j += 16)
65 {
66 internal::prefetch(src + j);
67 uint8x16_t v_src = vld1q_u8(src + j);
68 int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
69 int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
70
71 vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
72 vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
73 }
74 for (; j < roiw8; j += 8)
75 {
76 int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
77 vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
78 }
79
80 for (; j < size.width; j++)
81 {
82 dst[j] = ((s16)src[j] << shift);
83 }
84 }
85 }
86
87 template <>
lshiftConst(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)88 void lshiftConst<0>(const Size2D &size,
89 const u8 * srcBase, ptrdiff_t srcStride,
90 s16 * dstBase, ptrdiff_t dstStride)
91 {
92 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
93 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
94
95 for (size_t i = 0; i < size.height; ++i)
96 {
97 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
98 s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
99 size_t j = 0;
100
101 for (; j < roiw16; j += 16)
102 {
103 internal::prefetch(src + j);
104 uint8x16_t v_src = vld1q_u8(src + j);
105 int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
106 int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
107
108 vst1q_s16(dst + j, v_dst0);
109 vst1q_s16(dst + j + 8, v_dst1);
110 }
111 for (; j < roiw8; j += 8)
112 {
113 int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
114 vst1q_s16(dst + j, v_dst);
115 }
116
117 for (; j < size.width; j++)
118 {
119 dst[j] = (s16)src[j];
120 }
121 }
122 }
123
124 template <int shift>
rshiftConst(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY cpolicy)125 void rshiftConst(const Size2D &size,
126 const s16 * srcBase, ptrdiff_t srcStride,
127 u8 * dstBase, ptrdiff_t dstStride,
128 CONVERT_POLICY cpolicy)
129 {
130 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
131 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
132
133 for (size_t i = 0; i < size.height; ++i)
134 {
135 const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
136 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
137 size_t j = 0;
138
139 if (cpolicy == CONVERT_POLICY_SATURATE)
140 {
141 for (; j < roiw16; j += 16)
142 {
143 internal::prefetch(src + j);
144 int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
145 v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
146 uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
147 vqmovun_s16(v_src1));
148 vst1q_u8(dst + j, v_dst);
149 }
150 for (; j < roiw8; j += 8)
151 {
152 int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
153 vst1_u8(dst + j, vqmovun_s16(v_src));
154 }
155
156 for (; j < size.width; j++)
157 {
158 dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
159 }
160 }
161 else // CONVERT_POLICY_WRAP
162 {
163 for (; j < roiw16; j += 16)
164 {
165 internal::prefetch(src + j);
166 int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
167 v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
168 int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
169 vmovn_s16(v_src1));
170 vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
171 }
172 for (; j < roiw8; j += 8)
173 {
174 int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
175 vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
176 }
177
178 for (; j < size.width; j++)
179 {
180 dst[j] = (u8)((src[j] >> shift));
181 }
182 }
183 }
184 }
185
186 template <>
rshiftConst(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY cpolicy)187 void rshiftConst<0>(const Size2D &size,
188 const s16 * srcBase, ptrdiff_t srcStride,
189 u8 * dstBase, ptrdiff_t dstStride,
190 CONVERT_POLICY cpolicy)
191 {
192 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
193 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
194
195 for (size_t i = 0; i < size.height; ++i)
196 {
197 const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
198 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
199 size_t j = 0;
200
201 if (cpolicy == CONVERT_POLICY_SATURATE)
202 {
203 for (; j < roiw16; j += 16)
204 {
205 internal::prefetch(src + j);
206 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
207 uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
208 vst1q_u8(dst + j, v_dst);
209 }
210 for (; j < roiw8; j += 8)
211 {
212 int16x8_t v_src = vld1q_s16(src + j);
213 vst1_u8(dst + j, vqmovun_s16(v_src));
214 }
215
216 for (; j < size.width; j++)
217 {
218 dst[j] = internal::saturate_cast<u8>(src[j]);
219 }
220 }
221 else // CONVERT_POLICY_WRAP
222 {
223 for (; j < roiw16; j += 16)
224 {
225 internal::prefetch(src + j);
226 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
227 int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
228 vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
229 }
230 for (; j < roiw8; j += 8)
231 {
232 int16x8_t v_src = vld1q_s16(src + j);
233 vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
234 }
235
236 for (; j < size.width; j++)
237 {
238 dst[j] = (u8)src[j];
239 }
240 }
241 }
242 }
243
244 typedef void (* lshiftConstFunc)(const Size2D &size,
245 const u8 * srcBase, ptrdiff_t srcStride,
246 s16 * dstBase, ptrdiff_t dstStride);
247
248 typedef void (* rshiftConstFunc)(const Size2D &size,
249 const s16 * srcBase, ptrdiff_t srcStride,
250 u8 * dstBase, ptrdiff_t dstStride,
251 CONVERT_POLICY cpolicy);
252
253 } // namespace
254
255 #endif
256
lshift(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,u32 shift)257 void lshift(const Size2D &size,
258 const u8 * srcBase, ptrdiff_t srcStride,
259 s16 * dstBase, ptrdiff_t dstStride,
260 u32 shift)
261 {
262 internal::assertSupportedConfiguration();
263
264 #ifdef CAROTENE_NEON
265 if (shift >= 16u)
266 {
267 for (size_t i = 0; i < size.height; ++i)
268 {
269 s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
270 std::memset(dst, 0, sizeof(s16) * size.width);
271 }
272 return;
273 }
274
275 // this ugly contruction is needed to avoid:
276 // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
277 // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
278
279 lshiftConstFunc funcs[16] =
280 {
281 lshiftConst<0>,
282 lshiftConst<1>,
283 lshiftConst<2>,
284 lshiftConst<3>,
285 lshiftConst<4>,
286 lshiftConst<5>,
287 lshiftConst<6>,
288 lshiftConst<7>,
289 lshiftConst<8>,
290 lshiftConst<9>,
291 lshiftConst<10>,
292 lshiftConst<11>,
293 lshiftConst<12>,
294 lshiftConst<13>,
295 lshiftConst<14>,
296 lshiftConst<15>
297 }, func = funcs[shift];
298
299 func(size, srcBase, srcStride, dstBase, dstStride);
300 #else
301 (void)size;
302 (void)srcBase;
303 (void)srcStride;
304 (void)dstBase;
305 (void)dstStride;
306 (void)shift;
307 #endif
308 }
309
rshift(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 shift,CONVERT_POLICY cpolicy)310 void rshift(const Size2D &size,
311 const s16 * srcBase, ptrdiff_t srcStride,
312 u8 * dstBase, ptrdiff_t dstStride,
313 u32 shift, CONVERT_POLICY cpolicy)
314 {
315 internal::assertSupportedConfiguration();
316
317 #ifdef CAROTENE_NEON
318 if (shift >= 16)
319 {
320 if (cpolicy == CONVERT_POLICY_WRAP)
321 {
322 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
323 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324 int16x8_t v_zero = vdupq_n_s16(0);
325
326 for (size_t i = 0; i < size.height; ++i)
327 {
328 const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
329 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
330 size_t j = 0;
331
332 for (; j < roiw16; j += 16)
333 {
334 internal::prefetch(src + j);
335 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
336 uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
337 vmovn_u16(vcltq_s16(v_src1, v_zero)));
338 vst1q_u8(dst + j, v_dst);
339 }
340 for (; j < roiw8; j += 8)
341 {
342 int16x8_t v_src = vld1q_s16(src + j);
343 vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
344 }
345
346 for (; j < size.width; j++)
347 {
348 dst[j] = src[j] >= 0 ? 0 : 255;
349 }
350 }
351 }
352 else
353 {
354 for (size_t i = 0; i < size.height; ++i)
355 {
356 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
357 std::memset(dst, 0, sizeof(u8) * size.width);
358 }
359 }
360 return;
361 }
362
363 // this ugly contruction is needed to avoid:
364 // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
365 // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
366
367 rshiftConstFunc funcs[16] =
368 {
369 rshiftConst<0>,
370 rshiftConst<1>,
371 rshiftConst<2>,
372 rshiftConst<3>,
373 rshiftConst<4>,
374 rshiftConst<5>,
375 rshiftConst<6>,
376 rshiftConst<7>,
377 rshiftConst<8>,
378 rshiftConst<9>,
379 rshiftConst<10>,
380 rshiftConst<11>,
381 rshiftConst<12>,
382 rshiftConst<13>,
383 rshiftConst<14>,
384 rshiftConst<15>
385 }, func = funcs[shift];
386
387 func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
388 #else
389 (void)size;
390 (void)srcBase;
391 (void)srcStride;
392 (void)dstBase;
393 (void)dstStride;
394 (void)shift;
395 (void)cpolicy;
396 #endif
397 }
398
399 } // namespace CAROTENE_NS
400