1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "vtransform.hpp"
42
43 namespace CAROTENE_NS {
44
extract2(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)45 void extract2(const Size2D &size,
46 const u8 * srcBase, ptrdiff_t srcStride,
47 u8 * dstBase, ptrdiff_t dstStride,
48 u32 coi)
49 {
50 internal::assertSupportedConfiguration();
51 #ifdef CAROTENE_NEON
52 #ifndef __ANDROID__
53 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
54 #endif
55 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56
57 for (size_t i = 0u; i < size.height; ++i)
58 {
59 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
60 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
61 size_t sj = 0u, dj = 0u;
62
63 #ifndef __ANDROID__
64 for (; dj < roiw32; sj += 64, dj += 32)
65 {
66 internal::prefetch(src + sj);
67
68 uint8x16x2_t v_src = vld2q_u8(src + sj);
69 vst1q_u8(dst + dj, v_src.val[coi]);
70
71 v_src = vld2q_u8(src + sj + 32);
72 vst1q_u8(dst + dj + 16, v_src.val[coi]);
73 }
74 #endif
75
76 for (; dj < roiw8; sj += 16, dj += 8)
77 {
78 uint8x8x2_t v_src = vld2_u8(src + sj);
79 vst1_u8(dst + dj, v_src.val[coi]);
80 }
81
82 for (; dj < size.width; sj += 2, ++dj)
83 {
84 dst[dj] = src[sj + coi];
85 }
86 }
87 #else
88 (void)size;
89 (void)srcBase;
90 (void)srcStride;
91 (void)dstBase;
92 (void)dstStride;
93 (void)coi;
94 #endif
95 }
96
extract3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)97 void extract3(const Size2D &size,
98 const u8 * srcBase, ptrdiff_t srcStride,
99 u8 * dstBase, ptrdiff_t dstStride,
100 u32 coi)
101 {
102 internal::assertSupportedConfiguration();
103 #ifdef CAROTENE_NEON
104 #ifndef __ANDROID__
105 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
106 #endif
107 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
108
109 for (size_t i = 0u; i < size.height; ++i)
110 {
111 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
112 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
113 size_t sj = 0u, dj = 0u;
114
115 #ifndef __ANDROID__
116 for (; dj < roiw32; sj += 96, dj += 32)
117 {
118 internal::prefetch(src + sj);
119
120 uint8x16x3_t v_src = vld3q_u8(src + sj);
121 vst1q_u8(dst + dj, v_src.val[coi]);
122
123 v_src = vld3q_u8(src + sj + 48);
124 vst1q_u8(dst + dj + 16, v_src.val[coi]);
125 }
126 #endif
127
128 for (; dj < roiw8; sj += 24, dj += 8)
129 {
130 uint8x8x3_t v_src = vld3_u8(src + sj);
131 vst1_u8(dst + dj, v_src.val[coi]);
132 }
133
134 for (; dj < size.width; sj += 3, ++dj)
135 {
136 dst[dj] = src[sj + coi];
137 }
138 }
139 #else
140 (void)size;
141 (void)srcBase;
142 (void)srcStride;
143 (void)dstBase;
144 (void)dstStride;
145 (void)coi;
146 #endif
147 }
148
extract4(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)149 void extract4(const Size2D &size,
150 const u8 * srcBase, ptrdiff_t srcStride,
151 u8 * dstBase, ptrdiff_t dstStride,
152 u32 coi)
153 {
154 internal::assertSupportedConfiguration();
155 #ifdef CAROTENE_NEON
156 #ifndef __ANDROID__
157 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
158 #endif
159 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
160
161 for (size_t i = 0u; i < size.height; ++i)
162 {
163 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
164 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
165 size_t sj = 0u, dj = 0u;
166
167 #ifndef __ANDROID__
168 for (; dj < roiw32; sj += 128, dj += 32)
169 {
170 internal::prefetch(src + sj);
171
172 uint8x16x4_t v_src = vld4q_u8(src + sj);
173 vst1q_u8(dst + dj, v_src.val[coi]);
174
175 v_src = vld4q_u8(src + sj + 64);
176 vst1q_u8(dst + dj + 16, v_src.val[coi]);
177 }
178 #endif
179
180 for (; dj < roiw8; sj += 32, dj += 8)
181 {
182 uint8x8x4_t v_src = vld4_u8(src + sj);
183 vst1_u8(dst + dj, v_src.val[coi]);
184 }
185
186 for (; dj < size.width; sj += 4, ++dj)
187 {
188 dst[dj] = src[sj + coi];
189 }
190 }
191 #else
192 (void)size;
193 (void)srcBase;
194 (void)srcStride;
195 (void)dstBase;
196 (void)dstStride;
197 (void)coi;
198 #endif
199 }
200
201 #define FILL_LINES2(macro,type) \
202 macro##_LINE(type,0) \
203 macro##_LINE(type,1)
204 #define FILL_LINES3(macro,type) \
205 FILL_LINES2(macro,type) \
206 macro##_LINE(type,2)
207 #define FILL_LINES4(macro,type) \
208 FILL_LINES3(macro,type) \
209 macro##_LINE(type,3)
210
211 #define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
212
213 #ifdef CAROTENE_NEON
214
215 #define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
216 #define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
217 #define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
218 #define SST_LINE(type, n) dst##n[dj] = src[sj + n];
219
220 #define MUL2(val) (val << 1)
221 #define MUL3(val) (MUL2(val) + val)
222 #define MUL4(val) (val << 2)
223
224 #define CONTDST2 srcStride == dst0Stride && \
225 srcStride == dst1Stride &&
226 #define CONTDST3 srcStride == dst0Stride && \
227 srcStride == dst1Stride && \
228 srcStride == dst2Stride &&
229 #define CONTDST4 srcStride == dst0Stride && \
230 srcStride == dst1Stride && \
231 srcStride == dst2Stride && \
232 srcStride == dst3Stride &&
233
234 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
235
236 #define SPLIT_ASM2(sgn, bits) __asm__ ( \
237 "vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
238 "vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
239 "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
240 "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
241 : \
242 : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
243 [in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
244 : "d0","d1","d2","d3" \
245 );
246 #define SPLIT_ASM3(sgn, bits) __asm__ ( \
247 "vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
248 "vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
249 "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
250 "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
251 "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
252 : \
253 : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
254 [in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
255 : "d0","d1","d2","d3","d4","d5" \
256 );
257 #define SPLIT_ASM4(sgn, bits) __asm__ ( \
258 "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
259 "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
260 "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
261 "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
262 "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
263 "vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
264 : \
265 : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
266 [in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
267 : "d0","d1","d2","d3","d4","d5","d6","d7" \
268 );
269
270 #define SPLIT_QUAD(sgn, bits, n) { \
271 internal::prefetch(src + sj); \
272 SPLIT_ASM##n(sgn, bits) \
273 }
274
275 #else
276
277 #define SPLIT_QUAD(sgn, bits, n) { \
278 internal::prefetch(src + sj); \
279 vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
280 FILL_LINES##n(VST1Q, sgn##bits) \
281 }
282
283 #endif
284
285 #define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
286 const sgn##bits * srcBase, ptrdiff_t srcStride \
287 FILL_LINES##n(FARG, sgn##bits) ) \
288 { \
289 internal::assertSupportedConfiguration(); \
290 Size2D size(_size); \
291 if (CONTDST##n \
292 dst0Stride == (ptrdiff_t)(size.width)) \
293 { \
294 size.width *= size.height; \
295 size.height = 1; \
296 } \
297 typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
298 size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
299 typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
300 size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
301 \
302 for (size_t i = 0u; i < size.height; ++i) \
303 { \
304 const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
305 FILL_LINES##n(VROW, sgn##bits) \
306 size_t sj = 0u, dj = 0u; \
307 \
308 for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
309 SPLIT_QUAD(sgn, bits, n) \
310 \
311 if (dj < roiw8) \
312 { \
313 vec64 v_src = vld##n##_##sgn##bits(src + sj); \
314 FILL_LINES##n(VST1, sgn##bits) \
315 sj += MUL##n(8)/sizeof(sgn##bits); \
316 dj += 8/sizeof(sgn##bits); \
317 } \
318 \
319 for (; dj < size.width; sj += n, ++dj) \
320 { \
321 FILL_LINES##n(SST, sgn##bits) \
322 } \
323 } \
324 }
325
326 #define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
327 const sgn##64 * srcBase, ptrdiff_t srcStride \
328 FILL_LINES##n(FARG, sgn##64) ) \
329 { \
330 internal::assertSupportedConfiguration(); \
331 Size2D size(_size); \
332 if (CONTDST##n \
333 dst0Stride == (ptrdiff_t)(size.width)) \
334 { \
335 size.width *= size.height; \
336 size.height = 1; \
337 } \
338 typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
339 \
340 for (size_t i = 0u; i < size.height; ++i) \
341 { \
342 const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
343 FILL_LINES##n(VROW, sgn##64) \
344 size_t sj = 0u, dj = 0u; \
345 \
346 for (; dj < size.width; sj += n, ++dj) \
347 { \
348 vec64 v_src = vld##n##_##sgn##64(src + sj); \
349 FILL_LINES##n(VST1, sgn##64) \
350 } \
351 } \
352 }
353
354 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
355
356 #define ALPHA_QUAD(sgn, bits) { \
357 internal::prefetch(src + sj); \
358 __asm__ ( \
359 "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
360 "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
361 "vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
362 "vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
363 "vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
364 : \
365 : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
366 [in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
367 : "d0","d1","d2","d3","d4","d5","d6","d7" \
368 ); \
369 }
370
371 #else
372
373 #define ALPHA_QUAD(sgn, bits) { \
374 internal::prefetch(src + sj); \
375 union { vec128_4 v4; vec128_3 v3; } vals; \
376 vals.v4 = vld4q_##sgn##bits(src + sj); \
377 vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
378 vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
379 }
380
381 #endif
382
383 #define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
384 const sgn##bits * srcBase, ptrdiff_t srcStride, \
385 sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
386 sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
387 { \
388 internal::assertSupportedConfiguration(); \
389 Size2D size(_size); \
390 if (srcStride == dst3Stride && \
391 srcStride == dst1Stride && \
392 srcStride == (ptrdiff_t)(size.width)) \
393 { \
394 size.width *= size.height; \
395 size.height = 1; \
396 } \
397 typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
398 typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
399 size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
400 typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
401 typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
402 size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
403 \
404 for (size_t i = 0u; i < size.height; ++i) \
405 { \
406 const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
407 sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
408 sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
409 size_t sj = 0u, d3j = 0u, d1j = 0u; \
410 \
411 for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
412 d1j += 16/sizeof(sgn##bits)) \
413 ALPHA_QUAD(sgn, bits) \
414 \
415 if (d1j < roiw8) \
416 { \
417 union { vec64_4 v4; vec64_3 v3; } vals; \
418 vals.v4 = vld4_##sgn##bits(src + sj); \
419 vst3_u8(dst3 + d3j, vals.v3); \
420 vst1_u8(dst1 + d1j, vals.v4.val[3]); \
421 sj += MUL4(8)/sizeof(sgn##bits); \
422 d3j += MUL3(8)/sizeof(sgn##bits); \
423 d1j += 8/sizeof(sgn##bits); \
424 } \
425 \
426 for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
427 { \
428 dst3[d3j+0] = src[sj + 0]; \
429 dst3[d3j+1] = src[sj + 1]; \
430 dst3[d3j+2] = src[sj + 2]; \
431 dst1[d1j] = src[sj + 3]; \
432 } \
433 } \
434 }
435
436 #else
437
438 #define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
439
440 #define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
441 const sgn##bits * srcBase, ptrdiff_t srcStride \
442 FILL_LINES##n(FARG, sgn##bits) ) \
443 { \
444 internal::assertSupportedConfiguration(); \
445 (void)size; \
446 (void)srcBase; \
447 (void)srcStride; \
448 FILL_LINES##n(VOID, sgn##bits) \
449 }
450
451 #define SPLIT64(sgn,n) SPLIT(sgn,64,n)
452
453 #define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
454 const sgn##bits * srcBase, ptrdiff_t srcStride, \
455 sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
456 sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
457 { \
458 internal::assertSupportedConfiguration(); \
459 (void)size; \
460 (void)srcBase; \
461 (void)srcStride; \
462 (void)dst3Base; \
463 (void)dst3Stride; \
464 (void)dst1Base; \
465 (void)dst1Stride; \
466 }
467
468 #endif //CAROTENE_NEON
469
470 SPLIT(u, 8,2)
471 SPLIT(u, 8,3)
472 SPLIT(u, 8,4)
473 SPLIT(u,16,2)
474 SPLIT(u,16,3)
475 SPLIT(u,16,4)
476 SPLIT(s,32,2)
477 SPLIT(s,32,3)
478 SPLIT(s,32,4)
479
480 SPLIT64(s, 2)
481 SPLIT64(s, 3)
482 SPLIT64(s, 4)
483
484 SPLIT4ALPHA(u,8)
485
486 } // namespace CAROTENE_NS
487