1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "vtransform.hpp"
42
43 namespace CAROTENE_NS {
44
45 #define FILL_LINES2(macro,type) \
46 macro##_LINE(type,0) \
47 macro##_LINE(type,1)
48 #define FILL_LINES3(macro,type) \
49 FILL_LINES2(macro,type) \
50 macro##_LINE(type,2)
51 #define FILL_LINES4(macro,type) \
52 FILL_LINES3(macro,type) \
53 macro##_LINE(type,3)
54
55 #define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
56
57 #ifdef CAROTENE_NEON
58
59 #define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
60 #define PREF_LINE(type, n) internal::prefetch(src##n + sj);
61 #define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
62 #define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
63 #define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
64 #define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
65
66 #define MUL2(val) (val << 1)
67 #define MUL3(val) (MUL2(val) + val)
68 #define MUL4(val) (val << 2)
69
70 #define CONTSRC2 dstStride == src0Stride && \
71 dstStride == src1Stride &&
72 #define CONTSRC3 dstStride == src0Stride && \
73 dstStride == src1Stride && \
74 dstStride == src2Stride &&
75 #define CONTSRC4 dstStride == src0Stride && \
76 dstStride == src1Stride && \
77 dstStride == src2Stride && \
78 dstStride == src3Stride &&
79
80 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
81
82 #define MERGE_ASM2(sgn, bits) __asm__ ( \
83 "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
84 "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
85 "vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
86 "vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
87 : \
88 : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
89 [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
90 : "d0","d1","d2","d3" \
91 );
92 #define MERGE_ASM3(sgn, bits) __asm__ ( \
93 "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
94 "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
95 "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
96 "vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
97 "vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
98 : \
99 : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
100 [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
101 : "d0","d1","d2","d3","d4","d5" \
102 );
103 #define MERGE_ASM4(sgn, bits) __asm__ ( \
104 "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
105 "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
106 "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
107 "vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
108 "vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
109 "vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
110 : \
111 : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
112 [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
113 : "d0","d1","d2","d3","d4","d5","d6","d7" \
114 );
115
116 #define MERGE_QUAD(sgn, bits, n) { \
117 FILL_LINES##n(PREF, sgn##bits) \
118 MERGE_ASM##n(sgn, bits) \
119 }
120
121 #else
122
123 #define MERGE_QUAD(sgn, bits, n) { \
124 vec128 v_dst; \
125 /*FILL_LINES##n(PREF, sgn##bits) \
126 FILL_LINES##n(VLD1Q, sgn##bits)*/ \
127 FILL_LINES##n(PRLD, sgn##bits) \
128 vst##n##q_##sgn##bits(dst + dj, v_dst); \
129 }
130
131 #endif
132
133 #define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
134 FILL_LINES##n(FARG, sgn##bits), \
135 sgn##bits * dstBase, ptrdiff_t dstStride) \
136 { \
137 internal::assertSupportedConfiguration(); \
138 Size2D size(_size); \
139 if (CONTSRC##n \
140 dstStride == (ptrdiff_t)(size.width)) \
141 { \
142 size.width *= size.height; \
143 size.height = 1; \
144 } \
145 typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
146 size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
147 typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
148 size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
149 \
150 for (size_t i = 0u; i < size.height; ++i) \
151 { \
152 FILL_LINES##n(VROW, sgn##bits) \
153 sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
154 size_t sj = 0u, dj = 0u; \
155 \
156 for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
157 MERGE_QUAD(sgn, bits, n) \
158 \
159 if ( sj < roiw8 ) \
160 { \
161 vec64 v_dst; \
162 FILL_LINES##n(VLD1, sgn##bits) \
163 vst##n##_##sgn##bits(dst + dj, v_dst); \
164 sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
165 } \
166 \
167 for (; sj < size.width; ++sj, dj += n) \
168 { \
169 FILL_LINES##n(SLD, sgn##bits) \
170 } \
171 } \
172 }
173
174 #define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
175 FILL_LINES##n(FARG, sgn##64), \
176 sgn##64 * dstBase, ptrdiff_t dstStride) \
177 { \
178 internal::assertSupportedConfiguration(); \
179 Size2D size(_size); \
180 if (CONTSRC##n \
181 dstStride == (ptrdiff_t)(size.width)) \
182 { \
183 size.width *= size.height; \
184 size.height = 1; \
185 } \
186 typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
187 \
188 for (size_t i = 0u; i < size.height; ++i) \
189 { \
190 FILL_LINES##n(VROW, sgn##64) \
191 sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
192 size_t sj = 0u, dj = 0u; \
193 \
194 for (; sj < size.width; ++sj, dj += n) \
195 { \
196 vec64 v_dst; \
197 FILL_LINES##n(VLD1, sgn##64) \
198 vst##n##_##sgn##64(dst + dj, v_dst); \
199 /*FILL_LINES##n(SLD, sgn##64)*/ \
200 } \
201 } \
202 }
203
204 #else
205
206 #define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
207
208 #define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
209 FILL_LINES##n(FARG, sgn##bits), \
210 sgn##bits * dstBase, ptrdiff_t dstStride) \
211 { \
212 internal::assertSupportedConfiguration(); \
213 (void)size; \
214 FILL_LINES##n(VOID, sgn##bits) \
215 (void)dstBase; \
216 (void)dstStride; \
217 }
218 #define COMBINE64(sgn,n) COMBINE(sgn,64,n)
219
220 #endif //CAROTENE_NEON
221
222 COMBINE(u, 8,2)
223 COMBINE(u, 8,3)
224 COMBINE(u, 8,4)
225 COMBINE(u,16,2)
226 COMBINE(u,16,3)
227 COMBINE(u,16,4)
228 COMBINE(s,32,2)
229 COMBINE(s,32,3)
230 COMBINE(s,32,4)
231 COMBINE64(s, 2)
232 COMBINE64(s, 3)
233 COMBINE64(s, 4)
234
combineYUYV(const Size2D & size,const u8 * srcyBase,ptrdiff_t srcyStride,const u8 * srcuBase,ptrdiff_t srcuStride,const u8 * srcvBase,ptrdiff_t srcvStride,u8 * dstBase,ptrdiff_t dstStride)235 void combineYUYV(const Size2D &size,
236 const u8 * srcyBase, ptrdiff_t srcyStride,
237 const u8 * srcuBase, ptrdiff_t srcuStride,
238 const u8 * srcvBase, ptrdiff_t srcvStride,
239 u8 * dstBase, ptrdiff_t dstStride)
240 {
241 internal::assertSupportedConfiguration();
242 #ifdef CAROTENE_NEON
243 #ifndef __ANDROID__
244 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
245 #endif
246 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
247
248 for (size_t i = 0u; i < size.height; i += 1)
249 {
250 const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
251 const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
252 const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
253 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
254 size_t syj = 0u, sj = 0u, dj = 0u;
255
256 #ifndef __ANDROID__
257 for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
258 {
259 internal::prefetch(srcy + syj);
260 internal::prefetch(srcu + sj);
261 internal::prefetch(srcv + sj);
262
263 uint8x16x2_t v_y = vld2q_u8(srcy + syj);
264 uint8x16x4_t v_dst;
265 v_dst.val[0] = v_y.val[0];
266 v_dst.val[1] = vld1q_u8(srcu + sj);
267 v_dst.val[2] = v_y.val[1];
268 v_dst.val[3] = vld1q_u8(srcv + sj);
269 vst4q_u8(dst + dj, v_dst);
270
271 v_y = vld2q_u8(srcy + syj + 32);
272 v_dst.val[0] = v_y.val[0];
273 v_dst.val[1] = vld1q_u8(srcu + sj + 16);
274 v_dst.val[2] = v_y.val[1];
275 v_dst.val[3] = vld1q_u8(srcv + sj + 16);
276 vst4q_u8(dst + dj + 64, v_dst);
277 }
278 #endif
279
280 for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
281 {
282 uint8x8x2_t v_y = vld2_u8(srcy + syj);
283 uint8x8x4_t v_dst;
284 v_dst.val[0] = v_y.val[0];
285 v_dst.val[1] = vld1_u8(srcu + sj);
286 v_dst.val[2] = v_y.val[1];
287 v_dst.val[3] = vld1_u8(srcv + sj);
288 vst4_u8(dst + dj, v_dst);
289 }
290
291 for (; sj < size.width; ++sj, syj += 2, dj += 4)
292 {
293 dst[dj] = srcy[syj];
294 dst[dj + 1] = srcu[sj];
295 dst[dj + 2] = srcy[syj + 1];
296 dst[dj + 3] = srcv[sj];
297 }
298 }
299 #else
300 (void)size;
301 (void)srcyBase;
302 (void)srcyStride;
303 (void)srcuBase;
304 (void)srcuStride;
305 (void)srcvBase;
306 (void)srcvStride;
307 (void)dstBase;
308 (void)dstStride;
309 #endif
310 }
311
combineUYVY(const Size2D & size,const u8 * srcyBase,ptrdiff_t srcyStride,const u8 * srcuBase,ptrdiff_t srcuStride,const u8 * srcvBase,ptrdiff_t srcvStride,u8 * dstBase,ptrdiff_t dstStride)312 void combineUYVY(const Size2D &size,
313 const u8 * srcyBase, ptrdiff_t srcyStride,
314 const u8 * srcuBase, ptrdiff_t srcuStride,
315 const u8 * srcvBase, ptrdiff_t srcvStride,
316 u8 * dstBase, ptrdiff_t dstStride)
317 {
318 internal::assertSupportedConfiguration();
319 #ifdef CAROTENE_NEON
320 #ifndef __ANDROID__
321 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
322 #endif
323 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324
325 for (size_t i = 0u; i < size.height; ++i)
326 {
327 const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
328 const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
329 const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
330 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
331 size_t syj = 0u, sj = 0u, dj = 0u;
332
333 #ifndef __ANDROID__
334 for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
335 {
336 internal::prefetch(srcy + syj);
337 internal::prefetch(srcu + sj);
338 internal::prefetch(srcv + sj);
339
340 uint8x16x2_t v_y = vld2q_u8(srcy + syj);
341 uint8x16x4_t v_dst;
342 v_dst.val[0] = vld1q_u8(srcu + sj);
343 v_dst.val[1] = v_y.val[0];
344 v_dst.val[2] = vld1q_u8(srcv + sj);
345 v_dst.val[3] = v_y.val[1];
346 vst4q_u8(dst + dj, v_dst);
347
348 v_y = vld2q_u8(srcy + syj + 32);
349 v_dst.val[0] = vld1q_u8(srcu + sj + 16);
350 v_dst.val[1] = v_y.val[0];
351 v_dst.val[2] = vld1q_u8(srcv + sj + 16);
352 v_dst.val[3] = v_y.val[1];
353 vst4q_u8(dst + dj + 64, v_dst);
354 }
355 #endif
356
357 for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
358 {
359 uint8x8x2_t v_y = vld2_u8(srcy + syj);
360 uint8x8x4_t v_dst;
361 v_dst.val[0] = vld1_u8(srcu + sj);
362 v_dst.val[1] = v_y.val[0];
363 v_dst.val[2] = vld1_u8(srcv + sj);
364 v_dst.val[3] = v_y.val[1];
365 vst4_u8(dst + dj, v_dst);
366 }
367
368 for (; sj < size.width; ++sj, syj += 2, dj += 4)
369 {
370 dst[dj] = srcu[sj];
371 dst[dj + 1] = srcy[syj];
372 dst[dj + 2] = srcv[sj];
373 dst[dj + 3] = srcy[syj + 1];
374 }
375 }
376 #else
377 (void)size;
378 (void)srcyBase;
379 (void)srcyStride;
380 (void)srcuBase;
381 (void)srcuStride;
382 (void)srcvBase;
383 (void)srcvStride;
384 (void)dstBase;
385 (void)dstStride;
386 #endif
387 }
388
389 } // namespace CAROTENE_NS
390