1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "vtransform.hpp"
42 
43 namespace CAROTENE_NS {
44 
45 #define FILL_LINES2(macro,type) \
46             macro##_LINE(type,0) \
47             macro##_LINE(type,1)
48 #define FILL_LINES3(macro,type) \
49             FILL_LINES2(macro,type) \
50             macro##_LINE(type,2)
51 #define FILL_LINES4(macro,type) \
52             FILL_LINES3(macro,type) \
53             macro##_LINE(type,3)
54 
55 #define  FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
56 
57 #ifdef CAROTENE_NEON
58 
59 #define  VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
60 #define  PREF_LINE(type, n) internal::prefetch(src##n + sj);
61 #define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
62 #define  PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
63 #define  VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
64 #define   SLD_LINE(type, n) dst[dj + n] = src##n[sj];
65 
66 #define MUL2(val) (val << 1)
67 #define MUL3(val) (MUL2(val) + val)
68 #define MUL4(val) (val << 2)
69 
70 #define CONTSRC2 dstStride == src0Stride && \
71                  dstStride == src1Stride &&
72 #define CONTSRC3 dstStride == src0Stride && \
73                  dstStride == src1Stride && \
74                  dstStride == src2Stride &&
75 #define CONTSRC4 dstStride == src0Stride && \
76                  dstStride == src1Stride && \
77                  dstStride == src2Stride && \
78                  dstStride == src3Stride &&
79 
80 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
81 
82 #define MERGE_ASM2(sgn, bits) __asm__ ( \
83                                           "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
84                                           "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
85                                           "vst2." #bits " {d0, d2}, [%[out0]]           \n\t" \
86                                           "vst2." #bits " {d1, d3}, [%[out1]]           \n\t" \
87                                           : \
88                                           : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
89                                             [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
90                                           : "d0","d1","d2","d3" \
91                                       );
92 #define MERGE_ASM3(sgn, bits) __asm__ ( \
93                                           "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
94                                           "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
95                                           "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
96                                           "vst3." #bits " {d0, d2, d4}, [%[out0]]       \n\t" \
97                                           "vst3." #bits " {d1, d3, d5}, [%[out1]]       \n\t" \
98                                           : \
99                                           : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
100                                             [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
101                                           : "d0","d1","d2","d3","d4","d5" \
102                                       );
103 #define MERGE_ASM4(sgn, bits) __asm__ ( \
104                                           "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
105                                           "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
106                                           "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
107                                           "vld1." #bits " {d6-d7}, [%[in3]]             \n\t" \
108                                           "vst4." #bits " {d0, d2, d4, d6}, [%[out0]]   \n\t" \
109                                           "vst4." #bits " {d1, d3, d5, d7}, [%[out1]]   \n\t" \
110                                           : \
111                                           : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
112                                             [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
113                                           : "d0","d1","d2","d3","d4","d5","d6","d7" \
114                                       );
115 
116 #define MERGE_QUAD(sgn, bits, n) { \
117                                      FILL_LINES##n(PREF, sgn##bits) \
118                                      MERGE_ASM##n(sgn, bits) \
119                                  }
120 
121 #else
122 
123 #define MERGE_QUAD(sgn, bits, n) { \
124                                      vec128 v_dst; \
125                                      /*FILL_LINES##n(PREF, sgn##bits) \
126                                      FILL_LINES##n(VLD1Q, sgn##bits)*/ \
127                                      FILL_LINES##n(PRLD, sgn##bits) \
128                                      vst##n##q_##sgn##bits(dst + dj, v_dst); \
129                                  }
130 
131 #endif
132 
133 #define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size                                             \
134                                         FILL_LINES##n(FARG, sgn##bits),                                     \
135                                         sgn##bits * dstBase, ptrdiff_t dstStride)                           \
136 {                                                                                                           \
137     internal::assertSupportedConfiguration();                                                               \
138     Size2D size(_size);                                                                                     \
139     if (CONTSRC##n                                                                                          \
140         dstStride == (ptrdiff_t)(size.width))                                                               \
141     {                                                                                                       \
142         size.width *= size.height;                                                                          \
143         size.height = 1;                                                                                    \
144     }                                                                                                       \
145     typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                               \
146     size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
147     typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                                 \
148     size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0;    \
149                                                                                                             \
150     for (size_t i = 0u; i < size.height; ++i)                                                               \
151     {                                                                                                       \
152         FILL_LINES##n(VROW, sgn##bits)                                                                      \
153         sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i);                                       \
154         size_t sj = 0u, dj = 0u;                                                                            \
155                                                                                                             \
156         for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits))                 \
157             MERGE_QUAD(sgn, bits, n)                                                                        \
158                                                                                                             \
159         if ( sj < roiw8 )                                                                                   \
160         {                                                                                                   \
161             vec64 v_dst;                                                                                    \
162             FILL_LINES##n(VLD1, sgn##bits)                                                                  \
163             vst##n##_##sgn##bits(dst + dj, v_dst);                                                          \
164             sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits);                                   \
165         }                                                                                                   \
166                                                                                                             \
167         for (; sj < size.width; ++sj, dj += n)                                                              \
168         {                                                                                                   \
169             FILL_LINES##n(SLD, sgn##bits)                                                                   \
170         }                                                                                                   \
171     }                                                                                                       \
172 }
173 
174 #define COMBINE64(sgn,n) void combine##n(const Size2D &_size                                                \
175                                                FILL_LINES##n(FARG, sgn##64),                                \
176                                                sgn##64 * dstBase, ptrdiff_t dstStride)                      \
177 {                                                                                                           \
178     internal::assertSupportedConfiguration();                                                               \
179     Size2D size(_size);                                                                                     \
180     if (CONTSRC##n                                                                                          \
181         dstStride == (ptrdiff_t)(size.width))                                                               \
182     {                                                                                                       \
183         size.width *= size.height;                                                                          \
184         size.height = 1;                                                                                    \
185     }                                                                                                       \
186     typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                                   \
187                                                                                                             \
188     for (size_t i = 0u; i < size.height; ++i)                                                               \
189     {                                                                                                       \
190         FILL_LINES##n(VROW, sgn##64)                                                                        \
191         sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i);                                         \
192         size_t sj = 0u, dj = 0u;                                                                            \
193                                                                                                             \
194         for (; sj < size.width; ++sj, dj += n)                                                              \
195         {                                                                                                   \
196             vec64 v_dst;                                                                                    \
197             FILL_LINES##n(VLD1, sgn##64)                                                                    \
198             vst##n##_##sgn##64(dst + dj, v_dst);                                                            \
199             /*FILL_LINES##n(SLD, sgn##64)*/                                                                 \
200         }                                                                                                   \
201     }                                                                                                       \
202 }
203 
204 #else
205 
206 #define  VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
207 
208 #define COMBINE(sgn,bits,n) void combine##n(const Size2D &size                                              \
209                                         FILL_LINES##n(FARG, sgn##bits),                                     \
210                                         sgn##bits * dstBase, ptrdiff_t dstStride)                           \
211 {                                                                                                           \
212     internal::assertSupportedConfiguration();                                                               \
213     (void)size;                                                                                             \
214     FILL_LINES##n(VOID, sgn##bits)                                                                          \
215     (void)dstBase;                                                                                          \
216     (void)dstStride;                                                                                        \
217 }
218 #define COMBINE64(sgn,n) COMBINE(sgn,64,n)
219 
220 #endif //CAROTENE_NEON
221 
222 COMBINE(u, 8,2)
223 COMBINE(u, 8,3)
224 COMBINE(u, 8,4)
225 COMBINE(u,16,2)
226 COMBINE(u,16,3)
227 COMBINE(u,16,4)
228 COMBINE(s,32,2)
229 COMBINE(s,32,3)
230 COMBINE(s,32,4)
231 COMBINE64(s, 2)
232 COMBINE64(s, 3)
233 COMBINE64(s, 4)
234 
combineYUYV(const Size2D & size,const u8 * srcyBase,ptrdiff_t srcyStride,const u8 * srcuBase,ptrdiff_t srcuStride,const u8 * srcvBase,ptrdiff_t srcvStride,u8 * dstBase,ptrdiff_t dstStride)235 void combineYUYV(const Size2D &size,
236                  const u8 * srcyBase, ptrdiff_t srcyStride,
237                  const u8 * srcuBase, ptrdiff_t srcuStride,
238                  const u8 * srcvBase, ptrdiff_t srcvStride,
239                  u8 * dstBase, ptrdiff_t dstStride)
240 {
241     internal::assertSupportedConfiguration();
242 #ifdef CAROTENE_NEON
243 #ifndef __ANDROID__
244     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
245 #endif
246     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
247 
248     for (size_t i = 0u; i < size.height; i += 1)
249     {
250         const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
251         const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
252         const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
253         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
254         size_t syj = 0u, sj = 0u, dj = 0u;
255 
256 #ifndef __ANDROID__
257         for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
258         {
259             internal::prefetch(srcy + syj);
260             internal::prefetch(srcu + sj);
261             internal::prefetch(srcv + sj);
262 
263             uint8x16x2_t v_y = vld2q_u8(srcy + syj);
264             uint8x16x4_t v_dst;
265             v_dst.val[0] = v_y.val[0];
266             v_dst.val[1] = vld1q_u8(srcu + sj);
267             v_dst.val[2] = v_y.val[1];
268             v_dst.val[3] = vld1q_u8(srcv + sj);
269             vst4q_u8(dst + dj, v_dst);
270 
271             v_y = vld2q_u8(srcy + syj + 32);
272             v_dst.val[0] = v_y.val[0];
273             v_dst.val[1] = vld1q_u8(srcu + sj + 16);
274             v_dst.val[2] = v_y.val[1];
275             v_dst.val[3] = vld1q_u8(srcv + sj + 16);
276             vst4q_u8(dst + dj + 64, v_dst);
277         }
278 #endif
279 
280         for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
281         {
282             uint8x8x2_t v_y = vld2_u8(srcy + syj);
283             uint8x8x4_t v_dst;
284             v_dst.val[0] = v_y.val[0];
285             v_dst.val[1] = vld1_u8(srcu + sj);
286             v_dst.val[2] = v_y.val[1];
287             v_dst.val[3] = vld1_u8(srcv + sj);
288             vst4_u8(dst + dj, v_dst);
289         }
290 
291         for (; sj < size.width; ++sj, syj += 2, dj += 4)
292         {
293             dst[dj] = srcy[syj];
294             dst[dj + 1] = srcu[sj];
295             dst[dj + 2] = srcy[syj + 1];
296             dst[dj + 3] = srcv[sj];
297         }
298     }
299 #else
300     (void)size;
301     (void)srcyBase;
302     (void)srcyStride;
303     (void)srcuBase;
304     (void)srcuStride;
305     (void)srcvBase;
306     (void)srcvStride;
307     (void)dstBase;
308     (void)dstStride;
309 #endif
310 }
311 
combineUYVY(const Size2D & size,const u8 * srcyBase,ptrdiff_t srcyStride,const u8 * srcuBase,ptrdiff_t srcuStride,const u8 * srcvBase,ptrdiff_t srcvStride,u8 * dstBase,ptrdiff_t dstStride)312 void combineUYVY(const Size2D &size,
313                  const u8 * srcyBase, ptrdiff_t srcyStride,
314                  const u8 * srcuBase, ptrdiff_t srcuStride,
315                  const u8 * srcvBase, ptrdiff_t srcvStride,
316                  u8 * dstBase, ptrdiff_t dstStride)
317 {
318     internal::assertSupportedConfiguration();
319 #ifdef CAROTENE_NEON
320 #ifndef __ANDROID__
321     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
322 #endif
323     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324 
325     for (size_t i = 0u; i < size.height; ++i)
326     {
327         const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
328         const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
329         const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
330         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
331         size_t syj = 0u, sj = 0u, dj = 0u;
332 
333 #ifndef __ANDROID__
334         for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
335         {
336             internal::prefetch(srcy + syj);
337             internal::prefetch(srcu + sj);
338             internal::prefetch(srcv + sj);
339 
340             uint8x16x2_t v_y = vld2q_u8(srcy + syj);
341             uint8x16x4_t v_dst;
342             v_dst.val[0] = vld1q_u8(srcu + sj);
343             v_dst.val[1] = v_y.val[0];
344             v_dst.val[2] = vld1q_u8(srcv + sj);
345             v_dst.val[3] = v_y.val[1];
346             vst4q_u8(dst + dj, v_dst);
347 
348             v_y = vld2q_u8(srcy + syj + 32);
349             v_dst.val[0] = vld1q_u8(srcu + sj + 16);
350             v_dst.val[1] = v_y.val[0];
351             v_dst.val[2] = vld1q_u8(srcv + sj + 16);
352             v_dst.val[3] = v_y.val[1];
353             vst4q_u8(dst + dj + 64, v_dst);
354         }
355 #endif
356 
357         for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
358         {
359             uint8x8x2_t v_y = vld2_u8(srcy + syj);
360             uint8x8x4_t v_dst;
361             v_dst.val[0] = vld1_u8(srcu + sj);
362             v_dst.val[1] = v_y.val[0];
363             v_dst.val[2] = vld1_u8(srcv + sj);
364             v_dst.val[3] = v_y.val[1];
365             vst4_u8(dst + dj, v_dst);
366         }
367 
368         for (; sj < size.width; ++sj, syj += 2, dj += 4)
369         {
370             dst[dj] = srcu[sj];
371             dst[dj + 1] = srcy[syj];
372             dst[dj + 2] = srcv[sj];
373             dst[dj + 3] = srcy[syj + 1];
374         }
375     }
376 #else
377     (void)size;
378     (void)srcyBase;
379     (void)srcyStride;
380     (void)srcuBase;
381     (void)srcuStride;
382     (void)srcvBase;
383     (void)srcvStride;
384     (void)dstBase;
385     (void)dstStride;
386 #endif
387 }
388 
389 } // namespace CAROTENE_NS
390