1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "vtransform.hpp"
42 
43 namespace CAROTENE_NS {
44 
extract2(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)45 void extract2(const Size2D &size,
46               const u8 * srcBase, ptrdiff_t srcStride,
47               u8 * dstBase, ptrdiff_t dstStride,
48               u32 coi)
49 {
50     internal::assertSupportedConfiguration();
51 #ifdef CAROTENE_NEON
52 #ifndef __ANDROID__
53     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
54 #endif
55     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56 
57     for (size_t i = 0u; i < size.height; ++i)
58     {
59         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
60         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
61         size_t sj = 0u, dj = 0u;
62 
63 #ifndef __ANDROID__
64         for (; dj < roiw32; sj += 64, dj += 32)
65         {
66             internal::prefetch(src + sj);
67 
68             uint8x16x2_t v_src = vld2q_u8(src + sj);
69             vst1q_u8(dst + dj, v_src.val[coi]);
70 
71             v_src = vld2q_u8(src + sj + 32);
72             vst1q_u8(dst + dj + 16, v_src.val[coi]);
73         }
74 #endif
75 
76         for (; dj < roiw8; sj += 16, dj += 8)
77         {
78             uint8x8x2_t v_src = vld2_u8(src + sj);
79             vst1_u8(dst + dj, v_src.val[coi]);
80         }
81 
82         for (; dj < size.width; sj += 2, ++dj)
83         {
84             dst[dj] = src[sj + coi];
85         }
86     }
87 #else
88     (void)size;
89     (void)srcBase;
90     (void)srcStride;
91     (void)dstBase;
92     (void)dstStride;
93     (void)coi;
94 #endif
95 }
96 
extract3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)97 void extract3(const Size2D &size,
98               const u8 * srcBase, ptrdiff_t srcStride,
99               u8 * dstBase, ptrdiff_t dstStride,
100               u32 coi)
101 {
102     internal::assertSupportedConfiguration();
103 #ifdef CAROTENE_NEON
104 #ifndef __ANDROID__
105     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
106 #endif
107     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
108 
109     for (size_t i = 0u; i < size.height; ++i)
110     {
111         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
112         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
113         size_t sj = 0u, dj = 0u;
114 
115 #ifndef __ANDROID__
116         for (; dj < roiw32; sj += 96, dj += 32)
117         {
118             internal::prefetch(src + sj);
119 
120             uint8x16x3_t v_src = vld3q_u8(src + sj);
121             vst1q_u8(dst + dj, v_src.val[coi]);
122 
123             v_src = vld3q_u8(src + sj + 48);
124             vst1q_u8(dst + dj + 16, v_src.val[coi]);
125         }
126 #endif
127 
128         for (; dj < roiw8; sj += 24, dj += 8)
129         {
130             uint8x8x3_t v_src = vld3_u8(src + sj);
131             vst1_u8(dst + dj, v_src.val[coi]);
132         }
133 
134         for (; dj < size.width; sj += 3, ++dj)
135         {
136             dst[dj] = src[sj + coi];
137         }
138     }
139 #else
140     (void)size;
141     (void)srcBase;
142     (void)srcStride;
143     (void)dstBase;
144     (void)dstStride;
145     (void)coi;
146 #endif
147 }
148 
extract4(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,u32 coi)149 void extract4(const Size2D &size,
150               const u8 * srcBase, ptrdiff_t srcStride,
151               u8 * dstBase, ptrdiff_t dstStride,
152               u32 coi)
153 {
154     internal::assertSupportedConfiguration();
155 #ifdef CAROTENE_NEON
156 #ifndef __ANDROID__
157     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
158 #endif
159     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
160 
161     for (size_t i = 0u; i < size.height; ++i)
162     {
163         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
164         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
165         size_t sj = 0u, dj = 0u;
166 
167 #ifndef __ANDROID__
168         for (; dj < roiw32; sj += 128, dj += 32)
169         {
170             internal::prefetch(src + sj);
171 
172             uint8x16x4_t v_src = vld4q_u8(src + sj);
173             vst1q_u8(dst + dj, v_src.val[coi]);
174 
175             v_src = vld4q_u8(src + sj + 64);
176             vst1q_u8(dst + dj + 16, v_src.val[coi]);
177         }
178 #endif
179 
180         for (; dj < roiw8; sj += 32, dj += 8)
181         {
182             uint8x8x4_t v_src = vld4_u8(src + sj);
183             vst1_u8(dst + dj, v_src.val[coi]);
184         }
185 
186         for (; dj < size.width; sj += 4, ++dj)
187         {
188             dst[dj] = src[sj + coi];
189         }
190     }
191 #else
192     (void)size;
193     (void)srcBase;
194     (void)srcStride;
195     (void)dstBase;
196     (void)dstStride;
197     (void)coi;
198 #endif
199 }
200 
201 #define FILL_LINES2(macro,type) \
202             macro##_LINE(type,0) \
203             macro##_LINE(type,1)
204 #define FILL_LINES3(macro,type) \
205             FILL_LINES2(macro,type) \
206             macro##_LINE(type,2)
207 #define FILL_LINES4(macro,type) \
208             FILL_LINES3(macro,type) \
209             macro##_LINE(type,3)
210 
211 #define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
212 
213 #ifdef CAROTENE_NEON
214 
215 #define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
216 #define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
217 #define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
218 #define SST_LINE(type, n) dst##n[dj] = src[sj + n];
219 
220 #define MUL2(val) (val << 1)
221 #define MUL3(val) (MUL2(val) + val)
222 #define MUL4(val) (val << 2)
223 
224 #define CONTDST2 srcStride == dst0Stride && \
225                  srcStride == dst1Stride &&
226 #define CONTDST3 srcStride == dst0Stride && \
227                  srcStride == dst1Stride && \
228                  srcStride == dst2Stride &&
229 #define CONTDST4 srcStride == dst0Stride && \
230                  srcStride == dst1Stride && \
231                  srcStride == dst2Stride && \
232                  srcStride == dst3Stride &&
233 
234 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
235 
236 #define SPLIT_ASM2(sgn, bits) __asm__ ( \
237                                           "vld2." #bits " {d0, d2}, [%[in0]]            \n\t" \
238                                           "vld2." #bits " {d1, d3}, [%[in1]]            \n\t" \
239                                           "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
240                                           "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
241                                           : \
242                                           : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
243                                             [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
244                                           : "d0","d1","d2","d3" \
245                                       );
246 #define SPLIT_ASM3(sgn, bits) __asm__ ( \
247                                           "vld3." #bits " {d0, d2, d4}, [%[in0]]        \n\t" \
248                                           "vld3." #bits " {d1, d3, d5}, [%[in1]]        \n\t" \
249                                           "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
250                                           "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
251                                           "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
252                                           : \
253                                           : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
254                                             [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
255                                           : "d0","d1","d2","d3","d4","d5" \
256                                       );
257 #define SPLIT_ASM4(sgn, bits) __asm__ ( \
258                                           "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
259                                           "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
260                                           "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
261                                           "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
262                                           "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
263                                           "vst1." #bits " {d6-d7}, [%[out3]]            \n\t" \
264                                           : \
265                                           : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
266                                             [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
267                                           : "d0","d1","d2","d3","d4","d5","d6","d7" \
268                                       );
269 
270 #define SPLIT_QUAD(sgn, bits, n) { \
271                                      internal::prefetch(src + sj); \
272                                      SPLIT_ASM##n(sgn, bits) \
273                                  }
274 
275 #else
276 
277 #define SPLIT_QUAD(sgn, bits, n) { \
278                                      internal::prefetch(src + sj); \
279                                      vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
280                                      FILL_LINES##n(VST1Q, sgn##bits) \
281                                  }
282 
283 #endif
284 
285 #define SPLIT(sgn,bits,n) void split##n(const Size2D &_size,                                            \
286                                     const sgn##bits * srcBase, ptrdiff_t srcStride                      \
287                                     FILL_LINES##n(FARG, sgn##bits) )                                    \
288 {                                                                                                       \
289     internal::assertSupportedConfiguration();                                                           \
290     Size2D size(_size);                                                                                 \
291     if (CONTDST##n                                                                                      \
292         dst0Stride == (ptrdiff_t)(size.width))                                                          \
293     {                                                                                                   \
294         size.width *= size.height;                                                                      \
295         size.height = 1;                                                                                \
296     }                                                                                                   \
297     typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                           \
298     size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
299     typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                             \
300     size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
301                                                                                                         \
302     for (size_t i = 0u; i < size.height; ++i)                                                           \
303     {                                                                                                   \
304         const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
305         FILL_LINES##n(VROW, sgn##bits)                                                                  \
306         size_t sj = 0u, dj = 0u;                                                                        \
307                                                                                                         \
308         for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits))             \
309             SPLIT_QUAD(sgn, bits, n)                                                                    \
310                                                                                                         \
311         if (dj < roiw8)                                                                                 \
312         {                                                                                               \
313             vec64 v_src = vld##n##_##sgn##bits(src + sj);                                               \
314             FILL_LINES##n(VST1, sgn##bits)                                                              \
315             sj += MUL##n(8)/sizeof(sgn##bits);                                                          \
316             dj += 8/sizeof(sgn##bits);                                                                  \
317         }                                                                                               \
318                                                                                                         \
319         for (; dj < size.width; sj += n, ++dj)                                                          \
320         {                                                                                               \
321             FILL_LINES##n(SST, sgn##bits)                                                               \
322         }                                                                                               \
323     }                                                                                                   \
324 }
325 
326 #define SPLIT64(sgn,n) void split##n(const Size2D &_size,                                               \
327                                      const sgn##64 * srcBase, ptrdiff_t srcStride                       \
328                                      FILL_LINES##n(FARG, sgn##64) )                                     \
329 {                                                                                                       \
330     internal::assertSupportedConfiguration();                                                           \
331     Size2D size(_size);                                                                                 \
332     if (CONTDST##n                                                                                      \
333         dst0Stride == (ptrdiff_t)(size.width))                                                          \
334     {                                                                                                   \
335         size.width *= size.height;                                                                      \
336         size.height = 1;                                                                                \
337     }                                                                                                   \
338     typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                               \
339                                                                                                         \
340     for (size_t i = 0u; i < size.height; ++i)                                                           \
341     {                                                                                                   \
342         const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i);                               \
343         FILL_LINES##n(VROW, sgn##64)                                                                    \
344         size_t sj = 0u, dj = 0u;                                                                        \
345                                                                                                         \
346         for (; dj < size.width; sj += n, ++dj)                                                          \
347         {                                                                                               \
348             vec64 v_src = vld##n##_##sgn##64(src + sj);                                                 \
349             FILL_LINES##n(VST1, sgn##64)                                                                \
350         }                                                                                               \
351     }                                                                                                   \
352 }
353 
354 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
355 
356 #define ALPHA_QUAD(sgn, bits) { \
357                                   internal::prefetch(src + sj); \
358                                   __asm__ ( \
359                                       "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
360                                       "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
361                                       "vst3." #bits " {d0, d2, d4}, [%[out3_1]]     \n\t" \
362                                       "vst3." #bits " {d1, d3, d5}, [%[out3_2]]     \n\t" \
363                                       "vst1." #bits " {d6-d7}, [%[out1]]            \n\t" \
364                                       : \
365                                       : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
366                                         [in0]  "r" (src + sj), [in1]  "r" (src + sj + 32/sizeof(sgn##bits)) \
367                                       : "d0","d1","d2","d3","d4","d5","d6","d7" \
368                                   ); \
369                               }
370 
371 #else
372 
373 #define ALPHA_QUAD(sgn, bits) { \
374                                   internal::prefetch(src + sj); \
375                                   union { vec128_4 v4; vec128_3 v3; } vals; \
376                                   vals.v4 = vld4q_##sgn##bits(src + sj); \
377                                   vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
378                                   vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
379                               }
380 
381 #endif
382 
383 #define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size,                                          \
384                                           const sgn##bits * srcBase, ptrdiff_t srcStride,               \
385                                           sgn##bits * dst3Base, ptrdiff_t dst3Stride,                   \
386                                           sgn##bits * dst1Base, ptrdiff_t dst1Stride)                   \
387 {                                                                                                       \
388     internal::assertSupportedConfiguration();                                                           \
389     Size2D size(_size);                                                                                 \
390     if (srcStride == dst3Stride &&                                                                      \
391         srcStride == dst1Stride &&                                                                      \
392         srcStride == (ptrdiff_t)(size.width))                                                           \
393     {                                                                                                   \
394         size.width *= size.height;                                                                      \
395         size.height = 1;                                                                                \
396     }                                                                                                   \
397     typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4;                                         \
398     typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3;                                         \
399     size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
400     typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4;                                           \
401     typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3;                                           \
402     size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
403                                                                                                         \
404     for (size_t i = 0u; i < size.height; ++i)                                                           \
405     {                                                                                                   \
406         const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
407         sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i);                                \
408         sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i);                                \
409         size_t sj = 0u, d3j = 0u, d1j = 0u;                                                             \
410                                                                                                         \
411         for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits),       \
412                                                                d1j += 16/sizeof(sgn##bits))             \
413             ALPHA_QUAD(sgn, bits)                                                                       \
414                                                                                                         \
415         if (d1j < roiw8)                                                                                \
416         {                                                                                               \
417             union { vec64_4 v4; vec64_3 v3; } vals;                                                     \
418             vals.v4 = vld4_##sgn##bits(src + sj);                                                       \
419             vst3_u8(dst3 + d3j, vals.v3);                                                               \
420             vst1_u8(dst1 + d1j, vals.v4.val[3]);                                                        \
421             sj += MUL4(8)/sizeof(sgn##bits);                                                            \
422             d3j += MUL3(8)/sizeof(sgn##bits);                                                           \
423             d1j += 8/sizeof(sgn##bits);                                                                 \
424         }                                                                                               \
425                                                                                                         \
426         for (; d1j < size.width; sj += 4, d3j += 3, ++d1j)                                              \
427         {                                                                                               \
428             dst3[d3j+0] = src[sj + 0];                                                                  \
429             dst3[d3j+1] = src[sj + 1];                                                                  \
430             dst3[d3j+2] = src[sj + 2];                                                                  \
431             dst1[d1j]   = src[sj + 3];                                                                  \
432         }                                                                                               \
433     }                                                                                                   \
434 }
435 
436 #else
437 
438 #define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
439 
440 #define SPLIT(sgn,bits,n) void split##n(const Size2D &size,                                          \
441                                     const sgn##bits * srcBase, ptrdiff_t srcStride                   \
442                                     FILL_LINES##n(FARG, sgn##bits) )                                 \
443 {                                                                                                    \
444     internal::assertSupportedConfiguration();                                                        \
445     (void)size;                                                                                      \
446     (void)srcBase;                                                                                   \
447     (void)srcStride;                                                                                 \
448     FILL_LINES##n(VOID, sgn##bits)                                                                   \
449 }
450 
451 #define SPLIT64(sgn,n) SPLIT(sgn,64,n)
452 
453 #define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size,                                        \
454                                           const sgn##bits * srcBase, ptrdiff_t srcStride,            \
455                                           sgn##bits * dst3Base, ptrdiff_t dst3Stride,                \
456                                           sgn##bits * dst1Base, ptrdiff_t dst1Stride)                \
457 {                                                                                                    \
458     internal::assertSupportedConfiguration();                                                        \
459     (void)size;                                                                                      \
460     (void)srcBase;                                                                                   \
461     (void)srcStride;                                                                                 \
462     (void)dst3Base;                                                                                  \
463     (void)dst3Stride;                                                                                \
464     (void)dst1Base;                                                                                  \
465     (void)dst1Stride;                                                                                \
466 }
467 
468 #endif //CAROTENE_NEON
469 
470 SPLIT(u, 8,2)
471 SPLIT(u, 8,3)
472 SPLIT(u, 8,4)
473 SPLIT(u,16,2)
474 SPLIT(u,16,3)
475 SPLIT(u,16,4)
476 SPLIT(s,32,2)
477 SPLIT(s,32,3)
478 SPLIT(s,32,4)
479 
480 SPLIT64(s, 2)
481 SPLIT64(s, 3)
482 SPLIT64(s, 4)
483 
484 SPLIT4ALPHA(u,8)
485 
486 } // namespace CAROTENE_NS
487