/* * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. */ #include "common.hpp" #include "vtransform.hpp" namespace CAROTENE_NS { #define FILL_LINES2(macro,type) \ macro##_LINE(type,0) \ macro##_LINE(type,1) #define FILL_LINES3(macro,type) \ FILL_LINES2(macro,type) \ macro##_LINE(type,2) #define FILL_LINES4(macro,type) \ FILL_LINES3(macro,type) \ macro##_LINE(type,3) #define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride #ifdef CAROTENE_NEON #define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i); #define PREF_LINE(type, n) internal::prefetch(src##n + sj); #define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj); #define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj); #define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj); #define SLD_LINE(type, n) dst[dj + n] = src##n[sj]; #define MUL2(val) (val << 1) #define MUL3(val) (MUL2(val) + val) #define MUL4(val) (val << 2) #define CONTSRC2 dstStride == src0Stride && \ dstStride == src1Stride && #define CONTSRC3 dstStride == src0Stride && \ dstStride == src1Stride && \ dstStride == src2Stride && #define CONTSRC4 dstStride == src0Stride && \ dstStride == src1Stride && \ dstStride == src2Stride && \ dstStride == src3Stride && #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) #define MERGE_ASM2(sgn, bits) __asm__ ( \ "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ "vst2." #bits " {d0, d2}, [%[out0]] \n\t" \ "vst2." #bits " {d1, d3}, [%[out1]] \n\t" \ : \ : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \ [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \ : "d0","d1","d2","d3" \ ); #define MERGE_ASM3(sgn, bits) __asm__ ( \ "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ "vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \ "vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \ : \ : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \ [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \ : "d0","d1","d2","d3","d4","d5" \ ); #define MERGE_ASM4(sgn, bits) __asm__ ( \ "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ "vld1." #bits " {d6-d7}, [%[in3]] \n\t" \ "vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \ "vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \ : \ : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \ [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \ : "d0","d1","d2","d3","d4","d5","d6","d7" \ ); #define MERGE_QUAD(sgn, bits, n) { \ FILL_LINES##n(PREF, sgn##bits) \ MERGE_ASM##n(sgn, bits) \ } #else #define MERGE_QUAD(sgn, bits, n) { \ vec128 v_dst; \ /*FILL_LINES##n(PREF, sgn##bits) \ FILL_LINES##n(VLD1Q, sgn##bits)*/ \ FILL_LINES##n(PRLD, sgn##bits) \ vst##n##q_##sgn##bits(dst + dj, v_dst); \ } #endif #define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \ FILL_LINES##n(FARG, sgn##bits), \ sgn##bits * dstBase, ptrdiff_t dstStride) \ { \ internal::assertSupportedConfiguration(); \ Size2D size(_size); \ if (CONTSRC##n \ dstStride == (ptrdiff_t)(size.width)) \ { \ size.width *= size.height; \ size.height = 1; \ } \ typedef internal::VecTraits::vec128 vec128; \ size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \ typedef internal::VecTraits::vec64 vec64; \ size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \ \ for (size_t i = 0u; i < size.height; ++i) \ { \ FILL_LINES##n(VROW, sgn##bits) \ sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \ size_t sj = 0u, dj = 0u; \ \ for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \ MERGE_QUAD(sgn, bits, n) \ \ if ( sj < roiw8 ) \ { \ vec64 v_dst; \ FILL_LINES##n(VLD1, sgn##bits) \ vst##n##_##sgn##bits(dst + dj, v_dst); \ sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \ } \ \ for (; sj < size.width; ++sj, dj += n) \ { \ FILL_LINES##n(SLD, sgn##bits) \ } \ } \ } #define COMBINE64(sgn,n) void combine##n(const Size2D &_size \ FILL_LINES##n(FARG, sgn##64), \ sgn##64 * dstBase, ptrdiff_t dstStride) \ { \ internal::assertSupportedConfiguration(); \ Size2D size(_size); \ if (CONTSRC##n \ dstStride == (ptrdiff_t)(size.width)) \ { \ size.width *= size.height; \ size.height = 1; \ } \ typedef internal::VecTraits::vec64 vec64; \ \ for (size_t i = 0u; i < size.height; ++i) \ { \ FILL_LINES##n(VROW, sgn##64) \ sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \ size_t sj = 0u, dj = 0u; \ \ for (; sj < size.width; ++sj, dj += n) \ { \ vec64 v_dst; \ FILL_LINES##n(VLD1, sgn##64) \ vst##n##_##sgn##64(dst + dj, v_dst); \ /*FILL_LINES##n(SLD, sgn##64)*/ \ } \ } \ } #else #define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride; #define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \ FILL_LINES##n(FARG, sgn##bits), \ sgn##bits * dstBase, ptrdiff_t dstStride) \ { \ internal::assertSupportedConfiguration(); \ (void)size; \ FILL_LINES##n(VOID, sgn##bits) \ (void)dstBase; \ (void)dstStride; \ } #define COMBINE64(sgn,n) COMBINE(sgn,64,n) #endif //CAROTENE_NEON COMBINE(u, 8,2) COMBINE(u, 8,3) COMBINE(u, 8,4) COMBINE(u,16,2) COMBINE(u,16,3) COMBINE(u,16,4) COMBINE(s,32,2) COMBINE(s,32,3) COMBINE(s,32,4) COMBINE64(s, 2) COMBINE64(s, 3) COMBINE64(s, 4) void combineYUYV(const Size2D &size, const u8 * srcyBase, ptrdiff_t srcyStride, const u8 * srcuBase, ptrdiff_t srcuStride, const u8 * srcvBase, ptrdiff_t srcvStride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON #ifndef __ANDROID__ size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; #endif size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0u; i < size.height; i += 1) { const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t syj = 0u, sj = 0u, dj = 0u; #ifndef __ANDROID__ for (; sj < roiw32; sj += 32, syj += 64, dj += 128) { internal::prefetch(srcy + syj); internal::prefetch(srcu + sj); internal::prefetch(srcv + sj); uint8x16x2_t v_y = vld2q_u8(srcy + syj); uint8x16x4_t v_dst; v_dst.val[0] = v_y.val[0]; v_dst.val[1] = vld1q_u8(srcu + sj); v_dst.val[2] = v_y.val[1]; v_dst.val[3] = vld1q_u8(srcv + sj); vst4q_u8(dst + dj, v_dst); v_y = vld2q_u8(srcy + syj + 32); v_dst.val[0] = v_y.val[0]; v_dst.val[1] = vld1q_u8(srcu + sj + 16); v_dst.val[2] = v_y.val[1]; v_dst.val[3] = vld1q_u8(srcv + sj + 16); vst4q_u8(dst + dj + 64, v_dst); } #endif for (; sj < roiw8; sj += 8, syj += 16, dj += 32) { uint8x8x2_t v_y = vld2_u8(srcy + syj); uint8x8x4_t v_dst; v_dst.val[0] = v_y.val[0]; v_dst.val[1] = vld1_u8(srcu + sj); v_dst.val[2] = v_y.val[1]; v_dst.val[3] = vld1_u8(srcv + sj); vst4_u8(dst + dj, v_dst); } for (; sj < size.width; ++sj, syj += 2, dj += 4) { dst[dj] = srcy[syj]; dst[dj + 1] = srcu[sj]; dst[dj + 2] = srcy[syj + 1]; dst[dj + 3] = srcv[sj]; } } #else (void)size; (void)srcyBase; (void)srcyStride; (void)srcuBase; (void)srcuStride; (void)srcvBase; (void)srcvStride; (void)dstBase; (void)dstStride; #endif } void combineUYVY(const Size2D &size, const u8 * srcyBase, ptrdiff_t srcyStride, const u8 * srcuBase, ptrdiff_t srcuStride, const u8 * srcvBase, ptrdiff_t srcvStride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON #ifndef __ANDROID__ size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; #endif size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0u; i < size.height; ++i) { const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t syj = 0u, sj = 0u, dj = 0u; #ifndef __ANDROID__ for (; sj < roiw32; sj += 32, syj += 64, dj += 128) { internal::prefetch(srcy + syj); internal::prefetch(srcu + sj); internal::prefetch(srcv + sj); uint8x16x2_t v_y = vld2q_u8(srcy + syj); uint8x16x4_t v_dst; v_dst.val[0] = vld1q_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj, v_dst); v_y = vld2q_u8(srcy + syj + 32); v_dst.val[0] = vld1q_u8(srcu + sj + 16); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj + 16); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj + 64, v_dst); } #endif for (; sj < roiw8; sj += 8, syj += 16, dj += 32) { uint8x8x2_t v_y = vld2_u8(srcy + syj); uint8x8x4_t v_dst; v_dst.val[0] = vld1_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4_u8(dst + dj, v_dst); } for (; sj < size.width; ++sj, syj += 2, dj += 4) { dst[dj] = srcu[sj]; dst[dj + 1] = srcy[syj]; dst[dj + 2] = srcv[sj]; dst[dj + 3] = srcy[syj + 1]; } } #else (void)size; (void)srcyBase; (void)srcyStride; (void)srcuBase; (void)srcuStride; (void)srcvBase; (void)srcvStride; (void)dstBase; (void)dstStride; #endif } } // namespace CAROTENE_NS