carotene/src/accumulate.cpp

/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */


#include "common.hpp"
#include "vtransform.hpp"

#include <cstring>

namespace CAROTENE_NS {

void accumulate(const Size2D &size,
                const u8 *srcBase, ptrdiff_t srcStride,
                s16 *dstBase, ptrdiff_t dstStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t j = 0;

        for (; j < roiw16; j += 16)
        {
            internal::prefetch(src + j);
            internal::prefetch(dst + j);
            uint8x16_t v_src = vld1q_u8(src + j);
            int16x8_t v_dst0 = vld1q_s16(dst + j);
            int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
            v_dst0 = vqaddq_s16(v_dst0, v_src0);
            v_dst1 = vqaddq_s16(v_dst1, v_src1);
            vst1q_s16(dst + j, v_dst0);
            vst1q_s16(dst + j + 8, v_dst1);
        }
        for (; j < roiw8; j += 8)
        {
            uint8x8_t v_src = vld1_u8(src + j);
            int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
            int16x8_t v_dst = vld1q_s16(dst + j);
            v_dst = vqaddq_s16(v_dst, v_src16);
            vst1q_s16(dst + j, v_dst);
        }

        for (; j < size.width; j++)
            dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
    (void)dstStride;
#endif
}

#ifdef CAROTENE_NEON

namespace {

template <int shift>
void accumulateSquareConst(const Size2D &size,
                           const u8 *srcBase, ptrdiff_t srcStride,
                           s16 *dstBase, ptrdiff_t dstStride)
{
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t j = 0;

        for (; j < roiw16; j += 16)
        {
            internal::prefetch(src + j);
            internal::prefetch(dst + j);
            uint8x16_t v_src = vld1q_u8(src + j);
            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));

            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));

            v_srclo = vget_low_s16(v_src1);
            v_srchi = vget_high_s16(v_src1);
            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));

            vst1q_s16(dst + j, v_dst0);
            vst1q_s16(dst + j + 8, v_dst1);
        }
        for (; j < roiw8; j += 8)
        {
            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
            int16x8_t v_dst = vld1q_s16(dst + j);
            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
                                 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
            vst1q_s16(dst + j, v_dst);
        }

        for (; j < size.width; j++)
        {
            s32 srcVal = src[j];
            dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
        }
    }
}

template <>
void accumulateSquareConst<0>(const Size2D &size,
                              const u8 *srcBase, ptrdiff_t srcStride,
                              s16 *dstBase, ptrdiff_t dstStride)
{
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t j = 0;

        for (; j < roiw16; j += 16)
        {
            internal::prefetch(src + j);
            internal::prefetch(dst + j);
            uint8x16_t v_src = vld1q_u8(src + j);
            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));

            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));

            v_srclo = vget_low_s16(v_src1);
            v_srchi = vget_high_s16(v_src1);
            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));

            vst1q_s16(dst + j, v_dst0);
            vst1q_s16(dst + j + 8, v_dst1);
        }
        for (; j < roiw8; j += 8)
        {
            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
            int16x8_t v_dst = vld1q_s16(dst + j);
            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
                                 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
            vst1q_s16(dst + j, v_dst);
        }

        for (; j < size.width; j++)
        {
            s32 srcVal = src[j];
            dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
        }
    }
}

typedef void (* accumulateSquareConstFunc)(const Size2D &size,
                                           const u8 *srcBase, ptrdiff_t srcStride,
                                           s16 *dstBase, ptrdiff_t dstStride);

} // namespace

#endif

void accumulateSquare(const Size2D &size,
                      const u8 *srcBase, ptrdiff_t srcStride,
                      s16 *dstBase, ptrdiff_t dstStride,
                      u32 shift)
{
    if (shift >= 16)
    {
        for (size_t i = 0; i < size.height; ++i)
        {
            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
            std::memset(dst, 0, sizeof(s16) * size.width);
        }
        return;
    }

    internal::assertSupportedConfiguration();

#ifdef CAROTENE_NEON
    // this ugly contruction is needed to avoid:
    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);

    accumulateSquareConstFunc funcs[16] =
    {
        accumulateSquareConst<0>,
        accumulateSquareConst<1>,
        accumulateSquareConst<2>,
        accumulateSquareConst<3>,
        accumulateSquareConst<4>,
        accumulateSquareConst<5>,
        accumulateSquareConst<6>,
        accumulateSquareConst<7>,
        accumulateSquareConst<8>,
        accumulateSquareConst<9>,
        accumulateSquareConst<10>,
        accumulateSquareConst<11>,
        accumulateSquareConst<12>,
        accumulateSquareConst<13>,
        accumulateSquareConst<14>,
        accumulateSquareConst<15>
    }, func = funcs[shift];

    func(size, srcBase, srcStride, dstBase, dstStride);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
    (void)dstStride;
    (void)shift;
#endif
}

#ifdef CAROTENE_NEON

namespace {

struct AccumulateWeightedHalf
{
    typedef u8 type;

    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
                     uint8x16_t & v_dst) const
    {
        v_dst = vhaddq_u8(v_src0, v_src1);
    }

    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
                     uint8x8_t & v_dst) const
    {
        v_dst = vhadd_u8(v_src0, v_src1);
    }

    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
    {
        dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
    }
};

struct AccumulateWeighted
{
    typedef u8 type;

    float alpha, beta;
    float32x4_t v_alpha, v_beta;

    explicit AccumulateWeighted(float _alpha) :
        alpha(_alpha), beta(1 - _alpha)
    {
        v_alpha = vdupq_n_f32(alpha);
        v_beta = vdupq_n_f32(beta);
    }

    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
                     uint8x16_t & v_dst) const
    {
        uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
        uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
        uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));

        v_src0_p = vmovl_u8(vget_high_u8(v_src0));
        v_src1_p = vmovl_u8(vget_high_u8(v_src1));
        v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
        v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
        uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));

        v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
    }

    void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
                     uint8x8_t & v_dst) const
    {
        uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);

        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
        uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
                                        vmovn_u32(vcvtq_u32_f32(v_dst1f)));

        v_dst = vmovn_u16(_v_dst);
    }

    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
    {
        dst[0] = beta * src1[0] + alpha * src0[0];
    }
};

} // namespace

#endif

void accumulateWeighted(const Size2D &size,
                        const u8 *srcBase, ptrdiff_t srcStride,
                        u8 *dstBase, ptrdiff_t dstStride,
                        f32 alpha)
{
    if (alpha == 0.0f)
        return;
    if (alpha == 1.0f)
    {
        for (size_t i = 0; i < size.height; ++i)
        {
            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
            std::memcpy(dst, src, sizeof(u8) * size.width);
        }
        return;
    }

    internal::assertSupportedConfiguration();

#ifdef CAROTENE_NEON
    // in this case we can use the following scheme:
    // dst[p] = (src[p] + dst[p]) >> 1
    // which is faster
    if (alpha == 0.5f)
    {
        internal::vtransform(size,
                             srcBase, srcStride,
                             dstBase, dstStride,
                             dstBase, dstStride,
                             AccumulateWeightedHalf());

        return;
    }

    internal::vtransform(size,
                     srcBase, srcStride,
                     dstBase, dstStride,
                     dstBase, dstStride,
                     AccumulateWeighted(alpha));
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
    (void)dstStride;
    (void)alpha;
#endif
}

} //namespace CAROTENE_NS