gdal-3.3.3/gcore/gdal_avx2_emulation.hpp

/******************************************************************************
 * Project:  GDAL
 * Purpose:  AVX2 emulation with SSE2 + a few SSE4.1 emulation
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
 *
 ******************************************************************************
 * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 ****************************************************************************/

#ifndef GDAL_AVX2_EMULATION_H_INCLUDED
#define GDAL_AVX2_EMULATION_H_INCLUDED

#include <emmintrin.h>

#ifdef __SSE4_1__
#include <smmintrin.h>

#define GDALmm_min_epu16   _mm_min_epu16
#define GDALmm_max_epu16   _mm_max_epu16
#define GDALmm_mullo_epi32 _mm_mullo_epi32

#else
// Emulation of SSE4.1 _mm_min_epu16 and _mm_max_epu16 with SSE2 only

static inline __m128i GDALAVX2Emul_mm_cmple_epu16 (__m128i x, __m128i y)
{
    return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128() );
}

static inline __m128i GDALAVX2Emul_mm_ternary(__m128i mask,
                                      __m128i then_reg,
                                      __m128i else_reg)
{
    return _mm_or_si128(_mm_and_si128(mask, then_reg),
                        _mm_andnot_si128(mask, else_reg));
}

static inline __m128i GDALmm_min_epu16 (__m128i x, __m128i y)
{
    const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
    return GDALAVX2Emul_mm_ternary(mask, x, y);
}

static inline __m128i GDALmm_max_epu16 (__m128i x, __m128i y)
{
    const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
    return GDALAVX2Emul_mm_ternary(mask, y, x);
}

static inline __m128i GDALmm_mullo_epi32 (__m128i x, __m128i y)
{
    const __m128i mul02 = _mm_shuffle_epi32(_mm_mul_epu32(x, y), 2 << 2);
    const __m128i mul13 = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_si128(x, 4),
                                                          _mm_srli_si128(y, 4)),
                                            2 << 2);
    return _mm_unpacklo_epi32(mul02, mul13);;
}
#endif // __SSE4_1__


#ifdef __AVX2__

#include <immintrin.h>

typedef __m256i GDALm256i;

#define GDALmm256_set1_epi8             _mm256_set1_epi8
#define GDALmm256_set1_epi16            _mm256_set1_epi16
#define GDALmm256_setzero_si256         _mm256_setzero_si256
#define GDALmm256_load_si256            _mm256_load_si256
#define GDALmm256_store_si256           _mm256_store_si256
#define GDALmm256_storeu_si256          _mm256_storeu_si256
#define GDALmm256_cmpeq_epi8            _mm256_cmpeq_epi8
#define GDALmm256_sad_epu8              _mm256_sad_epu8
#define GDALmm256_add_epi32             _mm256_add_epi32
#define GDALmm256_andnot_si256          _mm256_andnot_si256
#define GDALmm256_and_si256             _mm256_and_si256
#define GDALmm256_or_si256              _mm256_or_si256
#define GDALmm256_min_epu8              _mm256_min_epu8
#define GDALmm256_max_epu8              _mm256_max_epu8
#define GDALmm256_extracti128_si256     _mm256_extracti128_si256
#define GDALmm256_cvtepu8_epi16         _mm256_cvtepu8_epi16
#define GDALmm256_madd_epi16            _mm256_madd_epi16
#define GDALmm256_min_epu16             _mm256_min_epu16
#define GDALmm256_max_epu16             _mm256_max_epu16
#define GDALmm256_cvtepu16_epi32        _mm256_cvtepu16_epi32
#define GDALmm256_cvtepu16_epi64        _mm256_cvtepu16_epi64
#define GDALmm256_cvtepu32_epi64        _mm256_cvtepu32_epi64
#define GDALmm256_mullo_epi32           _mm256_mullo_epi32
#define GDALmm256_add_epi64             _mm256_add_epi64
#define GDALmm256_add_epi16             _mm256_add_epi16
#define GDALmm256_sub_epi16             _mm256_sub_epi16
#define GDALmm256_min_epi16             _mm256_min_epi16
#define GDALmm256_max_epi16             _mm256_max_epi16

#else

typedef struct
{
    __m128i low;
    __m128i high;
} GDALm256i;

static inline GDALm256i GDALmm256_set1_epi8(char c)
{
    GDALm256i reg;
    reg.low = _mm_set1_epi8(c);
    reg.high = _mm_set1_epi8(c);
    return reg;
}

static inline GDALm256i GDALmm256_set1_epi16(short s)
{
    GDALm256i reg;
    reg.low = _mm_set1_epi16(s);
    reg.high = _mm_set1_epi16(s);
    return reg;
}

static inline GDALm256i GDALmm256_setzero_si256()
{
    GDALm256i reg;
    reg.low = _mm_setzero_si128();
    reg.high = _mm_setzero_si128();
    return reg;
}

static inline GDALm256i GDALmm256_load_si256(GDALm256i const * p)
{
    GDALm256i reg;
    reg.low = _mm_load_si128(reinterpret_cast<__m128i const*>(p));
    reg.high = _mm_load_si128(reinterpret_cast<__m128i const*>(reinterpret_cast<const char*>(p)+16));
    return reg;
}

static inline void GDALmm256_store_si256(GDALm256i * p, GDALm256i reg)
{
    _mm_store_si128(reinterpret_cast<__m128i*>(p), reg.low);
    _mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
}

static inline void GDALmm256_storeu_si256(GDALm256i * p, GDALm256i reg)
{
    _mm_storeu_si128(reinterpret_cast<__m128i*>(p), reg.low);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
}

#define DEFINE_BINARY_MM256(mm256name, mm128name) \
static inline GDALm256i mm256name(GDALm256i r1, GDALm256i r2) \
{ \
    GDALm256i reg; \
    reg.low = mm128name(r1.low, r2.low); \
    reg.high = mm128name(r1.high, r2.high); \
    return reg; \
}

DEFINE_BINARY_MM256(GDALmm256_cmpeq_epi8, _mm_cmpeq_epi8)
DEFINE_BINARY_MM256(GDALmm256_sad_epu8, _mm_sad_epu8)
DEFINE_BINARY_MM256(GDALmm256_add_epi32, _mm_add_epi32)
DEFINE_BINARY_MM256(GDALmm256_andnot_si256, _mm_andnot_si128)
DEFINE_BINARY_MM256(GDALmm256_and_si256, _mm_and_si128)
DEFINE_BINARY_MM256(GDALmm256_or_si256, _mm_or_si128)
DEFINE_BINARY_MM256(GDALmm256_min_epu8, _mm_min_epu8)
DEFINE_BINARY_MM256(GDALmm256_max_epu8, _mm_max_epu8)
DEFINE_BINARY_MM256(GDALmm256_madd_epi16, _mm_madd_epi16)
DEFINE_BINARY_MM256(GDALmm256_min_epu16, GDALmm_min_epu16)
DEFINE_BINARY_MM256(GDALmm256_max_epu16, GDALmm_max_epu16)
DEFINE_BINARY_MM256(GDALmm256_mullo_epi32, GDALmm_mullo_epi32)
DEFINE_BINARY_MM256(GDALmm256_add_epi64, _mm_add_epi64)
DEFINE_BINARY_MM256(GDALmm256_add_epi16, _mm_add_epi16)
DEFINE_BINARY_MM256(GDALmm256_sub_epi16, _mm_sub_epi16)
DEFINE_BINARY_MM256(GDALmm256_min_epi16, _mm_min_epi16)
DEFINE_BINARY_MM256(GDALmm256_max_epi16, _mm_max_epi16)

static inline __m128i GDALmm256_extracti128_si256(GDALm256i reg, int index)
{
    return (index == 0) ? reg.low : reg.high;
}

static inline GDALm256i GDALmm256_cvtepu8_epi16(__m128i reg128)
{
    GDALm256i reg;
    reg.low = _mm_unpacklo_epi8(reg128, _mm_setzero_si128());
    reg.high = _mm_unpacklo_epi8(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
                                 _mm_setzero_si128());
    return reg;
}

static inline GDALm256i GDALmm256_cvtepu16_epi32(__m128i reg128)
{
    GDALm256i reg;
    reg.low = _mm_unpacklo_epi16(reg128, _mm_setzero_si128());
    reg.high = _mm_unpacklo_epi16(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
                                  _mm_setzero_si128());
    return reg;
}

static inline GDALm256i GDALmm256_cvtepu16_epi64(__m128i reg128)
{
    GDALm256i reg;
    reg.low = _mm_unpacklo_epi32(_mm_unpacklo_epi16(reg128,
                                                    _mm_setzero_si128()),
                                 _mm_setzero_si128());
    reg.high = _mm_unpacklo_epi32(_mm_unpacklo_epi16(
                                     _mm_srli_si128(reg128, 4),
                                                    _mm_setzero_si128()),
                                     _mm_setzero_si128());
    return reg;
}

static inline GDALm256i GDALmm256_cvtepu32_epi64(__m128i reg128)
{
    GDALm256i reg;
    reg.low = _mm_unpacklo_epi32(reg128, _mm_setzero_si128());
    reg.high = _mm_unpacklo_epi32(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
                                  _mm_setzero_si128());
    return reg;
}

#endif

#endif /* GDAL_AVX2_EMULATION_H_INCLUDED */