1 /*************************************************************************** 2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * 3 * Martin Renou * 4 * Copyright (c) QuantStack * 5 * * 6 * Distributed under the terms of the BSD 3-Clause License. * 7 * * 8 * The full license is in the file LICENSE, distributed with this software. * 9 ****************************************************************************/ 10 11 #ifndef XSIMD_INT_CONVERSION_HPP 12 #define XSIMD_INT_CONVERSION_HPP 13 14 #include "xsimd_base.hpp" 15 16 namespace xsimd 17 { 18 namespace detail 19 { 20 /************************************ 21 * conversion of 8 int8 <-> 8 int32 * 22 ************************************/ 23 24 // a contains 8 int8 in its low half 25 __m256i xsimd_cvtepi8_epi32(__m128i a); 26 __m256i xsimd_cvtepu8_epi32(__m128i a); 27 28 // Returns an vector containing 8 int8 in its low half 29 __m128i xsimd_cvtepi32_epi8(__m256i a); 30 __m128i xsimd_cvtepi32_epu8(__m256i a); 31 32 // a contains 16 int8 33 __m256i xsimd_cvtepi16_epi32(__m128i a); 34 __m256i xsimd_cvtepu16_epi32(__m128i a); 35 36 // Returns an vector containing 8 int16 37 __m128i xsimd_cvtepi32_epi16(__m256i a); 38 __m128i xsimd_cvtepi32_epu16(__m256i a); 39 40 /****************** 41 * Implementation * 42 ******************/ 43 xsimd_cvtepi8_epi32(__m128i a)44 inline __m256i xsimd_cvtepi8_epi32(__m128i a) 45 { 46 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 47 __m256i res = _mm256_cvtepi8_epi32(a); 48 #else 49 __m128i mask = _mm_cmplt_epi8(a, _mm_set1_epi8(0)); 50 __m128i tmp1 = _mm_unpacklo_epi8(a, mask); 51 mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); 52 __m128i tmp2 = _mm_unpacklo_epi16(tmp1, mask); 53 __m128i tmp3 = _mm_unpackhi_epi16(tmp1, mask); 54 __m256i res = _mm256_castsi128_si256(tmp2); 55 res = _mm256_insertf128_si256(res, tmp3, 1); 56 #endif 57 return res; 58 } 59 xsimd_cvtepu8_epi32(__m128i a)60 inline __m256i xsimd_cvtepu8_epi32(__m128i a) 61 { 62 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 63 __m256i res = _mm256_cvtepu8_epi32(a); 64 #else 65 __m128i tmp1 = _mm_unpacklo_epi8(a, _mm_set1_epi8(0)); 66 __m128i tmp2 = _mm_unpacklo_epi16(tmp1, _mm_set1_epi16(0)); 67 __m128i tmp3 = _mm_unpackhi_epi16(tmp1, _mm_set1_epi16(0)); 68 __m256i res = _mm256_castsi128_si256(tmp2); 69 res = _mm256_insertf128_si256(res, tmp3, 1); 70 #endif 71 return res; 72 } 73 xsimd_cvtepi32_epi8(__m256i a)74 inline __m128i xsimd_cvtepi32_epi8(__m256i a) 75 { 76 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 77 __m256i tmp2 = _mm256_packs_epi32(a, a); 78 __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0)); 79 __m256i tmp4 = _mm256_packs_epi16(tmp3, _mm256_set1_epi16(0)); 80 __m128i res = _mm256_castsi256_si128(tmp4); 81 #else 82 __m128i tmp_hi = _mm256_extractf128_si256(a, 1); 83 __m128i tmp_lo = _mm256_castsi256_si128(a); 84 tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); 85 tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); 86 __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); 87 res = _mm_packs_epi16(res, _mm_set1_epi16(0)); 88 #endif 89 return res; 90 } 91 xsimd_cvtepi32_epu8(__m256i a)92 inline __m128i xsimd_cvtepi32_epu8(__m256i a) 93 { 94 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 95 __m256i tmp2 = _mm256_packs_epi32(a, a); 96 __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0)); 97 __m256i tmp4 = _mm256_packus_epi16(tmp3, _mm256_set1_epi16(0)); 98 __m128i res = _mm256_castsi256_si128(tmp4); 99 #else 100 __m128i tmp_hi = _mm256_extractf128_si256(a, 1); 101 __m128i tmp_lo = _mm256_castsi256_si128(a); 102 tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); 103 tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); 104 __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); 105 res = _mm_packus_epi16(res, _mm_set1_epi16(0)); 106 #endif 107 return res; 108 } 109 xsimd_cvtepi16_epi32(__m128i a)110 inline __m256i xsimd_cvtepi16_epi32(__m128i a) 111 { 112 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 113 __m256i res = _mm256_cvtepi16_epi32(a); 114 #else 115 __m128i mask = _mm_cmplt_epi16(a, _mm_set1_epi16(0)); 116 __m128i tmp1 = _mm_unpacklo_epi16(a, mask); 117 __m128i tmp2 = _mm_unpackhi_epi16(a, mask); 118 __m256i res = _mm256_castsi128_si256(tmp1); 119 res = _mm256_insertf128_si256(res, tmp2, 1); 120 #endif 121 return res; 122 } 123 xsimd_cvtepu16_epi32(__m128i a)124 inline __m256i xsimd_cvtepu16_epi32(__m128i a) 125 { 126 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 127 __m256i res = _mm256_cvtepu16_epi32(a); 128 #else 129 __m128i tmp1 = _mm_unpacklo_epi16(a, _mm_set1_epi16(0)); 130 __m128i tmp2 = _mm_unpackhi_epi16(a, _mm_set1_epi16(0)); 131 __m256i res = _mm256_castsi128_si256(tmp1); 132 res = _mm256_insertf128_si256(res, tmp2, 1); 133 #endif 134 return res; 135 } 136 xsimd_cvtepi32_epi16(__m256i a)137 inline __m128i xsimd_cvtepi32_epi16(__m256i a) 138 { 139 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 140 __m256i tmp1 = _mm256_packs_epi32(a, a); 141 __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0)); 142 __m128i res = _mm256_castsi256_si128(tmp2); 143 #else 144 __m128i tmp_hi = _mm256_extractf128_si256(a, 1); 145 __m128i tmp_lo = _mm256_castsi256_si128(a); 146 tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); 147 tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); 148 __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); 149 #endif 150 return res; 151 } 152 xsimd_cvtepi32_epu16(__m256i a)153 inline __m128i xsimd_cvtepi32_epu16(__m256i a) 154 { 155 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION 156 __m256i tmp1 = _mm256_packus_epi32(a, a); 157 __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0)); 158 __m128i res = _mm256_castsi256_si128(tmp2); 159 #else 160 __m128i tmp_hi = _mm256_extractf128_si256(a, 1); 161 __m128i tmp_lo = _mm256_castsi256_si128(a); 162 tmp_hi = _mm_packus_epi32(tmp_hi, tmp_hi); 163 tmp_lo = _mm_packus_epi32(tmp_lo, tmp_lo); 164 __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); 165 #endif 166 return res; 167 } 168 } 169 } 170 171 #endif 172