1 /***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
3 * Martin Renou                                                             *
4 * Copyright (c) QuantStack                                                 *
5 *                                                                          *
6 * Distributed under the terms of the BSD 3-Clause License.                 *
7 *                                                                          *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10 
11 #ifndef XSIMD_INT_CONVERSION_HPP
12 #define XSIMD_INT_CONVERSION_HPP
13 
14 #include "xsimd_base.hpp"
15 
16 namespace xsimd
17 {
18     namespace detail
19     {
20         /************************************
21          * conversion of 8 int8 <-> 8 int32 *
22          ************************************/
23 
24         // a contains 8 int8 in its low half
25         __m256i xsimd_cvtepi8_epi32(__m128i a);
26         __m256i xsimd_cvtepu8_epi32(__m128i a);
27 
28         // Returns an vector containing 8 int8 in its low half
29         __m128i xsimd_cvtepi32_epi8(__m256i a);
30         __m128i xsimd_cvtepi32_epu8(__m256i a);
31 
32         // a contains 16 int8
33         __m256i xsimd_cvtepi16_epi32(__m128i a);
34         __m256i xsimd_cvtepu16_epi32(__m128i a);
35 
36         // Returns an vector containing 8 int16
37         __m128i xsimd_cvtepi32_epi16(__m256i a);
38         __m128i xsimd_cvtepi32_epu16(__m256i a);
39 
40         /******************
41          * Implementation *
42          ******************/
43 
xsimd_cvtepi8_epi32(__m128i a)44         inline __m256i xsimd_cvtepi8_epi32(__m128i a)
45         {
46 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
47             __m256i res = _mm256_cvtepi8_epi32(a);
48 #else
49             __m128i mask = _mm_cmplt_epi8(a, _mm_set1_epi8(0));
50             __m128i tmp1 = _mm_unpacklo_epi8(a, mask);
51             mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0));
52             __m128i tmp2 = _mm_unpacklo_epi16(tmp1, mask);
53             __m128i tmp3 = _mm_unpackhi_epi16(tmp1, mask);
54             __m256i res = _mm256_castsi128_si256(tmp2);
55             res = _mm256_insertf128_si256(res, tmp3, 1);
56 #endif
57             return res;
58         }
59 
xsimd_cvtepu8_epi32(__m128i a)60         inline __m256i xsimd_cvtepu8_epi32(__m128i a)
61         {
62 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
63             __m256i res = _mm256_cvtepu8_epi32(a);
64 #else
65             __m128i tmp1 = _mm_unpacklo_epi8(a, _mm_set1_epi8(0));
66             __m128i tmp2 = _mm_unpacklo_epi16(tmp1, _mm_set1_epi16(0));
67             __m128i tmp3 = _mm_unpackhi_epi16(tmp1, _mm_set1_epi16(0));
68             __m256i res = _mm256_castsi128_si256(tmp2);
69             res = _mm256_insertf128_si256(res, tmp3, 1);
70 #endif
71             return res;
72         }
73 
xsimd_cvtepi32_epi8(__m256i a)74         inline __m128i xsimd_cvtepi32_epi8(__m256i a)
75         {
76 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
77             __m256i tmp2 = _mm256_packs_epi32(a, a);
78             __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0));
79             __m256i tmp4 = _mm256_packs_epi16(tmp3, _mm256_set1_epi16(0));
80             __m128i res = _mm256_castsi256_si128(tmp4);
81 #else
82             __m128i tmp_hi = _mm256_extractf128_si256(a, 1);
83             __m128i tmp_lo = _mm256_castsi256_si128(a);
84             tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi);
85             tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo);
86             __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi);
87             res = _mm_packs_epi16(res, _mm_set1_epi16(0));
88 #endif
89             return res;
90         }
91 
xsimd_cvtepi32_epu8(__m256i a)92         inline __m128i xsimd_cvtepi32_epu8(__m256i a)
93         {
94 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
95             __m256i tmp2 = _mm256_packs_epi32(a, a);
96             __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0));
97             __m256i tmp4 = _mm256_packus_epi16(tmp3, _mm256_set1_epi16(0));
98             __m128i res = _mm256_castsi256_si128(tmp4);
99 #else
100             __m128i tmp_hi = _mm256_extractf128_si256(a, 1);
101             __m128i tmp_lo = _mm256_castsi256_si128(a);
102             tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi);
103             tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo);
104             __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi);
105             res = _mm_packus_epi16(res, _mm_set1_epi16(0));
106 #endif
107             return res;
108         }
109 
xsimd_cvtepi16_epi32(__m128i a)110         inline __m256i xsimd_cvtepi16_epi32(__m128i a)
111         {
112 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
113             __m256i res = _mm256_cvtepi16_epi32(a);
114 #else
115             __m128i mask = _mm_cmplt_epi16(a, _mm_set1_epi16(0));
116             __m128i tmp1 = _mm_unpacklo_epi16(a, mask);
117             __m128i tmp2 = _mm_unpackhi_epi16(a, mask);
118             __m256i res = _mm256_castsi128_si256(tmp1);
119             res = _mm256_insertf128_si256(res, tmp2, 1);
120 #endif
121             return res;
122         }
123 
xsimd_cvtepu16_epi32(__m128i a)124         inline __m256i xsimd_cvtepu16_epi32(__m128i a)
125         {
126 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
127             __m256i res = _mm256_cvtepu16_epi32(a);
128 #else
129             __m128i tmp1 = _mm_unpacklo_epi16(a, _mm_set1_epi16(0));
130             __m128i tmp2 = _mm_unpackhi_epi16(a, _mm_set1_epi16(0));
131             __m256i res = _mm256_castsi128_si256(tmp1);
132             res = _mm256_insertf128_si256(res, tmp2, 1);
133 #endif
134             return res;
135         }
136 
xsimd_cvtepi32_epi16(__m256i a)137         inline __m128i xsimd_cvtepi32_epi16(__m256i a)
138         {
139 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
140             __m256i tmp1 = _mm256_packs_epi32(a, a);
141             __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0));
142             __m128i res = _mm256_castsi256_si128(tmp2);
143 #else
144             __m128i tmp_hi = _mm256_extractf128_si256(a, 1);
145             __m128i tmp_lo = _mm256_castsi256_si128(a);
146             tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi);
147             tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo);
148             __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi);
149 #endif
150             return res;
151         }
152 
xsimd_cvtepi32_epu16(__m256i a)153         inline __m128i xsimd_cvtepi32_epu16(__m256i a)
154         {
155 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION
156             __m256i tmp1 = _mm256_packus_epi32(a, a);
157             __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0));
158             __m128i res = _mm256_castsi256_si128(tmp2);
159 #else
160             __m128i tmp_hi = _mm256_extractf128_si256(a, 1);
161             __m128i tmp_lo = _mm256_castsi256_si128(a);
162             tmp_hi = _mm_packus_epi32(tmp_hi, tmp_hi);
163             tmp_lo = _mm_packus_epi32(tmp_lo, tmp_lo);
164             __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi);
165 #endif
166             return res;
167         }
168     }
169 }
170 
171 #endif
172