1/*
2 * Copyright (C) 2014 the FFLAS-FFPACK group
3 *
4 * Written by   Bastien Vialla<bastien.vialla@lirmm.fr>
5 * Brice Boyer (briceboyer) <boyer.brice@gmail.com>
6 *
7 *
8 * ========LICENCE========
9 * This file is part of the library FFLAS-FFPACK.
10 *
11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify
12 * it under the terms of the  GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
24 * ========LICENCE========
25 *.
26 */
27
28#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
29#define __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
30
31struct Simd256fp_base {
32#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
33
34    /* Name of the Simd struct */
35    static inline const std::string type_string () { return "Simd256"; }
36
37    /*
38     * Shuffle 128-bits selected by imm8 from a and b, and store the results in dst.
39     * Args   :	[a0, a1]
40     *			[b0, b1]
41     * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1]
42     */
43    template<int s>
44    static INLINE CONST __m256d permute128(const __m256d a, const __m256d b) {
45        return _mm256_permute2f128_pd(a, b, s);
46    }
47
48    template<int s>
49    static INLINE CONST __m256 permute128(const __m256 a, const __m256 b) {
50        return _mm256_permute2f128_ps(a, b, s);
51    }
52
53    /*
54     * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst.
55     * Args   : [a0, a1] int128_t
56     [b0, b1] int128_t
57     * Return : [a0, b0] int128_t
58     */
59    static INLINE CONST __m256d unpacklo128(const __m256d a, const __m256d b) { return permute128<0x20>(a, b); }
60    static INLINE CONST __m256 unpacklo128(const __m256 a, const __m256 b) { return permute128<0x20>(a, b); }
61
62    /*
63     * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst.
64     * Args   : [a0, a1] int128_t
65     [b0, b1] int128_t
66     * Return : [a1, b1] int128_t
67     */
68    static INLINE CONST __m256d unpackhi128(const __m256d a, const __m256d b) { return permute128<0x31>(a, b); }
69    static INLINE CONST __m256 unpackhi128(const __m256 a, const __m256 b) { return permute128<0x31>(a, b); }
70
71#endif
72};
73
74struct Simd256i_base {
75
76    /*
77     * alias to 256 bit simd register
78     */
79    using vect_t = __m256i;
80
81    /* Name of the Simd struct */
82    static inline const std::string type_string () { return "Simd256"; }
83
84    /*
85     *  Return vector of type vect_t with all elements set to zero
86     *  Return [0, ...,0]
87     */
88    static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); }
89
90#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
91
92    // CLANG < 3.8 does not implement m256_bslli_epi128 nor _mmm256_bsrli_epi128
93#if defined(__clang__)
94#if  __clang_major < 3 || (__clang_major__  == 3 && __clang_minor__ < 8)
95#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
96#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
97#endif
98#endif
99
100    /*
101     * Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t.
102     * Args   : [a0, a1] int128_t
103     * Return : [a0 << (s*8), a1 << (s*8)] int128_t
104     */
105    template<uint8_t s>
106    static INLINE CONST vect_t sll128(const vect_t a) { return _mm256_bslli_epi128(a, s); }
107
108    /*
109     * Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
110     * Args   : [a0, a1] int128_t
111     * Return : [a0 << (s*8), a1 << (s*8)] int128_t
112     */
113    template<uint8_t s>
114    static INLINE CONST vect_t srl128(const vect_t a) { return _mm256_bsrli_epi128(a, s); }
115
116    /*
117     * Compute the bitwise AND and store the results in vect_t.
118     * Args   : [a0, ..., a255]
119     *		   [b0, ..., b255]
120     * Return : [a0 AND b0, ..., a255 AND b255]
121     */
122    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); }
123
124    /*
125     * Compute the bitwise OR and store the results in vect_t.
126     * Args   : [a0, ..., a255]
127     *		   [b0, ..., b255]
128     * Return : [a0 OR b0, ..., a255 OR b255]
129     */
130    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); }
131
132    /*
133     * Compute the bitwise XOR and store the results in vect_t.
134     * Args   : [a0, ..., a255]
135     *		   [b0, ..., b255]
136     * Return : [a0 XOR b0, ..., a255 XOR b255]
137     */
138    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); }
139
140    /*
141     * Compute the bitwise NOT AND and store the results in vect_t.
142     * Args   : [a0, ..., a255]
143     *		   [b0, ..., b255]
144     * Return : [(NOT a0) AND b0, ..., (NOT a255) AND b255]
145     */
146    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(a, b); }
147
148    /*
149     * Shuffle 128-bit integers in a and b using the control in imm8, and store the results in dst.
150     * Args   :	[a0, a1] int128_t
151     *			[b0, b1] int128_t
152     * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] int128_t
153     */
154    template<int s>
155    static INLINE CONST vect_t permute128(const vect_t a, const vect_t b) {
156        return _mm256_permute2x128_si256(a, b, s);
157    }
158
159    /*
160     * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst.
161     * Args   : [a0, a1] int128_t
162     [b0, b1] int128_t
163     * Return : [a0, b0] int128_t
164     */
165    static INLINE CONST vect_t unpacklo128(const vect_t a, const vect_t b) { return permute128<0x20>(a, b); }
166
167    /*
168     * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst.
169     * Args   : [a0, a1] int128_t
170     [b0, b1] int128_t
171     * Return : [a1, b1] int128_t
172     */
173    static INLINE CONST vect_t unpackhi128(const vect_t a, const vect_t b) { return permute128<0x31>(a, b); }
174#endif
175};
176
177template <bool ArithType, bool Int, bool Signed, int Size> struct Simd256_impl;
178
179template <class T>
180using Simd256 =
181Simd256_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
182
183#include "simd256_float.inl"
184#include "simd256_double.inl"
185
186#ifdef SIMD_INT
187// To many missing insctructions on int8_t
188
189#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
190#ifdef __x86_64__
191#include "simd256_int64.inl"
192#endif
193#include "simd256_int32.inl"
194#include "simd256_int16.inl"
195#endif
196
197#endif //#ifdef SIMD_INT
198
199
200#endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
201/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
202// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
203