1/*
2 * Copyright (C) 2014 the FFLAS-FFPACK group
3 *
4 * Written by   Bastien Vialla<bastien.vialla@lirmm.fr>
5 * Brice Boyer (briceboyer) <boyer.brice@gmail.com>
6 *
7 *
8 * ========LICENCE========
9 * This file is part of the library FFLAS-FFPACK.
10 *
11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify
12 * it under the terms of the  GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
24 * ========LICENCE========
25 *.
26 */
27
28#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
29#define __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
30
31#if not (defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS))
32#error "You need AVX instructions to perform 256bits operations on double"
33#endif
34
35/*
36 * Simd256 specialized for double
37 */
38template <> struct Simd256_impl<true, false, true, 8> : public Simd256fp_base {
39    /*
40     * alias to 256 bit simd register
41     */
42    using vect_t = __m256d;
43
44    /*
45     * define the scalar type corresponding to the specialization
46     */
47    using scalar_t = double;
48
49    /*
50     *	number of scalar_t in a simd register
51     */
52    static const constexpr size_t vect_size = 4;
53
54    /*
55     *	alignement required by scalar_t pointer to be loaded in a vect_t
56     */
57    static const constexpr size_t alignment = 32;
58
59    /*
60     * Check if the pointer p is a multiple of alignemnt
61     */
62    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
63
64    /*
65     * Check if the number n is a multiple of vect_size
66     */
67    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
68
69    /*
70     *	Return vector of type vect_t with all elements set to zero
71     *  Return [0,0,0,0]
72     */
73    static INLINE CONST vect_t zero() { return _mm256_setzero_pd(); }
74
75    /*
76     *	Broadcast double-precision (64-bit) floating-point value x to all elements of vect_t.
77     *  Return [x,x,x,x]
78     */
79    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_pd(x); }
80
81    /*
82     *	Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values.
83     *  Return [x1,x2,x3,x4]
84     */
85    static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) {
86        return _mm256_set_pd(x4, x3, x2, x1);
87    }
88
89    /*
90     *	Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
91     *vect_t.
92     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]]
93     */
94    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
95        return _mm256_set_pd(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
96    }
97
98    /*
99     * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
100     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
101     * Return [p[0], p[1], p[2], p[3]]
102     */
103    static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_pd(p); }
104
105    /*
106     * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
107     * p does not need to be aligned on any particular boundary.
108     * Return [p[0], p[1], p[2], p[3]]
109     */
110    static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_pd(p); }
111
112    /*
113     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
114     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
115     */
116    static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_pd(const_cast<scalar_t *>(p), v); }
117
118    /*
119     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
120     * p does not need to be aligned on any particular boundary.
121     */
122    static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_pd(const_cast<scalar_t *>(p), v); }
123
124    /*
125     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using
126     * a non-temporal memory hint.
127     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
128     */
129    static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_pd(const_cast<scalar_t *>(p), v); }
130
131    /*
132     * Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8,
133     * and store the results in dst.
134     * Args   : [a0, a1, a2, a3] double
135     [b0, b1, b2, b3] double
136     * Return : [a[s[0..1]], ..., a[s[6..7]]] double
137     */
138#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
139    template<uint8_t s>
140    static INLINE CONST vect_t shuffle(const vect_t a) {
141        return _mm256_permute4x64_pd(a, s);
142    }
143#endif
144
145    /*
146     * Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b,
147     * and store the results in dst.
148     * Args   : [a0, a1, a2, a3] double
149     [b0, b1, b2, b3] double
150     * Return : [a0, b0, a2, b2] double
151     */
152    static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_pd(a, b); }
153
154    /*
155     * Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b,
156     * and store the results in dst.
157     * Args   : [a0, a1, a2, a3] double
158     [b0, b1, b2, b3] double
159     * Return : [a1, b1, a3, b3] double
160     */
161    static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_pd(a, b); }
162
163    /*
164     * Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s,
165     * and store the results in dst.
166     * Args   : [a0, a1, a2, a3] double
167     [b0, b1, b2, b3] double
168     * Return : [s[0]?a0:b0, ..., s[3]?a3:b3] double
169     */
170    template<uint8_t s>
171    static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
172        return _mm256_blend_pd(a, b, s);
173    }
174
175    /*
176     * Blend packed double-precision (64-bit) floating-point elements from a and b using mask,
177     * and store the results in dst.
178     * Args   : [a0, a1, a2, a3] double
179     [b0, b1, b2, b3] double
180     * Return : [mask[31]?a0:b0, ..., mask[255]?a3:b3] double
181     */
182    static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) {
183        return _mm256_blendv_pd(a, b, mask);
184    }
185
186    /*
187     * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
188     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
189     * Return : [a0+b0, a1+b1, a2+b2, a3+b3]
190     */
191    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_pd(a, b); }
192
193    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
194
195    /*
196     * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit)
197     * floating-point elements in a, and store the results in vect_t.
198     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
199     * Return : [a0-b0, a1-b1, a2-b2, a3-b3]
200     */
201    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_pd(a, b); }
202
203    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
204
205    /*
206     * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
207     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
208     * Return : [a0*b0, a1*b1, a2*b2, a3*b3]
209     */
210    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_pd(a, b); }
211
212    static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
213
214    /*
215     * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b,
216     * and store the results in dst.
217     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
218     * Return : [a0/b0, a1/b1, a2/b2, a3/b3]
219     */
220    static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_pd(a, b); }
221
222    /*
223     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
224     * packed elements in c, and store the results in vect_t.
225     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
226     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
227     */
228    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
229#ifdef __FMA__
230        return _mm256_fmadd_pd(a, b, c);
231#else
232        return add(c, mul(a, b));
233#endif
234    }
235
236    static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
237
238    /*
239     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
240     * to packed elements in c, and store the results in vect_t.
241     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
242     * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3]
243     */
244    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
245#ifdef __FMA__
246        return _mm256_fnmadd_pd(a, b, c);
247#else
248        return sub(c, mul(a, b));
249#endif
250    }
251
252    static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
253
254    /*
255     * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
256     * the intermediate result, and store the results in vect_t.
257     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
258     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
259     */
260    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
261#ifdef __FMA__
262        return _mm256_fmsub_pd(a, b, c);
263#else
264        return sub(mul(a, b), c);
265#endif
266    }
267
268    static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
269
270    /*
271     * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results
272     in vect_t.
273     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
274     * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0,
275     (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0,
276     (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0,
277     (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0]
278     */
279    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
280
281    /*
282     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the
283     results in vect_t.
284     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
285     * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0,
286     (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0,
287     (a2<b2) ? 0xFFFFFFFFFFFFFFFF : 0,
288     (a3<b3) ? 0xFFFFFFFFFFFFFFFF : 0]
289     */
290    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
291
292    /*
293     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store
294     the results in vect_t.
295     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
296     * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
297     (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
298     (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
299     (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
300     */
301    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
302
303    /*
304     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the
305     results in vect_t.
306     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
307     * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0,
308     (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0,
309     (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0,
310     (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0]
311     */
312    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GT_OS); }
313
314    /*
315     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store
316     the results in vect_t.
317     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
318     * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
319     (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
320     (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
321     (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
322     */
323    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GE_OS); }
324
325    /*
326     * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
327     * results in vect_t.
328     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
329     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
330     */
331    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_pd(a, b); }
332
333    /*
334     * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the
335     * results in vect_t.
336     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
337     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
338     */
339    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_pd(a, b); }
340
341    /*
342     * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the
343     * results in vect_t.
344     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
345     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
346     */
347    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_pd(a, b); }
348
349    /*
350     * Compute the bitwise NOT AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
351     * results in vect_t.
352     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
353     * Return : [NOT(a0) AND b0, NOT(a1) AND b1, NOT(a2) AND b2, NOT(a3) AND b3]
354     */
355    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_pd(a, b); }
356
357    /*
358     * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the
359     * results as packed double-precision floating-point elements in vect_t.
360     * Args   : [a0, a1, a2, a3]
361     * Return : [floor(a0), floor(a1), floor(a2), floor(a3)]
362     */
363    static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_pd(a); }
364
365    /*
366     * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the
367     * results as packed double-precision floating-point elements in vect_t.
368     * Args   : [a0, a1, a2, a3]
369     * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)]
370     */
371    static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_pd(a); }
372
373    /*
374     * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed
375     * double-precision floating-point elements in vect_t.
376     * Args   : [a0, a1, a2, a3]
377     * Return : [round(a0), round(a1), round(a2), round(a3)]
378     */
379    static INLINE CONST vect_t round(const vect_t a) {
380        return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
381    }
382
383    /*
384     * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the
385     * results in vect_t.
386     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
387     * Return : [a0+a1, b0+b1, a2+a3, b2+b3]
388     */
389    static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_pd(a, b); }
390
391    /*
392     * Horizontally add double-precision (64-bit) floating-point elements in a.
393     * Args   : [a0, a1, a2, a3]
394     * Return : a0+a1+a2+a3
395     */
396    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
397        return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
398        ((const scalar_t *)&a)[3];
399    }
400
401    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
402                             const vect_t &MAX, vect_t &Q, vect_t &T) {
403        FLOAT_MOD(C, P, INVP, Q);
404        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
405
406        return C;
407    }
408
409};
410
411#endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
412/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
413// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
414