1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 #ifndef GF2X_MUL6_H_
29 #define GF2X_MUL6_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45  * as to remove the false dependency on pclmul, that would be nice.
46  */
47 /* This specialized version avoids loads, and relies on the destination
48  * being aligned, so that aligned stores are possible */
49 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
50 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
51 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
52 #define PZERO    _mm_setzero_si128()
53 
54 static inline void
GF2X_FUNC(mul6clk2_mul2)55 GF2X_FUNC(mul6clk2_mul2)(__m128i * t, __m128i ss1, __m128i ss2)
56 {
57     __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
58     __m128i t11 = _mm_clmulepi64_si128(ss1, ss2, 0x11);
59     ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1,0,3,2)));
60     ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1,0,3,2)));
61     __m128i tk = PXOR(PXOR(t00, t11), _mm_clmulepi64_si128(ss1, ss2, 0));
62     t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
63     t[1] = PXOR(t11, _mm_unpackhi_epi64(tk, PZERO));
64 }
65 
66 
67 /* variant with 6 calls to mul2, i.e., 18 multiplications */
68 GF2X_STORAGE_CLASS_mul6
gf2x_mul6(unsigned long * c,const unsigned long * a,const unsigned long * b)69 void gf2x_mul6 (unsigned long *c, const unsigned long *a, const unsigned long *b)
70 {
71     __m128i aa[3], bb[3];
72     __m128i p0[2], p1[2], p2[2];
73     __m128i pp0[2], pp1[2], pp2[2];
74     __m128i a0 = _mm_loadu_si128((__m128i*)(a));
75     __m128i a1 = _mm_loadu_si128((__m128i*)(a+2));
76     __m128i a2 = _mm_loadu_si128((__m128i*)(a+4));
77     __m128i b0 = _mm_loadu_si128((__m128i*)(b));
78     __m128i b1 = _mm_loadu_si128((__m128i*)(b+2));
79     __m128i b2 = _mm_loadu_si128((__m128i*)(b+4));
80     aa[0] = PXOR(a1, a2);
81     aa[1] = PXOR(a0, a2);
82     aa[2] = PXOR(a0, a1);
83     bb[0] = PXOR(b1, b2);
84     bb[1] = PXOR(b0, b2);
85     bb[2] = PXOR(b0, b1);
86     GF2X_FUNC(mul6clk2_mul2)(p0, a0, b0);
87     GF2X_FUNC(mul6clk2_mul2)(p1, a1, b1);
88     GF2X_FUNC(mul6clk2_mul2)(p2, a2, b2);
89     GF2X_FUNC(mul6clk2_mul2)(pp0, aa[0], bb[0]);
90     GF2X_FUNC(mul6clk2_mul2)(pp1, aa[1], bb[1]);
91     GF2X_FUNC(mul6clk2_mul2)(pp2, aa[2], bb[2]);
92     _mm_storeu_si128((__m128i*)(c + 0), p0[0]);
93     _mm_storeu_si128((__m128i*)(c + 2),
94             PXOR(PXOR3(p0[0], p1[0], pp2[0])       , p0[1]));
95     _mm_storeu_si128((__m128i*)(c + 4),
96             PXOR(PXOR4(p0[0], p1[0], p2[0], pp1[0]), PXOR3(p0[1], p1[1], pp2[1])));
97     _mm_storeu_si128((__m128i*)(c + 6),
98             PXOR(PXOR3(pp0[0], p1[0], p2[0])       , PXOR4(p0[1], p1[1], p2[1], pp1[1])));
99     _mm_storeu_si128((__m128i*)(c + 8),
100             PXOR(p2[0]                             , PXOR3(pp0[1], p1[1], p2[1])));
101     _mm_storeu_si128((__m128i*)(c + 10),                            p2[1]);
102 }
103 
104 #undef PXOR
105 #undef PXOR3
106 #undef PXOR4
107 #undef PZERO
108 #endif  /* GF2X_MUL6_H_ */
109