1 /* This file is part of the gf2x library.
2
3 Copyright 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of either:
8 - If the archive contains a file named toom-gpl.c (not a trivial
9 placeholder), the GNU General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 - If the archive contains a file named toom-gpl.c which is a trivial
13 placeholder, the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 2.1 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
20
21 You should have received a copy of the GNU General Public License as
22 well as the GNU Lesser General Public License along with this program;
23 see the files COPYING and COPYING.LIB. If not, write to the Free
24 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 02110-1301, USA.
26 */
27
28 #ifndef GF2X_MUL7_H_
29 #define GF2X_MUL7_H_
30
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33 * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43
44 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
45 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
46 #define PZERO _mm_setzero_si128()
47
48 /* TODO: if somebody comes up with a neat way to improve the interface so
49 * as to remove the false dependency on pclmul, that would be nice.
50 */
GF2X_FUNC(mul7k3_mul2)51 static inline void GF2X_FUNC(mul7k3_mul2)(__m128i *t, __m128i ss1,
52 __m128i ss2)
53 {
54 __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
55 __m128i t11 = _mm_clmulepi64_si128(ss1, ss2, 0x11);
56 ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
57 ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
58 __m128i tk = PXOR(PXOR(t00, t11), _mm_clmulepi64_si128(ss1, ss2, 0));
59 t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
60 t[1] = PXOR(t11, _mm_unpackhi_epi64(tk, PZERO));
61 }
62 static inline void
GF2X_FUNC(mul7k3_mul2b)63 GF2X_FUNC(mul7k3_mul2b)(__m128i *t, __m128i ss1, __m128i ss2, const unsigned long * sc)
64 {
65 __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
66 ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
67 ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
68 __m128i c = _mm_loadu_si128((__m128i*)sc);
69 __m128i tk = PXOR(PXOR(t00, c), _mm_clmulepi64_si128(ss1, ss2, 0));
70 t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
71 t[1] = PXOR(c, _mm_unpackhi_epi64(tk, PZERO));
72 }
73 static inline void
GF2X_FUNC(mul7k3_mul2c)74 GF2X_FUNC(mul7k3_mul2c)(__m128i *t, __m128i ss1, __m128i ss2, unsigned long *sc)
75 {
76 __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
77 __m128i t11 = _mm_clmulepi64_si128(ss1, ss2, 0x11);
78 ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
79 ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
80 __m128i tk = PXOR(PXOR(t00, t11), _mm_clmulepi64_si128(ss1, ss2, 0));
81 _mm_storeu_si128((__m128i*)sc, t11);
82 t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
83 t[1] = PXOR(t11, _mm_unpackhi_epi64(tk, PZERO));
84 }
85
86 /* specialized Karatsuba with 3 calls to mul2, i.e., 9 multiplications
87 {d,2} <- {a+3,1} * {b+3,1} */
88 GF2X_STORAGE_CLASS_mul4
GF2X_FUNC(mul7k3_mul4c)89 void GF2X_FUNC(mul7k3_mul4c) (unsigned long *c, const unsigned long *a, const unsigned long *b, unsigned long *d)
90 {
91 __m128i ab[2];
92 __m128i lo[2], hi[2];
93 __m128i a0 = _mm_loadu_si128((__m128i*)a);
94 __m128i a2 = _mm_loadu_si128((__m128i*)(a+2));
95 __m128i b0 = _mm_loadu_si128((__m128i*)b);
96 __m128i b2 = _mm_loadu_si128((__m128i*)(b+2));
97 GF2X_FUNC(mul7k3_mul2)(lo, a0, b0);
98 GF2X_FUNC(mul7k3_mul2c)(hi, a2, b2, d);
99 __m128i middle = PXOR(lo[1], hi[0]);
100 GF2X_FUNC(mul7k3_mul2)(ab, PXOR(a0, a2), PXOR(b0, b2));
101 _mm_storeu_si128((__m128i*)(c + 0), lo[0]);
102 _mm_storeu_si128((__m128i*)(c + 2), PXOR3(ab[0], lo[0], middle));
103 _mm_storeu_si128((__m128i*)(c + 4), PXOR3(ab[1], hi[1], middle));
104 _mm_storeu_si128((__m128i*)(c + 6), hi[1]);
105 }
106
107 /* specialized Karatsuba with 3 calls to mul2, i.e., 9 multiplications,
108 assume {d,2} = {a+3,1} * {b+3,1} */
109 GF2X_STORAGE_CLASS_mul4
GF2X_FUNC(mul7k3_mul4b)110 void GF2X_FUNC(mul7k3_mul4b) (unsigned long *c, const unsigned long *a, const unsigned long *b, unsigned long *d)
111 {
112 __m128i ab[2];
113 __m128i lo[2], hi[2];
114 __m128i a0 = _mm_loadu_si128((__m128i*)a);
115 __m128i a2 = _mm_loadu_si128((__m128i*)(a+2));
116 __m128i b0 = _mm_loadu_si128((__m128i*)b);
117 __m128i b2 = _mm_loadu_si128((__m128i*)(b+2));
118 GF2X_FUNC(mul7k3_mul2)(lo, a0, b0);
119 GF2X_FUNC(mul7k3_mul2b)(hi, a2, b2, d);
120 __m128i middle = PXOR(lo[1], hi[0]);
121 GF2X_FUNC(mul7k3_mul2)(ab, PXOR(a0, a2), PXOR(b0, b2));
122 _mm_storeu_si128((__m128i*)(c + 0), lo[0]);
123 _mm_storeu_si128((__m128i*)(c + 2), PXOR3(ab[0], lo[0], middle));
124 _mm_storeu_si128((__m128i*)(c + 4), PXOR3(ab[1], hi[1], middle));
125 _mm_storeu_si128((__m128i*)(c + 6), hi[1]);
126 }
127
128 #undef PXOR
129 #undef PXOR3
130 #undef PZERO
131
132 /* based on mul7k.c, version with M(3)+2M(4)-1=23 multiplications */
133 GF2X_STORAGE_CLASS_mul7
gf2x_mul7(unsigned long * c,const unsigned long * a,const unsigned long * b)134 void gf2x_mul7 (unsigned long *c, const unsigned long *a, const unsigned long *b)
135 {
136 unsigned long aa[4], bb[4], ab[8], ab4, ab5, ab6, ab7, d[2];
137
138 gf2x_mul3 (c+8, a+4, b+4);
139 GF2X_FUNC(mul7k3_mul4c) (c, a, b, d);
140 aa[0] = a[0] ^ a[4];
141 aa[1] = a[1] ^ a[5];
142 aa[2] = a[2] ^ a[6];
143 aa[3] = a[3];
144 bb[0] = b[0] ^ b[4];
145 bb[1] = b[1] ^ b[5];
146 bb[2] = b[2] ^ b[6];
147 bb[3] = b[3];
148 GF2X_FUNC(mul7k3_mul4b) (ab, aa, bb, d);
149 ab4 = ab[4] ^ c[4];
150 ab5 = ab[5] ^ c[5];
151 ab6 = ab[6] ^ c[6];
152 ab7 = ab[7] ^ c[7];
153 c[4] ^= ab[0] ^ c[0] ^ c[8];
154 c[5] ^= ab[1] ^ c[1] ^ c[9];
155 c[6] ^= ab[2] ^ c[2] ^ c[10];
156 c[7] ^= ab[3] ^ c[3] ^ c[11];
157 c[8] ^= ab4 ^ c[12];
158 c[9] ^= ab5 ^ c[13];
159 c[10] ^= ab6;
160 c[11] ^= ab7;
161 }
162
163 #endif /* GF2X_MUL7_H_ */
164