1 /* This file is part of the gf2x library.
2
3 Copyright 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of either:
8 - If the archive contains a file named toom-gpl.c (not a trivial
9 placeholder), the GNU General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 - If the archive contains a file named toom-gpl.c which is a trivial
13 placeholder, the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 2.1 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
20
21 You should have received a copy of the GNU General Public License as
22 well as the GNU Lesser General Public License along with this program;
23 see the files COPYING and COPYING.LIB. If not, write to the Free
24 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 02110-1301, USA.
26 */
27
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33 * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45 * as to remove the false dependency on pclmul, that would be nice.
46 */
47 static inline __m128i
GF2X_FUNC(mul9clk2_mul1)48 GF2X_FUNC(mul9clk2_mul1) (unsigned long a, unsigned long b)
49 {
50 __m128i aa = _gf2x_mm_setr_epi64(a, 0);
51 __m128i bb = _gf2x_mm_setr_epi64(b, 0);
52 return _mm_clmulepi64_si128(aa, bb, 0);
53 }
54
55 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
56 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
57 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
58 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
59 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
60 #define PXOR7(op1, op2, op3, op4, op5, op6, op7) PXOR(op1, PXOR6(op2, op3, op4, op5, op6, op7))
61 #define PZERO _mm_setzero_si128()
62
63 /* uses the variant of Karatsuba with 6 multiplications */
GF2X_FUNC(mul9clk2_mul3)64 static void GF2X_FUNC(mul9clk2_mul3) (__m128i *ce, __m128i *co, const unsigned long *a, const unsigned long *b)
65 {
66 unsigned long aa[3], bb[3];
67 __m128i p0, p1, p2;
68 __m128i pp0, pp1, pp2;
69 aa[0] = a[1]^a[2];
70 aa[1] = a[0]^a[2];
71 aa[2] = a[0]^a[1];
72 bb[0] = b[1]^b[2];
73 bb[1] = b[0]^b[2];
74 bb[2] = b[0]^b[1];
75 p0 = GF2X_FUNC(mul9clk2_mul1)(a[0], b[0]);
76 p1 = GF2X_FUNC(mul9clk2_mul1)(a[1], b[1]);
77 p2 = GF2X_FUNC(mul9clk2_mul1)(a[2], b[2]);
78 pp0 = GF2X_FUNC(mul9clk2_mul1)(aa[0], bb[0]);
79 pp1 = GF2X_FUNC(mul9clk2_mul1)(aa[1], bb[1]);
80 pp2 = GF2X_FUNC(mul9clk2_mul1)(aa[2], bb[2]);
81
82 ce[0] = p0;
83 ce[1] = PXOR4(p0, p1, p2, pp1);
84 ce[2] = p2;
85
86 co[0] = PXOR3(p0, p1, pp2);
87 co[1] = PXOR3(pp0, p1, p2);
88 }
89
90 /* recursive application of the Karatsuba-3 algorithm with 6 multiplies,
91 i.e., with a total of 6*6=36 multiplies */
92 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)93 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
94 {
95 unsigned long aa[9], bb[9];
96
97 /*
98 * a = al(t) + t^3 * am(t) + t^6 * ah(t)
99 * b = bl(t) + t^3 * bm(t) + t^6 * bh(t)
100 *
101 * aa0 = am + ah
102 * aa1 = al + ah
103 * aa2 = al + am
104 *
105 * p0e + z^64 * p0o = al * bl
106 * p1e + z^64 * p1o = am * bm
107 * p2e + z^64 * p2o = ah * bh
108 *
109 * aa0 * bb0 = (am*bm + ah*bh + am*bh + ah*bm)
110 * aa1 * bb1 = (al*bl + ah*bh + al*bh + ah*bl)
111 * aa2 * bb2 = (al*bl + am*bm + al*bm + am*bl)
112 *
113 * a*b = al*bl * t^0
114 * + (aa2*bb2 + al*bl + am*bm) * t^3
115 * + (am*bm + al*bl + ah*bh + aa1 * bb1) * t^6
116 * + (aa0*bb0 + ah*bh + am*bm) * t^9
117 * + ah*bh * t^12
118 */
119 aa[0] = a[3]^a[6];
120 aa[1] = a[4]^a[7];
121 aa[2] = a[5]^a[8];
122
123 aa[3] = a[0]^a[6];
124 aa[4] = a[1]^a[7];
125 aa[5] = a[2]^a[8];
126
127 aa[6] = a[0]^a[3];
128 aa[7] = a[1]^a[4];
129 aa[8] = a[2]^a[5];
130
131 bb[0] = b[3]^b[6];
132 bb[1] = b[4]^b[7];
133 bb[2] = b[5]^b[8];
134
135 bb[3] = b[0]^b[6];
136 bb[4] = b[1]^b[7];
137 bb[5] = b[2]^b[8];
138
139 bb[6] = b[0]^b[3];
140 bb[7] = b[1]^b[4];
141 bb[8] = b[2]^b[5];
142
143
144 __m128i p0e[3], p0o[2];
145 __m128i p1e[3], p1o[2];
146 __m128i p2e[3], p2o[2];
147 __m128i q0e[3], q0o[2];
148 __m128i q1e[3], q1o[2];
149 __m128i q2e[3], q2o[2];
150 GF2X_FUNC(mul9clk2_mul3) (p0e, p0o, a+0, b+0);
151 GF2X_FUNC(mul9clk2_mul3) (p1e, p1o, a+3, b+3);
152 GF2X_FUNC(mul9clk2_mul3) (p2e, p2o, a+6, b+6);
153 GF2X_FUNC(mul9clk2_mul3) (q0e, q0o, aa+0, bb+0);
154 GF2X_FUNC(mul9clk2_mul3) (q1e, q1o, aa+3, bb+3);
155 GF2X_FUNC(mul9clk2_mul3) (q2e, q2o, aa+6, bb+6);
156
157 __m128i e,h,l;
158 e = p0e[0];
159 l = p0o[0];
160 _mm_storeu_si128((__m128i*)(c), PXOR(e, _mm_slli_si128(l, 8)));
161
162 e = p0e[1]; h=l; l= PXOR4(p0o[1], p1e[0], q2e[0], p0e[0]);
163 _mm_storeu_si128((__m128i*)(c+2), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
164
165 e = PXOR4(p0e[2], p0o[0], p1o[0], q2o[0]); h=l; l= PXOR3(p1e[1], q2e[1], p0e[1]);
166 _mm_storeu_si128((__m128i*)(c+4), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
167
168 e = PXOR7(p0e[0], q1e[0], p2e[0], p1e[0], p0o[1], p1o[1], q2o[1]); h=l; l= PXOR7(p0o[0], q1o[0], p2o[0], p1o[0], p1e[2], q2e[2], p0e[2]);
169 _mm_storeu_si128((__m128i*)(c+6), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
170
171 e = PXOR4(p0e[1], q1e[1], p2e[1], p1e[1]); h=l; l= PXOR7(p0o[1], q1o[1], p2o[1], p1o[1], q0e[0], p1e[0], p2e[0]);
172 _mm_storeu_si128((__m128i*)(c+8), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
173
174 e = PXOR7(p0e[2], q1e[2], p2e[2], p1e[2], q0o[0], p1o[0], p2o[0]); h=l; l= PXOR3(q0e[1], p1e[1], p2e[1]);
175 _mm_storeu_si128((__m128i*)(c+10), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
176
177 e = PXOR4(p2e[0], q0o[1], p1o[1], p2o[1]); h=l; l= PXOR4(p2o[0], q0e[2], p1e[2], p2e[2]);
178 _mm_storeu_si128((__m128i*)(c+12), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
179
180 e = p2e[1]; h=l; l=p2o[1];
181 _mm_storeu_si128((__m128i*)(c+14), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
182
183 e = p2e[2]; h=l;
184 _mm_storeu_si128((__m128i*)(c+16), PXOR(e, _mm_srli_si128(h, 8)));
185
186 /*
187 * a*b = al*bl * t^0
188 * + (aa2*bb2 + al*bl + am*bm) * t^3
189 * + (am*bm + al*bl + ah*bh + aa1 * bb1) * t^6
190 * + (aa0*bb0 + ah*bh + am*bm) * t^9
191 * + ah*bh * t^12
192 *
193 * this is the 64-bit breakdown
194 c[0] = p0e[0] ^ ;
195 c[1] = p0e[1] ^ p0o[0] ;
196 c[2] = p0e[2] ^ p0o[1] ;
197 c[3] = p0e[3] ^ p0o[2]p1e[0]q2e[0]p0e[0];
198 c[4] = p0e[4]p0o[0]p1o[0]q2o[0] ^ p0o[3]p1e[1]q2e[1]p0e[1];
199 c[5] = p0e[5]p0o[1]p1o[1]q2o[1] ^ p1e[2]q2e[2]p0e[2];
200 c[6] = p0e[0]q1e[0]p2e[0]p1e[0]p0o[2]p1o[2]q2o[2] ^ p1e[3]q2e[3]p0e[3];
201 c[7] = p0e[1]q1e[1]p2e[1]p1e[1]p0o[3]p1o[3]q2o[3] ^ p0o[0]q1o[0]p2o[0]p1o[0]p1e[4]q2e[4]p0e[4];
202 c[8] = p0e[2]q1e[2]p2e[2]p1e[2] ^ p0o[1]q1o[1]p2o[1]p1o[1]p1e[5]q2e[5]p0e[5];
203 c[9] = p0e[3]q1e[3]p2e[3]p1e[3] ^ p0o[2]q1o[2]p2o[2]p1o[2]q0e[0]p1e[0]p2e[0];
204 c[10] = p0e[4]q1e[4]p2e[4]p1e[4]q0o[0]p1o[0]p2o[0] ^ p0o[3]q1o[3]p2o[3]p1o[3]q0e[1]p1e[1]p2e[1];
205 c[11] = p0e[5]q1e[5]p2e[5]p1e[5]q0o[1]p1o[1]p2o[1] ^ q0e[2]p1e[2]p2e[2];
206 c[12] = p2e[0] q0o[2]p1o[2]p2o[2] ^ q0e[3]p1e[3]p2e[3];
207 c[13] = p2e[1] q0o[3]p1o[3]p2o[3] ^ p2o[0]q0e[4]p1e[4]p2e[4];
208 c[14] = p2e[2] ^ p2o[1]q0e[5]p1e[5]p2e[5];
209 c[15] = p2e[3] ^ p2o[2] ;
210 c[16] = p2e[4] ^ p2o[3] ;
211 c[17] = p2e[5] ^ ;
212 */
213 }
214
215 #undef PXOR
216 #undef PXOR3
217 #undef PXOR4
218 #undef PXOR5
219 #undef PXOR6
220 #undef PXOR7
221 #undef PZERO
222 #endif /* GF2X_MUL9_H_ */
223