1 /* This file is part of the gf2x library.
2
3 Copyright 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of either:
8 - If the archive contains a file named toom-gpl.c (not a trivial
9 placeholder), the GNU General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 - If the archive contains a file named toom-gpl.c which is a trivial
13 placeholder, the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 2.1 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
20
21 You should have received a copy of the GNU General Public License as
22 well as the GNU Lesser General Public License along with this program;
23 see the files COPYING and COPYING.LIB. If not, write to the Free
24 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 02110-1301, USA.
26 */
27
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33 * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43
44 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
45 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
46 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
47 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
48 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
49 #define PXOR7(op1, op2, op3, op4, op5, op6, op7) PXOR(op1, PXOR6(op2, op3, op4, op5, op6, op7))
50 #define PZERO _mm_setzero_si128()
51
52 /* variant with 30 multiplications */
53 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)54 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
55 {
56 /* Taken from Cenk & Ozbudak 2009 */
57 /* We reserve one more to follow notations of the paper */
58 __m128i ab[9] = {
59 _gf2x_mm_setr_epi64(a[0], b[0]),
60 _gf2x_mm_setr_epi64(a[1], b[1]),
61 _gf2x_mm_setr_epi64(a[2], b[2]),
62 _gf2x_mm_setr_epi64(a[3], b[3]),
63 _gf2x_mm_setr_epi64(a[4], b[4]),
64 _gf2x_mm_setr_epi64(a[5], b[5]),
65 _gf2x_mm_setr_epi64(a[6], b[6]),
66 _gf2x_mm_setr_epi64(a[7], b[7]),
67 _gf2x_mm_setr_epi64(a[8], b[8]),
68 };
69 __m128i pab[30];
70
71 #if 0
72 pab[ 0] = ab[0]^ab[1]^ab[2]^ab[3]^ab[4]^ab[5]^ab[6]^ab[7]^ab[8];
73 pab[ 1] = ab[0]^ ab[2]^ ab[4]^ ab[6]^ ab[8];
74 pab[ 2] = ab[1]^ab[2]^ab[3]^ ab[5]^ ab[8];
75 pab[ 3] = ab[0]^ ab[2]^ab[3]^ ab[5]^ab[6]^ ab[8];
76 pab[ 4] = ab[0]^ab[1]^ ab[3]^ab[4]^ ab[6]^ab[7];
77 pab[ 5] = ab[0]^ ab[3]^ab[4]^ab[5]^ ab[7];
78 pab[ 6] = ab[1]^ab[2]^ab[3]^ ab[6]^ ab[8];
79 pab[ 7] = ab[2]^ ab[4]^ab[5]^ab[6];
80 pab[ 8] = ab[2]^ab[3]^ab[4]^ ab[6];
81 pab[ 9] = ab[1]^ ab[3]^ ab[5]^ ab[7];
82 pab[10] = ab[0]^ab[1]^ ab[4]^ ab[6]^ab[7]^ab[8];
83 pab[11] = ab[0]^ ab[3]^ ab[5]^ab[6]^ab[7];
84 pab[12] = ab[0]^ab[1]^ ab[4]^ab[5]^ ab[8];
85 pab[13] = ab[1]^ab[2]^ ab[4]^ab[5]^ ab[7]^ab[8];
86 pab[14] = ab[0]^ab[1]^ ab[3]^ ab[6]^ab[7]^ab[8];
87 pab[15] = ab[1]^ ab[3]^ab[4]^ab[5]^ ab[8];
88 pab[16] = ab[0]^ ab[2]^ab[3]^ab[4]^ ab[7];
89 pab[17] = ab[1]^ ab[4]^ab[5]^ab[6]^ ab[8];
90 pab[18] = ab[0]^ ab[2]^ ab[5]^ab[6]^ab[7];
91 pab[19] = ab[2]^ab[3]^ ab[6]^ab[7];
92 pab[20] = ab[6]^ ab[8];
93 pab[21] = ab[0]^ ab[2];
94 pab[22] = ab[0]^ab[1];
95 pab[23] = ab[0];
96 pab[24] = ab[1];
97 pab[25] = ab[7];
98 pab[26] = ab[7]^ab[8];
99 pab[27] = ab[6];
100 pab[28] = ab[8];
101 pab[29] = ab[2];
102 #else
103 /* same as above, but optimized with Maple's codegen[optimize] function
104 with 'tryhard' option: 89 XORs -> 46 XORs */
105 __m128i t51, t52, t53, t54, t55, t56, t57, t58, t59, t60, t61, t62,
106 t63, t64, t65, t66, t67, t68, t69, t70, t71, t72, t73, t74, t75;
107 t51 = ab[8];
108 t55 = ab[4];
109 t75 = PXOR(t51, t55);
110 t54 = ab[5];
111 t57 = ab[2];
112 t74 = PXOR(t54, t57);
113 t56 = ab[3];
114 t58 = ab[1];
115 t73 = PXOR(t56, t58);
116 t59 = ab[0];
117 t72 = PXOR(t59, t57);
118 t71 = PXOR(t58, t75);
119 t52 = ab[7];
120 t70 = PXOR3(t52, t56, t59);
121 t53 = ab[6];
122 t69 = PXOR3(t53, t56, t57);
123 t68 = PXOR(t53, t72);
124 t67 = PXOR(t54, t71);
125 t66 = PXOR(t59, t71);
126 t65 = PXOR3(t51, t57, t73);
127 t64 = PXOR(t53, t70);
128 t63 = PXOR(t55, t70);
129 t62 = PXOR(t54, t68);
130 t61 = PXOR(t58, t64);
131 t60 = PXOR(t55, t61);
132 pab[0] = PXOR3(t51, t60, t74);
133 pab[1] = PXOR(t68, t75);
134 pab[2] = PXOR(t54, t65);
135 pab[3] = PXOR3(t56, t51, t62);
136 pab[4] = t60;
137 pab[5] = PXOR(t54, t63);
138 pab[6] = PXOR(t53, t65);
139 pab[7] = PXOR3(t55, t53, t74);
140 pab[8] = PXOR(t55, t69);
141 pab[9] = PXOR3(t54, t52, t73);
142 pab[10] = PXOR3(t53, t52, t66);
143 pab[11] = PXOR(t54, t64);
144 pab[12] = PXOR(t54, t66);
145 pab[13] = PXOR3(t57, t52, t67);
146 pab[14] = PXOR(t51, t61);
147 pab[15] = PXOR(t56, t67);
148 pab[16] = PXOR(t57, t63);
149 pab[17] = PXOR(t53, t67);
150 pab[18] = PXOR(t52, t62);
151 pab[19] = PXOR(t52, t69);
152 pab[20] = PXOR(t53, t51);
153 pab[21] = t72;
154 pab[22] = PXOR(t59, t58);
155 pab[23] = t59;
156 pab[24] = t58;
157 pab[25] = t52;
158 pab[26] = PXOR(t52, t51);
159 pab[27] = t53;
160 pab[28] = t51;
161 pab[29] = t57;
162 #endif
163
164 int i;
165 for (i = 0; i < 30; ++i)
166 pab[i] = _mm_clmulepi64_si128(pab[i], pab[i], 0x10);
167
168 __m128i cc[17];
169
170 #if 0
171 cc[0 ] = pab[23];
172 cc[1 ] = pab[22]^pab[23]^pab[24];
173 cc[2 ] = pab[21]^pab[23]^pab[24]^pab[29];
174 cc[3 ] = pab[28]^pab[17]^pab[2]^pab[15]^pab[7]^pab[6]^pab[5]^pab[29]^pab[21]^pab[22]^pab[12]^pab[19]^pab[9]^pab[13]^pab[11]^pab[3]^pab[26]^pab[20]^pab[27];
175 cc[4 ] = pab[4]^pab[3]^pab[10]^pab[11]^pab[6]^pab[2]^pab[8]^pab[14]^pab[9]^pab[22]^pab[23]^pab[24]^pab[1]^pab[20]^pab[27]^pab[28]^pab[25];
176 cc[5 ] = pab[26]^pab[25]^pab[28]^pab[0]^pab[9]^pab[21]^pab[23]^pab[29]^pab[24]^pab[1]^pab[3]^pab[13]^pab[14]^pab[5]^pab[18]^pab[16]^pab[11]^pab[15];
177 cc[6 ] = pab[26]^pab[12]^pab[19]^pab[21]^pab[23]^pab[29]^pab[4]^pab[3]^pab[14]^pab[5]^pab[18]^pab[22]^pab[1]^pab[20]^pab[27];
178 cc[7 ] = pab[20]^pab[27]^pab[28]^pab[25]^pab[23]^pab[0]^pab[15]^pab[7]^pab[11]^pab[6]^pab[14]^pab[5]^pab[18];
179 cc[8 ] = pab[0]^pab[23]^pab[24]^pab[10]^pab[15]^pab[7]^pab[2]^pab[18]^pab[14]^pab[17]^pab[22]^pab[26]^pab[25]^pab[28];
180 cc[9 ] = pab[21]^pab[23]^pab[29]^pab[24]^pab[0]^pab[16]^pab[11]^pab[7]^pab[10]^pab[2]^pab[8]^pab[18]^pab[5]^pab[28];
181 cc[10] = pab[12]^pab[0]^pab[19]^pab[9]^pab[21]^pab[29]^pab[22]^pab[3]^pab[13]^pab[16]^pab[11]^pab[7]^pab[10]^pab[20]^pab[27]^pab[28]^pab[26];
182 cc[11] = pab[16]^pab[11]^pab[7]^pab[10]^pab[17]^pab[5]^pab[2]^pab[0]^pab[9]^pab[4]^pab[3]^pab[22]^pab[23]^pab[24]^pab[1]^pab[20]^pab[27]^pab[28]^pab[25];
183 cc[12] = pab[26]^pab[25]^pab[28]^pab[8]^pab[14]^pab[5]^pab[17]^pab[10]^pab[6]^pab[16]^pab[15]^pab[3]^pab[13]^pab[1]^pab[9]^pab[21]^pab[23]^pab[29]^pab[24];
184 cc[13] = pab[8]^pab[18]^pab[2]^pab[15]^pab[16]^pab[5]^pab[29]^pab[21]^pab[23]^pab[22]^pab[0]^pab[12]^pab[19]^pab[1]^pab[11]^pab[4]^pab[3]^pab[26]^pab[20]^pab[27];
185 cc[14] = pab[20]^pab[27]^pab[28]^pab[25];
186 cc[15] = pab[25]^pab[26]^pab[28];
187 cc[16] = pab[28];
188 #else
189 /* same as above, optimized with codegen[optimize] with 'tryhard' */
190 __m128i t100, t101, t102, t103, t104, t105, t106, t107, t108, t109, t110,
191 t111, t112, t113, t114, t115, t116, t117, t118, t119, t120, t121, t122,
192 t123, t124, t125, t126, t127, t128, t129, t130, t77, t79, t80, t82, t83,
193 t87, t88, t89, t90, t91, t92, t94, t95, t96, t97, t98, t99;
194 t82 = pab[23];
195 t87 = pab[18];
196 t130 = PXOR(t82, t87);
197 t77 = pab[28];
198 t98 = pab[7];
199 t129 = PXOR(t77, t98);
200 t79 = pab[26];
201 t83 = pab[22];
202 t128 = PXOR(t79, t83);
203 t90 = pab[15];
204 t91 = pab[14];
205 t127 = PXOR(t90, t91);
206 t97 = pab[8];
207 t99 = pab[6];
208 t126 = PXOR(t97, t99);
209 t100 = pab[5];
210 t125 = PXOR(t100, t90);
211 t117 = PXOR(pab[27], pab[20]);
212 t80 = pab[25];
213 t118 = PXOR(t77, t80);
214 t112 = PXOR(t117, t118);
215 t94 = pab[11];
216 t124 = PXOR(t112, t94);
217 t103 = pab[2];
218 t105 = pab[0];
219 t123 = PXOR(t103, t105);
220 t89 = pab[16];
221 t122 = PXOR3(t89, t94, t97);
222 t121 = PXOR3(t100, t105, t98);
223 t102 = pab[3];
224 t104 = pab[1];
225 t96 = pab[9];
226 t120 = PXOR3(t102, t104, t96);
227 t119 = PXOR(pab[29], pab[21]);
228 t116 = PXOR(pab[24], t82);
229 t115 = PXOR(t79, t118);
230 t114 = PXOR(t83, t116);
231 t113 = PXOR(t116, t119);
232 t95 = pab[10];
233 t111 = PXOR5(t87, t95, t116, t123, t129);
234 t110 = PXOR6(t102, pab[19], pab[12], t117, t119, t128);
235 t92 = pab[13];
236 t109 = PXOR5(t92, t94, t96, t110, t129);
237 t101 = pab[4];
238 t108 = PXOR5(t100, t101, t104, t110, t130);
239 t107 = PXOR6(t101, t103, t95, t114, t120, t124);
240 t106 = PXOR7(t89, t91, t92, t113, t115, t120, t125);
241 t88 = pab[17];
242 cc[0] = t82;
243 cc[1] = t114;
244 cc[2] = t113;
245 cc[3] = PXOR5(t88, t99, t103, t109, t125);
246 cc[4] = PXOR3(t91, t107, t126);
247 cc[5] = PXOR4(t87, t94, t105, t106);
248 cc[6] = PXOR(t91, t108);
249 cc[7] = PXOR5(t99, t121, t124, t127, t130);
250 cc[8] = PXOR5(t88, t80, t111, t127, t128);
251 cc[9] = PXOR4(t100, t111, t119, t122);
252 cc[10] = PXOR4(t89, t95, t105, t109);
253 cc[11] = PXOR4(t88, t89, t107, t121);
254 cc[12] = PXOR4(t88, t95, t106, t126);
255 cc[13] = PXOR4(t90, t108, t122, t123);
256 cc[14] = t112;
257 cc[15] = t115;
258 cc[16] = t77;
259 #endif
260
261 _mm_storeu_si128((__m128i*)(c), PXOR(cc[0], _mm_slli_si128(cc[1], 8)));
262 _mm_storeu_si128((__m128i*)(c+2), PXOR3(cc[2] , _mm_srli_si128(cc[1], 8), _mm_slli_si128(cc[3], 8)));
263 _mm_storeu_si128((__m128i*)(c+4), PXOR3(cc[4] , _mm_srli_si128(cc[3], 8), _mm_slli_si128(cc[5], 8)));
264 _mm_storeu_si128((__m128i*)(c+6), PXOR3(cc[6] , _mm_srli_si128(cc[5], 8), _mm_slli_si128(cc[7], 8)));
265 _mm_storeu_si128((__m128i*)(c+8), PXOR3(cc[8] , _mm_srli_si128(cc[7], 8), _mm_slli_si128(cc[9], 8)));
266 _mm_storeu_si128((__m128i*)(c+10), PXOR3(cc[10], _mm_srli_si128(cc[9], 8), _mm_slli_si128(cc[11], 8)));
267 _mm_storeu_si128((__m128i*)(c+12), PXOR3(cc[12], _mm_srli_si128(cc[11], 8), _mm_slli_si128(cc[13], 8)));
268 _mm_storeu_si128((__m128i*)(c+14), PXOR3(cc[14], _mm_srli_si128(cc[13], 8), _mm_slli_si128(cc[15], 8)));
269 _mm_storeu_si128((__m128i*)(c+16), PXOR(cc[16], _mm_srli_si128(cc[15], 8)));
270 #undef PXOR
271 #undef PXOR3
272 #undef PXOR4
273 #undef PXOR5
274 #undef PXOR6
275 #undef PXOR7
276 #undef PZERO
277 }
278
279 #endif /* GF2X_MUL9_H_ */
280