1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2012, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
45 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
46 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
47 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
48 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
49 #define PXOR7(op1, op2, op3, op4, op5, op6, op7) PXOR(op1, PXOR6(op2, op3, op4, op5, op6, op7))
50 #define PZERO    _mm_setzero_si128()
51 
52 /* variant with 30 multiplications */
53 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)54 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
55 {
56     /* Taken from Cenk & Ozbudak 2009 */
57     /* We reserve one more to follow notations of the paper */
58     __m128i ab[9] = {
59         _gf2x_mm_setr_epi64(a[0], b[0]),
60         _gf2x_mm_setr_epi64(a[1], b[1]),
61         _gf2x_mm_setr_epi64(a[2], b[2]),
62         _gf2x_mm_setr_epi64(a[3], b[3]),
63         _gf2x_mm_setr_epi64(a[4], b[4]),
64         _gf2x_mm_setr_epi64(a[5], b[5]),
65         _gf2x_mm_setr_epi64(a[6], b[6]),
66         _gf2x_mm_setr_epi64(a[7], b[7]),
67         _gf2x_mm_setr_epi64(a[8], b[8]),
68     };
69     __m128i pab[30];
70 
71 #if 0
72     pab[ 0] = ab[0]^ab[1]^ab[2]^ab[3]^ab[4]^ab[5]^ab[6]^ab[7]^ab[8];
73     pab[ 1] = ab[0]^      ab[2]^      ab[4]^      ab[6]^      ab[8];
74     pab[ 2] =       ab[1]^ab[2]^ab[3]^      ab[5]^            ab[8];
75     pab[ 3] = ab[0]^      ab[2]^ab[3]^      ab[5]^ab[6]^      ab[8];
76     pab[ 4] = ab[0]^ab[1]^      ab[3]^ab[4]^      ab[6]^ab[7];
77     pab[ 5] = ab[0]^            ab[3]^ab[4]^ab[5]^      ab[7];
78     pab[ 6] =       ab[1]^ab[2]^ab[3]^            ab[6]^      ab[8];
79     pab[ 7] =             ab[2]^      ab[4]^ab[5]^ab[6];
80     pab[ 8] =             ab[2]^ab[3]^ab[4]^      ab[6];
81     pab[ 9] =       ab[1]^      ab[3]^      ab[5]^      ab[7];
82     pab[10] = ab[0]^ab[1]^            ab[4]^      ab[6]^ab[7]^ab[8];
83     pab[11] = ab[0]^            ab[3]^      ab[5]^ab[6]^ab[7];
84     pab[12] = ab[0]^ab[1]^            ab[4]^ab[5]^            ab[8];
85     pab[13] =       ab[1]^ab[2]^      ab[4]^ab[5]^      ab[7]^ab[8];
86     pab[14] = ab[0]^ab[1]^      ab[3]^            ab[6]^ab[7]^ab[8];
87     pab[15] =       ab[1]^      ab[3]^ab[4]^ab[5]^            ab[8];
88     pab[16] = ab[0]^      ab[2]^ab[3]^ab[4]^            ab[7];
89     pab[17] =       ab[1]^            ab[4]^ab[5]^ab[6]^      ab[8];
90     pab[18] = ab[0]^      ab[2]^            ab[5]^ab[6]^ab[7];
91     pab[19] =             ab[2]^ab[3]^            ab[6]^ab[7];
92     pab[20] =                                     ab[6]^      ab[8];
93     pab[21] = ab[0]^      ab[2];
94     pab[22] = ab[0]^ab[1];
95     pab[23] = ab[0];
96     pab[24] =       ab[1];
97     pab[25] =                                           ab[7];
98     pab[26] =                                           ab[7]^ab[8];
99     pab[27] =                                     ab[6];
100     pab[28] =                                                 ab[8];
101     pab[29] =             ab[2];
102 #else
103     /* same as above, but optimized with Maple's codegen[optimize] function
104        with 'tryhard' option: 89 XORs -> 46 XORs */
105     __m128i t51, t52, t53, t54, t55, t56, t57, t58, t59, t60, t61, t62,
106       t63, t64, t65, t66, t67, t68, t69, t70, t71, t72, t73, t74, t75;
107     t51 = ab[8];
108     t55 = ab[4];
109     t75 = PXOR(t51, t55);
110     t54 = ab[5];
111     t57 = ab[2];
112     t74 = PXOR(t54, t57);
113     t56 = ab[3];
114     t58 = ab[1];
115     t73 = PXOR(t56, t58);
116     t59 = ab[0];
117     t72 = PXOR(t59, t57);
118     t71 = PXOR(t58, t75);
119     t52 = ab[7];
120     t70 = PXOR3(t52, t56, t59);
121     t53 = ab[6];
122     t69 = PXOR3(t53, t56, t57);
123     t68 = PXOR(t53, t72);
124     t67 = PXOR(t54, t71);
125     t66 = PXOR(t59, t71);
126     t65 = PXOR3(t51, t57, t73);
127     t64 = PXOR(t53, t70);
128     t63 = PXOR(t55, t70);
129     t62 = PXOR(t54, t68);
130     t61 = PXOR(t58, t64);
131     t60 = PXOR(t55, t61);
132     pab[0] = PXOR3(t51, t60, t74);
133     pab[1] = PXOR(t68, t75);
134     pab[2] = PXOR(t54, t65);
135     pab[3] = PXOR3(t56, t51, t62);
136     pab[4] = t60;
137     pab[5] = PXOR(t54, t63);
138     pab[6] = PXOR(t53, t65);
139     pab[7] = PXOR3(t55, t53, t74);
140     pab[8] = PXOR(t55, t69);
141     pab[9] = PXOR3(t54, t52, t73);
142     pab[10] = PXOR3(t53, t52, t66);
143     pab[11] = PXOR(t54, t64);
144     pab[12] = PXOR(t54, t66);
145     pab[13] = PXOR3(t57, t52, t67);
146     pab[14] = PXOR(t51, t61);
147     pab[15] = PXOR(t56, t67);
148     pab[16] = PXOR(t57, t63);
149     pab[17] = PXOR(t53, t67);
150     pab[18] = PXOR(t52, t62);
151     pab[19] = PXOR(t52, t69);
152     pab[20] = PXOR(t53, t51);
153     pab[21] = t72;
154     pab[22] = PXOR(t59, t58);
155     pab[23] = t59;
156     pab[24] = t58;
157     pab[25] = t52;
158     pab[26] = PXOR(t52, t51);
159     pab[27] = t53;
160     pab[28] = t51;
161     pab[29] = t57;
162 #endif
163 
164     int i;
165     for (i = 0; i < 30; ++i)
166         pab[i] = _mm_clmulepi64_si128(pab[i], pab[i], 0x10);
167 
168     __m128i cc[17];
169 
170 #if 0
171     cc[0 ] = pab[23];
172     cc[1 ] = pab[22]^pab[23]^pab[24];
173     cc[2 ] = pab[21]^pab[23]^pab[24]^pab[29];
174     cc[3 ] = pab[28]^pab[17]^pab[2]^pab[15]^pab[7]^pab[6]^pab[5]^pab[29]^pab[21]^pab[22]^pab[12]^pab[19]^pab[9]^pab[13]^pab[11]^pab[3]^pab[26]^pab[20]^pab[27];
175     cc[4 ] = pab[4]^pab[3]^pab[10]^pab[11]^pab[6]^pab[2]^pab[8]^pab[14]^pab[9]^pab[22]^pab[23]^pab[24]^pab[1]^pab[20]^pab[27]^pab[28]^pab[25];
176     cc[5 ] = pab[26]^pab[25]^pab[28]^pab[0]^pab[9]^pab[21]^pab[23]^pab[29]^pab[24]^pab[1]^pab[3]^pab[13]^pab[14]^pab[5]^pab[18]^pab[16]^pab[11]^pab[15];
177     cc[6 ] = pab[26]^pab[12]^pab[19]^pab[21]^pab[23]^pab[29]^pab[4]^pab[3]^pab[14]^pab[5]^pab[18]^pab[22]^pab[1]^pab[20]^pab[27];
178     cc[7 ] = pab[20]^pab[27]^pab[28]^pab[25]^pab[23]^pab[0]^pab[15]^pab[7]^pab[11]^pab[6]^pab[14]^pab[5]^pab[18];
179     cc[8 ] = pab[0]^pab[23]^pab[24]^pab[10]^pab[15]^pab[7]^pab[2]^pab[18]^pab[14]^pab[17]^pab[22]^pab[26]^pab[25]^pab[28];
180     cc[9 ] = pab[21]^pab[23]^pab[29]^pab[24]^pab[0]^pab[16]^pab[11]^pab[7]^pab[10]^pab[2]^pab[8]^pab[18]^pab[5]^pab[28];
181     cc[10] = pab[12]^pab[0]^pab[19]^pab[9]^pab[21]^pab[29]^pab[22]^pab[3]^pab[13]^pab[16]^pab[11]^pab[7]^pab[10]^pab[20]^pab[27]^pab[28]^pab[26];
182     cc[11] = pab[16]^pab[11]^pab[7]^pab[10]^pab[17]^pab[5]^pab[2]^pab[0]^pab[9]^pab[4]^pab[3]^pab[22]^pab[23]^pab[24]^pab[1]^pab[20]^pab[27]^pab[28]^pab[25];
183     cc[12] = pab[26]^pab[25]^pab[28]^pab[8]^pab[14]^pab[5]^pab[17]^pab[10]^pab[6]^pab[16]^pab[15]^pab[3]^pab[13]^pab[1]^pab[9]^pab[21]^pab[23]^pab[29]^pab[24];
184     cc[13] = pab[8]^pab[18]^pab[2]^pab[15]^pab[16]^pab[5]^pab[29]^pab[21]^pab[23]^pab[22]^pab[0]^pab[12]^pab[19]^pab[1]^pab[11]^pab[4]^pab[3]^pab[26]^pab[20]^pab[27];
185     cc[14] = pab[20]^pab[27]^pab[28]^pab[25];
186     cc[15] = pab[25]^pab[26]^pab[28];
187     cc[16] = pab[28];
188 #else
189     /* same as above, optimized with codegen[optimize] with 'tryhard' */
190     __m128i t100, t101, t102, t103, t104, t105, t106, t107, t108, t109, t110,
191       t111, t112, t113, t114, t115, t116, t117, t118, t119, t120, t121, t122,
192       t123, t124, t125, t126, t127, t128, t129, t130, t77, t79, t80, t82, t83,
193       t87, t88, t89, t90, t91, t92, t94, t95, t96, t97, t98, t99;
194     t82 = pab[23];
195     t87 = pab[18];
196     t130 = PXOR(t82, t87);
197     t77 = pab[28];
198     t98 = pab[7];
199     t129 = PXOR(t77, t98);
200     t79 = pab[26];
201     t83 = pab[22];
202     t128 = PXOR(t79, t83);
203     t90 = pab[15];
204     t91 = pab[14];
205     t127 = PXOR(t90, t91);
206     t97 = pab[8];
207     t99 = pab[6];
208     t126 = PXOR(t97, t99);
209     t100 = pab[5];
210     t125 = PXOR(t100, t90);
211     t117 = PXOR(pab[27], pab[20]);
212     t80 = pab[25];
213     t118 = PXOR(t77, t80);
214     t112 = PXOR(t117, t118);
215     t94 = pab[11];
216     t124 = PXOR(t112, t94);
217     t103 = pab[2];
218     t105 = pab[0];
219     t123 = PXOR(t103, t105);
220     t89 = pab[16];
221     t122 = PXOR3(t89, t94, t97);
222     t121 = PXOR3(t100, t105, t98);
223     t102 = pab[3];
224     t104 = pab[1];
225     t96 = pab[9];
226     t120 = PXOR3(t102, t104, t96);
227     t119 = PXOR(pab[29], pab[21]);
228     t116 = PXOR(pab[24], t82);
229     t115 = PXOR(t79, t118);
230     t114 = PXOR(t83, t116);
231     t113 = PXOR(t116, t119);
232     t95 = pab[10];
233     t111 = PXOR5(t87, t95, t116, t123, t129);
234     t110 = PXOR6(t102, pab[19], pab[12], t117, t119, t128);
235     t92 = pab[13];
236     t109 = PXOR5(t92, t94, t96, t110, t129);
237     t101 = pab[4];
238     t108 = PXOR5(t100, t101, t104, t110, t130);
239     t107 = PXOR6(t101, t103, t95, t114, t120, t124);
240     t106 = PXOR7(t89, t91, t92, t113, t115, t120, t125);
241     t88 = pab[17];
242     cc[0] = t82;
243     cc[1] = t114;
244     cc[2] = t113;
245     cc[3] = PXOR5(t88, t99, t103, t109, t125);
246     cc[4] = PXOR3(t91, t107, t126);
247     cc[5] = PXOR4(t87, t94, t105, t106);
248     cc[6] = PXOR(t91, t108);
249     cc[7] = PXOR5(t99, t121, t124, t127, t130);
250     cc[8] = PXOR5(t88, t80, t111, t127, t128);
251     cc[9] = PXOR4(t100, t111, t119, t122);
252     cc[10] = PXOR4(t89, t95, t105, t109);
253     cc[11] = PXOR4(t88, t89, t107, t121);
254     cc[12] = PXOR4(t88, t95, t106, t126);
255     cc[13] = PXOR4(t90, t108, t122, t123);
256     cc[14] = t112;
257     cc[15] = t115;
258     cc[16] = t77;
259 #endif
260 
261     _mm_storeu_si128((__m128i*)(c),    PXOR(cc[0],                           _mm_slli_si128(cc[1], 8)));
262     _mm_storeu_si128((__m128i*)(c+2),  PXOR3(cc[2] , _mm_srli_si128(cc[1], 8), _mm_slli_si128(cc[3], 8)));
263     _mm_storeu_si128((__m128i*)(c+4),  PXOR3(cc[4] , _mm_srli_si128(cc[3], 8), _mm_slli_si128(cc[5], 8)));
264     _mm_storeu_si128((__m128i*)(c+6),  PXOR3(cc[6] , _mm_srli_si128(cc[5], 8), _mm_slli_si128(cc[7], 8)));
265     _mm_storeu_si128((__m128i*)(c+8),  PXOR3(cc[8] , _mm_srli_si128(cc[7], 8), _mm_slli_si128(cc[9], 8)));
266     _mm_storeu_si128((__m128i*)(c+10), PXOR3(cc[10], _mm_srli_si128(cc[9], 8), _mm_slli_si128(cc[11], 8)));
267     _mm_storeu_si128((__m128i*)(c+12), PXOR3(cc[12], _mm_srli_si128(cc[11], 8), _mm_slli_si128(cc[13], 8)));
268     _mm_storeu_si128((__m128i*)(c+14), PXOR3(cc[14], _mm_srli_si128(cc[13], 8), _mm_slli_si128(cc[15], 8)));
269     _mm_storeu_si128((__m128i*)(c+16), PXOR(cc[16], _mm_srli_si128(cc[15], 8)));
270 #undef PXOR
271 #undef PXOR3
272 #undef PXOR4
273 #undef PXOR5
274 #undef PXOR6
275 #undef PXOR7
276 #undef PZERO
277 }
278 
279 #endif  /* GF2X_MUL9_H_ */
280