1 /* This file is part of the gf2x library.
2
3 Copyright 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 Nicolas Estibals (for this file)
6
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of either:
9 - If the archive contains a file named toom-gpl.c (not a trivial
10 placeholder), the GNU General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13 - If the archive contains a file named toom-gpl.c which is a trivial
14 placeholder, the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 2.1 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
20 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
21
22 You should have received a copy of the GNU General Public License as
23 well as the GNU Lesser General Public License along with this program;
24 see the files COPYING and COPYING.LIB. If not, write to the Free
25 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 02110-1301, USA.
27 */
28 #ifndef GF2X_MUL7_H_
29 #define GF2X_MUL7_H_
30
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33 * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45 * as to remove the false dependency on pclmul, that would be nice.
46 */
47 static inline __m128i
GF2X_FUNC(mul7cl_mul1)48 GF2X_FUNC(mul7cl_mul1) (unsigned long a, unsigned long b)
49 {
50 return _mm_clmulepi64_si128(_gf2x_mm_setr_epi64(a,0), _gf2x_mm_setr_epi64(b,0), 0);
51 }
52
53
54 /* variant with 22 multiplications */
55 GF2X_STORAGE_CLASS_mul7
gf2x_mul7(unsigned long * c,const unsigned long * a,const unsigned long * b)56 void gf2x_mul7 (unsigned long *c, const unsigned long *a, const unsigned long *b)
57 {
58 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
59 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
60 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
61 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
62 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
63 #define PZERO _mm_setzero_si128()
64 /* Montgomery formulae with 22 multiplications, see
65 Five, Six, and Seven-Term {K}aratsuba-Like Formulae,
66 IEEE Transactions on Computers, volume 54, number 3, p. 362-369, 2005 */
67 unsigned long ta[5], tb[5], pa[22], pb[22];
68 ta[0] = a[0] ^ a[4];
69 ta[1] = a[3] ^ a[5];
70 ta[2] = a[2] ^ a[6];
71 ta[3] = a[1] ^ ta[0];
72 ta[4] = ta[1] ^ ta[2];
73 pa[0] = a[6];
74 pa[1] = a[5];
75 pa[2] = a[5] ^ a[6];
76 pa[3] = a[4];
77 pa[4] = a[4] ^ a[6];
78 pa[5] = a[3];
79 pa[6] = ta[1];
80 pa[7] = a[2];
81 pa[8] = ta[2];
82 pa[9] = a[1];
83 pa[10]= a[1] ^ a[3];
84 pa[11]= a[1] ^ a[2] ^ a[4] ^ a[5];
85 pa[12]= a[1] ^ ta[4];
86 pa[13]= a[0];
87 pa[14]= ta[0];
88 pa[15]= a[0] ^ a[2];
89 pa[16]= a[0] ^ ta[4];
90 pa[17]= a[3] ^ ta[0] ^ ta[2];
91 pa[18]= a[0] ^ a[1];
92 pa[19]= a[3] ^ a[6] ^ ta[3];
93 pa[20]= ta[1] ^ ta[3];
94 pa[21]= ta[3] ^ ta[4];
95
96 tb[0] = b[0] ^ b[4];
97 tb[1] = b[3] ^ b[5];
98 tb[2] = b[2] ^ b[6];
99 tb[3] = b[1] ^ tb[0];
100 tb[4] = tb[1] ^ tb[2];
101 pb[0] = b[6];
102 pb[1] = b[5];
103 pb[2] = b[5] ^ b[6];
104 pb[3] = b[4];
105 pb[4] = b[4] ^ b[6];
106 pb[5] = b[3];
107 pb[6] = tb[1];
108 pb[7] = b[2];
109 pb[8] = tb[2];
110 pb[9] = b[1];
111 pb[10]= b[1] ^ b[3];
112 pb[11]= b[1] ^ b[2] ^ b[4] ^ b[5];
113 pb[12]= b[1] ^ tb[4];
114 pb[13]= b[0];
115 pb[14]= tb[0];
116 pb[15]= b[0] ^ b[2];
117 pb[16]= b[0] ^ tb[4];
118 pb[17]= b[3] ^ tb[0] ^ tb[2];
119 pb[18]= b[0] ^ b[1];
120 pb[19]= b[3] ^ b[6] ^ tb[3];
121 pb[20]= tb[1] ^ tb[3];
122 pb[21]= tb[3] ^ tb[4];
123
124 __m128i p[22];
125
126 p[0] = GF2X_FUNC(mul7cl_mul1)(pa[0], pb[0]);
127 p[1] = GF2X_FUNC(mul7cl_mul1)(pa[1], pb[1]);
128 p[2] = GF2X_FUNC(mul7cl_mul1)(pa[2], pb[2]);
129 p[3] = GF2X_FUNC(mul7cl_mul1)(pa[3], pb[3]);
130 p[4] = GF2X_FUNC(mul7cl_mul1)(pa[4], pb[4]);
131 p[5] = GF2X_FUNC(mul7cl_mul1)(pa[5], pb[5]);
132 p[6] = GF2X_FUNC(mul7cl_mul1)(pa[6], pb[6]);
133 p[7] = GF2X_FUNC(mul7cl_mul1)(pa[7], pb[7]);
134 p[8] = GF2X_FUNC(mul7cl_mul1)(pa[8], pb[8]);
135 p[9] = GF2X_FUNC(mul7cl_mul1)(pa[9], pb[9]);
136 p[10]= GF2X_FUNC(mul7cl_mul1)(pa[10], pb[10]);
137 p[11]= GF2X_FUNC(mul7cl_mul1)(pa[11], pb[11]);
138 p[12]= GF2X_FUNC(mul7cl_mul1)(pa[12], pb[12]);
139 p[13]= GF2X_FUNC(mul7cl_mul1)(pa[13], pb[13]);
140 p[14]= GF2X_FUNC(mul7cl_mul1)(pa[14], pb[14]);
141 p[15]= GF2X_FUNC(mul7cl_mul1)(pa[15], pb[15]);
142 p[16]= GF2X_FUNC(mul7cl_mul1)(pa[16], pb[16]);
143 p[17]= GF2X_FUNC(mul7cl_mul1)(pa[17], pb[17]);
144 p[18]= GF2X_FUNC(mul7cl_mul1)(pa[18], pb[18]);
145 p[19]= GF2X_FUNC(mul7cl_mul1)(pa[19], pb[19]);
146 p[20]= GF2X_FUNC(mul7cl_mul1)(pa[20], pb[20]);
147 p[21]= GF2X_FUNC(mul7cl_mul1)(pa[21], pb[21]);
148
149 __m128i t[13];
150
151 t[0] = PXOR(p[0], p[1]);
152 t[1] = PXOR(p[9], p[13]);
153 t[2] = PXOR(p[3], p[6]);
154 t[3] = PXOR(p[7], p[10]);
155 t[4] = PXOR(p[11], p[18]);
156 t[5] = PXOR(p[4], t[3]);
157 t[6] = PXOR(p[15], t[2]);
158 t[7] = PXOR(p[20], t[5]);
159 t[8] = PXOR(p[5], p[14]);
160 t[9] = PXOR(p[2], p[17]);
161 t[10] = PXOR(p[5], p[8]);
162 t[11] = PXOR(p[21], t[6]);
163 t[12] = PXOR(p[16], t[4]);
164
165 __m128i cc[13];
166
167 cc[0] = p[13];
168 cc[2] = PXOR3(p[7], p[15], t[1]);
169 cc[4] = PXOR4(p[3], t[1], t[3], t[8]);
170 cc[6] = PXOR5(p[0], p[12], p[13], t[7], t[11]);
171 cc[8] = PXOR4(p[7], t[0], t[2], t[10]);
172 cc[10] = PXOR3(p[3], p[4], t[0]);
173 cc[12] = p[0];
174
175 cc[1] = PXOR(p[18], t[1]);
176 cc[3] = PXOR3(t[7], t[9], t[12]);
177 cc[5] = PXOR6(p[2], p[11], p[19], t[1], t[10], t[11]);
178 cc[7] = PXOR5(p[21], t[0], t[5], t[8], t[12]);
179 cc[9] = PXOR5(p[12], p[19], t[4], t[6], t[9]);
180 cc[11] = PXOR(p[2], t[0]);
181
182 _mm_storeu_si128((__m128i*)(c), PXOR(cc[0] , _mm_slli_si128(cc[1], 8)));
183 _mm_storeu_si128((__m128i*)(c+2), PXOR(cc[2] , PXOR(_mm_srli_si128(cc[1], 8), _mm_slli_si128(cc[3], 8))));
184 _mm_storeu_si128((__m128i*)(c+4), PXOR(cc[4] , PXOR(_mm_srli_si128(cc[3], 8), _mm_slli_si128(cc[5], 8))));
185 _mm_storeu_si128((__m128i*)(c+6), PXOR(cc[6] , PXOR(_mm_srli_si128(cc[5], 8), _mm_slli_si128(cc[7], 8))));
186 _mm_storeu_si128((__m128i*)(c+8), PXOR(cc[8] , PXOR(_mm_srli_si128(cc[7], 8), _mm_slli_si128(cc[9], 8))));
187 _mm_storeu_si128((__m128i*)(c+10), PXOR(cc[10], PXOR(_mm_srli_si128(cc[9], 8), _mm_slli_si128(cc[11], 8))));
188 _mm_storeu_si128((__m128i*)(c+12), PXOR(cc[12], _mm_srli_si128(cc[11], 8)));
189 #undef PXOR
190 #undef PXOR3
191 #undef PXOR4
192 #undef PXOR5
193 #undef PXOR6
194 #undef PZERO
195 }
196
197 #endif /* GF2X_MUL7_H_ */
198