1 /* This file is part of the gf2x library.
2
3 Copyright 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of either:
8 - If the archive contains a file named toom-gpl.c (not a trivial
9 placeholder), the GNU General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 - If the archive contains a file named toom-gpl.c which is a trivial
13 placeholder, the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 2.1 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
20
21 You should have received a copy of the GNU General Public License as
22 well as the GNU Lesser General Public License along with this program;
23 see the files COPYING and COPYING.LIB. If not, write to the Free
24 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 02110-1301, USA.
26 */
27
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33 * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45 * as to remove the false dependency on pclmul, that would be nice.
46 */
47 static inline __m128i
GF2X_FUNC(mul9k3_mul1)48 GF2X_FUNC(mul9k3_mul1) (unsigned long a, unsigned long b)
49 {
50 __m128i aa = _gf2x_mm_setr_epi64(a, 0);
51 __m128i bb = _gf2x_mm_setr_epi64(b, 0);
52 return _mm_clmulepi64_si128(aa, bb, 0);
53 }
54
55 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
56 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
57 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
58 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
59 #define PZERO _mm_setzero_si128()
60
61 /* same as mul5, but stores a[4]*b[4] into {d,2} */
62 GF2X_STORAGE_CLASS_mul5
GF2X_FUNC(mul9k3_mul5c)63 void GF2X_FUNC(mul9k3_mul5c) (unsigned long *c, const unsigned long *a,
64 const unsigned long *b, unsigned long *d)
65 {
66 /* Montgomery formulae with 13 multiplications */
67 unsigned long ta[3], tb[3], pa[8], pb[8];
68 __m128i p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
69 __m128i t0, t2, t4, t6, t8, t10, t12;
70 ta[0] = a[0] ^ a[4] ; tb[0] = b[0] ^ b[4];
71 ta[1] = a[1] ^ a[2] ; tb[1] = b[1] ^ b[2];
72 ta[2] = a[3] ^ ta[0] ; tb[2] = b[3] ^ tb[0];
73 pa[0] = ta[1] ^ ta[2] ; pb[0] = tb[1] ^ tb[2];
74 pa[1] = a[2] ^ ta[2] ; pb[1] = b[2] ^ tb[2];
75 pa[2] = ta[0] ^ ta[1] ; pb[2] = tb[0] ^ tb[1];
76 pa[3] = a[1] ^ ta[2] ; pb[3] = b[1] ^ tb[2];
77 pa[4] = a[0] ^ a[2] ^ a[3] ; pb[4] = b[0] ^ b[2] ^ b[3];
78 pa[5] = a[4] ^ ta[1] ; pb[5] = b[4] ^ tb[1];
79 pa[6] = a[3] ^ a[4] ; pb[6] = b[3] ^ b[4];
80 pa[7] = a[0] ^ a[1] ; pb[7] = b[0] ^ b[1];
81 p0 = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]);
82 p2 = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]);
83 p4 = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]);
84 p6 = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]);
85 p8 = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]);
86 p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]);
87 p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]);
88 p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]);
89 p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]);
90 p18 = GF2X_FUNC(mul9k3_mul1)(a[4], b[4]);
91 p20 = GF2X_FUNC(mul9k3_mul1)(a[3], b[3]);
92 p22 = GF2X_FUNC(mul9k3_mul1)(a[1], b[1]);
93 p24 = GF2X_FUNC(mul9k3_mul1)(a[0], b[0]);
94 t0 = PXOR(p14, p24);
95 t2 = PXOR(p12, p18);
96 t4 = PXOR(p2, p16);
97 t6 = PXOR(p0, p6);
98 t8 = PXOR(p4, p16);
99 t10 = PXOR(p10, t0);
100 t12 = PXOR(p8, t2);
101
102 __m128i ce0 = p24;
103 __m128i ce2 = PXOR3(p18, t8, t10);
104 __m128i ce4 = PXOR5(p0, p20, p22, t10, t12);
105 __m128i ce6 = PXOR3(p24, t4, t12);
106 __m128i ce8 = p18;
107
108 __m128i co1 = PXOR(p22, t0);
109 __m128i co3 = PXOR3(t2, t4, t6);
110 __m128i co5 = PXOR3(t0, t6, t8);
111 __m128i co7 = PXOR(p20, t2);
112
113 _mm_storeu_si128((__m128i*)(d), p18); /* a[4] * b[4] */
114 _mm_storeu_si128((__m128i*)(c), PXOR(ce0, _mm_slli_si128(co1, 8)));
115 _mm_storeu_si128((__m128i*)(c+2), PXOR3(ce2 , _mm_srli_si128(co1, 8) , _mm_slli_si128(co3, 8)));
116 _mm_storeu_si128((__m128i*)(c+4), PXOR3(ce4 , _mm_srli_si128(co3, 8) , _mm_slli_si128(co5, 8)));
117 _mm_storeu_si128((__m128i*)(c+6), PXOR3(ce6 , _mm_srli_si128(co5, 8) , _mm_slli_si128(co7, 8)));
118 _mm_storeu_si128((__m128i*)(c+8), PXOR(ce8 , _mm_srli_si128(co7, 8)));
119 }
120
121 /* same as mul5, but assumes {d,2} contains a[4]*b[4] */
122 GF2X_STORAGE_CLASS_mul5
GF2X_FUNC(mul9k3_mul5b)123 void GF2X_FUNC(mul9k3_mul5b) (unsigned long *c, const unsigned long *a,
124 const unsigned long *b, const unsigned long *d)
125 {
126 /* Montgomery formulae with 13 multiplications */
127 unsigned long ta[3], tb[3], pa[8], pb[8];
128 __m128i p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
129 __m128i t0, t2, t4, t6, t8, t10, t12;
130
131 ta[0] = a[0] ^ a[4] ; tb[0] = b[0] ^ b[4];
132 ta[1] = a[1] ^ a[2] ; tb[1] = b[1] ^ b[2];
133 ta[2] = a[3] ^ ta[0] ; tb[2] = b[3] ^ tb[0];
134 pa[0] = ta[1] ^ ta[2] ; pb[0] = tb[1] ^ tb[2];
135 pa[1] = a[2] ^ ta[2] ; pb[1] = b[2] ^ tb[2];
136 pa[2] = ta[0] ^ ta[1] ; pb[2] = tb[0] ^ tb[1];
137 pa[3] = a[1] ^ ta[2] ; pb[3] = b[1] ^ tb[2];
138 pa[4] = a[0] ^ a[2] ^ a[3] ; pb[4] = b[0] ^ b[2] ^ b[3];
139 pa[5] = a[4] ^ ta[1] ; pb[5] = b[4] ^ tb[1];
140 pa[6] = a[3] ^ a[4] ; pb[6] = b[3] ^ b[4];
141 pa[7] = a[0] ^ a[1] ; pb[7] = b[0] ^ b[1];
142 p0 = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]);
143 p2 = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]);
144 p4 = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]);
145 p6 = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]);
146 p8 = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]);
147 p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]);
148 p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]);
149 p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]);
150 p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]);
151 /* p18 = GF2X_FUNC(mul9k3_mul1)(a[4], b[4]); */
152 p18 = _mm_loadu_si128((__m128i *) d);
153 p20 = GF2X_FUNC(mul9k3_mul1)(a[3], b[3]);
154 p22 = GF2X_FUNC(mul9k3_mul1)(a[1], b[1]);
155 p24 = GF2X_FUNC(mul9k3_mul1)(a[0], b[0]);
156 t0 = PXOR(p14, p24);
157 t2 = PXOR(p12, p18);
158 t4 = PXOR(p2, p16);
159 t6 = PXOR(p0, p6);
160 t8 = PXOR(p4, p16);
161 t10 = PXOR(p10, t0);
162 t12 = PXOR(p8, t2);
163
164 __m128i ce0 = p24;
165 __m128i ce2 = PXOR3(p18, t8, t10);
166 __m128i ce4 = PXOR5(p0, p20, p22, t10, t12);
167 __m128i ce6 = PXOR3(p24, t4, t12);
168 __m128i ce8 = p18;
169
170 __m128i co1 = PXOR(p22, t0);
171 __m128i co3 = PXOR3(t2, t4, t6);
172 __m128i co5 = PXOR3(t0, t6, t8);
173 __m128i co7 = PXOR(p20, t2);
174
175 _mm_storeu_si128((__m128i*)(c), PXOR(ce0, _mm_slli_si128(co1, 8)));
176 _mm_storeu_si128((__m128i*)(c+2), PXOR3(ce2, _mm_srli_si128(co1, 8), _mm_slli_si128(co3, 8)));
177 _mm_storeu_si128((__m128i*)(c+4), PXOR3(ce4, _mm_srli_si128(co3, 8), _mm_slli_si128(co5, 8)));
178 _mm_storeu_si128((__m128i*)(c+6), PXOR3(ce6, _mm_srli_si128(co5, 8), _mm_slli_si128(co7, 8)));
179 _mm_storeu_si128((__m128i*)(c+8), PXOR(ce8, _mm_srli_si128(co7, 8)));
180 }
181
182 /* (based on mul9k.c)
183 1 call to mul4 and 2 calls to mul5 minus one multiplication, i.e.,
184 34 multiplications with mul5clk_c */
185 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)186 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
187 {
188 unsigned long aa[5], bb[5], ab[10], ab5, ab6, ab7, ab8, ab9, d[2];
189
190 gf2x_mul4 (c+10, a+5, b+5);
191 GF2X_FUNC(mul9k3_mul5c) (c, a, b, d); /* a[4]*b[4] is cached in d */
192 aa[0] = a[0] ^ a[5];
193 aa[1] = a[1] ^ a[6];
194 aa[2] = a[2] ^ a[7];
195 aa[3] = a[3] ^ a[8];
196 aa[4] = a[4];
197 bb[0] = b[0] ^ b[5];
198 bb[1] = b[1] ^ b[6];
199 bb[2] = b[2] ^ b[7];
200 bb[3] = b[3] ^ b[8];
201 bb[4] = b[4];
202 GF2X_FUNC(mul9k3_mul5b) (ab, aa, bb, d);
203 ab5 = ab[5] ^ c[5];
204 ab6 = ab[6] ^ c[6];
205 ab7 = ab[7] ^ c[7];
206 ab8 = ab[8] ^ c[8];
207 ab9 = ab[9] ^ c[9];
208 c[5] ^= ab[0] ^ c[0] ^ c[10];
209 c[6] ^= ab[1] ^ c[1] ^ c[11];
210 c[7] ^= ab[2] ^ c[2] ^ c[12];
211 c[8] ^= ab[3] ^ c[3] ^ c[13];
212 c[9] ^= ab[4] ^ c[4] ^ c[14];
213 c[10] ^= ab5 ^ c[15];
214 c[11] ^= ab6 ^ c[16];
215 c[12] ^= ab7 ^ c[17];
216 c[13] ^= ab8;
217 c[14] ^= ab9;
218 }
219
220
221 #undef PXOR
222 #undef PXOR3
223 #undef PXOR4
224 #undef PXOR5
225 #undef PZERO
226 #endif /* GF2X_MUL9_H_ */
227