1 // gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //                 Also based on PCLMULQDQ code by Jankowski, Laurent and
3 //                 O'Mahony from Intel (see reference below).
4 //
5 //    This source file uses intrinsics and built-ins to gain access to
6 //    CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7 //    needed because additional CXXFLAGS are required to enable the
8 //    appropriate instructions sets in some build configurations.
9 //
10 //    Several speedups were taken from Intel Polynomial Multiplication
11 //    Instruction and its Usage for Elliptic Curve Cryptography, by
12 //    Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13 //    https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14 //    There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15 //    The IACR paper performs some optimizations that the compiler is
16 //    expected to perform, like Common Subexpression Elimination to save
17 //    on variables (among others). Note that the compiler may miss the
18 //    optimization so the IACR paper is useful. However, the code is GPL3
19 //    and toxic for some users of the library...
20 
21 #include "pch.h"
22 #include "config.h"
23 
24 #ifndef CRYPTOPP_IMPORTS
25 
26 #include "gf2n.h"
27 
28 #if (CRYPTOPP_CLMUL_AVAILABLE)
29 # include <emmintrin.h>
30 # include <wmmintrin.h>
31 #endif
32 
33 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
34 # include "arm_simd.h"
35 #endif
36 
37 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
38 # include "ppc_simd.h"
39 #endif
40 
41 // Squash MS LNK4221 and libtool warnings
42 extern const char GF2N_SIMD_FNAME[] = __FILE__;
43 
44 ANONYMOUS_NAMESPACE_BEGIN
45 
46 // ************************** ARMv8 ************************** //
47 
48 using CryptoPP::word;
49 
50 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
51 
52 // c1c0 = a * b
53 inline void
F2N_Multiply_128x128_ARMv8(uint64x2_t & c1,uint64x2_t & c0,const uint64x2_t & a,const uint64x2_t & b)54 F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
55 {
56     uint64x2_t t1, t2, z0={0};
57 
58     c0 = PMULL_00(a, b);
59     c1 = PMULL_11(a, b);
60     t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
61     t1 = veorq_u64(a, t1);
62     t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
63     t2 = veorq_u64(b, t2);
64     t1 = PMULL_00(t1, t2);
65     t1 = veorq_u64(c0, t1);
66     t1 = veorq_u64(c1, t1);
67     t2 = t1;
68     t1 = vextq_u64(z0, t1, 1);
69     t2 = vextq_u64(t2, z0, 1);
70     c0 = veorq_u64(c0, t1);
71     c1 = veorq_u64(c1, t2);
72 }
73 
74 // c3c2c1c0 = a1a0 * b1b0
75 inline void
F2N_Multiply_256x256_ARMv8(uint64x2_t & c3,uint64x2_t & c2,uint64x2_t & c1,uint64x2_t & c0,const uint64x2_t & b1,const uint64x2_t & b0,const uint64x2_t & a1,const uint64x2_t & a0)76 F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
77     const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
78 {
79     uint64x2_t c4, c5;
80     uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
81 
82     F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
83     F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
84 
85     x0 = veorq_u64(x0, x1);
86     y0 = veorq_u64(y0, y1);
87 
88     F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
89 
90     c4 = veorq_u64(c4, c0);
91     c4 = veorq_u64(c4, c2);
92     c5 = veorq_u64(c5, c1);
93     c5 = veorq_u64(c5, c3);
94     c1 = veorq_u64(c1, c4);
95     c2 = veorq_u64(c2, c5);
96 }
97 
98 // c3c2c1c0 = a1a0 * a1a0
99 inline void
F2N_Square_256_ARMv8(uint64x2_t & c3,uint64x2_t & c2,uint64x2_t & c1,uint64x2_t & c0,const uint64x2_t & a1,const uint64x2_t & a0)100 F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
101     uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
102 {
103     c0 = PMULL_00(a0, a0);
104     c1 = PMULL_11(a0, a0);
105     c2 = PMULL_00(a1, a1);
106     c3 = PMULL_11(a1, a1);
107 }
108 
109 // x = (x << n), z = 0
110 template <unsigned int N>
ShiftLeft128_ARMv8(uint64x2_t x)111 inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
112 {
113     uint64x2_t u=x, v, z={0};
114     x = vshlq_n_u64(x, N);
115     u = vshrq_n_u64(u, (64-N));
116     v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
117     x = vorrq_u64(x, v);
118     return x;
119 }
120 
121 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
122 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
123 inline void
GF2NT_233_Reduce_ARMv8(uint64x2_t & c3,uint64x2_t & c2,uint64x2_t & c1,uint64x2_t & c0)124 GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
125 {
126     const unsigned int mask[4] = {
127         0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
128     };
129 
130     uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
131     m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
132     b1 = c1; a1 = c1;
133     a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
134     a1 = vshlq_n_u64(a1, 23);
135     a1 = vshrq_n_u64(a1, 23);
136     c1 = vorrq_u64(a1, a0);
137     b2 = vshrq_n_u64(c2, (64-23));
138     c3 = ShiftLeft128_ARMv8<23>(c3);
139     a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
140     c3 = vorrq_u64(c3, a0);
141     b1 = vshrq_n_u64(b1, (64-23));
142     c2 = ShiftLeft128_ARMv8<23>(c2);
143     a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
144     c2 = vorrq_u64(c2, a0);
145     b3 = c3;
146     b2 = vshrq_n_u64(c2, (64-10));
147     b3 = ShiftLeft128_ARMv8<10>(b3);
148     a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
149     b3 = vorrq_u64(b3, a0);
150     a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
151     b3 = veorq_u64(b3, a0);
152     b1 = vshrq_n_u64(b3, (64-23));
153     b3 = ShiftLeft128_ARMv8<23>(b3);
154     b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
155     b3 = vorrq_u64(b3, b1);
156     c2 = veorq_u64(c2, b3);
157     b3 = c3;
158     b2 = vshrq_n_u64(c2, (64-10));
159     b3 = ShiftLeft128_ARMv8<10>(b3);
160     b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
161     b3 = vorrq_u64(b3, b2);
162     b2 = c2;
163     b2 = ShiftLeft128_ARMv8<10>(b2);
164     a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
165     c2 = veorq_u64(c2, a0);
166     a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
167     a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
168     a0 = vorrq_u64(a0, a1);
169     c3 = veorq_u64(c3, a0);
170     c0 = veorq_u64(c0, c2);
171     c1 = veorq_u64(c1, c3);
172     c1 = vandq_u64(c1, m0);
173 }
174 
175 #endif
176 
177 // ************************** SSE ************************** //
178 
179 #if (CRYPTOPP_CLMUL_AVAILABLE)
180 
181 using CryptoPP::word;
182 
183 // c1c0 = a * b
184 inline void
F2N_Multiply_128x128_CLMUL(__m128i & c1,__m128i & c0,const __m128i & a,const __m128i & b)185 F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
186 {
187     __m128i t1, t2;
188 
189     c0 = _mm_clmulepi64_si128(a, b, 0x00);
190     c1 = _mm_clmulepi64_si128(a, b, 0x11);
191     t1 = _mm_shuffle_epi32(a, 0xEE);
192     t1 = _mm_xor_si128(a, t1);
193     t2 = _mm_shuffle_epi32(b, 0xEE);
194     t2 = _mm_xor_si128(b, t2);
195     t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
196     t1 = _mm_xor_si128(c0, t1);
197     t1 = _mm_xor_si128(c1, t1);
198     t2 = t1;
199     t1 = _mm_slli_si128(t1, 8);
200     t2 = _mm_srli_si128(t2, 8);
201     c0 = _mm_xor_si128(c0, t1);
202     c1 = _mm_xor_si128(c1, t2);
203 }
204 
205 // c3c2c1c0 = a1a0 * b1b0
206 inline void
F2N_Multiply_256x256_CLMUL(__m128i & c3,__m128i & c2,__m128i & c1,__m128i & c0,const __m128i & b1,const __m128i & b0,const __m128i & a1,const __m128i & a0)207 F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
208     const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
209 {
210     __m128i c4, c5;
211     __m128i x0=a0, x1=a1, y0=b0, y1=b1;
212 
213     F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
214     F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
215 
216     x0 = _mm_xor_si128(x0, x1);
217     y0 = _mm_xor_si128(y0, y1);
218 
219     F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
220 
221     c4 = _mm_xor_si128(c4, c0);
222     c4 = _mm_xor_si128(c4, c2);
223     c5 = _mm_xor_si128(c5, c1);
224     c5 = _mm_xor_si128(c5, c3);
225     c1 = _mm_xor_si128(c1, c4);
226     c2 = _mm_xor_si128(c2, c5);
227 }
228 
229 // c3c2c1c0 = a1a0 * a1a0
230 inline void
F2N_Square_256_CLMUL(__m128i & c3,__m128i & c2,__m128i & c1,__m128i & c0,const __m128i & a1,const __m128i & a0)231 F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
232     __m128i& c0, const __m128i& a1, const __m128i& a0)
233 {
234     c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
235     c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
236     c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
237     c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
238 }
239 
240 // x = (x << n), z = 0
241 template <unsigned int N>
ShiftLeft128_SSE(__m128i x,const __m128i & z)242 inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
243 {
244     __m128i u=x, v;
245     x = _mm_slli_epi64(x, N);
246     u = _mm_srli_epi64(u, (64-N));
247     v = _mm_unpacklo_epi64(z, u);
248     x = _mm_or_si128(x, v);
249     return x;
250 }
251 
252 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
253 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
254 inline void
GF2NT_233_Reduce_CLMUL(__m128i & c3,__m128i & c2,__m128i & c1,__m128i & c0)255 GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
256 {
257     const unsigned int m[4] = {
258         0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
259     };
260 
261     __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
262     m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
263     z0 = _mm_setzero_si128();
264     b1 = c1; a1 = c1;
265     a0 = _mm_move_epi64(c1);
266     a1 = _mm_slli_epi64(a1, 23);
267     a1 = _mm_srli_epi64(a1, 23);
268     c1 = _mm_or_si128(a1, a0);
269     b2 = _mm_srli_epi64(c2, (64-23));
270     c3 = ShiftLeft128_SSE<23>(c3, z0);
271     a0 = _mm_unpackhi_epi64(b2, z0);
272     c3 = _mm_or_si128(c3, a0);
273     b1 = _mm_srli_epi64(b1, (64-23));
274     c2 = ShiftLeft128_SSE<23>(c2, z0);
275     a0 = _mm_unpackhi_epi64(b1, z0);
276     c2 = _mm_or_si128(c2, a0);
277     b3 = c3;
278     b2 = _mm_srli_epi64(c2, (64-10));
279     b3 = ShiftLeft128_SSE<10>(b3, z0);
280     a0 = _mm_unpackhi_epi64(b2, z0);
281     b3 = _mm_or_si128(b3, a0);
282     a0 = _mm_unpackhi_epi64(c3, z0);
283     b3 = _mm_xor_si128(b3, a0);
284     b1 = _mm_srli_epi64(b3, (64-23));
285     b3 = ShiftLeft128_SSE<23>(b3, z0);
286     b3 = _mm_unpackhi_epi64(b3, z0);
287     b3 = _mm_or_si128(b3, b1);
288     c2 = _mm_xor_si128(c2, b3);
289     b3 = c3;
290     b2 = _mm_srli_epi64(c2, (64-10));
291     b3 = ShiftLeft128_SSE<10>(b3, z0);
292     b2 = _mm_unpackhi_epi64(b2, z0);
293     b3 = _mm_or_si128(b3, b2);
294     b2 = c2;
295     b2 = ShiftLeft128_SSE<10>(b2, z0);
296     a0 = _mm_unpacklo_epi64(z0, b2);
297     c2 = _mm_xor_si128(c2, a0);
298     a0 = _mm_unpacklo_epi64(z0, b3);
299     a1 = _mm_unpackhi_epi64(b2, z0);
300     a0 = _mm_or_si128(a0, a1);
301     c3 = _mm_xor_si128(c3, a0);
302     c0 = _mm_xor_si128(c0, c2);
303     c1 = _mm_xor_si128(c1, c3);
304     c1 = _mm_and_si128(c1, m0);
305 }
306 
307 #endif
308 
309 // ************************* Power8 ************************* //
310 
311 #if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
312 
313 using CryptoPP::byte;
314 using CryptoPP::word;
315 using CryptoPP::uint8x16_p;
316 using CryptoPP::uint64x2_p;
317 
318 using CryptoPP::VecLoad;
319 using CryptoPP::VecStore;
320 
321 using CryptoPP::VecOr;
322 using CryptoPP::VecXor;
323 using CryptoPP::VecAnd;
324 
325 using CryptoPP::VecPermute;
326 using CryptoPP::VecMergeLow;
327 using CryptoPP::VecMergeHigh;
328 using CryptoPP::VecShiftLeft;
329 using CryptoPP::VecShiftRight;
330 
331 using CryptoPP::VecIntelMultiply00;
332 using CryptoPP::VecIntelMultiply11;
333 
334 // c1c0 = a * b
335 inline void
F2N_Multiply_128x128_POWER8(uint64x2_p & c1,uint64x2_p & c0,const uint64x2_p & a,const uint64x2_p & b)336 F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
337 {
338     uint64x2_p t1, t2;
339     const uint64x2_p z0={0};
340 
341     c0 = VecIntelMultiply00(a, b);
342     c1 = VecIntelMultiply11(a, b);
343     t1 = VecMergeLow(a, a);
344     t1 = VecXor(a, t1);
345     t2 = VecMergeLow(b, b);
346     t2 = VecXor(b, t2);
347     t1 = VecIntelMultiply00(t1, t2);
348     t1 = VecXor(c0, t1);
349     t1 = VecXor(c1, t1);
350     t2 = t1;
351     t1 = VecMergeHigh(z0, t1);
352     t2 = VecMergeLow(t2, z0);
353     c0 = VecXor(c0, t1);
354     c1 = VecXor(c1, t2);
355 }
356 
357 // c3c2c1c0 = a1a0 * b1b0
358 inline void
F2N_Multiply_256x256_POWER8(uint64x2_p & c3,uint64x2_p & c2,uint64x2_p & c1,uint64x2_p & c0,const uint64x2_p & b1,const uint64x2_p & b0,const uint64x2_p & a1,const uint64x2_p & a0)359 F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
360     const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
361 {
362     uint64x2_p c4, c5;
363     uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
364 
365     F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
366     F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
367 
368     x0 = VecXor(x0, x1);
369     y0 = VecXor(y0, y1);
370 
371     F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
372 
373     c4 = VecXor(c4, c0);
374     c4 = VecXor(c4, c2);
375     c5 = VecXor(c5, c1);
376     c5 = VecXor(c5, c3);
377     c1 = VecXor(c1, c4);
378     c2 = VecXor(c2, c5);
379 }
380 
381 // c3c2c1c0 = a1a0 * a1a0
382 inline void
F2N_Square_256_POWER8(uint64x2_p & c3,uint64x2_p & c2,uint64x2_p & c1,uint64x2_p & c0,const uint64x2_p & a1,const uint64x2_p & a0)383 F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
384     uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
385 {
386     c0 = VecIntelMultiply00(a0, a0);
387     c1 = VecIntelMultiply11(a0, a0);
388     c2 = VecIntelMultiply00(a1, a1);
389     c3 = VecIntelMultiply11(a1, a1);
390 }
391 
392 // x = (x << n), z = 0
393 template <unsigned int N>
ShiftLeft128_POWER8(uint64x2_p x)394 inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
395 {
396     uint64x2_p u=x, v;
397     const uint64x2_p z={0};
398 
399     x = VecShiftLeft<N>(x);
400     u = VecShiftRight<64-N>(u);
401     v = VecMergeHigh(z, u);
402     x = VecOr(x, v);
403     return x;
404 }
405 
406 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
407 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
408 inline void
GF2NT_233_Reduce_POWER8(uint64x2_p & c3,uint64x2_p & c2,uint64x2_p & c1,uint64x2_p & c0)409 GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
410 {
411     const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
412     const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
413 
414     uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
415     const uint64x2_p z0={0};
416 
417     b1 = c1; a1 = c1;
418     a0 = VecMergeHigh(c1, z0);
419     a1 = VecShiftLeft<23>(a1);
420     a1 = VecShiftRight<23>(a1);
421     c1 = VecOr(a1, a0);
422     b2 = VecShiftRight<64-23>(c2);
423     c3 = ShiftLeft128_POWER8<23>(c3);
424     a0 = VecMergeLow(b2, z0);
425     c3 = VecOr(c3, a0);
426     b1 = VecShiftRight<64-23>(b1);
427     c2 = ShiftLeft128_POWER8<23>(c2);
428     a0 = VecMergeLow(b1, z0);
429     c2 = VecOr(c2, a0);
430     b3 = c3;
431     b2 = VecShiftRight<64-10>(c2);
432     b3 = ShiftLeft128_POWER8<10>(b3);
433     a0 = VecMergeLow(b2, z0);
434     b3 = VecOr(b3, a0);
435     a0 = VecMergeLow(c3, z0);
436     b3 = VecXor(b3, a0);
437     b1 = VecShiftRight<64-23>(b3);
438     b3 = ShiftLeft128_POWER8<23>(b3);
439     b3 = VecMergeLow(b3, z0);
440     b3 = VecOr(b3, b1);
441     c2 = VecXor(c2, b3);
442     b3 = c3;
443     b2 = VecShiftRight<64-10>(c2);
444     b3 = ShiftLeft128_POWER8<10>(b3);
445     b2 = VecMergeLow(b2, z0);
446     b3 = VecOr(b3, b2);
447     b2 = c2;
448     b2 = ShiftLeft128_POWER8<10>(b2);
449     a0 = VecMergeHigh(z0, b2);
450     c2 = VecXor(c2, a0);
451     a0 = VecMergeHigh(z0, b3);
452     a1 = VecMergeLow(b2, z0);
453     a0 = VecOr(a0, a1);
454     c3 = VecXor(c3, a0);
455     c0 = VecXor(c0, c2);
456     c1 = VecXor(c1, c3);
457     c1 = VecAnd(c1, m0);
458 }
459 
460 #endif
461 
462 ANONYMOUS_NAMESPACE_END
463 
NAMESPACE_BEGIN(CryptoPP)464 NAMESPACE_BEGIN(CryptoPP)
465 
466 #if (CRYPTOPP_CLMUL_AVAILABLE)
467 
468 void
469 GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
470 {
471     enum {S=sizeof(__m128i)/sizeof(word)};
472     __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
473     __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));
474     __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+0*S));
475     __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+1*S));
476 
477     __m128i c0, c1, c2, c3;
478     F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
479     GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
480 
481     _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
482     _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
483 }
484 
485 void
GF2NT_233_Square_Reduce_CLMUL(const word * pA,word * pC)486 GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
487 {
488     enum {S=sizeof(__m128i)/sizeof(word)};
489     __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
490     __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));
491 
492     __m128i c0, c1, c2, c3;
493     F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
494     GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
495 
496     _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
497     _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
498 }
499 
500 #elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
501 
502 void
503 GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
504 {
505     // word is either 32-bit or 64-bit, depending on the platform.
506     // Load using a 32-bit pointer to avoid possible alignment issues.
507     const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
508     const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
509 
510     uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511     uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512     uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513     uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
514 
515     uint64x2_t c0, c1, c2, c3;
516     F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517     GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
518 
519     uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
520     vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521     vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
522 }
523 
524 void
525 GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
526 {
527     // word is either 32-bit or 64-bit, depending on the platform.
528     // Load using a 32-bit pointer to avoid possible alignment issues.
529     const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
530     uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531     uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
532 
533     uint64x2_t c0, c1, c2, c3;
534     F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535     GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
536 
537     uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
538     vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539     vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540 }
541 
542 #elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
543 
544 void
545 GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
546 {
547     // word is either 32-bit or 64-bit, depending on the platform.
548     // Load using a byte pointer to avoid possible alignment issues.
549     const byte* pAA = reinterpret_cast<const byte*>(pA);
550     const byte* pBB = reinterpret_cast<const byte*>(pB);
551 
552     uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553     uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554     uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555     uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
556 
557 #if (CRYPTOPP_BIG_ENDIAN)
558     const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559     const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560     a0 = VecPermute(a0, m);
561     a1 = VecPermute(a1, m);
562     b0 = VecPermute(b0, m);
563     b1 = VecPermute(b1, m);
564 #endif
565 
566     uint64x2_p c0, c1, c2, c3;
567     F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568     GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
569 
570 #if (CRYPTOPP_BIG_ENDIAN)
571     c0 = VecPermute(c0, m);
572     c1 = VecPermute(c1, m);
573 #endif
574 
575     byte* pCC = reinterpret_cast<byte*>(pC);
576     VecStore(c0, pCC+0);
577     VecStore(c1, pCC+16);
578 }
579 
580 void
581 GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
582 {
583     // word is either 32-bit or 64-bit, depending on the platform.
584     // Load using a byte pointer to avoid possible alignment issues.
585     const byte* pAA = reinterpret_cast<const byte*>(pA);
586     uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587     uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
588 
589 #if (CRYPTOPP_BIG_ENDIAN)
590     const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591     const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592     a0 = VecPermute(a0, m);
593     a1 = VecPermute(a1, m);
594 #endif
595 
596     uint64x2_p c0, c1, c2, c3;
597     F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598     GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
599 
600 #if (CRYPTOPP_BIG_ENDIAN)
601     c0 = VecPermute(c0, m);
602     c1 = VecPermute(c1, m);
603 #endif
604 
605     byte* pCC = reinterpret_cast<byte*>(pC);
606     VecStore(c0, pCC+0);
607     VecStore(c1, pCC+16);
608 }
609 
610 #endif
611 
612 NAMESPACE_END
613 
614 #endif  // CRYPTOPP_IMPORTS
615