1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2012, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 #ifndef GF2X_MUL7_H_
29 #define GF2X_MUL7_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
45 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
46 #define PZERO    _mm_setzero_si128()
47 
48 /* TODO: if somebody comes up with a neat way to improve the interface so
49  * as to remove the false dependency on pclmul, that would be nice.
50  */
GF2X_FUNC(mul7k3_mul2)51 static inline void GF2X_FUNC(mul7k3_mul2)(__m128i *t, __m128i ss1,
52 	       __m128i ss2)
53 {
54     __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
55     __m128i t11 = _mm_clmulepi64_si128(ss1, ss2, 0x11);
56     ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
57     ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
58     __m128i tk = PXOR(PXOR(t00, t11), _mm_clmulepi64_si128(ss1, ss2, 0));
59     t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
60     t[1] = PXOR(t11, _mm_unpackhi_epi64(tk, PZERO));
61 }
62 static inline void
GF2X_FUNC(mul7k3_mul2b)63 GF2X_FUNC(mul7k3_mul2b)(__m128i *t, __m128i ss1, __m128i ss2, const unsigned long * sc)
64 {
65     __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
66     ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
67     ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
68     __m128i c = _mm_loadu_si128((__m128i*)sc);
69     __m128i tk = PXOR(PXOR(t00, c), _mm_clmulepi64_si128(ss1, ss2, 0));
70     t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
71     t[1] = PXOR(c, _mm_unpackhi_epi64(tk, PZERO));
72 }
73 static inline void
GF2X_FUNC(mul7k3_mul2c)74 GF2X_FUNC(mul7k3_mul2c)(__m128i *t, __m128i ss1, __m128i ss2, unsigned long *sc)
75 {
76     __m128i t00 = _mm_clmulepi64_si128(ss1, ss2, 0);
77     __m128i t11 = _mm_clmulepi64_si128(ss1, ss2, 0x11);
78     ss1 = PXOR(ss1, _mm_shuffle_epi32(ss1, _MM_SHUFFLE(1, 0, 3, 2)));
79     ss2 = PXOR(ss2, _mm_shuffle_epi32(ss2, _MM_SHUFFLE(1, 0, 3, 2)));
80     __m128i tk = PXOR(PXOR(t00, t11), _mm_clmulepi64_si128(ss1, ss2, 0));
81     _mm_storeu_si128((__m128i*)sc, t11);
82     t[0] = PXOR(t00, _mm_unpacklo_epi64(PZERO, tk));
83     t[1] = PXOR(t11, _mm_unpackhi_epi64(tk, PZERO));
84 }
85 
86 /* specialized Karatsuba with 3 calls to mul2, i.e., 9 multiplications
87    {d,2} <- {a+3,1} * {b+3,1} */
88 GF2X_STORAGE_CLASS_mul4
GF2X_FUNC(mul7k3_mul4c)89 void GF2X_FUNC(mul7k3_mul4c) (unsigned long *c, const unsigned long *a, const unsigned long *b, unsigned long *d)
90 {
91   __m128i ab[2];
92   __m128i lo[2], hi[2];
93   __m128i a0 = _mm_loadu_si128((__m128i*)a);
94   __m128i a2 = _mm_loadu_si128((__m128i*)(a+2));
95   __m128i b0 = _mm_loadu_si128((__m128i*)b);
96   __m128i b2 = _mm_loadu_si128((__m128i*)(b+2));
97   GF2X_FUNC(mul7k3_mul2)(lo, a0, b0);
98   GF2X_FUNC(mul7k3_mul2c)(hi, a2, b2, d);
99   __m128i middle = PXOR(lo[1], hi[0]);
100   GF2X_FUNC(mul7k3_mul2)(ab, PXOR(a0, a2), PXOR(b0, b2));
101   _mm_storeu_si128((__m128i*)(c + 0), lo[0]);
102   _mm_storeu_si128((__m128i*)(c + 2), PXOR3(ab[0], lo[0], middle));
103   _mm_storeu_si128((__m128i*)(c + 4), PXOR3(ab[1], hi[1], middle));
104   _mm_storeu_si128((__m128i*)(c + 6), hi[1]);
105 }
106 
107 /* specialized Karatsuba with 3 calls to mul2, i.e., 9 multiplications,
108    assume {d,2} = {a+3,1} * {b+3,1} */
109 GF2X_STORAGE_CLASS_mul4
GF2X_FUNC(mul7k3_mul4b)110 void GF2X_FUNC(mul7k3_mul4b) (unsigned long *c, const unsigned long *a, const unsigned long *b, unsigned long *d)
111 {
112   __m128i ab[2];
113   __m128i lo[2], hi[2];
114   __m128i a0 = _mm_loadu_si128((__m128i*)a);
115   __m128i a2 = _mm_loadu_si128((__m128i*)(a+2));
116   __m128i b0 = _mm_loadu_si128((__m128i*)b);
117   __m128i b2 = _mm_loadu_si128((__m128i*)(b+2));
118   GF2X_FUNC(mul7k3_mul2)(lo, a0, b0);
119   GF2X_FUNC(mul7k3_mul2b)(hi, a2, b2, d);
120   __m128i middle = PXOR(lo[1], hi[0]);
121   GF2X_FUNC(mul7k3_mul2)(ab, PXOR(a0, a2), PXOR(b0, b2));
122   _mm_storeu_si128((__m128i*)(c + 0), lo[0]);
123   _mm_storeu_si128((__m128i*)(c + 2), PXOR3(ab[0], lo[0], middle));
124   _mm_storeu_si128((__m128i*)(c + 4), PXOR3(ab[1], hi[1], middle));
125   _mm_storeu_si128((__m128i*)(c + 6), hi[1]);
126 }
127 
128 #undef PXOR
129 #undef PXOR3
130 #undef PZERO
131 
132 /* based on mul7k.c, version with M(3)+2M(4)-1=23 multiplications */
133 GF2X_STORAGE_CLASS_mul7
gf2x_mul7(unsigned long * c,const unsigned long * a,const unsigned long * b)134 void gf2x_mul7 (unsigned long *c, const unsigned long *a, const unsigned long *b)
135 {
136     unsigned long aa[4], bb[4], ab[8], ab4, ab5, ab6, ab7, d[2];
137 
138     gf2x_mul3 (c+8, a+4, b+4);
139     GF2X_FUNC(mul7k3_mul4c) (c, a, b, d);
140     aa[0] = a[0] ^ a[4];
141     aa[1] = a[1] ^ a[5];
142     aa[2] = a[2] ^ a[6];
143     aa[3] = a[3];
144     bb[0] = b[0] ^ b[4];
145     bb[1] = b[1] ^ b[5];
146     bb[2] = b[2] ^ b[6];
147     bb[3] = b[3];
148     GF2X_FUNC(mul7k3_mul4b) (ab, aa, bb, d);
149     ab4 = ab[4] ^ c[4];
150     ab5 = ab[5] ^ c[5];
151     ab6 = ab[6] ^ c[6];
152     ab7 = ab[7] ^ c[7];
153     c[4] ^= ab[0] ^ c[0] ^ c[8];
154     c[5] ^= ab[1] ^ c[1] ^ c[9];
155     c[6] ^= ab[2] ^ c[2] ^ c[10];
156     c[7] ^= ab[3] ^ c[3] ^ c[11];
157     c[8] ^= ab4 ^ c[12];
158     c[9] ^= ab5 ^ c[13];
159     c[10] ^= ab6;
160     c[11] ^= ab7;
161 }
162 
163 #endif  /* GF2X_MUL7_H_ */
164