1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2012, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5    Nicolas Estibals (for this file)
6 
7    This program is free software; you can redistribute it and/or modify it
8    under the terms of either:
9     - If the archive contains a file named toom-gpl.c (not a trivial
10     placeholder), the GNU General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13     - If the archive contains a file named toom-gpl.c which is a trivial
14     placeholder, the GNU Lesser General Public License as published by
15     the Free Software Foundation; either version 2.1 of the License, or
16     (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but WITHOUT
19    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
20    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
21 
22    You should have received a copy of the GNU General Public License as
23    well as the GNU Lesser General Public License along with this program;
24    see the files COPYING and COPYING.LIB.  If not, write to the Free
25    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 */
28 #ifndef GF2X_MUL7_H_
29 #define GF2X_MUL7_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45  * as to remove the false dependency on pclmul, that would be nice.
46  */
47 static inline __m128i
GF2X_FUNC(mul7cl_mul1)48 GF2X_FUNC(mul7cl_mul1) (unsigned long a, unsigned long b)
49 {
50     return _mm_clmulepi64_si128(_gf2x_mm_setr_epi64(a,0), _gf2x_mm_setr_epi64(b,0), 0);
51 }
52 
53 
54 /* variant with 22 multiplications */
55 GF2X_STORAGE_CLASS_mul7
gf2x_mul7(unsigned long * c,const unsigned long * a,const unsigned long * b)56 void gf2x_mul7 (unsigned long *c, const unsigned long *a, const unsigned long *b)
57 {
58 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
59 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
60 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
61 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
62 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
63 #define PZERO    _mm_setzero_si128()
64     /* Montgomery formulae with 22 multiplications, see
65      Five, Six, and Seven-Term {K}aratsuba-Like Formulae,
66      IEEE Transactions on Computers, volume 54, number 3, p. 362-369, 2005 */
67     unsigned long ta[5], tb[5], pa[22], pb[22];
68     ta[0] = a[0] ^ a[4];
69     ta[1] = a[3] ^ a[5];
70     ta[2] = a[2] ^ a[6];
71     ta[3] = a[1] ^ ta[0];
72     ta[4] = ta[1] ^ ta[2];
73     pa[0] = a[6];
74     pa[1] = a[5];
75     pa[2] = a[5] ^ a[6];
76     pa[3] = a[4];
77     pa[4] = a[4] ^ a[6];
78     pa[5] = a[3];
79     pa[6] = ta[1];
80     pa[7] = a[2];
81     pa[8] = ta[2];
82     pa[9] = a[1];
83     pa[10]= a[1] ^ a[3];
84     pa[11]= a[1] ^ a[2] ^ a[4] ^ a[5];
85     pa[12]= a[1] ^ ta[4];
86     pa[13]= a[0];
87     pa[14]= ta[0];
88     pa[15]= a[0] ^ a[2];
89     pa[16]= a[0] ^ ta[4];
90     pa[17]= a[3] ^ ta[0] ^ ta[2];
91     pa[18]= a[0] ^ a[1];
92     pa[19]= a[3] ^ a[6] ^ ta[3];
93     pa[20]= ta[1] ^ ta[3];
94     pa[21]= ta[3] ^ ta[4];
95 
96     tb[0] = b[0] ^ b[4];
97     tb[1] = b[3] ^ b[5];
98     tb[2] = b[2] ^ b[6];
99     tb[3] = b[1] ^ tb[0];
100     tb[4] = tb[1] ^ tb[2];
101     pb[0] = b[6];
102     pb[1] = b[5];
103     pb[2] = b[5] ^ b[6];
104     pb[3] = b[4];
105     pb[4] = b[4] ^ b[6];
106     pb[5] = b[3];
107     pb[6] = tb[1];
108     pb[7] = b[2];
109     pb[8] = tb[2];
110     pb[9] = b[1];
111     pb[10]= b[1] ^ b[3];
112     pb[11]= b[1] ^ b[2] ^ b[4] ^ b[5];
113     pb[12]= b[1] ^ tb[4];
114     pb[13]= b[0];
115     pb[14]= tb[0];
116     pb[15]= b[0] ^ b[2];
117     pb[16]= b[0] ^ tb[4];
118     pb[17]= b[3] ^ tb[0] ^ tb[2];
119     pb[18]= b[0] ^ b[1];
120     pb[19]= b[3] ^ b[6] ^ tb[3];
121     pb[20]= tb[1] ^ tb[3];
122     pb[21]= tb[3] ^ tb[4];
123 
124     __m128i p[22];
125 
126     p[0] = GF2X_FUNC(mul7cl_mul1)(pa[0], pb[0]);
127     p[1] = GF2X_FUNC(mul7cl_mul1)(pa[1], pb[1]);
128     p[2] = GF2X_FUNC(mul7cl_mul1)(pa[2], pb[2]);
129     p[3] = GF2X_FUNC(mul7cl_mul1)(pa[3], pb[3]);
130     p[4] = GF2X_FUNC(mul7cl_mul1)(pa[4], pb[4]);
131     p[5] = GF2X_FUNC(mul7cl_mul1)(pa[5], pb[5]);
132     p[6] = GF2X_FUNC(mul7cl_mul1)(pa[6], pb[6]);
133     p[7] = GF2X_FUNC(mul7cl_mul1)(pa[7], pb[7]);
134     p[8] = GF2X_FUNC(mul7cl_mul1)(pa[8], pb[8]);
135     p[9] = GF2X_FUNC(mul7cl_mul1)(pa[9], pb[9]);
136     p[10]= GF2X_FUNC(mul7cl_mul1)(pa[10], pb[10]);
137     p[11]= GF2X_FUNC(mul7cl_mul1)(pa[11], pb[11]);
138     p[12]= GF2X_FUNC(mul7cl_mul1)(pa[12], pb[12]);
139     p[13]= GF2X_FUNC(mul7cl_mul1)(pa[13], pb[13]);
140     p[14]= GF2X_FUNC(mul7cl_mul1)(pa[14], pb[14]);
141     p[15]= GF2X_FUNC(mul7cl_mul1)(pa[15], pb[15]);
142     p[16]= GF2X_FUNC(mul7cl_mul1)(pa[16], pb[16]);
143     p[17]= GF2X_FUNC(mul7cl_mul1)(pa[17], pb[17]);
144     p[18]= GF2X_FUNC(mul7cl_mul1)(pa[18], pb[18]);
145     p[19]= GF2X_FUNC(mul7cl_mul1)(pa[19], pb[19]);
146     p[20]= GF2X_FUNC(mul7cl_mul1)(pa[20], pb[20]);
147     p[21]= GF2X_FUNC(mul7cl_mul1)(pa[21], pb[21]);
148 
149     __m128i t[13];
150 
151     t[0]  = PXOR(p[0], p[1]);
152     t[1]  = PXOR(p[9], p[13]);
153     t[2]  = PXOR(p[3], p[6]);
154     t[3]  = PXOR(p[7], p[10]);
155     t[4]  = PXOR(p[11], p[18]);
156     t[5]  = PXOR(p[4], t[3]);
157     t[6]  = PXOR(p[15], t[2]);
158     t[7]  = PXOR(p[20], t[5]);
159     t[8]  = PXOR(p[5], p[14]);
160     t[9]  = PXOR(p[2], p[17]);
161     t[10] = PXOR(p[5], p[8]);
162     t[11] = PXOR(p[21], t[6]);
163     t[12] = PXOR(p[16], t[4]);
164 
165     __m128i cc[13];
166 
167     cc[0]  = p[13];
168     cc[2]  = PXOR3(p[7], p[15], t[1]);
169     cc[4]  = PXOR4(p[3], t[1], t[3], t[8]);
170     cc[6]  = PXOR5(p[0], p[12], p[13], t[7], t[11]);
171     cc[8]  = PXOR4(p[7], t[0], t[2], t[10]);
172     cc[10] = PXOR3(p[3], p[4], t[0]);
173     cc[12] = p[0];
174 
175     cc[1]  = PXOR(p[18], t[1]);
176     cc[3]  = PXOR3(t[7], t[9], t[12]);
177     cc[5]  = PXOR6(p[2], p[11], p[19], t[1], t[10], t[11]);
178     cc[7]  = PXOR5(p[21], t[0], t[5], t[8], t[12]);
179     cc[9]  = PXOR5(p[12], p[19], t[4], t[6], t[9]);
180     cc[11] = PXOR(p[2], t[0]);
181 
182     _mm_storeu_si128((__m128i*)(c),    PXOR(cc[0] ,                           _mm_slli_si128(cc[1], 8)));
183     _mm_storeu_si128((__m128i*)(c+2),  PXOR(cc[2] , PXOR(_mm_srli_si128(cc[1], 8), _mm_slli_si128(cc[3], 8))));
184     _mm_storeu_si128((__m128i*)(c+4),  PXOR(cc[4] , PXOR(_mm_srli_si128(cc[3], 8), _mm_slli_si128(cc[5], 8))));
185     _mm_storeu_si128((__m128i*)(c+6),  PXOR(cc[6] , PXOR(_mm_srli_si128(cc[5], 8), _mm_slli_si128(cc[7], 8))));
186     _mm_storeu_si128((__m128i*)(c+8),  PXOR(cc[8] , PXOR(_mm_srli_si128(cc[7], 8), _mm_slli_si128(cc[9], 8))));
187     _mm_storeu_si128((__m128i*)(c+10), PXOR(cc[10], PXOR(_mm_srli_si128(cc[9], 8), _mm_slli_si128(cc[11], 8))));
188     _mm_storeu_si128((__m128i*)(c+12), PXOR(cc[12],  _mm_srli_si128(cc[11], 8)));
189 #undef PXOR
190 #undef PXOR3
191 #undef PXOR4
192 #undef PXOR5
193 #undef PXOR6
194 #undef PZERO
195 }
196 
197 #endif  /* GF2X_MUL7_H_ */
198