1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2012, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45  * as to remove the false dependency on pclmul, that would be nice.
46  */
47 static inline __m128i
GF2X_FUNC(mul9k3_mul1)48 GF2X_FUNC(mul9k3_mul1) (unsigned long a, unsigned long b)
49 {
50     __m128i aa = _gf2x_mm_setr_epi64(a, 0);
51     __m128i bb = _gf2x_mm_setr_epi64(b, 0);
52     return _mm_clmulepi64_si128(aa, bb, 0);
53 }
54 
55 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
56 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
57 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
58 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
59 #define PZERO    _mm_setzero_si128()
60 
61 /* same as mul5, but stores a[4]*b[4] into {d,2} */
62 GF2X_STORAGE_CLASS_mul5
GF2X_FUNC(mul9k3_mul5c)63 void GF2X_FUNC(mul9k3_mul5c) (unsigned long *c, const unsigned long *a,
64                         const unsigned long *b, unsigned long *d)
65 {
66   /* Montgomery formulae with 13 multiplications */
67   unsigned long ta[3], tb[3], pa[8], pb[8];
68   __m128i p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
69   __m128i t0, t2, t4, t6, t8, t10, t12;
70   ta[0] = a[0]  ^ a[4]         ; tb[0] = b[0]  ^ b[4];
71   ta[1] = a[1]  ^ a[2]         ; tb[1] = b[1]  ^ b[2];
72   ta[2] = a[3]  ^ ta[0]        ; tb[2] = b[3]  ^ tb[0];
73   pa[0] = ta[1] ^ ta[2]        ; pb[0] = tb[1] ^ tb[2];
74   pa[1] = a[2]  ^ ta[2]        ; pb[1] = b[2]  ^ tb[2];
75   pa[2] = ta[0] ^ ta[1]        ; pb[2] = tb[0] ^ tb[1];
76   pa[3] = a[1]  ^ ta[2]        ; pb[3] = b[1]  ^ tb[2];
77   pa[4] = a[0]  ^ a[2]  ^ a[3] ; pb[4] = b[0]  ^ b[2]  ^ b[3];
78   pa[5] = a[4]  ^ ta[1]        ; pb[5] = b[4]  ^ tb[1];
79   pa[6] = a[3]  ^ a[4]         ; pb[6] = b[3]  ^ b[4];
80   pa[7] = a[0]  ^ a[1]         ; pb[7] = b[0]  ^ b[1];
81   p0  = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]);
82   p2  = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]);
83   p4  = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]);
84   p6  = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]);
85   p8  = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]);
86   p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]);
87   p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]);
88   p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]);
89   p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]);
90   p18 = GF2X_FUNC(mul9k3_mul1)(a[4],  b[4]);
91   p20 = GF2X_FUNC(mul9k3_mul1)(a[3],  b[3]);
92   p22 = GF2X_FUNC(mul9k3_mul1)(a[1],  b[1]);
93   p24 = GF2X_FUNC(mul9k3_mul1)(a[0],  b[0]);
94   t0  = PXOR(p14, p24);
95   t2  = PXOR(p12, p18);
96   t4  = PXOR(p2, p16);
97   t6  = PXOR(p0, p6);
98   t8  = PXOR(p4, p16);
99   t10 = PXOR(p10, t0);
100   t12 = PXOR(p8, t2);
101 
102   __m128i ce0 = p24;
103   __m128i ce2 = PXOR3(p18, t8, t10);
104   __m128i ce4 = PXOR5(p0, p20, p22, t10, t12);
105   __m128i ce6 = PXOR3(p24, t4, t12);
106   __m128i ce8 = p18;
107 
108   __m128i co1 = PXOR(p22, t0);
109   __m128i co3 = PXOR3(t2, t4, t6);
110   __m128i co5 = PXOR3(t0, t6, t8);
111   __m128i co7 = PXOR(p20, t2);
112 
113   _mm_storeu_si128((__m128i*)(d),   p18); /* a[4] * b[4] */
114   _mm_storeu_si128((__m128i*)(c),   PXOR(ce0, _mm_slli_si128(co1, 8)));
115   _mm_storeu_si128((__m128i*)(c+2), PXOR3(ce2 ,  _mm_srli_si128(co1, 8) ,  _mm_slli_si128(co3, 8)));
116   _mm_storeu_si128((__m128i*)(c+4), PXOR3(ce4 ,  _mm_srli_si128(co3, 8) ,  _mm_slli_si128(co5, 8)));
117   _mm_storeu_si128((__m128i*)(c+6), PXOR3(ce6 ,  _mm_srli_si128(co5, 8) ,  _mm_slli_si128(co7, 8)));
118   _mm_storeu_si128((__m128i*)(c+8), PXOR(ce8 ,  _mm_srli_si128(co7, 8)));
119 }
120 
121 /* same as mul5, but assumes {d,2} contains a[4]*b[4] */
122 GF2X_STORAGE_CLASS_mul5
GF2X_FUNC(mul9k3_mul5b)123 void GF2X_FUNC(mul9k3_mul5b) (unsigned long *c, const unsigned long *a,
124                         const unsigned long *b, const unsigned long *d)
125 {
126   /* Montgomery formulae with 13 multiplications */
127   unsigned long ta[3], tb[3], pa[8], pb[8];
128   __m128i p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
129   __m128i t0, t2, t4, t6, t8, t10, t12;
130 
131   ta[0] = a[0]  ^ a[4]         ; tb[0] = b[0]  ^ b[4];
132   ta[1] = a[1]  ^ a[2]         ; tb[1] = b[1]  ^ b[2];
133   ta[2] = a[3]  ^ ta[0]        ; tb[2] = b[3]  ^ tb[0];
134   pa[0] = ta[1] ^ ta[2]        ; pb[0] = tb[1] ^ tb[2];
135   pa[1] = a[2]  ^ ta[2]        ; pb[1] = b[2]  ^ tb[2];
136   pa[2] = ta[0] ^ ta[1]        ; pb[2] = tb[0] ^ tb[1];
137   pa[3] = a[1]  ^ ta[2]        ; pb[3] = b[1]  ^ tb[2];
138   pa[4] = a[0]  ^ a[2]  ^ a[3] ; pb[4] = b[0]  ^ b[2]  ^ b[3];
139   pa[5] = a[4]  ^ ta[1]        ; pb[5] = b[4]  ^ tb[1];
140   pa[6] = a[3]  ^ a[4]         ; pb[6] = b[3]  ^ b[4];
141   pa[7] = a[0]  ^ a[1]         ; pb[7] = b[0]  ^ b[1];
142   p0  = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]);
143   p2  = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]);
144   p4  = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]);
145   p6  = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]);
146   p8  = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]);
147   p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]);
148   p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]);
149   p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]);
150   p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]);
151   /* p18 = GF2X_FUNC(mul9k3_mul1)(a[4],  b[4]); */
152   p18 = _mm_loadu_si128((__m128i *) d);
153   p20 = GF2X_FUNC(mul9k3_mul1)(a[3],  b[3]);
154   p22 = GF2X_FUNC(mul9k3_mul1)(a[1],  b[1]);
155   p24 = GF2X_FUNC(mul9k3_mul1)(a[0],  b[0]);
156   t0  = PXOR(p14, p24);
157   t2  = PXOR(p12, p18);
158   t4  = PXOR(p2, p16);
159   t6  = PXOR(p0, p6);
160   t8  = PXOR(p4, p16);
161   t10 = PXOR(p10, t0);
162   t12 = PXOR(p8, t2);
163 
164   __m128i ce0 = p24;
165   __m128i ce2 = PXOR3(p18, t8, t10);
166   __m128i ce4 = PXOR5(p0, p20, p22, t10, t12);
167   __m128i ce6 = PXOR3(p24, t4, t12);
168   __m128i ce8 = p18;
169 
170   __m128i co1 = PXOR(p22, t0);
171   __m128i co3 = PXOR3(t2, t4, t6);
172   __m128i co5 = PXOR3(t0, t6, t8);
173   __m128i co7 = PXOR(p20, t2);
174 
175   _mm_storeu_si128((__m128i*)(c),   PXOR(ce0, _mm_slli_si128(co1, 8)));
176   _mm_storeu_si128((__m128i*)(c+2), PXOR3(ce2, _mm_srli_si128(co1, 8), _mm_slli_si128(co3, 8)));
177   _mm_storeu_si128((__m128i*)(c+4), PXOR3(ce4, _mm_srli_si128(co3, 8), _mm_slli_si128(co5, 8)));
178   _mm_storeu_si128((__m128i*)(c+6), PXOR3(ce6, _mm_srli_si128(co5, 8), _mm_slli_si128(co7, 8)));
179   _mm_storeu_si128((__m128i*)(c+8), PXOR(ce8, _mm_srli_si128(co7, 8)));
180 }
181 
182 /* (based on mul9k.c)
183    1 call to mul4 and 2 calls to mul5 minus one multiplication, i.e.,
184    34 multiplications with mul5clk_c */
185 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)186 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
187 {
188   unsigned long aa[5], bb[5], ab[10], ab5, ab6, ab7, ab8, ab9, d[2];
189 
190     gf2x_mul4 (c+10, a+5, b+5);
191     GF2X_FUNC(mul9k3_mul5c) (c, a, b, d); /* a[4]*b[4] is cached in d */
192     aa[0] = a[0] ^ a[5];
193     aa[1] = a[1] ^ a[6];
194     aa[2] = a[2] ^ a[7];
195     aa[3] = a[3] ^ a[8];
196     aa[4] = a[4];
197     bb[0] = b[0] ^ b[5];
198     bb[1] = b[1] ^ b[6];
199     bb[2] = b[2] ^ b[7];
200     bb[3] = b[3] ^ b[8];
201     bb[4] = b[4];
202     GF2X_FUNC(mul9k3_mul5b) (ab, aa, bb, d);
203     ab5 = ab[5] ^ c[5];
204     ab6 = ab[6] ^ c[6];
205     ab7 = ab[7] ^ c[7];
206     ab8 = ab[8] ^ c[8];
207     ab9 = ab[9] ^ c[9];
208     c[5] ^= ab[0] ^ c[0] ^ c[10];
209     c[6] ^= ab[1] ^ c[1] ^ c[11];
210     c[7] ^= ab[2] ^ c[2] ^ c[12];
211     c[8] ^= ab[3] ^ c[3] ^ c[13];
212     c[9] ^= ab[4] ^ c[4] ^ c[14];
213     c[10] ^= ab5 ^ c[15];
214     c[11] ^= ab6 ^ c[16];
215     c[12] ^= ab7 ^ c[17];
216     c[13] ^= ab8;
217     c[14] ^= ab9;
218 }
219 
220 
221 #undef PXOR
222 #undef PXOR3
223 #undef PXOR4
224 #undef PXOR5
225 #undef PZERO
226 #endif  /* GF2X_MUL9_H_ */
227