1 /* This file is part of the gf2x library.
2 
3    Copyright 2010, 2012, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 #ifndef GF2X_MUL9_H_
29 #define GF2X_MUL9_H_
30 
31 #include "gf2x.h"
32 /* All gf2x source files for lowlevel functions must include gf2x-small.h
33  * This is mandatory for the tuning mechanism. */
34 #include "gf2x/gf2x-small.h"
35 
36 #if GF2X_WORDSIZE != 64
37 #error "This code is for 64-bit only"
38 #endif
39 
40 #ifndef GF2X_HAVE_PCLMUL_SUPPORT
41 #error "This code needs pclmul support"
42 #endif
43 
44 /* TODO: if somebody comes up with a neat way to improve the interface so
45  * as to remove the false dependency on pclmul, that would be nice.
46  */
47 static inline __m128i
GF2X_FUNC(mul9clk2_mul1)48 GF2X_FUNC(mul9clk2_mul1) (unsigned long a, unsigned long b)
49 {
50     __m128i aa = _gf2x_mm_setr_epi64(a, 0);
51     __m128i bb = _gf2x_mm_setr_epi64(b, 0);
52     return _mm_clmulepi64_si128(aa, bb, 0);
53 }
54 
55 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
56 #define PXOR3(op1, op2, op3) PXOR(op1, PXOR(op2, op3))
57 #define PXOR4(op1, op2, op3, op4) PXOR(op1, PXOR3(op2, op3, op4))
58 #define PXOR5(op1, op2, op3, op4, op5) PXOR(op1, PXOR4(op2, op3, op4, op5))
59 #define PXOR6(op1, op2, op3, op4, op5, op6) PXOR(op1, PXOR5(op2, op3, op4, op5, op6))
60 #define PXOR7(op1, op2, op3, op4, op5, op6, op7) PXOR(op1, PXOR6(op2, op3, op4, op5, op6, op7))
61 #define PZERO    _mm_setzero_si128()
62 
63 /* uses the variant of Karatsuba with 6 multiplications */
GF2X_FUNC(mul9clk2_mul3)64 static void GF2X_FUNC(mul9clk2_mul3) (__m128i *ce, __m128i *co, const unsigned long *a, const unsigned long *b)
65 {
66   unsigned long aa[3], bb[3];
67   __m128i p0, p1, p2;
68   __m128i pp0, pp1, pp2;
69   aa[0] = a[1]^a[2];
70   aa[1] = a[0]^a[2];
71   aa[2] = a[0]^a[1];
72   bb[0] = b[1]^b[2];
73   bb[1] = b[0]^b[2];
74   bb[2] = b[0]^b[1];
75   p0  = GF2X_FUNC(mul9clk2_mul1)(a[0], b[0]);
76   p1  = GF2X_FUNC(mul9clk2_mul1)(a[1], b[1]);
77   p2  = GF2X_FUNC(mul9clk2_mul1)(a[2], b[2]);
78   pp0 = GF2X_FUNC(mul9clk2_mul1)(aa[0], bb[0]);
79   pp1 = GF2X_FUNC(mul9clk2_mul1)(aa[1], bb[1]);
80   pp2 = GF2X_FUNC(mul9clk2_mul1)(aa[2], bb[2]);
81 
82   ce[0] = p0;
83   ce[1] = PXOR4(p0, p1, p2, pp1);
84   ce[2] = p2;
85 
86   co[0] = PXOR3(p0, p1, pp2);
87   co[1] = PXOR3(pp0, p1, p2);
88 }
89 
90 /* recursive application of the Karatsuba-3 algorithm with 6 multiplies,
91    i.e., with a total of 6*6=36 multiplies */
92 GF2X_STORAGE_CLASS_mul9
gf2x_mul9(unsigned long * c,const unsigned long * a,const unsigned long * b)93 void gf2x_mul9 (unsigned long *c, const unsigned long *a, const unsigned long *b)
94 {
95   unsigned long aa[9], bb[9];
96 
97   /*
98    * a = al(t) + t^3 * am(t) + t^6 * ah(t)
99    * b = bl(t) + t^3 * bm(t) + t^6 * bh(t)
100    *
101    * aa0 = am + ah
102    * aa1 = al + ah
103    * aa2 = al + am
104    *
105    * p0e + z^64 * p0o = al * bl
106    * p1e + z^64 * p1o = am * bm
107    * p2e + z^64 * p2o = ah * bh
108    *
109    * aa0 * bb0   = (am*bm + ah*bh + am*bh + ah*bm)
110    * aa1 * bb1   = (al*bl + ah*bh + al*bh + ah*bl)
111    * aa2 * bb2   = (al*bl + am*bm + al*bm + am*bl)
112    *
113    * a*b =  al*bl * t^0
114    *      + (aa2*bb2 + al*bl + am*bm) * t^3
115    *      + (am*bm + al*bl + ah*bh + aa1 * bb1) * t^6
116    *      + (aa0*bb0 + ah*bh + am*bm) * t^9
117    *      + ah*bh * t^12
118    */
119   aa[0] = a[3]^a[6];
120   aa[1] = a[4]^a[7];
121   aa[2] = a[5]^a[8];
122 
123   aa[3] = a[0]^a[6];
124   aa[4] = a[1]^a[7];
125   aa[5] = a[2]^a[8];
126 
127   aa[6] = a[0]^a[3];
128   aa[7] = a[1]^a[4];
129   aa[8] = a[2]^a[5];
130 
131   bb[0] = b[3]^b[6];
132   bb[1] = b[4]^b[7];
133   bb[2] = b[5]^b[8];
134 
135   bb[3] = b[0]^b[6];
136   bb[4] = b[1]^b[7];
137   bb[5] = b[2]^b[8];
138 
139   bb[6] = b[0]^b[3];
140   bb[7] = b[1]^b[4];
141   bb[8] = b[2]^b[5];
142 
143 
144   __m128i p0e[3], p0o[2];
145   __m128i p1e[3], p1o[2];
146   __m128i p2e[3], p2o[2];
147   __m128i q0e[3], q0o[2];
148   __m128i q1e[3], q1o[2];
149   __m128i q2e[3], q2o[2];
150   GF2X_FUNC(mul9clk2_mul3) (p0e, p0o, a+0, b+0);
151   GF2X_FUNC(mul9clk2_mul3) (p1e, p1o, a+3, b+3);
152   GF2X_FUNC(mul9clk2_mul3) (p2e, p2o, a+6, b+6);
153   GF2X_FUNC(mul9clk2_mul3) (q0e, q0o, aa+0, bb+0);
154   GF2X_FUNC(mul9clk2_mul3) (q1e, q1o, aa+3, bb+3);
155   GF2X_FUNC(mul9clk2_mul3) (q2e, q2o, aa+6, bb+6);
156 
157   __m128i e,h,l;
158   e = p0e[0];
159   l = p0o[0];
160   _mm_storeu_si128((__m128i*)(c), PXOR(e, _mm_slli_si128(l, 8)));
161 
162   e = p0e[1];                      h=l; l= PXOR4(p0o[1], p1e[0], q2e[0], p0e[0]);
163   _mm_storeu_si128((__m128i*)(c+2), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
164 
165   e = PXOR4(p0e[2], p0o[0], p1o[0], q2o[0]); h=l; l= PXOR3(p1e[1], q2e[1], p0e[1]);
166   _mm_storeu_si128((__m128i*)(c+4), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
167 
168   e = PXOR7(p0e[0], q1e[0], p2e[0], p1e[0], p0o[1], p1o[1], q2o[1]); h=l; l= PXOR7(p0o[0], q1o[0], p2o[0], p1o[0], p1e[2], q2e[2], p0e[2]);
169   _mm_storeu_si128((__m128i*)(c+6), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
170 
171   e = PXOR4(p0e[1], q1e[1], p2e[1], p1e[1]); h=l; l= PXOR7(p0o[1], q1o[1], p2o[1], p1o[1], q0e[0], p1e[0], p2e[0]);
172   _mm_storeu_si128((__m128i*)(c+8), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
173 
174   e = PXOR7(p0e[2], q1e[2], p2e[2], p1e[2], q0o[0], p1o[0], p2o[0]); h=l; l= PXOR3(q0e[1], p1e[1], p2e[1]);
175   _mm_storeu_si128((__m128i*)(c+10), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
176 
177   e = PXOR4(p2e[0], q0o[1], p1o[1], p2o[1]); h=l; l= PXOR4(p2o[0], q0e[2], p1e[2], p2e[2]);
178   _mm_storeu_si128((__m128i*)(c+12), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
179 
180   e = p2e[1]; h=l; l=p2o[1];
181   _mm_storeu_si128((__m128i*)(c+14), PXOR3(e, _mm_slli_si128(l, 8), _mm_srli_si128(h, 8)));
182 
183   e = p2e[2]; h=l;
184   _mm_storeu_si128((__m128i*)(c+16), PXOR(e, _mm_srli_si128(h, 8)));
185 
186   /*
187    * a*b =  al*bl * t^0
188    *      + (aa2*bb2 + al*bl + am*bm) * t^3
189    *      + (am*bm + al*bl + ah*bh + aa1 * bb1) * t^6
190    *      + (aa0*bb0 + ah*bh + am*bm) * t^9
191    *      + ah*bh * t^12
192    *
193    * this is the 64-bit breakdown
194   c[0]  =                   p0e[0]                   ^                                           ;
195   c[1]  =                   p0e[1]                   ^                   p0o[0]                  ;
196   c[2]  =                   p0e[2]                   ^                   p0o[1]                  ;
197   c[3]  =                   p0e[3]                   ^                   p0o[2]p1e[0]q2e[0]p0e[0];
198   c[4]  =                   p0e[4]p0o[0]p1o[0]q2o[0] ^                   p0o[3]p1e[1]q2e[1]p0e[1];
199   c[5]  =                   p0e[5]p0o[1]p1o[1]q2o[1] ^                         p1e[2]q2e[2]p0e[2];
200   c[6]  = p0e[0]q1e[0]p2e[0]p1e[0]p0o[2]p1o[2]q2o[2] ^                         p1e[3]q2e[3]p0e[3];
201   c[7]  = p0e[1]q1e[1]p2e[1]p1e[1]p0o[3]p1o[3]q2o[3] ^ p0o[0]q1o[0]p2o[0]p1o[0]p1e[4]q2e[4]p0e[4];
202   c[8]  = p0e[2]q1e[2]p2e[2]p1e[2]                   ^ p0o[1]q1o[1]p2o[1]p1o[1]p1e[5]q2e[5]p0e[5];
203   c[9]  = p0e[3]q1e[3]p2e[3]p1e[3]                   ^ p0o[2]q1o[2]p2o[2]p1o[2]q0e[0]p1e[0]p2e[0];
204   c[10] = p0e[4]q1e[4]p2e[4]p1e[4]q0o[0]p1o[0]p2o[0] ^ p0o[3]q1o[3]p2o[3]p1o[3]q0e[1]p1e[1]p2e[1];
205   c[11] = p0e[5]q1e[5]p2e[5]p1e[5]q0o[1]p1o[1]p2o[1] ^                         q0e[2]p1e[2]p2e[2];
206   c[12] = p2e[0]                  q0o[2]p1o[2]p2o[2] ^                         q0e[3]p1e[3]p2e[3];
207   c[13] = p2e[1]                  q0o[3]p1o[3]p2o[3] ^                   p2o[0]q0e[4]p1e[4]p2e[4];
208   c[14] = p2e[2]                                     ^                   p2o[1]q0e[5]p1e[5]p2e[5];
209   c[15] = p2e[3]                                     ^                   p2o[2]                  ;
210   c[16] = p2e[4]                                     ^                   p2o[3]                  ;
211   c[17] = p2e[5]                                     ^                                           ;
212   */
213 }
214 
215 #undef PXOR
216 #undef PXOR3
217 #undef PXOR4
218 #undef PXOR5
219 #undef PXOR6
220 #undef PXOR7
221 #undef PZERO
222 #endif  /* GF2X_MUL9_H_ */
223