1 /* This file is part of the gf2x library.
2 
3    Copyright 2007, 2008, 2009, 2010, 2013, 2015
4    Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of either:
8     - If the archive contains a file named toom-gpl.c (not a trivial
9     placeholder), the GNU General Public License as published by the Free
10     Software Foundation; either version 3 of the License, or (at your
11     option) any later version.
12     - If the archive contains a file named toom-gpl.c which is a trivial
13     placeholder, the GNU Lesser General Public License as published by
14     the Free Software Foundation; either version 2.1 of the License, or
15     (at your option) any later version.
16 
17    This program is distributed in the hope that it will be useful, but WITHOUT
18    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19    FITNESS FOR A PARTICULAR PURPOSE.  See the license text for more details.
20 
21    You should have received a copy of the GNU General Public License as
22    well as the GNU Lesser General Public License along with this program;
23    see the files COPYING and COPYING.LIB.  If not, write to the Free
24    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25    02110-1301, USA.
26 */
27 
28 /* Implements 128x128 -> 256 bit product using SSE2 instructions. */
29 
30 #ifndef GF2X_MUL4_H_
31 #define GF2X_MUL4_H_
32 
33 #include "gf2x.h"
34 /* All gf2x source files for lowlevel functions must include gf2x-small.h
35  * This is mandatory for the tuning mechanism. */
36 #include "gf2x/gf2x-small.h"
37 
38 #if GF2X_WORDSIZE != 32
39 #error "This code is for 32-bit only"
40 #endif
41 
42 #ifndef GF2X_HAVE_SSE2_SUPPORT
43 #error "This code needs sse-2 support"
44 #endif
45 
46 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 3 && (__GNUC_PATCHLEVEL__ == 0 || __GNUC_PATCHLEVEL__ == 1)
47 #warning "Your GCC version is buggy. Binary fields may fail randomly"
48 /* Gcc bug reports 37101 and 37340 -- the only convenient fix is to
49  * upgrade to 4.3.2 */
50 #endif
51 
52 /* This has been edited without testing */
53 
54 GF2X_STORAGE_CLASS_mul4
gf2x_mul4(unsigned long * t,unsigned long const * s1,unsigned long const * s2)55 void gf2x_mul4(unsigned long *t, unsigned long const *s1,
56                unsigned long const *s2)
57 {
58 #define SHL(x, r) _mm_slli_epi64((x), (r))
59 #define SHR(x, r) _mm_srli_epi64((x), (r))
60 #define SHLD(x, r) _mm_slli_si128((x), (r) >> 3)
61 #define SHRD(x, r) _mm_srli_si128((x), (r) >> 3)
62 #define PZERO    _mm_setzero_si128()
63 #define PADD(x, y)      _mm_add_epi64((x), (y))
64 #define PSUB(x, y)      _mm_sub_epi64((x), (y))
65 #define PNEG(x)      PSUB(PZERO, (x))
66 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
67 #define XOREQ(lop, rop) lop = _mm_xor_si128((lop), (rop))
68 #define PAND(lop, rop) _mm_and_si128((lop), (rop))
69 
70     __m128i u;
71     __m128i t0;
72     __m128i t1;
73     __m128i t2;
74 
75     __m128i g[16];
76     /* sequence update walk */
77     g[0] = PZERO;
78     g[1] = _gf2x_mm_setr_epi32(s2[0], s2[1], s2[2], s2[3]);
79     g[2] = SHL(g[1], 1);
80     g[3] = PXOR(g[2], g[1]);
81     g[4] = SHL(g[2], 1);
82     g[5] = PXOR(g[4], g[1]);
83     g[6] = SHL(g[3], 1);
84     g[7] = PXOR(g[6], g[1]);
85     g[8] = SHL(g[4], 1);
86     g[9] = PXOR(g[8], g[1]);
87     g[10] = SHL(g[5], 1);
88     g[11] = PXOR(g[10], g[1]);
89     g[12] = SHL(g[6], 1);
90     g[13] = PXOR(g[12], g[1]);
91     g[14] = SHL(g[7], 1);
92     g[15] = PXOR(g[14], g[1]);
93 
94     /* round 0 */
95     u = g[s1[0]       & 15];
96     t0  = u;
97     u = g[s1[0] >>  4 & 15];
98     XOREQ(t0, SHL(u,  4)); t1  = SHR(u, 60);
99     u = g[s1[0] >>  8 & 15];
100     XOREQ(t0, SHL(u,  8)); XOREQ(t1, SHR(u, 56));
101     u = g[s1[0] >> 12 & 15];
102     XOREQ(t0, SHL(u, 12)); XOREQ(t1, SHR(u, 52));
103     u = g[s1[0] >> 16 & 15];
104     XOREQ(t0, SHL(u, 16)); XOREQ(t1, SHR(u, 48));
105     u = g[s1[0] >> 20 & 15];
106     XOREQ(t0, SHL(u, 20)); XOREQ(t1, SHR(u, 44));
107     u = g[s1[0] >> 24 & 15];
108     XOREQ(t0, SHL(u, 24)); XOREQ(t1, SHR(u, 40));
109     u = g[s1[0] >> 28 & 15];
110     XOREQ(t0, SHL(u, 28)); XOREQ(t1, SHR(u, 36));
111     u = g[s1[1]       & 15];
112     XOREQ(t0, SHL(u, 32)); XOREQ(t1, SHR(u, 32));
113     u = g[s1[1] >>  4 & 15];
114     XOREQ(t0, SHL(u, 36)); XOREQ(t1, SHR(u, 28));
115     u = g[s1[1] >>  8 & 15];
116     XOREQ(t0, SHL(u, 40)); XOREQ(t1, SHR(u, 24));
117     u = g[s1[1] >> 12 & 15];
118     XOREQ(t0, SHL(u, 44)); XOREQ(t1, SHR(u, 20));
119     u = g[s1[1] >> 16 & 15];
120     XOREQ(t0, SHL(u, 48)); XOREQ(t1, SHR(u, 16));
121     u = g[s1[1] >> 20 & 15];
122     XOREQ(t0, SHL(u, 52)); XOREQ(t1, SHR(u, 12));
123     u = g[s1[1] >> 24 & 15];
124     XOREQ(t0, SHL(u, 56)); XOREQ(t1, SHR(u,  8));
125     u = g[s1[1] >> 28 & 15];
126     XOREQ(t0, SHL(u, 60)); XOREQ(t1, SHR(u,  4));
127 
128     /* round 1 */
129     u = g[s1[2]       & 15];
130     XOREQ(t1, u);
131     u = g[s1[2] >>  4 & 15];
132     XOREQ(t1, SHL(u,  4)); t2  = SHR(u, 60);
133     u = g[s1[2] >>  8 & 15];
134     XOREQ(t1, SHL(u,  8)); XOREQ(t2, SHR(u, 56));
135     u = g[s1[2] >> 12 & 15];
136     XOREQ(t1, SHL(u, 12)); XOREQ(t2, SHR(u, 52));
137     u = g[s1[2] >> 16 & 15];
138     XOREQ(t1, SHL(u, 16)); XOREQ(t2, SHR(u, 48));
139     u = g[s1[2] >> 20 & 15];
140     XOREQ(t1, SHL(u, 20)); XOREQ(t2, SHR(u, 44));
141     u = g[s1[2] >> 24 & 15];
142     XOREQ(t1, SHL(u, 24)); XOREQ(t2, SHR(u, 40));
143     u = g[s1[2] >> 28 & 15];
144     XOREQ(t1, SHL(u, 28)); XOREQ(t2, SHR(u, 36));
145     u = g[s1[3]       & 15];
146     XOREQ(t1, SHL(u, 32)); XOREQ(t2, SHR(u, 32));
147     u = g[s1[3] >>  4 & 15];
148     XOREQ(t1, SHL(u, 36)); XOREQ(t2, SHR(u, 28));
149     u = g[s1[3] >>  8 & 15];
150     XOREQ(t1, SHL(u, 40)); XOREQ(t2, SHR(u, 24));
151     u = g[s1[3] >> 12 & 15];
152     XOREQ(t1, SHL(u, 44)); XOREQ(t2, SHR(u, 20));
153     u = g[s1[3] >> 16 & 15];
154     XOREQ(t1, SHL(u, 48)); XOREQ(t2, SHR(u, 16));
155     u = g[s1[3] >> 20 & 15];
156     XOREQ(t1, SHL(u, 52)); XOREQ(t2, SHR(u, 12));
157     u = g[s1[3] >> 24 & 15];
158     XOREQ(t1, SHL(u, 56)); XOREQ(t2, SHR(u,  8));
159     u = g[s1[3] >> 28 & 15];
160     XOREQ(t1, SHL(u, 60)); XOREQ(t2, SHR(u,  4));
161     /* end */
162 
163     /* repair steps */
164     /* repair section 200711-200803 */
165     __m128i v1 = SHR(_gf2x_mm_setr_epi32(s1[0], s1[1], s1[0], s1[1]), 1);
166     __m128i v2 = SHR(_gf2x_mm_setr_epi32(s1[2], s1[3], s1[2], s1[3]), 1);
167     __m128i m = _gf2x_mm_set1_epi32_c(0x77777777);
168     __m128i w = PNEG(SHR(g[1],63));
169     v1 = PAND(v1, m);
170     XOREQ(t1, PAND(v1, w));
171     v2 = PAND(v2, m);
172     XOREQ(t2, PAND(v2, w));
173     w = PNEG(SHR(g[2],63));
174     v1 = SHR(v1, 1) & m;
175     XOREQ(t1, PAND(v1, w));
176     v2 = SHR(v2, 1) & m;
177     XOREQ(t2, PAND(v2, w));
178     w = PNEG(SHR(g[4],63));
179     v1 = SHR(v1, 1) & m;
180     XOREQ(t1, PAND(v1, w));
181     v2 = SHR(v2, 1) & m;
182     XOREQ(t2, PAND(v2, w));
183 
184     /* store result */
185     _mm_storeu_si128((__m128i *)(t),  PXOR(t0, _mm_unpacklo_epi64(PZERO, t1)));
186     _mm_storeu_si128((__m128i *)(t+4),PXOR(t2, _mm_unpackhi_epi64(t1, PZERO)));
187 #undef PAND
188 #undef XOREQ
189 #undef PXOR
190 #undef PZERO
191 #undef PNEG
192 #undef PADD
193 #undef PSUB
194 #undef SHL
195 #undef SHR
196 #undef SHLD
197 #undef SHRD
198 }
199 #endif  /* GF2X_MUL4_H_ */
200