1 /* This file is part of the gf2x library.
2
3 Copyright 2007, 2008, 2009, 2010, 2012, 2013, 2015
4 Richard Brent, Pierrick Gaudry, Emmanuel Thome', Paul Zimmermann
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of either:
8 - If the archive contains a file named toom-gpl.c (not a trivial
9 placeholder), the GNU General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 - If the archive contains a file named toom-gpl.c which is a trivial
13 placeholder, the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 2.1 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 FITNESS FOR A PARTICULAR PURPOSE. See the license text for more details.
20
21 You should have received a copy of the GNU General Public License as
22 well as the GNU Lesser General Public License along with this program;
23 see the files COPYING and COPYING.LIB. If not, write to the Free
24 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 02110-1301, USA.
26 */
27
28 /* Implements 128x128 -> 256 bit product using SSE2 instructions. */
29
30 #ifndef GF2X_MUL2_H_
31 #define GF2X_MUL2_H_
32
33 #include "gf2x.h"
34 /* All gf2x source files for lowlevel functions must include gf2x-small.h
35 * This is mandatory for the tuning mechanism. */
36 #include "gf2x/gf2x-small.h"
37
38 #if GF2X_WORDSIZE != 64
39 #error "This code is for 64-bit only"
40 #endif
41
42 #ifndef GF2X_HAVE_SSE2_SUPPORT
43 #error "This code needs sse-2 support"
44 #endif
45
46 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 3 && (__GNUC_PATCHLEVEL__ == 0 || __GNUC_PATCHLEVEL__ == 1)
47 #warning "Your GCC version is buggy. Binary fields may fail randomly"
48 /* Gcc bug reports 37101 and 37340 -- the only convenient fix is to
49 * upgrade to 4.3.2 */
50 #endif
51
52 /* This code has been modified in comparison to the previously existing
53 * one, in order to use Intel intrinsics exclusively, and not rely on gcc
54 * syntax.
55 *
56 * The Intel C++ Intrinsics Reference is a good read. However I can't
57 * find the official link right now. An old version can be found in the wild:
58 *
59 * http://www.info.univ-angers.fr/~richer/ens/l3info/ao/intel_intrinsics.pdf
60 *
61 * I don't even know if this document evolved. Probably not.
62 *
63 * It says: ``Document Number: 312482-003US''
64 */
65 GF2X_STORAGE_CLASS_mul2
gf2x_mul2(unsigned long * t,unsigned long const * s1,unsigned long const * s2)66 void gf2x_mul2(unsigned long * t, unsigned long const * s1,
67 unsigned long const * s2)
68 {
69 #define SHL(x, r) _mm_slli_epi64((x), (r))
70 #define SHR(x, r) _mm_srli_epi64((x), (r))
71 #define SHLD(x, r) _mm_slli_si128((x), (r) >> 3)
72 #define SHRD(x, r) _mm_srli_si128((x), (r) >> 3)
73 #define XOREQ(lop, rop) lop = _mm_xor_si128((lop), (rop))
74 #define PXOR(lop, rop) _mm_xor_si128((lop), (rop))
75 #define PAND(lop, rop) _mm_and_si128((lop), (rop))
76 #define PZERO _mm_setzero_si128()
77 #define PSUB(x, y) _mm_sub_epi64((x), (y))
78 #define PNEG(x) PSUB(PZERO, (x))
79
80 __m128i u;
81 __m128i t0;
82 __m128i t1;
83 __m128i t2;
84
85 __m128i g[16];
86 __m128i w;
87 // __m128i m = _mm_set1_epi32(0xeeeeeeee);
88 __m128i m = _gf2x_mm_set1_epi64_c(0xeeeeeeeeeeeeeeee);
89 /* sequence update walk */
90 __m128i b0 = _mm_loadu_si128((__m128i*) s2);
91 g[ 0] = _mm_setzero_si128();
92 g[ 1] = b0;
93 __m128i v1 = _mm_loadu_si128((__m128i*) s1);
94 w = PNEG(SHR(b0,63));
95 __m128i v2 = _mm_unpackhi_epi64(v1, v1);
96 v1 = _mm_unpacklo_epi64(v1, v1);
97 v1 = SHR(PAND(v1, m), 1); t1 = PAND(v1, w);
98 g[ 2] = SHL(b0, 1); g[ 3] = PXOR(g[ 2], b0);
99 v2 = SHR(PAND(v2, m), 1); t2 = PAND(v2, w);
100 g[ 4] = SHL(g[ 2], 1); g[ 5] = PXOR(g[ 4], b0);
101 w = PNEG(SHR(g[ 2],63));
102 g[ 6] = SHL(g[ 3], 1); g[ 7] = PXOR(g[ 6], b0);
103 v1 = SHR(PAND(v1, m), 1); XOREQ(t1, PAND(v1, w));
104 g[ 8] = SHL(g[ 4], 1); g[ 9] = PXOR(g[ 8], b0);
105 v2 = SHR(PAND(v2, m), 1); XOREQ(t2, PAND(v2, w));
106 g[10] = SHL(g[ 5], 1); g[11] = PXOR(g[10], b0);
107 w = PNEG(SHR(g[4],63));
108 g[12] = SHL(g[ 6], 1); g[13] = PXOR(g[12], b0);
109 v1 = SHR(PAND(v1, m), 1); XOREQ(t1, PAND(v1, w));
110 g[14] = SHL(g[ 7], 1); g[15] = PXOR(g[14], b0);
111 v2 = SHR(PAND(v2, m), 1); XOREQ(t2, PAND(v2, w));
112
113
114 /* round 0 */
115 u = g[s1[0] & 15]; t0 = u;
116 u = g[s1[0] >> 4 & 15]; XOREQ(t0, SHL(u, 4)); XOREQ(t1, SHR(u, 60));
117 u = g[s1[0] >> 8 & 15]; XOREQ(t0, SHL(u, 8)); XOREQ(t1, SHR(u, 56));
118 u = g[s1[0] >> 12 & 15]; XOREQ(t0, SHL(u, 12)); XOREQ(t1, SHR(u, 52));
119 u = g[s1[0] >> 16 & 15]; XOREQ(t0, SHL(u, 16)); XOREQ(t1, SHR(u, 48));
120 u = g[s1[0] >> 20 & 15]; XOREQ(t0, SHL(u, 20)); XOREQ(t1, SHR(u, 44));
121 u = g[s1[0] >> 24 & 15]; XOREQ(t0, SHL(u, 24)); XOREQ(t1, SHR(u, 40));
122 u = g[s1[0] >> 28 & 15]; XOREQ(t0, SHL(u, 28)); XOREQ(t1, SHR(u, 36));
123 u = g[s1[0] >> 32 & 15]; XOREQ(t0, SHL(u, 32)); XOREQ(t1, SHR(u, 32));
124 u = g[s1[0] >> 36 & 15]; XOREQ(t0, SHL(u, 36)); XOREQ(t1, SHR(u, 28));
125 u = g[s1[0] >> 40 & 15]; XOREQ(t0, SHL(u, 40)); XOREQ(t1, SHR(u, 24));
126 u = g[s1[0] >> 44 & 15]; XOREQ(t0, SHL(u, 44)); XOREQ(t1, SHR(u, 20));
127 u = g[s1[0] >> 48 & 15]; XOREQ(t0, SHL(u, 48)); XOREQ(t1, SHR(u, 16));
128 u = g[s1[0] >> 52 & 15]; XOREQ(t0, SHL(u, 52)); XOREQ(t1, SHR(u, 12));
129 u = g[s1[0] >> 56 & 15]; XOREQ(t0, SHL(u, 56)); XOREQ(t1, SHR(u, 8));
130 u = g[s1[0] >> 60 & 15]; XOREQ(t0, SHL(u, 60)); XOREQ(t1, SHR(u, 4));
131
132 /* round 1 */
133 u = g[s1[1] & 15]; XOREQ(t1, u);
134 u = g[s1[1] >> 4 & 15]; XOREQ(t1, SHL(u, 4)); XOREQ(t2, SHR(u, 60));
135 u = g[s1[1] >> 8 & 15]; XOREQ(t1, SHL(u, 8)); XOREQ(t2, SHR(u, 56));
136 u = g[s1[1] >> 12 & 15]; XOREQ(t1, SHL(u, 12)); XOREQ(t2, SHR(u, 52));
137 u = g[s1[1] >> 16 & 15]; XOREQ(t1, SHL(u, 16)); XOREQ(t2, SHR(u, 48));
138 u = g[s1[1] >> 20 & 15]; XOREQ(t1, SHL(u, 20)); XOREQ(t2, SHR(u, 44));
139 u = g[s1[1] >> 24 & 15]; XOREQ(t1, SHL(u, 24)); XOREQ(t2, SHR(u, 40));
140 u = g[s1[1] >> 28 & 15]; XOREQ(t1, SHL(u, 28)); XOREQ(t2, SHR(u, 36));
141 u = g[s1[1] >> 32 & 15]; XOREQ(t1, SHL(u, 32)); XOREQ(t2, SHR(u, 32));
142 u = g[s1[1] >> 36 & 15]; XOREQ(t1, SHL(u, 36)); XOREQ(t2, SHR(u, 28));
143 u = g[s1[1] >> 40 & 15]; XOREQ(t1, SHL(u, 40)); XOREQ(t2, SHR(u, 24));
144 u = g[s1[1] >> 44 & 15]; XOREQ(t1, SHL(u, 44)); XOREQ(t2, SHR(u, 20));
145 u = g[s1[1] >> 48 & 15]; XOREQ(t1, SHL(u, 48)); XOREQ(t2, SHR(u, 16));
146 u = g[s1[1] >> 52 & 15]; XOREQ(t1, SHL(u, 52)); XOREQ(t2, SHR(u, 12));
147 u = g[s1[1] >> 56 & 15]; XOREQ(t1, SHL(u, 56)); XOREQ(t2, SHR(u, 8));
148 u = g[s1[1] >> 60 & 15]; XOREQ(t1, SHL(u, 60)); XOREQ(t2, SHR(u, 4));
149 /* end */
150
151 /* store result */
152 _mm_storeu_si128((__m128i*)t, PXOR(t0, SHLD(t1, 64)));
153 _mm_storeu_si128((__m128i*)(t+2), PXOR(t2, SHRD(t1, 64)));
154
155 #undef PNEG
156 #undef PSUB
157 #undef PZERO
158 #undef SHL
159 #undef SHR
160 #undef SHLD
161 #undef SHRD
162 #undef XOREQ
163 #undef PXOR
164 #undef PAND
165 }
166
167 #endif /* GF2X_MUL2_H_ */
168