1 /* Copyright (c) 2014 Cryptography Research, Inc.
2 * Released under the MIT License. See LICENSE.txt for license information.
3 */
4
5 #include "f_field.h"
6
gf_mul(gf_s * __restrict__ cs,const gf as,const gf bs)7 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
8 const uint64_t *a = as->limb, *b = bs->limb;
9 uint64_t *c = cs->limb;
10
11 __uint128_t accum0 = 0, accum1 = 0, accum2;
12 uint64_t mask = (1ull<<56) - 1;
13
14 uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
15
16 /* For some reason clang doesn't vectorize this without prompting? */
17 unsigned int i;
18 for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
19 ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
20 ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
21 ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
22 }
23 /*
24 for (int i=0; i<4; i++) {
25 aa[i] = a[i] + a[i+4];
26 bb[i] = b[i] + b[i+4];
27 }
28 */
29
30 accum2 = widemul(&a[0],&b[3]);
31 accum0 = widemul(&aa[0],&bb[3]);
32 accum1 = widemul(&a[4],&b[7]);
33
34 mac(&accum2, &a[1], &b[2]);
35 mac(&accum0, &aa[1], &bb[2]);
36 mac(&accum1, &a[5], &b[6]);
37
38 mac(&accum2, &a[2], &b[1]);
39 mac(&accum0, &aa[2], &bb[1]);
40 mac(&accum1, &a[6], &b[5]);
41
42 mac(&accum2, &a[3], &b[0]);
43 mac(&accum0, &aa[3], &bb[0]);
44 mac(&accum1, &a[7], &b[4]);
45
46 accum0 -= accum2;
47 accum1 += accum2;
48
49 c[3] = ((uint64_t)(accum1)) & mask;
50 c[7] = ((uint64_t)(accum0)) & mask;
51
52 accum0 >>= 56;
53 accum1 >>= 56;
54
55 mac(&accum0, &aa[1],&bb[3]);
56 mac(&accum1, &a[5], &b[7]);
57 mac(&accum0, &aa[2], &bb[2]);
58 mac(&accum1, &a[6], &b[6]);
59 mac(&accum0, &aa[3], &bb[1]);
60 accum1 += accum0;
61
62 accum2 = widemul(&a[0],&b[0]);
63 accum1 -= accum2;
64 accum0 += accum2;
65
66 msb(&accum0, &a[1], &b[3]);
67 msb(&accum0, &a[2], &b[2]);
68 mac(&accum1, &a[7], &b[5]);
69 msb(&accum0, &a[3], &b[1]);
70 mac(&accum1, &aa[0], &bb[0]);
71 mac(&accum0, &a[4], &b[4]);
72
73 c[0] = ((uint64_t)(accum0)) & mask;
74 c[4] = ((uint64_t)(accum1)) & mask;
75
76 accum0 >>= 56;
77 accum1 >>= 56;
78
79 accum2 = widemul(&a[2],&b[7]);
80 mac(&accum0, &a[6], &bb[3]);
81 mac(&accum1, &aa[2], &bbb[3]);
82
83 mac(&accum2, &a[3], &b[6]);
84 mac(&accum0, &a[7], &bb[2]);
85 mac(&accum1, &aa[3], &bbb[2]);
86
87 mac(&accum2, &a[0],&b[1]);
88 mac(&accum1, &aa[0], &bb[1]);
89 mac(&accum0, &a[4], &b[5]);
90
91 mac(&accum2, &a[1], &b[0]);
92 mac(&accum1, &aa[1], &bb[0]);
93 mac(&accum0, &a[5], &b[4]);
94
95 accum1 -= accum2;
96 accum0 += accum2;
97
98 c[1] = ((uint64_t)(accum0)) & mask;
99 c[5] = ((uint64_t)(accum1)) & mask;
100
101 accum0 >>= 56;
102 accum1 >>= 56;
103
104 accum2 = widemul(&a[3],&b[7]);
105 mac(&accum0, &a[7], &bb[3]);
106 mac(&accum1, &aa[3], &bbb[3]);
107
108 mac(&accum2, &a[0],&b[2]);
109 mac(&accum1, &aa[0], &bb[2]);
110 mac(&accum0, &a[4], &b[6]);
111
112 mac(&accum2, &a[1], &b[1]);
113 mac(&accum1, &aa[1], &bb[1]);
114 mac(&accum0, &a[5], &b[5]);
115
116 mac(&accum2, &a[2], &b[0]);
117 mac(&accum1, &aa[2], &bb[0]);
118 mac(&accum0, &a[6], &b[4]);
119
120 accum1 -= accum2;
121 accum0 += accum2;
122
123 c[2] = ((uint64_t)(accum0)) & mask;
124 c[6] = ((uint64_t)(accum1)) & mask;
125
126 accum0 >>= 56;
127 accum1 >>= 56;
128
129 accum0 += c[3];
130 accum1 += c[7];
131 c[3] = ((uint64_t)(accum0)) & mask;
132 c[7] = ((uint64_t)(accum1)) & mask;
133
134 /* we could almost stop here, but it wouldn't be stable, so... */
135
136 accum0 >>= 56;
137 accum1 >>= 56;
138 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
139 c[0] += ((uint64_t)(accum1));
140 }
141
gf_mulw_unsigned(gf_s * __restrict__ cs,const gf as,uint32_t b)142 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
143 const uint64_t *a = as->limb;
144 uint64_t *c = cs->limb;
145
146 __uint128_t accum0, accum4;
147 uint64_t mask = (1ull<<56) - 1;
148
149 accum0 = widemul_rm(b, &a[0]);
150 accum4 = widemul_rm(b, &a[4]);
151
152 c[0] = accum0 & mask; accum0 >>= 56;
153 c[4] = accum4 & mask; accum4 >>= 56;
154
155 mac_rm(&accum0, b, &a[1]);
156 mac_rm(&accum4, b, &a[5]);
157
158 c[1] = accum0 & mask; accum0 >>= 56;
159 c[5] = accum4 & mask; accum4 >>= 56;
160
161 mac_rm(&accum0, b, &a[2]);
162 mac_rm(&accum4, b, &a[6]);
163
164 c[2] = accum0 & mask; accum0 >>= 56;
165 c[6] = accum4 & mask; accum4 >>= 56;
166
167 mac_rm(&accum0, b, &a[3]);
168 mac_rm(&accum4, b, &a[7]);
169
170 c[3] = accum0 & mask; accum0 >>= 56;
171 c[7] = accum4 & mask; accum4 >>= 56;
172
173 accum0 += accum4 + c[4];
174 c[4] = accum0 & mask;
175 c[5] += accum0 >> 56;
176
177 accum4 += c[0];
178 c[0] = accum4 & mask;
179 c[1] += accum4 >> 56;
180 }
181
gf_sqr(gf_s * __restrict__ cs,const gf as)182 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
183 const uint64_t *a = as->limb;
184 uint64_t *c = cs->limb;
185
186 __uint128_t accum0 = 0, accum1 = 0, accum2;
187 uint64_t mask = (1ull<<56) - 1;
188
189 uint64_t aa[4] VECTOR_ALIGNED;
190
191 /* For some reason clang doesn't vectorize this without prompting? */
192 unsigned int i;
193 for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
194 ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
195 }
196
197 accum2 = widemul(&a[0],&a[3]);
198 accum0 = widemul(&aa[0],&aa[3]);
199 accum1 = widemul(&a[4],&a[7]);
200
201 mac(&accum2, &a[1], &a[2]);
202 mac(&accum0, &aa[1], &aa[2]);
203 mac(&accum1, &a[5], &a[6]);
204
205 accum0 -= accum2;
206 accum1 += accum2;
207
208 c[3] = ((uint64_t)(accum1))<<1 & mask;
209 c[7] = ((uint64_t)(accum0))<<1 & mask;
210
211 accum0 >>= 55;
212 accum1 >>= 55;
213
214 mac2(&accum0, &aa[1],&aa[3]);
215 mac2(&accum1, &a[5], &a[7]);
216 mac(&accum0, &aa[2], &aa[2]);
217 accum1 += accum0;
218
219 msb2(&accum0, &a[1], &a[3]);
220 mac(&accum1, &a[6], &a[6]);
221
222 accum2 = widemul(&a[0],&a[0]);
223 accum1 -= accum2;
224 accum0 += accum2;
225
226 msb(&accum0, &a[2], &a[2]);
227 mac(&accum1, &aa[0], &aa[0]);
228 mac(&accum0, &a[4], &a[4]);
229
230 c[0] = ((uint64_t)(accum0)) & mask;
231 c[4] = ((uint64_t)(accum1)) & mask;
232
233 accum0 >>= 56;
234 accum1 >>= 56;
235
236 accum2 = widemul2(&aa[2],&aa[3]);
237 msb2(&accum0, &a[2], &a[3]);
238 mac2(&accum1, &a[6], &a[7]);
239
240 accum1 += accum2;
241 accum0 += accum2;
242
243 accum2 = widemul2(&a[0],&a[1]);
244 mac2(&accum1, &aa[0], &aa[1]);
245 mac2(&accum0, &a[4], &a[5]);
246
247 accum1 -= accum2;
248 accum0 += accum2;
249
250 c[1] = ((uint64_t)(accum0)) & mask;
251 c[5] = ((uint64_t)(accum1)) & mask;
252
253 accum0 >>= 56;
254 accum1 >>= 56;
255
256 accum2 = widemul(&aa[3],&aa[3]);
257 msb(&accum0, &a[3], &a[3]);
258 mac(&accum1, &a[7], &a[7]);
259
260 accum1 += accum2;
261 accum0 += accum2;
262
263 accum2 = widemul2(&a[0],&a[2]);
264 mac2(&accum1, &aa[0], &aa[2]);
265 mac2(&accum0, &a[4], &a[6]);
266
267 mac(&accum2, &a[1], &a[1]);
268 mac(&accum1, &aa[1], &aa[1]);
269 mac(&accum0, &a[5], &a[5]);
270
271 accum1 -= accum2;
272 accum0 += accum2;
273
274 c[2] = ((uint64_t)(accum0)) & mask;
275 c[6] = ((uint64_t)(accum1)) & mask;
276
277 accum0 >>= 56;
278 accum1 >>= 56;
279
280 accum0 += c[3];
281 accum1 += c[7];
282 c[3] = ((uint64_t)(accum0)) & mask;
283 c[7] = ((uint64_t)(accum1)) & mask;
284
285 /* we could almost stop here, but it wouldn't be stable, so... */
286
287 accum0 >>= 56;
288 accum1 >>= 56;
289 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
290 c[0] += ((uint64_t)(accum1));
291 }
292