1 /* Copyright (c) 2014 Cryptography Research, Inc.
2  * Released under the MIT License.  See LICENSE.txt for license information.
3  */
4 
5 #include "f_field.h"
6 
gf_mul(gf_s * __restrict__ cs,const gf as,const gf bs)7 void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
8     const uint64_t *a = as->limb, *b = bs->limb;
9     uint64_t *c = cs->limb;
10 
11     __uint128_t accum0 = 0, accum1 = 0, accum2;
12     uint64_t mask = (1ull<<56) - 1;
13 
14     uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
15 
16     /* For some reason clang doesn't vectorize this without prompting? */
17     unsigned int i;
18     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
19         ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
20         ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
21         ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
22     }
23     /*
24     for (int i=0; i<4; i++) {
25     aa[i] = a[i] + a[i+4];
26     bb[i] = b[i] + b[i+4];
27     }
28     */
29 
30     accum2  = widemul(&a[0],&b[3]);
31     accum0  = widemul(&aa[0],&bb[3]);
32     accum1  = widemul(&a[4],&b[7]);
33 
34     mac(&accum2, &a[1], &b[2]);
35     mac(&accum0, &aa[1], &bb[2]);
36     mac(&accum1, &a[5], &b[6]);
37 
38     mac(&accum2, &a[2], &b[1]);
39     mac(&accum0, &aa[2], &bb[1]);
40     mac(&accum1, &a[6], &b[5]);
41 
42     mac(&accum2, &a[3], &b[0]);
43     mac(&accum0, &aa[3], &bb[0]);
44     mac(&accum1, &a[7], &b[4]);
45 
46     accum0 -= accum2;
47     accum1 += accum2;
48 
49     c[3] = ((uint64_t)(accum1)) & mask;
50     c[7] = ((uint64_t)(accum0)) & mask;
51 
52     accum0 >>= 56;
53     accum1 >>= 56;
54 
55     mac(&accum0, &aa[1],&bb[3]);
56     mac(&accum1, &a[5], &b[7]);
57     mac(&accum0, &aa[2], &bb[2]);
58     mac(&accum1, &a[6], &b[6]);
59     mac(&accum0, &aa[3], &bb[1]);
60     accum1 += accum0;
61 
62     accum2 = widemul(&a[0],&b[0]);
63     accum1 -= accum2;
64     accum0 += accum2;
65 
66     msb(&accum0, &a[1], &b[3]);
67     msb(&accum0, &a[2], &b[2]);
68     mac(&accum1, &a[7], &b[5]);
69     msb(&accum0, &a[3], &b[1]);
70     mac(&accum1, &aa[0], &bb[0]);
71     mac(&accum0, &a[4], &b[4]);
72 
73     c[0] = ((uint64_t)(accum0)) & mask;
74     c[4] = ((uint64_t)(accum1)) & mask;
75 
76     accum0 >>= 56;
77     accum1 >>= 56;
78 
79     accum2  = widemul(&a[2],&b[7]);
80     mac(&accum0, &a[6], &bb[3]);
81     mac(&accum1, &aa[2], &bbb[3]);
82 
83     mac(&accum2, &a[3], &b[6]);
84     mac(&accum0, &a[7], &bb[2]);
85     mac(&accum1, &aa[3], &bbb[2]);
86 
87     mac(&accum2, &a[0],&b[1]);
88     mac(&accum1, &aa[0], &bb[1]);
89     mac(&accum0, &a[4], &b[5]);
90 
91     mac(&accum2, &a[1], &b[0]);
92     mac(&accum1, &aa[1], &bb[0]);
93     mac(&accum0, &a[5], &b[4]);
94 
95     accum1 -= accum2;
96     accum0 += accum2;
97 
98     c[1] = ((uint64_t)(accum0)) & mask;
99     c[5] = ((uint64_t)(accum1)) & mask;
100 
101     accum0 >>= 56;
102     accum1 >>= 56;
103 
104     accum2  = widemul(&a[3],&b[7]);
105     mac(&accum0, &a[7], &bb[3]);
106     mac(&accum1, &aa[3], &bbb[3]);
107 
108     mac(&accum2, &a[0],&b[2]);
109     mac(&accum1, &aa[0], &bb[2]);
110     mac(&accum0, &a[4], &b[6]);
111 
112     mac(&accum2, &a[1], &b[1]);
113     mac(&accum1, &aa[1], &bb[1]);
114     mac(&accum0, &a[5], &b[5]);
115 
116     mac(&accum2, &a[2], &b[0]);
117     mac(&accum1, &aa[2], &bb[0]);
118     mac(&accum0, &a[6], &b[4]);
119 
120     accum1 -= accum2;
121     accum0 += accum2;
122 
123     c[2] = ((uint64_t)(accum0)) & mask;
124     c[6] = ((uint64_t)(accum1)) & mask;
125 
126     accum0 >>= 56;
127     accum1 >>= 56;
128 
129     accum0 += c[3];
130     accum1 += c[7];
131     c[3] = ((uint64_t)(accum0)) & mask;
132     c[7] = ((uint64_t)(accum1)) & mask;
133 
134     /* we could almost stop here, but it wouldn't be stable, so... */
135 
136     accum0 >>= 56;
137     accum1 >>= 56;
138     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
139     c[0] += ((uint64_t)(accum1));
140 }
141 
gf_mulw_unsigned(gf_s * __restrict__ cs,const gf as,uint32_t b)142 void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
143     const uint64_t *a = as->limb;
144     uint64_t *c = cs->limb;
145 
146     __uint128_t accum0, accum4;
147     uint64_t mask = (1ull<<56) - 1;
148 
149     accum0 = widemul_rm(b, &a[0]);
150     accum4 = widemul_rm(b, &a[4]);
151 
152     c[0] = accum0 & mask; accum0 >>= 56;
153     c[4] = accum4 & mask; accum4 >>= 56;
154 
155     mac_rm(&accum0, b, &a[1]);
156     mac_rm(&accum4, b, &a[5]);
157 
158     c[1] = accum0 & mask; accum0 >>= 56;
159     c[5] = accum4 & mask; accum4 >>= 56;
160 
161     mac_rm(&accum0, b, &a[2]);
162     mac_rm(&accum4, b, &a[6]);
163 
164     c[2] = accum0 & mask; accum0 >>= 56;
165     c[6] = accum4 & mask; accum4 >>= 56;
166 
167     mac_rm(&accum0, b, &a[3]);
168     mac_rm(&accum4, b, &a[7]);
169 
170     c[3] = accum0 & mask; accum0 >>= 56;
171     c[7] = accum4 & mask; accum4 >>= 56;
172 
173     accum0 += accum4 + c[4];
174     c[4] = accum0 & mask;
175     c[5] += accum0 >> 56;
176 
177     accum4 += c[0];
178     c[0] = accum4 & mask;
179     c[1] += accum4 >> 56;
180 }
181 
gf_sqr(gf_s * __restrict__ cs,const gf as)182 void gf_sqr (gf_s *__restrict__ cs, const gf as) {
183     const uint64_t *a = as->limb;
184     uint64_t *c = cs->limb;
185 
186     __uint128_t accum0 = 0, accum1 = 0, accum2;
187     uint64_t mask = (1ull<<56) - 1;
188 
189     uint64_t aa[4] VECTOR_ALIGNED;
190 
191     /* For some reason clang doesn't vectorize this without prompting? */
192     unsigned int i;
193     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
194       ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
195     }
196 
197     accum2  = widemul(&a[0],&a[3]);
198     accum0  = widemul(&aa[0],&aa[3]);
199     accum1  = widemul(&a[4],&a[7]);
200 
201     mac(&accum2, &a[1], &a[2]);
202     mac(&accum0, &aa[1], &aa[2]);
203     mac(&accum1, &a[5], &a[6]);
204 
205     accum0 -= accum2;
206     accum1 += accum2;
207 
208     c[3] = ((uint64_t)(accum1))<<1 & mask;
209     c[7] = ((uint64_t)(accum0))<<1 & mask;
210 
211     accum0 >>= 55;
212     accum1 >>= 55;
213 
214     mac2(&accum0, &aa[1],&aa[3]);
215     mac2(&accum1, &a[5], &a[7]);
216     mac(&accum0, &aa[2], &aa[2]);
217     accum1 += accum0;
218 
219     msb2(&accum0, &a[1], &a[3]);
220     mac(&accum1, &a[6], &a[6]);
221 
222     accum2 = widemul(&a[0],&a[0]);
223     accum1 -= accum2;
224     accum0 += accum2;
225 
226     msb(&accum0, &a[2], &a[2]);
227     mac(&accum1, &aa[0], &aa[0]);
228     mac(&accum0, &a[4], &a[4]);
229 
230     c[0] = ((uint64_t)(accum0)) & mask;
231     c[4] = ((uint64_t)(accum1)) & mask;
232 
233     accum0 >>= 56;
234     accum1 >>= 56;
235 
236     accum2  = widemul2(&aa[2],&aa[3]);
237     msb2(&accum0, &a[2], &a[3]);
238     mac2(&accum1, &a[6], &a[7]);
239 
240     accum1 += accum2;
241     accum0 += accum2;
242 
243     accum2  = widemul2(&a[0],&a[1]);
244     mac2(&accum1, &aa[0], &aa[1]);
245     mac2(&accum0, &a[4], &a[5]);
246 
247     accum1 -= accum2;
248     accum0 += accum2;
249 
250     c[1] = ((uint64_t)(accum0)) & mask;
251     c[5] = ((uint64_t)(accum1)) & mask;
252 
253     accum0 >>= 56;
254     accum1 >>= 56;
255 
256     accum2  = widemul(&aa[3],&aa[3]);
257     msb(&accum0, &a[3], &a[3]);
258     mac(&accum1, &a[7], &a[7]);
259 
260     accum1 += accum2;
261     accum0 += accum2;
262 
263     accum2  = widemul2(&a[0],&a[2]);
264     mac2(&accum1, &aa[0], &aa[2]);
265     mac2(&accum0, &a[4], &a[6]);
266 
267     mac(&accum2, &a[1], &a[1]);
268     mac(&accum1, &aa[1], &aa[1]);
269     mac(&accum0, &a[5], &a[5]);
270 
271     accum1 -= accum2;
272     accum0 += accum2;
273 
274     c[2] = ((uint64_t)(accum0)) & mask;
275     c[6] = ((uint64_t)(accum1)) & mask;
276 
277     accum0 >>= 56;
278     accum1 >>= 56;
279 
280     accum0 += c[3];
281     accum1 += c[7];
282     c[3] = ((uint64_t)(accum0)) & mask;
283     c[7] = ((uint64_t)(accum1)) & mask;
284 
285     /* we could almost stop here, but it wouldn't be stable, so... */
286 
287     accum0 >>= 56;
288     accum1 >>= 56;
289     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
290     c[0] += ((uint64_t)(accum1));
291 }
292