1 /***********************************************************************
2 * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
3 * Distributed under the MIT software license, see the accompanying *
4 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
5 ***********************************************************************/
6
7 /**
8 * Changelog:
9 * - March 2013, Diederik Huys: original version
10 * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
11 * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
12 */
13
14 #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H
15 #define SECP256K1_FIELD_INNER5X52_IMPL_H
16
secp256k1_fe_mul_inner(uint64_t * r,const uint64_t * a,const uint64_t * SECP256K1_RESTRICT b)17 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
18 /**
19 * Registers: rdx:rax = multiplication accumulator
20 * r9:r8 = c
21 * r15:rcx = d
22 * r10-r14 = a0-a4
23 * rbx = b
24 * rdi = r
25 * rsi = a / t?
26 */
27 uint64_t tmp1, tmp2, tmp3;
28 __asm__ __volatile__(
29 "movq 0(%%rsi),%%r10\n"
30 "movq 8(%%rsi),%%r11\n"
31 "movq 16(%%rsi),%%r12\n"
32 "movq 24(%%rsi),%%r13\n"
33 "movq 32(%%rsi),%%r14\n"
34
35 /* d += a3 * b0 */
36 "movq 0(%%rbx),%%rax\n"
37 "mulq %%r13\n"
38 "movq %%rax,%%rcx\n"
39 "movq %%rdx,%%r15\n"
40 /* d += a2 * b1 */
41 "movq 8(%%rbx),%%rax\n"
42 "mulq %%r12\n"
43 "addq %%rax,%%rcx\n"
44 "adcq %%rdx,%%r15\n"
45 /* d += a1 * b2 */
46 "movq 16(%%rbx),%%rax\n"
47 "mulq %%r11\n"
48 "addq %%rax,%%rcx\n"
49 "adcq %%rdx,%%r15\n"
50 /* d = a0 * b3 */
51 "movq 24(%%rbx),%%rax\n"
52 "mulq %%r10\n"
53 "addq %%rax,%%rcx\n"
54 "adcq %%rdx,%%r15\n"
55 /* c = a4 * b4 */
56 "movq 32(%%rbx),%%rax\n"
57 "mulq %%r14\n"
58 "movq %%rax,%%r8\n"
59 "movq %%rdx,%%r9\n"
60 /* d += (c & M) * R */
61 "movq $0xfffffffffffff,%%rdx\n"
62 "andq %%rdx,%%rax\n"
63 "movq $0x1000003d10,%%rdx\n"
64 "mulq %%rdx\n"
65 "addq %%rax,%%rcx\n"
66 "adcq %%rdx,%%r15\n"
67 /* c >>= 52 (%%r8 only) */
68 "shrdq $52,%%r9,%%r8\n"
69 /* t3 (tmp1) = d & M */
70 "movq %%rcx,%%rsi\n"
71 "movq $0xfffffffffffff,%%rdx\n"
72 "andq %%rdx,%%rsi\n"
73 "movq %%rsi,%q1\n"
74 /* d >>= 52 */
75 "shrdq $52,%%r15,%%rcx\n"
76 "xorq %%r15,%%r15\n"
77 /* d += a4 * b0 */
78 "movq 0(%%rbx),%%rax\n"
79 "mulq %%r14\n"
80 "addq %%rax,%%rcx\n"
81 "adcq %%rdx,%%r15\n"
82 /* d += a3 * b1 */
83 "movq 8(%%rbx),%%rax\n"
84 "mulq %%r13\n"
85 "addq %%rax,%%rcx\n"
86 "adcq %%rdx,%%r15\n"
87 /* d += a2 * b2 */
88 "movq 16(%%rbx),%%rax\n"
89 "mulq %%r12\n"
90 "addq %%rax,%%rcx\n"
91 "adcq %%rdx,%%r15\n"
92 /* d += a1 * b3 */
93 "movq 24(%%rbx),%%rax\n"
94 "mulq %%r11\n"
95 "addq %%rax,%%rcx\n"
96 "adcq %%rdx,%%r15\n"
97 /* d += a0 * b4 */
98 "movq 32(%%rbx),%%rax\n"
99 "mulq %%r10\n"
100 "addq %%rax,%%rcx\n"
101 "adcq %%rdx,%%r15\n"
102 /* d += c * R */
103 "movq %%r8,%%rax\n"
104 "movq $0x1000003d10,%%rdx\n"
105 "mulq %%rdx\n"
106 "addq %%rax,%%rcx\n"
107 "adcq %%rdx,%%r15\n"
108 /* t4 = d & M (%%rsi) */
109 "movq %%rcx,%%rsi\n"
110 "movq $0xfffffffffffff,%%rdx\n"
111 "andq %%rdx,%%rsi\n"
112 /* d >>= 52 */
113 "shrdq $52,%%r15,%%rcx\n"
114 "xorq %%r15,%%r15\n"
115 /* tx = t4 >> 48 (tmp3) */
116 "movq %%rsi,%%rax\n"
117 "shrq $48,%%rax\n"
118 "movq %%rax,%q3\n"
119 /* t4 &= (M >> 4) (tmp2) */
120 "movq $0xffffffffffff,%%rax\n"
121 "andq %%rax,%%rsi\n"
122 "movq %%rsi,%q2\n"
123 /* c = a0 * b0 */
124 "movq 0(%%rbx),%%rax\n"
125 "mulq %%r10\n"
126 "movq %%rax,%%r8\n"
127 "movq %%rdx,%%r9\n"
128 /* d += a4 * b1 */
129 "movq 8(%%rbx),%%rax\n"
130 "mulq %%r14\n"
131 "addq %%rax,%%rcx\n"
132 "adcq %%rdx,%%r15\n"
133 /* d += a3 * b2 */
134 "movq 16(%%rbx),%%rax\n"
135 "mulq %%r13\n"
136 "addq %%rax,%%rcx\n"
137 "adcq %%rdx,%%r15\n"
138 /* d += a2 * b3 */
139 "movq 24(%%rbx),%%rax\n"
140 "mulq %%r12\n"
141 "addq %%rax,%%rcx\n"
142 "adcq %%rdx,%%r15\n"
143 /* d += a1 * b4 */
144 "movq 32(%%rbx),%%rax\n"
145 "mulq %%r11\n"
146 "addq %%rax,%%rcx\n"
147 "adcq %%rdx,%%r15\n"
148 /* u0 = d & M (%%rsi) */
149 "movq %%rcx,%%rsi\n"
150 "movq $0xfffffffffffff,%%rdx\n"
151 "andq %%rdx,%%rsi\n"
152 /* d >>= 52 */
153 "shrdq $52,%%r15,%%rcx\n"
154 "xorq %%r15,%%r15\n"
155 /* u0 = (u0 << 4) | tx (%%rsi) */
156 "shlq $4,%%rsi\n"
157 "movq %q3,%%rax\n"
158 "orq %%rax,%%rsi\n"
159 /* c += u0 * (R >> 4) */
160 "movq $0x1000003d1,%%rax\n"
161 "mulq %%rsi\n"
162 "addq %%rax,%%r8\n"
163 "adcq %%rdx,%%r9\n"
164 /* r[0] = c & M */
165 "movq %%r8,%%rax\n"
166 "movq $0xfffffffffffff,%%rdx\n"
167 "andq %%rdx,%%rax\n"
168 "movq %%rax,0(%%rdi)\n"
169 /* c >>= 52 */
170 "shrdq $52,%%r9,%%r8\n"
171 "xorq %%r9,%%r9\n"
172 /* c += a1 * b0 */
173 "movq 0(%%rbx),%%rax\n"
174 "mulq %%r11\n"
175 "addq %%rax,%%r8\n"
176 "adcq %%rdx,%%r9\n"
177 /* c += a0 * b1 */
178 "movq 8(%%rbx),%%rax\n"
179 "mulq %%r10\n"
180 "addq %%rax,%%r8\n"
181 "adcq %%rdx,%%r9\n"
182 /* d += a4 * b2 */
183 "movq 16(%%rbx),%%rax\n"
184 "mulq %%r14\n"
185 "addq %%rax,%%rcx\n"
186 "adcq %%rdx,%%r15\n"
187 /* d += a3 * b3 */
188 "movq 24(%%rbx),%%rax\n"
189 "mulq %%r13\n"
190 "addq %%rax,%%rcx\n"
191 "adcq %%rdx,%%r15\n"
192 /* d += a2 * b4 */
193 "movq 32(%%rbx),%%rax\n"
194 "mulq %%r12\n"
195 "addq %%rax,%%rcx\n"
196 "adcq %%rdx,%%r15\n"
197 /* c += (d & M) * R */
198 "movq %%rcx,%%rax\n"
199 "movq $0xfffffffffffff,%%rdx\n"
200 "andq %%rdx,%%rax\n"
201 "movq $0x1000003d10,%%rdx\n"
202 "mulq %%rdx\n"
203 "addq %%rax,%%r8\n"
204 "adcq %%rdx,%%r9\n"
205 /* d >>= 52 */
206 "shrdq $52,%%r15,%%rcx\n"
207 "xorq %%r15,%%r15\n"
208 /* r[1] = c & M */
209 "movq %%r8,%%rax\n"
210 "movq $0xfffffffffffff,%%rdx\n"
211 "andq %%rdx,%%rax\n"
212 "movq %%rax,8(%%rdi)\n"
213 /* c >>= 52 */
214 "shrdq $52,%%r9,%%r8\n"
215 "xorq %%r9,%%r9\n"
216 /* c += a2 * b0 */
217 "movq 0(%%rbx),%%rax\n"
218 "mulq %%r12\n"
219 "addq %%rax,%%r8\n"
220 "adcq %%rdx,%%r9\n"
221 /* c += a1 * b1 */
222 "movq 8(%%rbx),%%rax\n"
223 "mulq %%r11\n"
224 "addq %%rax,%%r8\n"
225 "adcq %%rdx,%%r9\n"
226 /* c += a0 * b2 (last use of %%r10 = a0) */
227 "movq 16(%%rbx),%%rax\n"
228 "mulq %%r10\n"
229 "addq %%rax,%%r8\n"
230 "adcq %%rdx,%%r9\n"
231 /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
232 "movq %q2,%%rsi\n"
233 "movq %q1,%%r10\n"
234 /* d += a4 * b3 */
235 "movq 24(%%rbx),%%rax\n"
236 "mulq %%r14\n"
237 "addq %%rax,%%rcx\n"
238 "adcq %%rdx,%%r15\n"
239 /* d += a3 * b4 */
240 "movq 32(%%rbx),%%rax\n"
241 "mulq %%r13\n"
242 "addq %%rax,%%rcx\n"
243 "adcq %%rdx,%%r15\n"
244 /* c += (d & M) * R */
245 "movq %%rcx,%%rax\n"
246 "movq $0xfffffffffffff,%%rdx\n"
247 "andq %%rdx,%%rax\n"
248 "movq $0x1000003d10,%%rdx\n"
249 "mulq %%rdx\n"
250 "addq %%rax,%%r8\n"
251 "adcq %%rdx,%%r9\n"
252 /* d >>= 52 (%%rcx only) */
253 "shrdq $52,%%r15,%%rcx\n"
254 /* r[2] = c & M */
255 "movq %%r8,%%rax\n"
256 "movq $0xfffffffffffff,%%rdx\n"
257 "andq %%rdx,%%rax\n"
258 "movq %%rax,16(%%rdi)\n"
259 /* c >>= 52 */
260 "shrdq $52,%%r9,%%r8\n"
261 "xorq %%r9,%%r9\n"
262 /* c += t3 */
263 "addq %%r10,%%r8\n"
264 /* c += d * R */
265 "movq %%rcx,%%rax\n"
266 "movq $0x1000003d10,%%rdx\n"
267 "mulq %%rdx\n"
268 "addq %%rax,%%r8\n"
269 "adcq %%rdx,%%r9\n"
270 /* r[3] = c & M */
271 "movq %%r8,%%rax\n"
272 "movq $0xfffffffffffff,%%rdx\n"
273 "andq %%rdx,%%rax\n"
274 "movq %%rax,24(%%rdi)\n"
275 /* c >>= 52 (%%r8 only) */
276 "shrdq $52,%%r9,%%r8\n"
277 /* c += t4 (%%r8 only) */
278 "addq %%rsi,%%r8\n"
279 /* r[4] = c */
280 "movq %%r8,32(%%rdi)\n"
281 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
282 : "b"(b), "D"(r)
283 : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
284 );
285 }
286
secp256k1_fe_sqr_inner(uint64_t * r,const uint64_t * a)287 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
288 /**
289 * Registers: rdx:rax = multiplication accumulator
290 * r9:r8 = c
291 * rcx:rbx = d
292 * r10-r14 = a0-a4
293 * r15 = M (0xfffffffffffff)
294 * rdi = r
295 * rsi = a / t?
296 */
297 uint64_t tmp1, tmp2, tmp3;
298 __asm__ __volatile__(
299 "movq 0(%%rsi),%%r10\n"
300 "movq 8(%%rsi),%%r11\n"
301 "movq 16(%%rsi),%%r12\n"
302 "movq 24(%%rsi),%%r13\n"
303 "movq 32(%%rsi),%%r14\n"
304 "movq $0xfffffffffffff,%%r15\n"
305
306 /* d = (a0*2) * a3 */
307 "leaq (%%r10,%%r10,1),%%rax\n"
308 "mulq %%r13\n"
309 "movq %%rax,%%rbx\n"
310 "movq %%rdx,%%rcx\n"
311 /* d += (a1*2) * a2 */
312 "leaq (%%r11,%%r11,1),%%rax\n"
313 "mulq %%r12\n"
314 "addq %%rax,%%rbx\n"
315 "adcq %%rdx,%%rcx\n"
316 /* c = a4 * a4 */
317 "movq %%r14,%%rax\n"
318 "mulq %%r14\n"
319 "movq %%rax,%%r8\n"
320 "movq %%rdx,%%r9\n"
321 /* d += (c & M) * R */
322 "andq %%r15,%%rax\n"
323 "movq $0x1000003d10,%%rdx\n"
324 "mulq %%rdx\n"
325 "addq %%rax,%%rbx\n"
326 "adcq %%rdx,%%rcx\n"
327 /* c >>= 52 (%%r8 only) */
328 "shrdq $52,%%r9,%%r8\n"
329 /* t3 (tmp1) = d & M */
330 "movq %%rbx,%%rsi\n"
331 "andq %%r15,%%rsi\n"
332 "movq %%rsi,%q1\n"
333 /* d >>= 52 */
334 "shrdq $52,%%rcx,%%rbx\n"
335 "xorq %%rcx,%%rcx\n"
336 /* a4 *= 2 */
337 "addq %%r14,%%r14\n"
338 /* d += a0 * a4 */
339 "movq %%r10,%%rax\n"
340 "mulq %%r14\n"
341 "addq %%rax,%%rbx\n"
342 "adcq %%rdx,%%rcx\n"
343 /* d+= (a1*2) * a3 */
344 "leaq (%%r11,%%r11,1),%%rax\n"
345 "mulq %%r13\n"
346 "addq %%rax,%%rbx\n"
347 "adcq %%rdx,%%rcx\n"
348 /* d += a2 * a2 */
349 "movq %%r12,%%rax\n"
350 "mulq %%r12\n"
351 "addq %%rax,%%rbx\n"
352 "adcq %%rdx,%%rcx\n"
353 /* d += c * R */
354 "movq %%r8,%%rax\n"
355 "movq $0x1000003d10,%%rdx\n"
356 "mulq %%rdx\n"
357 "addq %%rax,%%rbx\n"
358 "adcq %%rdx,%%rcx\n"
359 /* t4 = d & M (%%rsi) */
360 "movq %%rbx,%%rsi\n"
361 "andq %%r15,%%rsi\n"
362 /* d >>= 52 */
363 "shrdq $52,%%rcx,%%rbx\n"
364 "xorq %%rcx,%%rcx\n"
365 /* tx = t4 >> 48 (tmp3) */
366 "movq %%rsi,%%rax\n"
367 "shrq $48,%%rax\n"
368 "movq %%rax,%q3\n"
369 /* t4 &= (M >> 4) (tmp2) */
370 "movq $0xffffffffffff,%%rax\n"
371 "andq %%rax,%%rsi\n"
372 "movq %%rsi,%q2\n"
373 /* c = a0 * a0 */
374 "movq %%r10,%%rax\n"
375 "mulq %%r10\n"
376 "movq %%rax,%%r8\n"
377 "movq %%rdx,%%r9\n"
378 /* d += a1 * a4 */
379 "movq %%r11,%%rax\n"
380 "mulq %%r14\n"
381 "addq %%rax,%%rbx\n"
382 "adcq %%rdx,%%rcx\n"
383 /* d += (a2*2) * a3 */
384 "leaq (%%r12,%%r12,1),%%rax\n"
385 "mulq %%r13\n"
386 "addq %%rax,%%rbx\n"
387 "adcq %%rdx,%%rcx\n"
388 /* u0 = d & M (%%rsi) */
389 "movq %%rbx,%%rsi\n"
390 "andq %%r15,%%rsi\n"
391 /* d >>= 52 */
392 "shrdq $52,%%rcx,%%rbx\n"
393 "xorq %%rcx,%%rcx\n"
394 /* u0 = (u0 << 4) | tx (%%rsi) */
395 "shlq $4,%%rsi\n"
396 "movq %q3,%%rax\n"
397 "orq %%rax,%%rsi\n"
398 /* c += u0 * (R >> 4) */
399 "movq $0x1000003d1,%%rax\n"
400 "mulq %%rsi\n"
401 "addq %%rax,%%r8\n"
402 "adcq %%rdx,%%r9\n"
403 /* r[0] = c & M */
404 "movq %%r8,%%rax\n"
405 "andq %%r15,%%rax\n"
406 "movq %%rax,0(%%rdi)\n"
407 /* c >>= 52 */
408 "shrdq $52,%%r9,%%r8\n"
409 "xorq %%r9,%%r9\n"
410 /* a0 *= 2 */
411 "addq %%r10,%%r10\n"
412 /* c += a0 * a1 */
413 "movq %%r10,%%rax\n"
414 "mulq %%r11\n"
415 "addq %%rax,%%r8\n"
416 "adcq %%rdx,%%r9\n"
417 /* d += a2 * a4 */
418 "movq %%r12,%%rax\n"
419 "mulq %%r14\n"
420 "addq %%rax,%%rbx\n"
421 "adcq %%rdx,%%rcx\n"
422 /* d += a3 * a3 */
423 "movq %%r13,%%rax\n"
424 "mulq %%r13\n"
425 "addq %%rax,%%rbx\n"
426 "adcq %%rdx,%%rcx\n"
427 /* c += (d & M) * R */
428 "movq %%rbx,%%rax\n"
429 "andq %%r15,%%rax\n"
430 "movq $0x1000003d10,%%rdx\n"
431 "mulq %%rdx\n"
432 "addq %%rax,%%r8\n"
433 "adcq %%rdx,%%r9\n"
434 /* d >>= 52 */
435 "shrdq $52,%%rcx,%%rbx\n"
436 "xorq %%rcx,%%rcx\n"
437 /* r[1] = c & M */
438 "movq %%r8,%%rax\n"
439 "andq %%r15,%%rax\n"
440 "movq %%rax,8(%%rdi)\n"
441 /* c >>= 52 */
442 "shrdq $52,%%r9,%%r8\n"
443 "xorq %%r9,%%r9\n"
444 /* c += a0 * a2 (last use of %%r10) */
445 "movq %%r10,%%rax\n"
446 "mulq %%r12\n"
447 "addq %%rax,%%r8\n"
448 "adcq %%rdx,%%r9\n"
449 /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
450 "movq %q2,%%rsi\n"
451 "movq %q1,%%r10\n"
452 /* c += a1 * a1 */
453 "movq %%r11,%%rax\n"
454 "mulq %%r11\n"
455 "addq %%rax,%%r8\n"
456 "adcq %%rdx,%%r9\n"
457 /* d += a3 * a4 */
458 "movq %%r13,%%rax\n"
459 "mulq %%r14\n"
460 "addq %%rax,%%rbx\n"
461 "adcq %%rdx,%%rcx\n"
462 /* c += (d & M) * R */
463 "movq %%rbx,%%rax\n"
464 "andq %%r15,%%rax\n"
465 "movq $0x1000003d10,%%rdx\n"
466 "mulq %%rdx\n"
467 "addq %%rax,%%r8\n"
468 "adcq %%rdx,%%r9\n"
469 /* d >>= 52 (%%rbx only) */
470 "shrdq $52,%%rcx,%%rbx\n"
471 /* r[2] = c & M */
472 "movq %%r8,%%rax\n"
473 "andq %%r15,%%rax\n"
474 "movq %%rax,16(%%rdi)\n"
475 /* c >>= 52 */
476 "shrdq $52,%%r9,%%r8\n"
477 "xorq %%r9,%%r9\n"
478 /* c += t3 */
479 "addq %%r10,%%r8\n"
480 /* c += d * R */
481 "movq %%rbx,%%rax\n"
482 "movq $0x1000003d10,%%rdx\n"
483 "mulq %%rdx\n"
484 "addq %%rax,%%r8\n"
485 "adcq %%rdx,%%r9\n"
486 /* r[3] = c & M */
487 "movq %%r8,%%rax\n"
488 "andq %%r15,%%rax\n"
489 "movq %%rax,24(%%rdi)\n"
490 /* c >>= 52 (%%r8 only) */
491 "shrdq $52,%%r9,%%r8\n"
492 /* c += t4 (%%r8 only) */
493 "addq %%rsi,%%r8\n"
494 /* r[4] = c */
495 "movq %%r8,32(%%rdi)\n"
496 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
497 : "D"(r)
498 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
499 );
500 }
501
502 #endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */
503