1 /**********************************************************************
2  * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille               *
3  * Distributed under the MIT software license, see the accompanying   *
4  * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
5  **********************************************************************/
6 
7 /**
8  * Changelog:
9  * - March 2013, Diederik Huys:    original version
10  * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
11  * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
12  */
13 
14 #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H
15 #define SECP256K1_FIELD_INNER5X52_IMPL_H
16 
secp256k1_fe_mul_inner(uint64_t * r,const uint64_t * a,const uint64_t * SECP256K1_RESTRICT b)17 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
18 /**
19  * Registers: rdx:rax = multiplication accumulator
20  *            r9:r8   = c
21  *            r15:rcx = d
22  *            r10-r14 = a0-a4
23  *            rbx     = b
24  *            rdi     = r
25  *            rsi     = a / t?
26  */
27   uint64_t tmp1, tmp2, tmp3;
28 __asm__ __volatile__(
29     "movq 0(%%rsi),%%r10\n"
30     "movq 8(%%rsi),%%r11\n"
31     "movq 16(%%rsi),%%r12\n"
32     "movq 24(%%rsi),%%r13\n"
33     "movq 32(%%rsi),%%r14\n"
34 
35     /* d += a3 * b0 */
36     "movq 0(%%rbx),%%rax\n"
37     "mulq %%r13\n"
38     "movq %%rax,%%rcx\n"
39     "movq %%rdx,%%r15\n"
40     /* d += a2 * b1 */
41     "movq 8(%%rbx),%%rax\n"
42     "mulq %%r12\n"
43     "addq %%rax,%%rcx\n"
44     "adcq %%rdx,%%r15\n"
45     /* d += a1 * b2 */
46     "movq 16(%%rbx),%%rax\n"
47     "mulq %%r11\n"
48     "addq %%rax,%%rcx\n"
49     "adcq %%rdx,%%r15\n"
50     /* d = a0 * b3 */
51     "movq 24(%%rbx),%%rax\n"
52     "mulq %%r10\n"
53     "addq %%rax,%%rcx\n"
54     "adcq %%rdx,%%r15\n"
55     /* c = a4 * b4 */
56     "movq 32(%%rbx),%%rax\n"
57     "mulq %%r14\n"
58     "movq %%rax,%%r8\n"
59     "movq %%rdx,%%r9\n"
60     /* d += (c & M) * R */
61     "movq $0xfffffffffffff,%%rdx\n"
62     "andq %%rdx,%%rax\n"
63     "movq $0x1000003d10,%%rdx\n"
64     "mulq %%rdx\n"
65     "addq %%rax,%%rcx\n"
66     "adcq %%rdx,%%r15\n"
67     /* c >>= 52 (%%r8 only) */
68     "shrdq $52,%%r9,%%r8\n"
69     /* t3 (tmp1) = d & M */
70     "movq %%rcx,%%rsi\n"
71     "movq $0xfffffffffffff,%%rdx\n"
72     "andq %%rdx,%%rsi\n"
73     "movq %%rsi,%q1\n"
74     /* d >>= 52 */
75     "shrdq $52,%%r15,%%rcx\n"
76     "xorq %%r15,%%r15\n"
77     /* d += a4 * b0 */
78     "movq 0(%%rbx),%%rax\n"
79     "mulq %%r14\n"
80     "addq %%rax,%%rcx\n"
81     "adcq %%rdx,%%r15\n"
82     /* d += a3 * b1 */
83     "movq 8(%%rbx),%%rax\n"
84     "mulq %%r13\n"
85     "addq %%rax,%%rcx\n"
86     "adcq %%rdx,%%r15\n"
87     /* d += a2 * b2 */
88     "movq 16(%%rbx),%%rax\n"
89     "mulq %%r12\n"
90     "addq %%rax,%%rcx\n"
91     "adcq %%rdx,%%r15\n"
92     /* d += a1 * b3 */
93     "movq 24(%%rbx),%%rax\n"
94     "mulq %%r11\n"
95     "addq %%rax,%%rcx\n"
96     "adcq %%rdx,%%r15\n"
97     /* d += a0 * b4 */
98     "movq 32(%%rbx),%%rax\n"
99     "mulq %%r10\n"
100     "addq %%rax,%%rcx\n"
101     "adcq %%rdx,%%r15\n"
102     /* d += c * R */
103     "movq %%r8,%%rax\n"
104     "movq $0x1000003d10,%%rdx\n"
105     "mulq %%rdx\n"
106     "addq %%rax,%%rcx\n"
107     "adcq %%rdx,%%r15\n"
108     /* t4 = d & M (%%rsi) */
109     "movq %%rcx,%%rsi\n"
110     "movq $0xfffffffffffff,%%rdx\n"
111     "andq %%rdx,%%rsi\n"
112     /* d >>= 52 */
113     "shrdq $52,%%r15,%%rcx\n"
114     "xorq %%r15,%%r15\n"
115     /* tx = t4 >> 48 (tmp3) */
116     "movq %%rsi,%%rax\n"
117     "shrq $48,%%rax\n"
118     "movq %%rax,%q3\n"
119     /* t4 &= (M >> 4) (tmp2) */
120     "movq $0xffffffffffff,%%rax\n"
121     "andq %%rax,%%rsi\n"
122     "movq %%rsi,%q2\n"
123     /* c = a0 * b0 */
124     "movq 0(%%rbx),%%rax\n"
125     "mulq %%r10\n"
126     "movq %%rax,%%r8\n"
127     "movq %%rdx,%%r9\n"
128     /* d += a4 * b1 */
129     "movq 8(%%rbx),%%rax\n"
130     "mulq %%r14\n"
131     "addq %%rax,%%rcx\n"
132     "adcq %%rdx,%%r15\n"
133     /* d += a3 * b2 */
134     "movq 16(%%rbx),%%rax\n"
135     "mulq %%r13\n"
136     "addq %%rax,%%rcx\n"
137     "adcq %%rdx,%%r15\n"
138     /* d += a2 * b3 */
139     "movq 24(%%rbx),%%rax\n"
140     "mulq %%r12\n"
141     "addq %%rax,%%rcx\n"
142     "adcq %%rdx,%%r15\n"
143     /* d += a1 * b4 */
144     "movq 32(%%rbx),%%rax\n"
145     "mulq %%r11\n"
146     "addq %%rax,%%rcx\n"
147     "adcq %%rdx,%%r15\n"
148     /* u0 = d & M (%%rsi) */
149     "movq %%rcx,%%rsi\n"
150     "movq $0xfffffffffffff,%%rdx\n"
151     "andq %%rdx,%%rsi\n"
152     /* d >>= 52 */
153     "shrdq $52,%%r15,%%rcx\n"
154     "xorq %%r15,%%r15\n"
155     /* u0 = (u0 << 4) | tx (%%rsi) */
156     "shlq $4,%%rsi\n"
157     "movq %q3,%%rax\n"
158     "orq %%rax,%%rsi\n"
159     /* c += u0 * (R >> 4) */
160     "movq $0x1000003d1,%%rax\n"
161     "mulq %%rsi\n"
162     "addq %%rax,%%r8\n"
163     "adcq %%rdx,%%r9\n"
164     /* r[0] = c & M */
165     "movq %%r8,%%rax\n"
166     "movq $0xfffffffffffff,%%rdx\n"
167     "andq %%rdx,%%rax\n"
168     "movq %%rax,0(%%rdi)\n"
169     /* c >>= 52 */
170     "shrdq $52,%%r9,%%r8\n"
171     "xorq %%r9,%%r9\n"
172     /* c += a1 * b0 */
173     "movq 0(%%rbx),%%rax\n"
174     "mulq %%r11\n"
175     "addq %%rax,%%r8\n"
176     "adcq %%rdx,%%r9\n"
177     /* c += a0 * b1 */
178     "movq 8(%%rbx),%%rax\n"
179     "mulq %%r10\n"
180     "addq %%rax,%%r8\n"
181     "adcq %%rdx,%%r9\n"
182     /* d += a4 * b2 */
183     "movq 16(%%rbx),%%rax\n"
184     "mulq %%r14\n"
185     "addq %%rax,%%rcx\n"
186     "adcq %%rdx,%%r15\n"
187     /* d += a3 * b3 */
188     "movq 24(%%rbx),%%rax\n"
189     "mulq %%r13\n"
190     "addq %%rax,%%rcx\n"
191     "adcq %%rdx,%%r15\n"
192     /* d += a2 * b4 */
193     "movq 32(%%rbx),%%rax\n"
194     "mulq %%r12\n"
195     "addq %%rax,%%rcx\n"
196     "adcq %%rdx,%%r15\n"
197     /* c += (d & M) * R */
198     "movq %%rcx,%%rax\n"
199     "movq $0xfffffffffffff,%%rdx\n"
200     "andq %%rdx,%%rax\n"
201     "movq $0x1000003d10,%%rdx\n"
202     "mulq %%rdx\n"
203     "addq %%rax,%%r8\n"
204     "adcq %%rdx,%%r9\n"
205     /* d >>= 52 */
206     "shrdq $52,%%r15,%%rcx\n"
207     "xorq %%r15,%%r15\n"
208     /* r[1] = c & M */
209     "movq %%r8,%%rax\n"
210     "movq $0xfffffffffffff,%%rdx\n"
211     "andq %%rdx,%%rax\n"
212     "movq %%rax,8(%%rdi)\n"
213     /* c >>= 52 */
214     "shrdq $52,%%r9,%%r8\n"
215     "xorq %%r9,%%r9\n"
216     /* c += a2 * b0 */
217     "movq 0(%%rbx),%%rax\n"
218     "mulq %%r12\n"
219     "addq %%rax,%%r8\n"
220     "adcq %%rdx,%%r9\n"
221     /* c += a1 * b1 */
222     "movq 8(%%rbx),%%rax\n"
223     "mulq %%r11\n"
224     "addq %%rax,%%r8\n"
225     "adcq %%rdx,%%r9\n"
226     /* c += a0 * b2 (last use of %%r10 = a0) */
227     "movq 16(%%rbx),%%rax\n"
228     "mulq %%r10\n"
229     "addq %%rax,%%r8\n"
230     "adcq %%rdx,%%r9\n"
231     /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
232     "movq %q2,%%rsi\n"
233     "movq %q1,%%r10\n"
234     /* d += a4 * b3 */
235     "movq 24(%%rbx),%%rax\n"
236     "mulq %%r14\n"
237     "addq %%rax,%%rcx\n"
238     "adcq %%rdx,%%r15\n"
239     /* d += a3 * b4 */
240     "movq 32(%%rbx),%%rax\n"
241     "mulq %%r13\n"
242     "addq %%rax,%%rcx\n"
243     "adcq %%rdx,%%r15\n"
244     /* c += (d & M) * R */
245     "movq %%rcx,%%rax\n"
246     "movq $0xfffffffffffff,%%rdx\n"
247     "andq %%rdx,%%rax\n"
248     "movq $0x1000003d10,%%rdx\n"
249     "mulq %%rdx\n"
250     "addq %%rax,%%r8\n"
251     "adcq %%rdx,%%r9\n"
252     /* d >>= 52 (%%rcx only) */
253     "shrdq $52,%%r15,%%rcx\n"
254     /* r[2] = c & M */
255     "movq %%r8,%%rax\n"
256     "movq $0xfffffffffffff,%%rdx\n"
257     "andq %%rdx,%%rax\n"
258     "movq %%rax,16(%%rdi)\n"
259     /* c >>= 52 */
260     "shrdq $52,%%r9,%%r8\n"
261     "xorq %%r9,%%r9\n"
262     /* c += t3 */
263     "addq %%r10,%%r8\n"
264     /* c += d * R */
265     "movq %%rcx,%%rax\n"
266     "movq $0x1000003d10,%%rdx\n"
267     "mulq %%rdx\n"
268     "addq %%rax,%%r8\n"
269     "adcq %%rdx,%%r9\n"
270     /* r[3] = c & M */
271     "movq %%r8,%%rax\n"
272     "movq $0xfffffffffffff,%%rdx\n"
273     "andq %%rdx,%%rax\n"
274     "movq %%rax,24(%%rdi)\n"
275     /* c >>= 52 (%%r8 only) */
276     "shrdq $52,%%r9,%%r8\n"
277     /* c += t4 (%%r8 only) */
278     "addq %%rsi,%%r8\n"
279     /* r[4] = c */
280     "movq %%r8,32(%%rdi)\n"
281 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
282 : "b"(b), "D"(r)
283 : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
284 );
285 }
286 
secp256k1_fe_sqr_inner(uint64_t * r,const uint64_t * a)287 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
288 /**
289  * Registers: rdx:rax = multiplication accumulator
290  *            r9:r8   = c
291  *            rcx:rbx = d
292  *            r10-r14 = a0-a4
293  *            r15     = M (0xfffffffffffff)
294  *            rdi     = r
295  *            rsi     = a / t?
296  */
297   uint64_t tmp1, tmp2, tmp3;
298 __asm__ __volatile__(
299     "movq 0(%%rsi),%%r10\n"
300     "movq 8(%%rsi),%%r11\n"
301     "movq 16(%%rsi),%%r12\n"
302     "movq 24(%%rsi),%%r13\n"
303     "movq 32(%%rsi),%%r14\n"
304     "movq $0xfffffffffffff,%%r15\n"
305 
306     /* d = (a0*2) * a3 */
307     "leaq (%%r10,%%r10,1),%%rax\n"
308     "mulq %%r13\n"
309     "movq %%rax,%%rbx\n"
310     "movq %%rdx,%%rcx\n"
311     /* d += (a1*2) * a2 */
312     "leaq (%%r11,%%r11,1),%%rax\n"
313     "mulq %%r12\n"
314     "addq %%rax,%%rbx\n"
315     "adcq %%rdx,%%rcx\n"
316     /* c = a4 * a4 */
317     "movq %%r14,%%rax\n"
318     "mulq %%r14\n"
319     "movq %%rax,%%r8\n"
320     "movq %%rdx,%%r9\n"
321     /* d += (c & M) * R */
322     "andq %%r15,%%rax\n"
323     "movq $0x1000003d10,%%rdx\n"
324     "mulq %%rdx\n"
325     "addq %%rax,%%rbx\n"
326     "adcq %%rdx,%%rcx\n"
327     /* c >>= 52 (%%r8 only) */
328     "shrdq $52,%%r9,%%r8\n"
329     /* t3 (tmp1) = d & M */
330     "movq %%rbx,%%rsi\n"
331     "andq %%r15,%%rsi\n"
332     "movq %%rsi,%q1\n"
333     /* d >>= 52 */
334     "shrdq $52,%%rcx,%%rbx\n"
335     "xorq %%rcx,%%rcx\n"
336     /* a4 *= 2 */
337     "addq %%r14,%%r14\n"
338     /* d += a0 * a4 */
339     "movq %%r10,%%rax\n"
340     "mulq %%r14\n"
341     "addq %%rax,%%rbx\n"
342     "adcq %%rdx,%%rcx\n"
343     /* d+= (a1*2) * a3 */
344     "leaq (%%r11,%%r11,1),%%rax\n"
345     "mulq %%r13\n"
346     "addq %%rax,%%rbx\n"
347     "adcq %%rdx,%%rcx\n"
348     /* d += a2 * a2 */
349     "movq %%r12,%%rax\n"
350     "mulq %%r12\n"
351     "addq %%rax,%%rbx\n"
352     "adcq %%rdx,%%rcx\n"
353     /* d += c * R */
354     "movq %%r8,%%rax\n"
355     "movq $0x1000003d10,%%rdx\n"
356     "mulq %%rdx\n"
357     "addq %%rax,%%rbx\n"
358     "adcq %%rdx,%%rcx\n"
359     /* t4 = d & M (%%rsi) */
360     "movq %%rbx,%%rsi\n"
361     "andq %%r15,%%rsi\n"
362     /* d >>= 52 */
363     "shrdq $52,%%rcx,%%rbx\n"
364     "xorq %%rcx,%%rcx\n"
365     /* tx = t4 >> 48 (tmp3) */
366     "movq %%rsi,%%rax\n"
367     "shrq $48,%%rax\n"
368     "movq %%rax,%q3\n"
369     /* t4 &= (M >> 4) (tmp2) */
370     "movq $0xffffffffffff,%%rax\n"
371     "andq %%rax,%%rsi\n"
372     "movq %%rsi,%q2\n"
373     /* c = a0 * a0 */
374     "movq %%r10,%%rax\n"
375     "mulq %%r10\n"
376     "movq %%rax,%%r8\n"
377     "movq %%rdx,%%r9\n"
378     /* d += a1 * a4 */
379     "movq %%r11,%%rax\n"
380     "mulq %%r14\n"
381     "addq %%rax,%%rbx\n"
382     "adcq %%rdx,%%rcx\n"
383     /* d += (a2*2) * a3 */
384     "leaq (%%r12,%%r12,1),%%rax\n"
385     "mulq %%r13\n"
386     "addq %%rax,%%rbx\n"
387     "adcq %%rdx,%%rcx\n"
388     /* u0 = d & M (%%rsi) */
389     "movq %%rbx,%%rsi\n"
390     "andq %%r15,%%rsi\n"
391     /* d >>= 52 */
392     "shrdq $52,%%rcx,%%rbx\n"
393     "xorq %%rcx,%%rcx\n"
394     /* u0 = (u0 << 4) | tx (%%rsi) */
395     "shlq $4,%%rsi\n"
396     "movq %q3,%%rax\n"
397     "orq %%rax,%%rsi\n"
398     /* c += u0 * (R >> 4) */
399     "movq $0x1000003d1,%%rax\n"
400     "mulq %%rsi\n"
401     "addq %%rax,%%r8\n"
402     "adcq %%rdx,%%r9\n"
403     /* r[0] = c & M */
404     "movq %%r8,%%rax\n"
405     "andq %%r15,%%rax\n"
406     "movq %%rax,0(%%rdi)\n"
407     /* c >>= 52 */
408     "shrdq $52,%%r9,%%r8\n"
409     "xorq %%r9,%%r9\n"
410     /* a0 *= 2 */
411     "addq %%r10,%%r10\n"
412     /* c += a0 * a1 */
413     "movq %%r10,%%rax\n"
414     "mulq %%r11\n"
415     "addq %%rax,%%r8\n"
416     "adcq %%rdx,%%r9\n"
417     /* d += a2 * a4 */
418     "movq %%r12,%%rax\n"
419     "mulq %%r14\n"
420     "addq %%rax,%%rbx\n"
421     "adcq %%rdx,%%rcx\n"
422     /* d += a3 * a3 */
423     "movq %%r13,%%rax\n"
424     "mulq %%r13\n"
425     "addq %%rax,%%rbx\n"
426     "adcq %%rdx,%%rcx\n"
427     /* c += (d & M) * R */
428     "movq %%rbx,%%rax\n"
429     "andq %%r15,%%rax\n"
430     "movq $0x1000003d10,%%rdx\n"
431     "mulq %%rdx\n"
432     "addq %%rax,%%r8\n"
433     "adcq %%rdx,%%r9\n"
434     /* d >>= 52 */
435     "shrdq $52,%%rcx,%%rbx\n"
436     "xorq %%rcx,%%rcx\n"
437     /* r[1] = c & M */
438     "movq %%r8,%%rax\n"
439     "andq %%r15,%%rax\n"
440     "movq %%rax,8(%%rdi)\n"
441     /* c >>= 52 */
442     "shrdq $52,%%r9,%%r8\n"
443     "xorq %%r9,%%r9\n"
444     /* c += a0 * a2 (last use of %%r10) */
445     "movq %%r10,%%rax\n"
446     "mulq %%r12\n"
447     "addq %%rax,%%r8\n"
448     "adcq %%rdx,%%r9\n"
449     /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
450     "movq %q2,%%rsi\n"
451     "movq %q1,%%r10\n"
452     /* c += a1 * a1 */
453     "movq %%r11,%%rax\n"
454     "mulq %%r11\n"
455     "addq %%rax,%%r8\n"
456     "adcq %%rdx,%%r9\n"
457     /* d += a3 * a4 */
458     "movq %%r13,%%rax\n"
459     "mulq %%r14\n"
460     "addq %%rax,%%rbx\n"
461     "adcq %%rdx,%%rcx\n"
462     /* c += (d & M) * R */
463     "movq %%rbx,%%rax\n"
464     "andq %%r15,%%rax\n"
465     "movq $0x1000003d10,%%rdx\n"
466     "mulq %%rdx\n"
467     "addq %%rax,%%r8\n"
468     "adcq %%rdx,%%r9\n"
469     /* d >>= 52 (%%rbx only) */
470     "shrdq $52,%%rcx,%%rbx\n"
471     /* r[2] = c & M */
472     "movq %%r8,%%rax\n"
473     "andq %%r15,%%rax\n"
474     "movq %%rax,16(%%rdi)\n"
475     /* c >>= 52 */
476     "shrdq $52,%%r9,%%r8\n"
477     "xorq %%r9,%%r9\n"
478     /* c += t3 */
479     "addq %%r10,%%r8\n"
480     /* c += d * R */
481     "movq %%rbx,%%rax\n"
482     "movq $0x1000003d10,%%rdx\n"
483     "mulq %%rdx\n"
484     "addq %%rax,%%r8\n"
485     "adcq %%rdx,%%r9\n"
486     /* r[3] = c & M */
487     "movq %%r8,%%rax\n"
488     "andq %%r15,%%rax\n"
489     "movq %%rax,24(%%rdi)\n"
490     /* c >>= 52 (%%r8 only) */
491     "shrdq $52,%%r9,%%r8\n"
492     /* c += t4 (%%r8 only) */
493     "addq %%rsi,%%r8\n"
494     /* r[4] = c */
495     "movq %%r8,32(%%rdi)\n"
496 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
497 : "D"(r)
498 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
499 );
500 }
501 
502 #endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */
503