1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Square, z := x^2
17// Input x[8]; output z[16]
18//
19//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
20//
21// Standard x86-64 ABI: RDI = z, RSI = x
22// Microsoft x64 ABI:   RCX = z, RDX = x
23// ----------------------------------------------------------------------------
24
25#include "s2n_bignum_internal.h"
26
27        .intel_syntax noprefix
28        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
29        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
30        .text
31
32// Input arguments
33
34#define z rdi
35#define x rsi
36
37// Other variables used as a rotating 3-word window to add terms to
38
39#define t0 r8
40#define t1 r9
41#define t2 r10
42
43// Additional temporaries for local windows to share doublings
44
45#define u0 rcx
46#define u1 r11
47
48// Macro for the key "multiply and add to (c,h,l)" step
49
50#define combadd(c,h,l,numa,numb)                \
51        mov     rax, numa;                      \
52        mul     QWORD PTR numb;                 \
53        add     l, rax;                         \
54        adc     h, rdx;                         \
55        adc     c, 0
56
57// Set up initial window (c,h,l) = numa * numb
58
59#define combaddz(c,h,l,numa,numb)               \
60        mov     rax, numa;                      \
61        mul     QWORD PTR numb;                 \
62        xor     c, c;                           \
63        mov     l, rax;                         \
64        mov     h, rdx
65
66// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
67
68#define doubladd(c,h,l,hh,ll)                   \
69        add     ll, ll;                         \
70        adc     hh, hh;                         \
71        adc     c, c;                           \
72        add     l, ll;                          \
73        adc     h, hh;                          \
74        adc     c, 0
75
76// Square term incorporation (c,h,l) += numba^2
77
78#define combadd1(c,h,l,numa)                    \
79        mov     rax, numa;                      \
80        mul     rax;                            \
81        add     l, rax;                         \
82        adc     h, rdx;                         \
83        adc     c, 0
84
85// A short form where we don't expect a top carry
86
87#define combads(h,l,numa)                       \
88        mov     rax, numa;                      \
89        mul     rax;                            \
90        add     l, rax;                         \
91        adc     h, rdx
92
93// A version doubling directly before adding, for single non-square terms
94
95#define combadd2(c,h,l,numa,numb)               \
96        mov     rax, numa;                      \
97        mul     QWORD PTR numb;                 \
98        add     rax, rax;                       \
99        adc     rdx, rdx;                       \
100        adc     c, 0;                           \
101        add     l, rax;                         \
102        adc     h, rdx;                         \
103        adc     c, 0
104
105S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
106	endbr64
107
108#if WINDOWS_ABI
109        push    rdi
110        push    rsi
111        mov     rdi, rcx
112        mov     rsi, rdx
113#endif
114
115// Result term 0
116
117        mov     rax, [x]
118        mul     rax
119
120        mov     [z], rax
121        mov     t0, rdx
122        xor     t1, t1
123
124// Result term 1
125
126        xor     t2, t2
127        combadd2(t2,t1,t0,[x],[x+8])
128        mov     [z+8], t0
129
130// Result term 2
131
132        xor     t0, t0
133        combadd1(t0,t2,t1,[x+8])
134        combadd2(t0,t2,t1,[x],[x+16])
135        mov     [z+16], t1
136
137// Result term 3
138
139        combaddz(t1,u1,u0,[x],[x+24])
140        combadd(t1,u1,u0,[x+8],[x+16])
141        doubladd(t1,t0,t2,u1,u0)
142        mov     [z+24], t2
143
144// Result term 4
145
146        combaddz(t2,u1,u0,[x],[x+32])
147        combadd(t2,u1,u0,[x+8],[x+24])
148        doubladd(t2,t1,t0,u1,u0)
149        combadd1(t2,t1,t0,[x+16])
150        mov     [z+32], t0
151
152// Result term 5
153
154        combaddz(t0,u1,u0,[x],[x+40])
155        combadd(t0,u1,u0,[x+8],[x+32])
156        combadd(t0,u1,u0,[x+16],[x+24])
157        doubladd(t0,t2,t1,u1,u0)
158        mov     [z+40], t1
159
160// Result term 6
161
162        combaddz(t1,u1,u0,[x],[x+48])
163        combadd(t1,u1,u0,[x+8],[x+40])
164        combadd(t1,u1,u0,[x+16],[x+32])
165        doubladd(t1,t0,t2,u1,u0)
166        combadd1(t1,t0,t2,[x+24])
167        mov     [z+48], t2
168
169// Result term 7
170
171        combaddz(t2,u1,u0,[x],[x+56])
172        combadd(t2,u1,u0,[x+8],[x+48])
173        combadd(t2,u1,u0,[x+16],[x+40])
174        combadd(t2,u1,u0,[x+24],[x+32])
175        doubladd(t2,t1,t0,u1,u0)
176        mov     [z+56], t0
177
178// Result term 8
179
180        combaddz(t0,u1,u0,[x+8],[x+56])
181        combadd(t0,u1,u0,[x+16],[x+48])
182        combadd(t0,u1,u0,[x+24],[x+40])
183        doubladd(t0,t2,t1,u1,u0)
184        combadd1(t0,t2,t1,[x+32])
185        mov     [z+64], t1
186
187// Result term 9
188
189        combaddz(t1,u1,u0,[x+16],[x+56])
190        combadd(t1,u1,u0,[x+24],[x+48])
191        combadd(t1,u1,u0,[x+32],[x+40])
192        doubladd(t1,t0,t2,u1,u0)
193        mov     [z+72], t2
194
195// Result term 10
196
197        combaddz(t2,u1,u0,[x+24],[x+56])
198        combadd(t2,u1,u0,[x+32],[x+48])
199        doubladd(t2,t1,t0,u1,u0)
200        combadd1(t2,t1,t0,[x+40])
201        mov     [z+80], t0
202
203// Result term 11
204
205        combaddz(t0,u1,u0,[x+32],[x+56])
206        combadd(t0,u1,u0,[x+40],[x+48])
207        doubladd(t0,t2,t1,u1,u0)
208        mov     [z+88], t1
209
210// Result term 12
211
212        xor     t1, t1
213        combadd2(t1,t0,t2,[x+40],[x+56])
214        combadd1(t1,t0,t2,[x+48])
215        mov     [z+96], t2
216
217// Result term 13
218
219        xor     t2, t2
220        combadd2(t2,t1,t0,[x+48],[x+56])
221        mov     [z+104], t0
222
223// Result term 14
224
225        combads(t2,t1,[x+56])
226        mov     [z+112], t1
227
228// Result term 15
229
230        mov     [z+120], t2
231
232// Return
233
234#if WINDOWS_ABI
235        pop    rsi
236        pop    rdi
237#endif
238        ret
239
240#if defined(__linux__) && defined(__ELF__)
241.section .note.GNU-stack,"",%progbits
242#endif
243