xref: /openbsd/lib/libcrypto/bn/arch/amd64/bignum_sqr.S (revision d415bd75)
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Square z := x^2
17// Input x[n]; output z[k]
18//
19//    extern void bignum_sqr
20//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
21//
22// Does the "z := x^2" operation where x is n digits and result z is k.
23// Truncates the result in general unless k >= 2 * n
24//
25// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
26// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
27// ----------------------------------------------------------------------------
28
29#include "s2n_bignum_internal.h"
30
31        .intel_syntax noprefix
32        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
33        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
34        .text
35
36// First three are where arguments come in, but n is moved.
37
38#define p rdi
39#define z rsi
40#define x rcx
41#define n r8
42
43// These are always local scratch since multiplier result is in these
44
45#define a rax
46#define d rdx
47
48// Other variables
49
50#define i rbx
51#define ll rbp
52#define hh r9
53#define k r10
54#define y r11
55#define htop r12
56#define l r13
57#define h r14
58#define c r15
59
60// Short versions
61
62#define llshort ebp
63
64S2N_BN_SYMBOL(bignum_sqr):
65	endbr64
66
67#if WINDOWS_ABI
68        push    rdi
69        push    rsi
70        mov     rdi, rcx
71        mov     rsi, rdx
72        mov     rdx, r8
73        mov     rcx, r9
74#endif
75
76// We use too many registers, and also we need rax:rdx for multiplications
77
78        push    rbx
79        push    rbp
80        push    r12
81        push    r13
82        push    r14
83        push    r15
84        mov     n, rdx
85
86// If p = 0 the result is trivial and nothing needs doing
87
88        test    p, p
89        jz      end
90
91// initialize (hh,ll) = 0
92
93        xor     llshort, llshort
94        xor     hh, hh
95
96// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
97
98        xor     k, k
99
100outerloop:
101
102// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
103// We want to accumulate all x[i] * x[k - i] for bot <= i < top
104// For the optimization of squaring we avoid duplication and do
105// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
106// Initialize i = bot; in fact just compute bot as i directly.
107
108        xor     c, c
109        lea     i, [k+1]
110        mov     htop, i
111        shr     htop, 1
112        sub     i, n
113        cmovc   i, c
114        cmp     htop, n
115        cmovnc  htop, n
116
117// Initialize the three-part local sum (c,h,l); c was already done above
118
119        xor     l, l
120        xor     h, h
121
122// If htop <= bot then main doubled part of the sum is empty
123
124        cmp     i, htop
125        jnc     nosumming
126
127// Use a moving pointer for [y] = x[k-i] for the cofactor
128
129        mov     a, k
130        sub     a, i
131        lea     y, [x+8*a]
132
133// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
134
135innerloop:
136        mov     a, [x+8*i]
137        mul     QWORD PTR [y]
138        add     l, a
139        adc     h, d
140        adc     c, 0
141        sub     y, 8
142        inc     i
143        cmp     i, htop
144        jc      innerloop
145
146// Now double it
147
148        add     l, l
149        adc     h, h
150        adc     c, c
151
152// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
153
154nosumming:
155        test    k, 1
156        jnz     innerend
157        cmp     i, n
158        jnc     innerend
159
160        mov     a, [x+8*i]
161        mul     a
162        add     l, a
163        adc     h, d
164        adc     c, 0
165
166// Now add the local sum into the global sum, store and shift
167
168innerend:
169        add     l, ll
170        mov     [z+8*k], l
171        adc     h, hh
172        mov     ll, h
173        adc     c, 0
174        mov     hh, c
175
176        inc     k
177        cmp     k, p
178        jc      outerloop
179
180// Restore registers and return
181
182end:
183        pop     r15
184        pop     r14
185        pop     r13
186        pop     r12
187        pop     rbp
188        pop     rbx
189#if WINDOWS_ABI
190        pop    rsi
191        pop    rdi
192#endif
193        ret
194
195#if defined(__linux__) && defined(__ELF__)
196.section .note.GNU-stack,"",%progbits
197#endif
198