1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2// 3// Permission to use, copy, modify, and/or distribute this software for any 4// purpose with or without fee is hereby granted, provided that the above 5// copyright notice and this permission notice appear in all copies. 6// 7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 15// ---------------------------------------------------------------------------- 16// Square, z := x^2 17// Input x[8]; output z[16] 18// 19// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); 20// 21// Standard x86-64 ABI: RDI = z, RSI = x 22// Microsoft x64 ABI: RCX = z, RDX = x 23// ---------------------------------------------------------------------------- 24 25#include "s2n_bignum_internal.h" 26 27 .intel_syntax noprefix 28 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) 29 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) 30 .text 31 32// Input arguments 33 34#define z rdi 35#define x rsi 36 37// Other variables used as a rotating 3-word window to add terms to 38 39#define t0 r8 40#define t1 r9 41#define t2 r10 42 43// Additional temporaries for local windows to share doublings 44 45#define u0 rcx 46#define u1 r11 47 48// Macro for the key "multiply and add to (c,h,l)" step 49 50#define combadd(c,h,l,numa,numb) \ 51 mov rax, numa; \ 52 mul QWORD PTR numb; \ 53 add l, rax; \ 54 adc h, rdx; \ 55 adc c, 0 56 57// Set up initial window (c,h,l) = numa * numb 58 59#define combaddz(c,h,l,numa,numb) \ 60 mov rax, numa; \ 61 mul QWORD PTR numb; \ 62 xor c, c; \ 63 mov l, rax; \ 64 mov h, rdx 65 66// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) 67 68#define doubladd(c,h,l,hh,ll) \ 69 add ll, ll; \ 70 adc hh, hh; \ 71 adc c, c; \ 72 add l, ll; \ 73 adc h, hh; \ 74 adc c, 0 75 76// Square term incorporation (c,h,l) += numba^2 77 78#define combadd1(c,h,l,numa) \ 79 mov rax, numa; \ 80 mul rax; \ 81 add l, rax; \ 82 adc h, rdx; \ 83 adc c, 0 84 85// A short form where we don't expect a top carry 86 87#define combads(h,l,numa) \ 88 mov rax, numa; \ 89 mul rax; \ 90 add l, rax; \ 91 adc h, rdx 92 93// A version doubling directly before adding, for single non-square terms 94 95#define combadd2(c,h,l,numa,numb) \ 96 mov rax, numa; \ 97 mul QWORD PTR numb; \ 98 add rax, rax; \ 99 adc rdx, rdx; \ 100 adc c, 0; \ 101 add l, rax; \ 102 adc h, rdx; \ 103 adc c, 0 104 105S2N_BN_SYMBOL(bignum_sqr_8_16_alt): 106 endbr64 107 108#if WINDOWS_ABI 109 push rdi 110 push rsi 111 mov rdi, rcx 112 mov rsi, rdx 113#endif 114 115// Result term 0 116 117 mov rax, [x] 118 mul rax 119 120 mov [z], rax 121 mov t0, rdx 122 xor t1, t1 123 124// Result term 1 125 126 xor t2, t2 127 combadd2(t2,t1,t0,[x],[x+8]) 128 mov [z+8], t0 129 130// Result term 2 131 132 xor t0, t0 133 combadd1(t0,t2,t1,[x+8]) 134 combadd2(t0,t2,t1,[x],[x+16]) 135 mov [z+16], t1 136 137// Result term 3 138 139 combaddz(t1,u1,u0,[x],[x+24]) 140 combadd(t1,u1,u0,[x+8],[x+16]) 141 doubladd(t1,t0,t2,u1,u0) 142 mov [z+24], t2 143 144// Result term 4 145 146 combaddz(t2,u1,u0,[x],[x+32]) 147 combadd(t2,u1,u0,[x+8],[x+24]) 148 doubladd(t2,t1,t0,u1,u0) 149 combadd1(t2,t1,t0,[x+16]) 150 mov [z+32], t0 151 152// Result term 5 153 154 combaddz(t0,u1,u0,[x],[x+40]) 155 combadd(t0,u1,u0,[x+8],[x+32]) 156 combadd(t0,u1,u0,[x+16],[x+24]) 157 doubladd(t0,t2,t1,u1,u0) 158 mov [z+40], t1 159 160// Result term 6 161 162 combaddz(t1,u1,u0,[x],[x+48]) 163 combadd(t1,u1,u0,[x+8],[x+40]) 164 combadd(t1,u1,u0,[x+16],[x+32]) 165 doubladd(t1,t0,t2,u1,u0) 166 combadd1(t1,t0,t2,[x+24]) 167 mov [z+48], t2 168 169// Result term 7 170 171 combaddz(t2,u1,u0,[x],[x+56]) 172 combadd(t2,u1,u0,[x+8],[x+48]) 173 combadd(t2,u1,u0,[x+16],[x+40]) 174 combadd(t2,u1,u0,[x+24],[x+32]) 175 doubladd(t2,t1,t0,u1,u0) 176 mov [z+56], t0 177 178// Result term 8 179 180 combaddz(t0,u1,u0,[x+8],[x+56]) 181 combadd(t0,u1,u0,[x+16],[x+48]) 182 combadd(t0,u1,u0,[x+24],[x+40]) 183 doubladd(t0,t2,t1,u1,u0) 184 combadd1(t0,t2,t1,[x+32]) 185 mov [z+64], t1 186 187// Result term 9 188 189 combaddz(t1,u1,u0,[x+16],[x+56]) 190 combadd(t1,u1,u0,[x+24],[x+48]) 191 combadd(t1,u1,u0,[x+32],[x+40]) 192 doubladd(t1,t0,t2,u1,u0) 193 mov [z+72], t2 194 195// Result term 10 196 197 combaddz(t2,u1,u0,[x+24],[x+56]) 198 combadd(t2,u1,u0,[x+32],[x+48]) 199 doubladd(t2,t1,t0,u1,u0) 200 combadd1(t2,t1,t0,[x+40]) 201 mov [z+80], t0 202 203// Result term 11 204 205 combaddz(t0,u1,u0,[x+32],[x+56]) 206 combadd(t0,u1,u0,[x+40],[x+48]) 207 doubladd(t0,t2,t1,u1,u0) 208 mov [z+88], t1 209 210// Result term 12 211 212 xor t1, t1 213 combadd2(t1,t0,t2,[x+40],[x+56]) 214 combadd1(t1,t0,t2,[x+48]) 215 mov [z+96], t2 216 217// Result term 13 218 219 xor t2, t2 220 combadd2(t2,t1,t0,[x+48],[x+56]) 221 mov [z+104], t0 222 223// Result term 14 224 225 combads(t2,t1,[x+56]) 226 mov [z+112], t1 227 228// Result term 15 229 230 mov [z+120], t2 231 232// Return 233 234#if WINDOWS_ABI 235 pop rsi 236 pop rdi 237#endif 238 ret 239 240#if defined(__linux__) && defined(__ELF__) 241.section .note.GNU-stack,"",%progbits 242#endif 243