1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2// 3// Permission to use, copy, modify, and/or distribute this software for any 4// purpose with or without fee is hereby granted, provided that the above 5// copyright notice and this permission notice appear in all copies. 6// 7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 15// ---------------------------------------------------------------------------- 16// Square z := x^2 17// Input x[n]; output z[k] 18// 19// extern void bignum_sqr 20// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); 21// 22// Does the "z := x^2" operation where x is n digits and result z is k. 23// Truncates the result in general unless k >= 2 * n 24// 25// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x 26// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x 27// ---------------------------------------------------------------------------- 28 29#include "s2n_bignum_internal.h" 30 31 .intel_syntax noprefix 32 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr) 33 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr) 34 .text 35 36// First three are where arguments come in, but n is moved. 37 38#define p rdi 39#define z rsi 40#define x rcx 41#define n r8 42 43// These are always local scratch since multiplier result is in these 44 45#define a rax 46#define d rdx 47 48// Other variables 49 50#define i rbx 51#define ll rbp 52#define hh r9 53#define k r10 54#define y r11 55#define htop r12 56#define l r13 57#define h r14 58#define c r15 59 60// Short versions 61 62#define llshort ebp 63 64S2N_BN_SYMBOL(bignum_sqr): 65 endbr64 66 67#if WINDOWS_ABI 68 push rdi 69 push rsi 70 mov rdi, rcx 71 mov rsi, rdx 72 mov rdx, r8 73 mov rcx, r9 74#endif 75 76// We use too many registers, and also we need rax:rdx for multiplications 77 78 push rbx 79 push rbp 80 push r12 81 push r13 82 push r14 83 push r15 84 mov n, rdx 85 86// If p = 0 the result is trivial and nothing needs doing 87 88 test p, p 89 jz end 90 91// initialize (hh,ll) = 0 92 93 xor llshort, llshort 94 xor hh, hh 95 96// Iterate outer loop from k = 0 ... k = p - 1 producing result digits 97 98 xor k, k 99 100outerloop: 101 102// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n 103// We want to accumulate all x[i] * x[k - i] for bot <= i < top 104// For the optimization of squaring we avoid duplication and do 105// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n 106// Initialize i = bot; in fact just compute bot as i directly. 107 108 xor c, c 109 lea i, [k+1] 110 mov htop, i 111 shr htop, 1 112 sub i, n 113 cmovc i, c 114 cmp htop, n 115 cmovnc htop, n 116 117// Initialize the three-part local sum (c,h,l); c was already done above 118 119 xor l, l 120 xor h, h 121 122// If htop <= bot then main doubled part of the sum is empty 123 124 cmp i, htop 125 jnc nosumming 126 127// Use a moving pointer for [y] = x[k-i] for the cofactor 128 129 mov a, k 130 sub a, i 131 lea y, [x+8*a] 132 133// Do the main part of the sum x[i] * x[k - i] for 2 * i < k 134 135innerloop: 136 mov a, [x+8*i] 137 mul QWORD PTR [y] 138 add l, a 139 adc h, d 140 adc c, 0 141 sub y, 8 142 inc i 143 cmp i, htop 144 jc innerloop 145 146// Now double it 147 148 add l, l 149 adc h, h 150 adc c, c 151 152// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term 153 154nosumming: 155 test k, 1 156 jnz innerend 157 cmp i, n 158 jnc innerend 159 160 mov a, [x+8*i] 161 mul a 162 add l, a 163 adc h, d 164 adc c, 0 165 166// Now add the local sum into the global sum, store and shift 167 168innerend: 169 add l, ll 170 mov [z+8*k], l 171 adc h, hh 172 mov ll, h 173 adc c, 0 174 mov hh, c 175 176 inc k 177 cmp k, p 178 jc outerloop 179 180// Restore registers and return 181 182end: 183 pop r15 184 pop r14 185 pop r13 186 pop r12 187 pop rbp 188 pop rbx 189#if WINDOWS_ABI 190 pop rsi 191 pop rdi 192#endif 193 ret 194 195#if defined(__linux__) && defined(__ELF__) 196.section .note.GNU-stack,"",%progbits 197#endif 198