1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2// 3// Permission to use, copy, modify, and/or distribute this software for any 4// purpose with or without fee is hereby granted, provided that the above 5// copyright notice and this permission notice appear in all copies. 6// 7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 15// ---------------------------------------------------------------------------- 16// Multiply-add with single-word multiplier, z := z + c * y 17// Inputs c, y[n]; outputs function return (carry-out) and z[k] 18// 19// extern uint64_t bignum_cmadd 20// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); 21// 22// Does the "z := z + c * y" operation where y is n digits, result z is p. 23// Truncates the result in general. 24// 25// The return value is a high/carry word that is meaningful when p = n + 1, or 26// more generally when n <= p and the result fits in p + 1 digits. In these 27// cases it gives the top digit of the (p + 1)-digit result. 28// 29// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX 30// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX 31// ---------------------------------------------------------------------------- 32 33#include "s2n_bignum_internal.h" 34 35 .intel_syntax noprefix 36 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) 37 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) 38 .text 39 40#define p rdi 41#define z rsi 42#define c r9 43#define n rcx 44#define x r8 45 46#define i r10 47#define h r11 48 49#define r rbx 50 51#define hshort r11d 52#define ishort r10d 53 54 55 56S2N_BN_SYMBOL(bignum_cmadd): 57 _CET_ENDBR 58 59#if WINDOWS_ABI 60 push rdi 61 push rsi 62 mov rdi, rcx 63 mov rsi, rdx 64 mov rdx, r8 65 mov rcx, r9 66 mov r8, [rsp+56] 67#endif 68 69// Seems hard to avoid one more register 70 71 push rbx 72 73// First clamp the input size n := min(p,n) since we can never need to read 74// past the p'th term of the input to generate p-digit output. 75// Subtract p := p - min(n,p) so it holds the size of the extra tail needed 76 77 cmp p, n 78 cmovc n, p 79 sub p, n 80 81// Initialize high part h = 0; if n = 0 do nothing but return that zero 82 83 xor h, h 84 test n, n 85 jz end 86 87// Move c into a safer register as multiplies overwrite rdx 88 89 mov c, rdx 90 91// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 92 93 mov rax, [x] 94 mul c 95 add [z], rax 96 mov h, rdx 97 mov ishort, 1 98 dec n 99 jz hightail 100 101// Main loop, where we always have CF + previous high part h to add in 102 103loop: 104 adc h, [z+8*i] 105 sbb r, r 106 mov rax, [x+8*i] 107 mul c 108 sub rdx, r 109 add rax, h 110 mov [z+8*i], rax 111 mov h, rdx 112 inc i 113 dec n 114 jnz loop 115 116hightail: 117 adc h, 0 118 119// Propagate the carry all the way to the end with h as extra carry word 120 121tail: 122 test p, p 123 jz end 124 125 add [z+8*i], h 126 mov hshort, 0 127 inc i 128 dec p 129 jz highend 130 131tloop: 132 adc [z+8*i], h 133 inc i 134 dec p 135 jnz tloop 136 137highend: 138 139 adc h, 0 140 141// Return the high/carry word 142 143end: 144 mov rax, h 145 146 pop rbx 147#if WINDOWS_ABI 148 pop rsi 149 pop rdi 150#endif 151 ret 152 153#if defined(__linux__) && defined(__ELF__) 154.section .note.GNU-stack,"",%progbits 155#endif 156