1# mp_limb_t mulredc1_6(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, 2# const mp_limb_t *m, mp_limb_t inv_m); 3# 4# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 5# Needs %rbx, %rsp, %rbp, %r12-%r15 restored 6# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) 7# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored 8 9 10 11include(`config.m4') 12 13ifdef(`WINDOWS64_ABI', 14`define(`Y_PARAM', `%r8')dnl 15define(`INVM_PARAM',`72(%rsp)')dnl' 16, 17`define(`Y_PARAM', `%rdx')dnl 18define(`INVM_PARAM',`%r8')dnl' 19)dnl 20 TEXT 21.p2align 6 # Opteron L1 code cache line is 64 bytes long 22 GLOBL GSYM_PREFIX`'mulredc1_6 23 TYPE(GSYM_PREFIX`'mulredc1_`'6,`function') 24 25# Implements multiplication and REDC for one input numbers of LENGTH words 26# and a multiplier of one word 27ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') 28 29# Values that are referenced only once in the loop over j go into r8 .. r14, 30# In the inner loop (over j), tmp, x[i], y, m, and u are constant. 31# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values 32# stay in registers and are referenced as 33# YP = y, MP = m, 34# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry 35 36define(`T0', `%rsi')dnl 37define(`T1', `%rbx')dnl 38define(`CY', `%rcx')dnl 39define(`CYl', `%ecx')dnl 40define(`CYb', `%cl')dnl 41define(`X', `%r14')dnl # register that holds x value 42define(`U', `%r11')dnl 43define(`YP', `%r9')dnl # register that points to the y array 44define(`MP', `%r10')dnl # register that points to the m array 45define(`ZP', `%rdi')dnl # register that holds z 46 47`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U 48`#' `YP' = YP, `MP' = MP 49 50GSYM_PREFIX`'mulredc1_6: 51 52 53######################################################################### 54# i = 0 pass 55######################################################################### 56 57`#' register values at loop entry: YP = y, MP = m 58 59# We need to compute u 60 61 movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) 62 pushq %rbx 63 pushq %r14 64ifdef(`WINDOWS64_ABI', 65` pushq %rsi 66 pushq %rdi 67 movq %r9, MP # store m in MP 68 movq Y_PARAM, YP 69 movq %rcx, ZP 70 movq %rdx, X' 71, 72` movq Y_PARAM, YP 73 movq %rcx, MP 74 movq %rsi, X # store x in X 75 # ZP is same as passed in' 76) 77 78 xorl CYl, CYl # set %CY to 0 79 80 mulq X # rdx:rax = y[0] * x 81 82 movq %rax, T0 # Move low word of product to T0 83 movq %rdx, T1 # Move high word of product to T1 84 85 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 86 movq %rax, U # this is the new u value 87 88 mulq (MP) # multipy u*m[0] 89 addq %rax, T0 # Now %T0 = 0, need not be stored 90 movq 8(YP), %rax # Fetch y[1] 91 adcq %rdx, T1 # 92 setc CYb 93 # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence 94 # CY:T1 <= 2*2^64 - 4 95 96define(`TT', defn(`T0'))dnl 97define(`T0', defn(`T1'))dnl 98define(`T1', defn(`TT'))dnl 99undefine(`TT')dnl 100`#' Now `T0' = T0, `T1' = T1 101 102 103# Pass for j = 1 104# Register values at entry: 105# %rax = y[j], X = x, U = u 106# T0 = value to store in tmp[j], T1 undefined 107# CY = carry into T1 (is <= 2) 108# We have CY:T1 <= 2 * 2^64 - 2 109 110 movq CY, T1 # T1 = CY <= 1 111 112 # Here, T1:T0 <= 2*2^64 - 2 113 mulq X # y[j] * x 114 # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 115 addq %rax, T0 # Add low word to T0 116 movq 8(MP), %rax # Fetch m[j] into %rax 117 adcq %rdx, T1 # Add high word with carry to T1 118 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! 119 120 mulq U # m[j]*u 121 # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 122 addq T0, %rax # Add T0 and low word 123 movq %rax, 0(ZP) # Store T0 in z[1-1] 124 movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax 125 adcq %rdx, T1 # Add high word with carry to T1 126 setc CYb # CY <= 1 127 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= 128 # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 129 130define(`TT', defn(`T0'))dnl 131define(`T0', defn(`T1'))dnl 132define(`T1', defn(`TT'))dnl 133undefine(`TT')dnl 134`#' Now `T0' = T0, `T1' = T1 135 136 137# Pass for j = 2 138# Register values at entry: 139# %rax = y[j], X = x, U = u 140# T0 = value to store in tmp[j], T1 undefined 141# CY = carry into T1 (is <= 2) 142# We have CY:T1 <= 2 * 2^64 - 2 143 144 movq CY, T1 # T1 = CY <= 1 145 146 # Here, T1:T0 <= 2*2^64 - 2 147 mulq X # y[j] * x 148 # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 149 addq %rax, T0 # Add low word to T0 150 movq 16(MP), %rax # Fetch m[j] into %rax 151 adcq %rdx, T1 # Add high word with carry to T1 152 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! 153 154 mulq U # m[j]*u 155 # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 156 addq T0, %rax # Add T0 and low word 157 movq %rax, 8(ZP) # Store T0 in z[2-1] 158 movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax 159 adcq %rdx, T1 # Add high word with carry to T1 160 setc CYb # CY <= 1 161 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= 162 # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 163 164define(`TT', defn(`T0'))dnl 165define(`T0', defn(`T1'))dnl 166define(`T1', defn(`TT'))dnl 167undefine(`TT')dnl 168`#' Now `T0' = T0, `T1' = T1 169 170 171# Pass for j = 3 172# Register values at entry: 173# %rax = y[j], X = x, U = u 174# T0 = value to store in tmp[j], T1 undefined 175# CY = carry into T1 (is <= 2) 176# We have CY:T1 <= 2 * 2^64 - 2 177 178 movq CY, T1 # T1 = CY <= 1 179 180 # Here, T1:T0 <= 2*2^64 - 2 181 mulq X # y[j] * x 182 # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 183 addq %rax, T0 # Add low word to T0 184 movq 24(MP), %rax # Fetch m[j] into %rax 185 adcq %rdx, T1 # Add high word with carry to T1 186 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! 187 188 mulq U # m[j]*u 189 # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 190 addq T0, %rax # Add T0 and low word 191 movq %rax, 16(ZP) # Store T0 in z[3-1] 192 movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax 193 adcq %rdx, T1 # Add high word with carry to T1 194 setc CYb # CY <= 1 195 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= 196 # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 197 198define(`TT', defn(`T0'))dnl 199define(`T0', defn(`T1'))dnl 200define(`T1', defn(`TT'))dnl 201undefine(`TT')dnl 202`#' Now `T0' = T0, `T1' = T1 203 204 205# Pass for j = 4 206# Register values at entry: 207# %rax = y[j], X = x, U = u 208# T0 = value to store in tmp[j], T1 undefined 209# CY = carry into T1 (is <= 2) 210# We have CY:T1 <= 2 * 2^64 - 2 211 212 movq CY, T1 # T1 = CY <= 1 213 214 # Here, T1:T0 <= 2*2^64 - 2 215 mulq X # y[j] * x 216 # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 217 addq %rax, T0 # Add low word to T0 218 movq 32(MP), %rax # Fetch m[j] into %rax 219 adcq %rdx, T1 # Add high word with carry to T1 220 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! 221 222 mulq U # m[j]*u 223 # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 224 addq T0, %rax # Add T0 and low word 225 movq %rax, 24(ZP) # Store T0 in z[4-1] 226 movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax 227 adcq %rdx, T1 # Add high word with carry to T1 228 setc CYb # CY <= 1 229 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= 230 # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 231 232define(`TT', defn(`T0'))dnl 233define(`T0', defn(`T1'))dnl 234define(`T1', defn(`TT'))dnl 235undefine(`TT')dnl 236`#' Now `T0' = T0, `T1' = T1 237 238 239# Pass for j = 5. Don't fetch new data from y[j+1]. 240 241 movq CY, T1 # T1 = CY <= 1 242 243 mulq X # y[j] * x[i] 244 addq %rax, T0 # Add low word to T0 245 movq 40(MP), %rax # Fetch m[j] into %rax 246 adcq %rdx, T1 # Add high word with carry to T1 247 mulq U # m[j]*u 248 addq %rax, T0 # Add low word to T0 249 movq T0, 32(ZP) # Store T0 in z[j-1] 250 adcq %rdx, T1 # Add high word with carry to T1 251 movq T1, 40(ZP) # Store T1 in tmp[j] 252 setc CYb # %CY <= 1 253 254 movq CY, %rax # use carry as return value 255ifdef(`WINDOWS64_ABI', 256` popq %rdi 257 popq %rsi 258') dnl 259 popq %r14 260 popq %rbx 261 ret 262