1; This Source Code Form is subject to the terms of the Mozilla Public 2; License, v. 2.0. If a copy of the MPL was not distributed with this 3; file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5; 6; This code is converted from mpi_amd64_gas.asm for MASM for x64. 7; 8 9; ------------------------------------------------------------------------ 10; 11; Implementation of s_mpv_mul_set_vec which exploits 12; the 64X64->128 bit unsigned multiply instruction. 13; 14; ------------------------------------------------------------------------ 15 16; r = a * digit, r and a are vectors of length len 17; returns the carry digit 18; r and a are 64 bit aligned. 19; 20; uint64_t 21; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 22; 23 24.CODE 25 26s_mpv_mul_set_vec64 PROC 27 28 ; compatibilities for paramenter registers 29 ; 30 ; About GAS and MASM, the usage of parameter registers are different. 31 32 push rdi 33 push rsi 34 35 mov rdi, rcx 36 mov rsi, rdx 37 mov edx, r8d 38 mov rcx, r9 39 40 xor rax, rax 41 test rdx, rdx 42 jz L17 43 mov r8, rdx 44 xor r9, r9 45 46L15: 47 cmp r8, 8 48 jb L16 49 mov rax, [rsi] 50 mov r11, [8+rsi] 51 mul rcx 52 add rax, r9 53 adc rdx, 0 54 mov [0+rdi], rax 55 mov r9, rdx 56 mov rax,r11 57 mov r11, [16+rsi] 58 mul rcx 59 add rax,r9 60 adc rdx,0 61 mov [8+rdi],rax 62 mov r9,rdx 63 mov rax,r11 64 mov r11, [24+rsi] 65 mul rcx 66 add rax,r9 67 adc rdx,0 68 mov [16+rdi],rax 69 mov r9,rdx 70 mov rax,r11 71 mov r11, [32+rsi] 72 mul rcx 73 add rax,r9 74 adc rdx,0 75 mov [24+rdi],rax 76 mov r9,rdx 77 mov rax,r11 78 mov r11, [40+rsi] 79 mul rcx 80 add rax,r9 81 adc rdx,0 82 mov [32+rdi],rax 83 mov r9,rdx 84 mov rax,r11 85 mov r11, [48+rsi] 86 mul rcx 87 add rax,r9 88 adc rdx,0 89 mov [40+rdi],rax 90 mov r9,rdx 91 mov rax,r11 92 mov r11, [56+rsi] 93 mul rcx 94 add rax,r9 95 adc rdx,0 96 mov [48+rdi],rax 97 mov r9,rdx 98 mov rax,r11 99 mul rcx 100 add rax,r9 101 adc rdx,0 102 mov [56+rdi],rax 103 mov r9,rdx 104 add rsi, 64 105 add rdi, 64 106 sub r8, 8 107 jz L17 108 jmp L15 109 110L16: 111 mov rax, [0+rsi] 112 mul rcx 113 add rax, r9 114 adc rdx,0 115 mov [0+rdi],rax 116 mov r9,rdx 117 dec r8 118 jz L17 119 mov rax, [8+rsi] 120 mul rcx 121 add rax,r9 122 adc rdx,0 123 mov [8+rdi], rax 124 mov r9, rdx 125 dec r8 126 jz L17 127 mov rax, [16+rsi] 128 mul rcx 129 add rax, r9 130 adc rdx, 0 131 mov [16+rdi],rax 132 mov r9,rdx 133 dec r8 134 jz L17 135 mov rax, [24+rsi] 136 mul rcx 137 add rax, r9 138 adc rdx, 0 139 mov [24+rdi], rax 140 mov r9, rdx 141 dec r8 142 jz L17 143 mov rax, [32+rsi] 144 mul rcx 145 add rax, r9 146 adc rdx, 0 147 mov [32+rdi],rax 148 mov r9, rdx 149 dec r8 150 jz L17 151 mov rax, [40+rsi] 152 mul rcx 153 add rax, r9 154 adc rdx, 0 155 mov [40+rdi], rax 156 mov r9, rdx 157 dec r8 158 jz L17 159 mov rax, [48+rsi] 160 mul rcx 161 add rax, r9 162 adc rdx, 0 163 mov [48+rdi], rax 164 mov r9, rdx 165 dec r8 166 jz L17 167 168L17: 169 mov rax, r9 170 pop rsi 171 pop rdi 172 ret 173 174s_mpv_mul_set_vec64 ENDP 175 176 177;------------------------------------------------------------------------ 178; 179; Implementation of s_mpv_mul_add_vec which exploits 180; the 64X64->128 bit unsigned multiply instruction. 181; 182;------------------------------------------------------------------------ 183 184; r += a * digit, r and a are vectors of length len 185; returns the carry digit 186; r and a are 64 bit aligned. 187; 188; uint64_t 189; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 190; 191 192s_mpv_mul_add_vec64 PROC 193 194 ; compatibilities for paramenter registers 195 ; 196 ; About GAS and MASM, the usage of parameter registers are different. 197 198 push rdi 199 push rsi 200 201 mov rdi, rcx 202 mov rsi, rdx 203 mov edx, r8d 204 mov rcx, r9 205 206 xor rax, rax 207 test rdx, rdx 208 jz L27 209 mov r8, rdx 210 xor r9, r9 211 212L25: 213 cmp r8, 8 214 jb L26 215 mov rax, [0+rsi] 216 mov r10, [0+rdi] 217 mov r11, [8+rsi] 218 mul rcx 219 add rax,r10 220 adc rdx,0 221 mov r10, [8+rdi] 222 add rax,r9 223 adc rdx,0 224 mov [0+rdi],rax 225 mov r9,rdx 226 mov rax,r11 227 mov r11, [16+rsi] 228 mul rcx 229 add rax,r10 230 adc rdx,0 231 mov r10, [16+rdi] 232 add rax,r9 233 adc rdx,0 234 mov [8+rdi],rax 235 mov r9,rdx 236 mov rax,r11 237 mov r11, [24+rsi] 238 mul rcx 239 add rax,r10 240 adc rdx,0 241 mov r10, [24+rdi] 242 add rax,r9 243 adc rdx,0 244 mov [16+rdi],rax 245 mov r9,rdx 246 mov rax,r11 247 mov r11, [32+rsi] 248 mul rcx 249 add rax,r10 250 adc rdx,0 251 mov r10, [32+rdi] 252 add rax,r9 253 adc rdx,0 254 mov [24+rdi],rax 255 mov r9,rdx 256 mov rax,r11 257 mov r11, [40+rsi] 258 mul rcx 259 add rax,r10 260 adc rdx,0 261 mov r10, [40+rdi] 262 add rax,r9 263 adc rdx,0 264 mov [32+rdi],rax 265 mov r9,rdx 266 mov rax,r11 267 mov r11, [48+rsi] 268 mul rcx 269 add rax,r10 270 adc rdx,0 271 mov r10, [48+rdi] 272 add rax,r9 273 adc rdx,0 274 mov [40+rdi],rax 275 mov r9,rdx 276 mov rax,r11 277 mov r11, [56+rsi] 278 mul rcx 279 add rax,r10 280 adc rdx,0 281 mov r10, [56+rdi] 282 add rax,r9 283 adc rdx,0 284 mov [48+rdi],rax 285 mov r9,rdx 286 mov rax,r11 287 mul rcx 288 add rax,r10 289 adc rdx,0 290 add rax,r9 291 adc rdx,0 292 mov [56+rdi],rax 293 mov r9,rdx 294 add rsi,64 295 add rdi,64 296 sub r8, 8 297 jz L27 298 jmp L25 299 300L26: 301 mov rax, [0+rsi] 302 mov r10, [0+rdi] 303 mul rcx 304 add rax,r10 305 adc rdx,0 306 add rax,r9 307 adc rdx,0 308 mov [0+rdi],rax 309 mov r9,rdx 310 dec r8 311 jz L27 312 mov rax, [8+rsi] 313 mov r10, [8+rdi] 314 mul rcx 315 add rax,r10 316 adc rdx,0 317 add rax,r9 318 adc rdx,0 319 mov [8+rdi],rax 320 mov r9,rdx 321 dec r8 322 jz L27 323 mov rax, [16+rsi] 324 mov r10, [16+rdi] 325 mul rcx 326 add rax,r10 327 adc rdx,0 328 add rax,r9 329 adc rdx,0 330 mov [16+rdi],rax 331 mov r9,rdx 332 dec r8 333 jz L27 334 mov rax, [24+rsi] 335 mov r10, [24+rdi] 336 mul rcx 337 add rax,r10 338 adc rdx,0 339 add rax,r9 340 adc rdx,0 341 mov [24+rdi],rax 342 mov r9,rdx 343 dec r8 344 jz L27 345 mov rax, [32+rsi] 346 mov r10, [32+rdi] 347 mul rcx 348 add rax,r10 349 adc rdx,0 350 add rax,r9 351 adc rdx,0 352 mov [32+rdi],rax 353 mov r9,rdx 354 dec r8 355 jz L27 356 mov rax, [40+rsi] 357 mov r10, [40+rdi] 358 mul rcx 359 add rax,r10 360 adc rdx,0 361 add rax,r9 362 adc rdx,0 363 mov [40+rdi],rax 364 mov r9,rdx 365 dec r8 366 jz L27 367 mov rax, [48+rsi] 368 mov r10, [48+rdi] 369 mul rcx 370 add rax,r10 371 adc rdx,0 372 add rax, r9 373 adc rdx, 0 374 mov [48+rdi], rax 375 mov r9, rdx 376 dec r8 377 jz L27 378 379L27: 380 mov rax, r9 381 382 pop rsi 383 pop rdi 384 ret 385 386s_mpv_mul_add_vec64 ENDP 387 388END 389