1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !math_big_pure_go 6// +build !math_big_pure_go 7 8#include "textflag.h" 9 10// This file provides fast assembly versions for the elementary 11// arithmetic operations on vectors implemented in arith.go. 12 13// func mulWW(x, y Word) (z1, z0 Word) 14TEXT ·mulWW(SB),NOSPLIT,$0 15 MOVQ x+0(FP), AX 16 MULQ y+8(FP) 17 MOVQ DX, z1+16(FP) 18 MOVQ AX, z0+24(FP) 19 RET 20 21 22 23// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 24// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 25// This is faster than using rotate instructions. 26 27// func addVV(z, x, y []Word) (c Word) 28TEXT ·addVV(SB),NOSPLIT,$0 29 MOVQ z_len+8(FP), DI 30 MOVQ x+24(FP), R8 31 MOVQ y+48(FP), R9 32 MOVQ z+0(FP), R10 33 34 MOVQ $0, CX // c = 0 35 MOVQ $0, SI // i = 0 36 37 // s/JL/JMP/ below to disable the unrolled loop 38 SUBQ $4, DI // n -= 4 39 JL V1 // if n < 0 goto V1 40 41U1: // n >= 0 42 // regular loop body unrolled 4x 43 ADDQ CX, CX // restore CF 44 MOVQ 0(R8)(SI*8), R11 45 MOVQ 8(R8)(SI*8), R12 46 MOVQ 16(R8)(SI*8), R13 47 MOVQ 24(R8)(SI*8), R14 48 ADCQ 0(R9)(SI*8), R11 49 ADCQ 8(R9)(SI*8), R12 50 ADCQ 16(R9)(SI*8), R13 51 ADCQ 24(R9)(SI*8), R14 52 MOVQ R11, 0(R10)(SI*8) 53 MOVQ R12, 8(R10)(SI*8) 54 MOVQ R13, 16(R10)(SI*8) 55 MOVQ R14, 24(R10)(SI*8) 56 SBBQ CX, CX // save CF 57 58 ADDQ $4, SI // i += 4 59 SUBQ $4, DI // n -= 4 60 JGE U1 // if n >= 0 goto U1 61 62V1: ADDQ $4, DI // n += 4 63 JLE E1 // if n <= 0 goto E1 64 65L1: // n > 0 66 ADDQ CX, CX // restore CF 67 MOVQ 0(R8)(SI*8), R11 68 ADCQ 0(R9)(SI*8), R11 69 MOVQ R11, 0(R10)(SI*8) 70 SBBQ CX, CX // save CF 71 72 ADDQ $1, SI // i++ 73 SUBQ $1, DI // n-- 74 JG L1 // if n > 0 goto L1 75 76E1: NEGQ CX 77 MOVQ CX, c+72(FP) // return c 78 RET 79 80 81// func subVV(z, x, y []Word) (c Word) 82// (same as addVV except for SBBQ instead of ADCQ and label names) 83TEXT ·subVV(SB),NOSPLIT,$0 84 MOVQ z_len+8(FP), DI 85 MOVQ x+24(FP), R8 86 MOVQ y+48(FP), R9 87 MOVQ z+0(FP), R10 88 89 MOVQ $0, CX // c = 0 90 MOVQ $0, SI // i = 0 91 92 // s/JL/JMP/ below to disable the unrolled loop 93 SUBQ $4, DI // n -= 4 94 JL V2 // if n < 0 goto V2 95 96U2: // n >= 0 97 // regular loop body unrolled 4x 98 ADDQ CX, CX // restore CF 99 MOVQ 0(R8)(SI*8), R11 100 MOVQ 8(R8)(SI*8), R12 101 MOVQ 16(R8)(SI*8), R13 102 MOVQ 24(R8)(SI*8), R14 103 SBBQ 0(R9)(SI*8), R11 104 SBBQ 8(R9)(SI*8), R12 105 SBBQ 16(R9)(SI*8), R13 106 SBBQ 24(R9)(SI*8), R14 107 MOVQ R11, 0(R10)(SI*8) 108 MOVQ R12, 8(R10)(SI*8) 109 MOVQ R13, 16(R10)(SI*8) 110 MOVQ R14, 24(R10)(SI*8) 111 SBBQ CX, CX // save CF 112 113 ADDQ $4, SI // i += 4 114 SUBQ $4, DI // n -= 4 115 JGE U2 // if n >= 0 goto U2 116 117V2: ADDQ $4, DI // n += 4 118 JLE E2 // if n <= 0 goto E2 119 120L2: // n > 0 121 ADDQ CX, CX // restore CF 122 MOVQ 0(R8)(SI*8), R11 123 SBBQ 0(R9)(SI*8), R11 124 MOVQ R11, 0(R10)(SI*8) 125 SBBQ CX, CX // save CF 126 127 ADDQ $1, SI // i++ 128 SUBQ $1, DI // n-- 129 JG L2 // if n > 0 goto L2 130 131E2: NEGQ CX 132 MOVQ CX, c+72(FP) // return c 133 RET 134 135 136// func addVW(z, x []Word, y Word) (c Word) 137TEXT ·addVW(SB),NOSPLIT,$0 138 MOVQ z_len+8(FP), DI 139 CMPQ DI, $32 140 JG large 141 MOVQ x+24(FP), R8 142 MOVQ y+48(FP), CX // c = y 143 MOVQ z+0(FP), R10 144 145 MOVQ $0, SI // i = 0 146 147 // s/JL/JMP/ below to disable the unrolled loop 148 SUBQ $4, DI // n -= 4 149 JL V3 // if n < 4 goto V3 150 151U3: // n >= 0 152 // regular loop body unrolled 4x 153 MOVQ 0(R8)(SI*8), R11 154 MOVQ 8(R8)(SI*8), R12 155 MOVQ 16(R8)(SI*8), R13 156 MOVQ 24(R8)(SI*8), R14 157 ADDQ CX, R11 158 ADCQ $0, R12 159 ADCQ $0, R13 160 ADCQ $0, R14 161 SBBQ CX, CX // save CF 162 NEGQ CX 163 MOVQ R11, 0(R10)(SI*8) 164 MOVQ R12, 8(R10)(SI*8) 165 MOVQ R13, 16(R10)(SI*8) 166 MOVQ R14, 24(R10)(SI*8) 167 168 ADDQ $4, SI // i += 4 169 SUBQ $4, DI // n -= 4 170 JGE U3 // if n >= 0 goto U3 171 172V3: ADDQ $4, DI // n += 4 173 JLE E3 // if n <= 0 goto E3 174 175L3: // n > 0 176 ADDQ 0(R8)(SI*8), CX 177 MOVQ CX, 0(R10)(SI*8) 178 SBBQ CX, CX // save CF 179 NEGQ CX 180 181 ADDQ $1, SI // i++ 182 SUBQ $1, DI // n-- 183 JG L3 // if n > 0 goto L3 184 185E3: MOVQ CX, c+56(FP) // return c 186 RET 187large: 188 JMP ·addVWlarge(SB) 189 190 191// func subVW(z, x []Word, y Word) (c Word) 192// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 193TEXT ·subVW(SB),NOSPLIT,$0 194 MOVQ z_len+8(FP), DI 195 CMPQ DI, $32 196 JG large 197 MOVQ x+24(FP), R8 198 MOVQ y+48(FP), CX // c = y 199 MOVQ z+0(FP), R10 200 201 MOVQ $0, SI // i = 0 202 203 // s/JL/JMP/ below to disable the unrolled loop 204 SUBQ $4, DI // n -= 4 205 JL V4 // if n < 4 goto V4 206 207U4: // n >= 0 208 // regular loop body unrolled 4x 209 MOVQ 0(R8)(SI*8), R11 210 MOVQ 8(R8)(SI*8), R12 211 MOVQ 16(R8)(SI*8), R13 212 MOVQ 24(R8)(SI*8), R14 213 SUBQ CX, R11 214 SBBQ $0, R12 215 SBBQ $0, R13 216 SBBQ $0, R14 217 SBBQ CX, CX // save CF 218 NEGQ CX 219 MOVQ R11, 0(R10)(SI*8) 220 MOVQ R12, 8(R10)(SI*8) 221 MOVQ R13, 16(R10)(SI*8) 222 MOVQ R14, 24(R10)(SI*8) 223 224 ADDQ $4, SI // i += 4 225 SUBQ $4, DI // n -= 4 226 JGE U4 // if n >= 0 goto U4 227 228V4: ADDQ $4, DI // n += 4 229 JLE E4 // if n <= 0 goto E4 230 231L4: // n > 0 232 MOVQ 0(R8)(SI*8), R11 233 SUBQ CX, R11 234 MOVQ R11, 0(R10)(SI*8) 235 SBBQ CX, CX // save CF 236 NEGQ CX 237 238 ADDQ $1, SI // i++ 239 SUBQ $1, DI // n-- 240 JG L4 // if n > 0 goto L4 241 242E4: MOVQ CX, c+56(FP) // return c 243 RET 244large: 245 JMP ·subVWlarge(SB) 246 247 248// func shlVU(z, x []Word, s uint) (c Word) 249TEXT ·shlVU(SB),NOSPLIT,$0 250 MOVQ z_len+8(FP), BX // i = z 251 SUBQ $1, BX // i-- 252 JL X8b // i < 0 (n <= 0) 253 254 // n > 0 255 MOVQ z+0(FP), R10 256 MOVQ x+24(FP), R8 257 MOVQ s+48(FP), CX 258 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 259 MOVQ $0, DX 260 SHLQ CX, AX, DX // w1>>ŝ 261 MOVQ DX, c+56(FP) 262 263 CMPQ BX, $0 264 JLE X8a // i <= 0 265 266 // i > 0 267L8: MOVQ AX, DX // w = w1 268 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 269 SHLQ CX, AX, DX // w<<s | w1>>ŝ 270 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 271 SUBQ $1, BX // i-- 272 JG L8 // i > 0 273 274 // i <= 0 275X8a: SHLQ CX, AX // w1<<s 276 MOVQ AX, (R10) // z[0] = w1<<s 277 RET 278 279X8b: MOVQ $0, c+56(FP) 280 RET 281 282 283// func shrVU(z, x []Word, s uint) (c Word) 284TEXT ·shrVU(SB),NOSPLIT,$0 285 MOVQ z_len+8(FP), R11 286 SUBQ $1, R11 // n-- 287 JL X9b // n < 0 (n <= 0) 288 289 // n > 0 290 MOVQ z+0(FP), R10 291 MOVQ x+24(FP), R8 292 MOVQ s+48(FP), CX 293 MOVQ (R8), AX // w1 = x[0] 294 MOVQ $0, DX 295 SHRQ CX, AX, DX // w1<<ŝ 296 MOVQ DX, c+56(FP) 297 298 MOVQ $0, BX // i = 0 299 JMP E9 300 301 // i < n-1 302L9: MOVQ AX, DX // w = w1 303 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 304 SHRQ CX, AX, DX // w>>s | w1<<ŝ 305 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 306 ADDQ $1, BX // i++ 307 308E9: CMPQ BX, R11 309 JL L9 // i < n-1 310 311 // i >= n-1 312X9a: SHRQ CX, AX // w1>>s 313 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 314 RET 315 316X9b: MOVQ $0, c+56(FP) 317 RET 318 319 320// func mulAddVWW(z, x []Word, y, r Word) (c Word) 321TEXT ·mulAddVWW(SB),NOSPLIT,$0 322 MOVQ z+0(FP), R10 323 MOVQ x+24(FP), R8 324 MOVQ y+48(FP), R9 325 MOVQ r+56(FP), CX // c = r 326 MOVQ z_len+8(FP), R11 327 MOVQ $0, BX // i = 0 328 329 CMPQ R11, $4 330 JL E5 331 332U5: // i+4 <= n 333 // regular loop body unrolled 4x 334 MOVQ (0*8)(R8)(BX*8), AX 335 MULQ R9 336 ADDQ CX, AX 337 ADCQ $0, DX 338 MOVQ AX, (0*8)(R10)(BX*8) 339 MOVQ DX, CX 340 MOVQ (1*8)(R8)(BX*8), AX 341 MULQ R9 342 ADDQ CX, AX 343 ADCQ $0, DX 344 MOVQ AX, (1*8)(R10)(BX*8) 345 MOVQ DX, CX 346 MOVQ (2*8)(R8)(BX*8), AX 347 MULQ R9 348 ADDQ CX, AX 349 ADCQ $0, DX 350 MOVQ AX, (2*8)(R10)(BX*8) 351 MOVQ DX, CX 352 MOVQ (3*8)(R8)(BX*8), AX 353 MULQ R9 354 ADDQ CX, AX 355 ADCQ $0, DX 356 MOVQ AX, (3*8)(R10)(BX*8) 357 MOVQ DX, CX 358 ADDQ $4, BX // i += 4 359 360 LEAQ 4(BX), DX 361 CMPQ DX, R11 362 JLE U5 363 JMP E5 364 365L5: MOVQ (R8)(BX*8), AX 366 MULQ R9 367 ADDQ CX, AX 368 ADCQ $0, DX 369 MOVQ AX, (R10)(BX*8) 370 MOVQ DX, CX 371 ADDQ $1, BX // i++ 372 373E5: CMPQ BX, R11 // i < n 374 JL L5 375 376 MOVQ CX, c+64(FP) 377 RET 378 379 380// func addMulVVW(z, x []Word, y Word) (c Word) 381TEXT ·addMulVVW(SB),NOSPLIT,$0 382 CMPB ·support_adx(SB), $1 383 JEQ adx 384 MOVQ z+0(FP), R10 385 MOVQ x+24(FP), R8 386 MOVQ y+48(FP), R9 387 MOVQ z_len+8(FP), R11 388 MOVQ $0, BX // i = 0 389 MOVQ $0, CX // c = 0 390 MOVQ R11, R12 391 ANDQ $-2, R12 392 CMPQ R11, $2 393 JAE A6 394 JMP E6 395 396A6: 397 MOVQ (R8)(BX*8), AX 398 MULQ R9 399 ADDQ (R10)(BX*8), AX 400 ADCQ $0, DX 401 ADDQ CX, AX 402 ADCQ $0, DX 403 MOVQ DX, CX 404 MOVQ AX, (R10)(BX*8) 405 406 MOVQ (8)(R8)(BX*8), AX 407 MULQ R9 408 ADDQ (8)(R10)(BX*8), AX 409 ADCQ $0, DX 410 ADDQ CX, AX 411 ADCQ $0, DX 412 MOVQ DX, CX 413 MOVQ AX, (8)(R10)(BX*8) 414 415 ADDQ $2, BX 416 CMPQ BX, R12 417 JL A6 418 JMP E6 419 420L6: MOVQ (R8)(BX*8), AX 421 MULQ R9 422 ADDQ CX, AX 423 ADCQ $0, DX 424 ADDQ AX, (R10)(BX*8) 425 ADCQ $0, DX 426 MOVQ DX, CX 427 ADDQ $1, BX // i++ 428 429E6: CMPQ BX, R11 // i < n 430 JL L6 431 432 MOVQ CX, c+56(FP) 433 RET 434 435adx: 436 MOVQ z_len+8(FP), R11 437 MOVQ z+0(FP), R10 438 MOVQ x+24(FP), R8 439 MOVQ y+48(FP), DX 440 MOVQ $0, BX // i = 0 441 MOVQ $0, CX // carry 442 CMPQ R11, $8 443 JAE adx_loop_header 444 CMPQ BX, R11 445 JL adx_short 446 MOVQ CX, c+56(FP) 447 RET 448 449adx_loop_header: 450 MOVQ R11, R13 451 ANDQ $-8, R13 452adx_loop: 453 XORQ R9, R9 // unset flags 454 MULXQ (R8), SI, DI 455 ADCXQ CX,SI 456 ADOXQ (R10), SI 457 MOVQ SI,(R10) 458 459 MULXQ 8(R8), AX, CX 460 ADCXQ DI, AX 461 ADOXQ 8(R10), AX 462 MOVQ AX, 8(R10) 463 464 MULXQ 16(R8), SI, DI 465 ADCXQ CX, SI 466 ADOXQ 16(R10), SI 467 MOVQ SI, 16(R10) 468 469 MULXQ 24(R8), AX, CX 470 ADCXQ DI, AX 471 ADOXQ 24(R10), AX 472 MOVQ AX, 24(R10) 473 474 MULXQ 32(R8), SI, DI 475 ADCXQ CX, SI 476 ADOXQ 32(R10), SI 477 MOVQ SI, 32(R10) 478 479 MULXQ 40(R8), AX, CX 480 ADCXQ DI, AX 481 ADOXQ 40(R10), AX 482 MOVQ AX, 40(R10) 483 484 MULXQ 48(R8), SI, DI 485 ADCXQ CX, SI 486 ADOXQ 48(R10), SI 487 MOVQ SI, 48(R10) 488 489 MULXQ 56(R8), AX, CX 490 ADCXQ DI, AX 491 ADOXQ 56(R10), AX 492 MOVQ AX, 56(R10) 493 494 ADCXQ R9, CX 495 ADOXQ R9, CX 496 497 ADDQ $64, R8 498 ADDQ $64, R10 499 ADDQ $8, BX 500 501 CMPQ BX, R13 502 JL adx_loop 503 MOVQ z+0(FP), R10 504 MOVQ x+24(FP), R8 505 CMPQ BX, R11 506 JL adx_short 507 MOVQ CX, c+56(FP) 508 RET 509 510adx_short: 511 MULXQ (R8)(BX*8), SI, DI 512 ADDQ CX, SI 513 ADCQ $0, DI 514 ADDQ SI, (R10)(BX*8) 515 ADCQ $0, DI 516 MOVQ DI, CX 517 ADDQ $1, BX // i++ 518 519 CMPQ BX, R11 520 JL adx_short 521 522 MOVQ CX, c+56(FP) 523 RET 524 525 526 527