1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !math_big_pure_go 6// +build !math_big_pure_go 7 8#include "textflag.h" 9 10// This file provides fast assembly versions for the elementary 11// arithmetic operations on vectors implemented in arith.go. 12 13TEXT ·mulWW(SB), NOSPLIT, $0 14 MOVD x+0(FP), R3 15 MOVD y+8(FP), R4 16 MULHDU R3, R4 17 MOVD R10, z1+16(FP) 18 MOVD R11, z0+24(FP) 19 RET 20 21 22// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 23// func addVV(z, x, y []Word) (c Word) 24 25TEXT ·addVV(SB), NOSPLIT, $0 26 MOVD addvectorfacility+0x00(SB), R1 27 BR (R1) 28 29TEXT ·addVV_check(SB), NOSPLIT, $0 30 MOVB ·hasVX(SB), R1 31 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 32 MOVD $addvectorfacility+0x00(SB), R1 33 MOVD $·addVV_novec(SB), R2 34 MOVD R2, 0(R1) 35 36 // MOVD $·addVV_novec(SB), 0(R1) 37 BR ·addVV_novec(SB) 38 39vectorimpl: 40 MOVD $addvectorfacility+0x00(SB), R1 41 MOVD $·addVV_vec(SB), R2 42 MOVD R2, 0(R1) 43 44 // MOVD $·addVV_vec(SB), 0(R1) 45 BR ·addVV_vec(SB) 46 47GLOBL addvectorfacility+0x00(SB), NOPTR, $8 48DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) 49 50TEXT ·addVV_vec(SB), NOSPLIT, $0 51 MOVD z_len+8(FP), R3 52 MOVD x+24(FP), R8 53 MOVD y+48(FP), R9 54 MOVD z+0(FP), R2 55 56 MOVD $0, R4 // c = 0 57 MOVD $0, R0 // make sure it's zero 58 MOVD $0, R10 // i = 0 59 60 // s/JL/JMP/ below to disable the unrolled loop 61 SUB $4, R3 62 BLT v1 63 SUB $12, R3 // n -= 16 64 BLT A1 // if n < 0 goto A1 65 66 MOVD R8, R5 67 MOVD R9, R6 68 MOVD R2, R7 69 70 // n >= 0 71 // regular loop body unrolled 16x 72 VZERO V0 // c = 0 73 74UU1: 75 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 76 ADD $64, R5 77 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 78 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 79 80 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 81 ADD $64, R6 82 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 83 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 84 85 VACCCQ V1, V9, V0, V25 86 VACQ V1, V9, V0, V17 87 VACCCQ V2, V10, V25, V26 88 VACQ V2, V10, V25, V18 89 90 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 91 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 92 ADD $32, R5 93 ADD $32, R6 94 95 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 96 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 97 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 98 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 99 100 VACCCQ V3, V11, V26, V27 101 VACQ V3, V11, V26, V19 102 VACCCQ V4, V12, V27, V28 103 VACQ V4, V12, V27, V20 104 105 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 106 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 107 ADD $32, R5 108 ADD $32, R6 109 110 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 111 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 112 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 113 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 114 115 VACCCQ V5, V13, V28, V29 116 VACQ V5, V13, V28, V21 117 VACCCQ V6, V14, V29, V30 118 VACQ V6, V14, V29, V22 119 120 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 121 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 122 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 123 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 124 125 VACCCQ V7, V15, V30, V31 126 VACQ V7, V15, V30, V23 127 VACCCQ V8, V16, V31, V0 // V0 has carry-over 128 VACQ V8, V16, V31, V24 129 130 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 131 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 132 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 133 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 134 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 135 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 136 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 137 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 138 VSTM V17, V24, 0(R7) // 128-bytes into z 139 ADD $128, R7 140 ADD $128, R10 // i += 16 141 SUB $16, R3 // n -= 16 142 BGE UU1 // if n >= 0 goto U1 143 VLGVG $1, V0, R4 // put cf into R4 144 NEG R4, R4 // save cf 145 146A1: 147 ADD $12, R3 // n += 16 148 149 // s/JL/JMP/ below to disable the unrolled loop 150 BLT v1 // if n < 0 goto v1 151 152U1: // n >= 0 153 // regular loop body unrolled 4x 154 MOVD 0(R8)(R10*1), R5 155 MOVD 8(R8)(R10*1), R6 156 MOVD 16(R8)(R10*1), R7 157 MOVD 24(R8)(R10*1), R1 158 ADDC R4, R4 // restore CF 159 MOVD 0(R9)(R10*1), R11 160 ADDE R11, R5 161 MOVD 8(R9)(R10*1), R11 162 ADDE R11, R6 163 MOVD 16(R9)(R10*1), R11 164 ADDE R11, R7 165 MOVD 24(R9)(R10*1), R11 166 ADDE R11, R1 167 MOVD R0, R4 168 ADDE R4, R4 // save CF 169 NEG R4, R4 170 MOVD R5, 0(R2)(R10*1) 171 MOVD R6, 8(R2)(R10*1) 172 MOVD R7, 16(R2)(R10*1) 173 MOVD R1, 24(R2)(R10*1) 174 175 ADD $32, R10 // i += 4 176 SUB $4, R3 // n -= 4 177 BGE U1 // if n >= 0 goto U1 178 179v1: 180 ADD $4, R3 // n += 4 181 BLE E1 // if n <= 0 goto E1 182 183L1: // n > 0 184 ADDC R4, R4 // restore CF 185 MOVD 0(R8)(R10*1), R5 186 MOVD 0(R9)(R10*1), R11 187 ADDE R11, R5 188 MOVD R5, 0(R2)(R10*1) 189 MOVD R0, R4 190 ADDE R4, R4 // save CF 191 NEG R4, R4 192 193 ADD $8, R10 // i++ 194 SUB $1, R3 // n-- 195 BGT L1 // if n > 0 goto L1 196 197E1: 198 NEG R4, R4 199 MOVD R4, c+72(FP) // return c 200 RET 201 202TEXT ·addVV_novec(SB), NOSPLIT, $0 203novec: 204 MOVD z_len+8(FP), R3 205 MOVD x+24(FP), R8 206 MOVD y+48(FP), R9 207 MOVD z+0(FP), R2 208 209 MOVD $0, R4 // c = 0 210 MOVD $0, R0 // make sure it's zero 211 MOVD $0, R10 // i = 0 212 213 // s/JL/JMP/ below to disable the unrolled loop 214 SUB $4, R3 // n -= 4 215 BLT v1n // if n < 0 goto v1n 216 217U1n: // n >= 0 218 // regular loop body unrolled 4x 219 MOVD 0(R8)(R10*1), R5 220 MOVD 8(R8)(R10*1), R6 221 MOVD 16(R8)(R10*1), R7 222 MOVD 24(R8)(R10*1), R1 223 ADDC R4, R4 // restore CF 224 MOVD 0(R9)(R10*1), R11 225 ADDE R11, R5 226 MOVD 8(R9)(R10*1), R11 227 ADDE R11, R6 228 MOVD 16(R9)(R10*1), R11 229 ADDE R11, R7 230 MOVD 24(R9)(R10*1), R11 231 ADDE R11, R1 232 MOVD R0, R4 233 ADDE R4, R4 // save CF 234 NEG R4, R4 235 MOVD R5, 0(R2)(R10*1) 236 MOVD R6, 8(R2)(R10*1) 237 MOVD R7, 16(R2)(R10*1) 238 MOVD R1, 24(R2)(R10*1) 239 240 ADD $32, R10 // i += 4 241 SUB $4, R3 // n -= 4 242 BGE U1n // if n >= 0 goto U1n 243 244v1n: 245 ADD $4, R3 // n += 4 246 BLE E1n // if n <= 0 goto E1n 247 248L1n: // n > 0 249 ADDC R4, R4 // restore CF 250 MOVD 0(R8)(R10*1), R5 251 MOVD 0(R9)(R10*1), R11 252 ADDE R11, R5 253 MOVD R5, 0(R2)(R10*1) 254 MOVD R0, R4 255 ADDE R4, R4 // save CF 256 NEG R4, R4 257 258 ADD $8, R10 // i++ 259 SUB $1, R3 // n-- 260 BGT L1n // if n > 0 goto L1n 261 262E1n: 263 NEG R4, R4 264 MOVD R4, c+72(FP) // return c 265 RET 266 267TEXT ·subVV(SB), NOSPLIT, $0 268 MOVD subvectorfacility+0x00(SB), R1 269 BR (R1) 270 271TEXT ·subVV_check(SB), NOSPLIT, $0 272 MOVB ·hasVX(SB), R1 273 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 274 MOVD $subvectorfacility+0x00(SB), R1 275 MOVD $·subVV_novec(SB), R2 276 MOVD R2, 0(R1) 277 278 // MOVD $·subVV_novec(SB), 0(R1) 279 BR ·subVV_novec(SB) 280 281vectorimpl: 282 MOVD $subvectorfacility+0x00(SB), R1 283 MOVD $·subVV_vec(SB), R2 284 MOVD R2, 0(R1) 285 286 // MOVD $·subVV_vec(SB), 0(R1) 287 BR ·subVV_vec(SB) 288 289GLOBL subvectorfacility+0x00(SB), NOPTR, $8 290DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) 291 292// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 293// func subVV(z, x, y []Word) (c Word) 294// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 295TEXT ·subVV_vec(SB), NOSPLIT, $0 296 MOVD z_len+8(FP), R3 297 MOVD x+24(FP), R8 298 MOVD y+48(FP), R9 299 MOVD z+0(FP), R2 300 MOVD $0, R4 // c = 0 301 MOVD $0, R0 // make sure it's zero 302 MOVD $0, R10 // i = 0 303 304 // s/JL/JMP/ below to disable the unrolled loop 305 SUB $4, R3 // n -= 4 306 BLT v1 // if n < 0 goto v1 307 SUB $12, R3 // n -= 16 308 BLT A1 // if n < 0 goto A1 309 310 MOVD R8, R5 311 MOVD R9, R6 312 MOVD R2, R7 313 314 // n >= 0 315 // regular loop body unrolled 16x 316 VZERO V0 // cf = 0 317 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) 318 VLVGG $1, R4, V0 // put carry into V0 319 320UU1: 321 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 322 ADD $64, R5 323 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 324 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 325 326 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 327 ADD $64, R6 328 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 329 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 330 331 VSBCBIQ V1, V9, V0, V25 332 VSBIQ V1, V9, V0, V17 333 VSBCBIQ V2, V10, V25, V26 334 VSBIQ V2, V10, V25, V18 335 336 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 337 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 338 ADD $32, R5 339 ADD $32, R6 340 341 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 342 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 343 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 344 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 345 346 VSBCBIQ V3, V11, V26, V27 347 VSBIQ V3, V11, V26, V19 348 VSBCBIQ V4, V12, V27, V28 349 VSBIQ V4, V12, V27, V20 350 351 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 352 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 353 ADD $32, R5 354 ADD $32, R6 355 356 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 357 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 358 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 359 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 360 361 VSBCBIQ V5, V13, V28, V29 362 VSBIQ V5, V13, V28, V21 363 VSBCBIQ V6, V14, V29, V30 364 VSBIQ V6, V14, V29, V22 365 366 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 367 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 368 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 369 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 370 371 VSBCBIQ V7, V15, V30, V31 372 VSBIQ V7, V15, V30, V23 373 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over 374 VSBIQ V8, V16, V31, V24 375 376 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 377 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 378 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 379 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 380 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 381 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 382 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 383 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 384 VSTM V17, V24, 0(R7) // 128-bytes into z 385 ADD $128, R7 386 ADD $128, R10 // i += 16 387 SUB $16, R3 // n -= 16 388 BGE UU1 // if n >= 0 goto U1 389 VLGVG $1, V0, R4 // put cf into R4 390 SUB $1, R4 // save cf 391 392A1: 393 ADD $12, R3 // n += 16 394 BLT v1 // if n < 0 goto v1 395 396U1: // n >= 0 397 // regular loop body unrolled 4x 398 MOVD 0(R8)(R10*1), R5 399 MOVD 8(R8)(R10*1), R6 400 MOVD 16(R8)(R10*1), R7 401 MOVD 24(R8)(R10*1), R1 402 MOVD R0, R11 403 SUBC R4, R11 // restore CF 404 MOVD 0(R9)(R10*1), R11 405 SUBE R11, R5 406 MOVD 8(R9)(R10*1), R11 407 SUBE R11, R6 408 MOVD 16(R9)(R10*1), R11 409 SUBE R11, R7 410 MOVD 24(R9)(R10*1), R11 411 SUBE R11, R1 412 MOVD R0, R4 413 SUBE R4, R4 // save CF 414 MOVD R5, 0(R2)(R10*1) 415 MOVD R6, 8(R2)(R10*1) 416 MOVD R7, 16(R2)(R10*1) 417 MOVD R1, 24(R2)(R10*1) 418 419 ADD $32, R10 // i += 4 420 SUB $4, R3 // n -= 4 421 BGE U1 // if n >= 0 goto U1n 422 423v1: 424 ADD $4, R3 // n += 4 425 BLE E1 // if n <= 0 goto E1 426 427L1: // n > 0 428 MOVD R0, R11 429 SUBC R4, R11 // restore CF 430 MOVD 0(R8)(R10*1), R5 431 MOVD 0(R9)(R10*1), R11 432 SUBE R11, R5 433 MOVD R5, 0(R2)(R10*1) 434 MOVD R0, R4 435 SUBE R4, R4 // save CF 436 437 ADD $8, R10 // i++ 438 SUB $1, R3 // n-- 439 BGT L1 // if n > 0 goto L1n 440 441E1: 442 NEG R4, R4 443 MOVD R4, c+72(FP) // return c 444 RET 445 446// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 447// func subVV(z, x, y []Word) (c Word) 448// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 449TEXT ·subVV_novec(SB), NOSPLIT, $0 450 MOVD z_len+8(FP), R3 451 MOVD x+24(FP), R8 452 MOVD y+48(FP), R9 453 MOVD z+0(FP), R2 454 455 MOVD $0, R4 // c = 0 456 MOVD $0, R0 // make sure it's zero 457 MOVD $0, R10 // i = 0 458 459 // s/JL/JMP/ below to disable the unrolled loop 460 SUB $4, R3 // n -= 4 461 BLT v1 // if n < 0 goto v1 462 463U1: // n >= 0 464 // regular loop body unrolled 4x 465 MOVD 0(R8)(R10*1), R5 466 MOVD 8(R8)(R10*1), R6 467 MOVD 16(R8)(R10*1), R7 468 MOVD 24(R8)(R10*1), R1 469 MOVD R0, R11 470 SUBC R4, R11 // restore CF 471 MOVD 0(R9)(R10*1), R11 472 SUBE R11, R5 473 MOVD 8(R9)(R10*1), R11 474 SUBE R11, R6 475 MOVD 16(R9)(R10*1), R11 476 SUBE R11, R7 477 MOVD 24(R9)(R10*1), R11 478 SUBE R11, R1 479 MOVD R0, R4 480 SUBE R4, R4 // save CF 481 MOVD R5, 0(R2)(R10*1) 482 MOVD R6, 8(R2)(R10*1) 483 MOVD R7, 16(R2)(R10*1) 484 MOVD R1, 24(R2)(R10*1) 485 486 ADD $32, R10 // i += 4 487 SUB $4, R3 // n -= 4 488 BGE U1 // if n >= 0 goto U1 489 490v1: 491 ADD $4, R3 // n += 4 492 BLE E1 // if n <= 0 goto E1 493 494L1: // n > 0 495 MOVD R0, R11 496 SUBC R4, R11 // restore CF 497 MOVD 0(R8)(R10*1), R5 498 MOVD 0(R9)(R10*1), R11 499 SUBE R11, R5 500 MOVD R5, 0(R2)(R10*1) 501 MOVD R0, R4 502 SUBE R4, R4 // save CF 503 504 ADD $8, R10 // i++ 505 SUB $1, R3 // n-- 506 BGT L1 // if n > 0 goto L1 507 508E1: 509 NEG R4, R4 510 MOVD R4, c+72(FP) // return c 511 RET 512 513TEXT ·addVW(SB), NOSPLIT, $0 514 MOVD z_len+8(FP), R5 // length of z 515 MOVD x+24(FP), R6 516 MOVD y+48(FP), R7 // c = y 517 MOVD z+0(FP), R8 518 519 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return 520 521 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. 522 ADDC 0(R6), R7 523 MOVD R7, 0(R8) 524 CMPBEQ R5, $1, returnResult // len(z) == 1 525 MOVD $0, R9 526 ADDE 8(R6), R9 527 MOVD R9, 8(R8) 528 CMPBEQ R5, $2, returnResult // len(z) == 2 529 530 // Update the counters 531 MOVD $16, R12 // i = 2 532 MOVD $-2(R5), R5 // n = n - 2 533 534loopOverEachWord: 535 BRC $12, copySetup // carry = 0, copy the rest 536 MOVD $1, R9 537 538 // Originally we used the carry flag generated in the previous iteration 539 // (i.e: ADDE could be used here to do the addition). However, since we 540 // already know carry is 1 (otherwise we will go to copy section), we can use 541 // ADDC here so the current iteration does not depend on the carry flag 542 // generated in the previous iteration. This could be useful when branch prediction happens. 543 ADDC 0(R6)(R12*1), R9 544 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c 545 546 MOVD $8(R12), R12 // i++ 547 BRCTG R5, loopOverEachWord // n-- 548 549// Return the current carry value 550returnResult: 551 MOVD $0, R0 552 ADDE R0, R0 553 MOVD R0, c+56(FP) 554 RET 555 556// Update position of x(R6) and z(R8) based on the current counter value and perform copying. 557// With the assumption that x and z will not overlap with each other or x and z will 558// point to same memory region, we can use a faster version of copy using only MVC here. 559// In the following implementation, we have three copy loops, each copying a word, 4 words, and 560// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 561copySetup: 562 ADD R12, R6 563 ADD R12, R8 564 565 CMPBGE R5, $4, mediumLoop 566 567smallLoop: // does a loop unrolling to copy word when n < 4 568 CMPBEQ R5, $0, returnZero 569 MVC $8, 0(R6), 0(R8) 570 CMPBEQ R5, $1, returnZero 571 MVC $8, 8(R6), 8(R8) 572 CMPBEQ R5, $2, returnZero 573 MVC $8, 16(R6), 16(R8) 574 575returnZero: 576 MOVD $0, c+56(FP) // return 0 as carry 577 RET 578 579mediumLoop: 580 CMPBLT R5, $4, smallLoop 581 CMPBLT R5, $32, mediumLoopBody 582 583largeLoop: // Copying 256 bytes at a time. 584 MVC $256, 0(R6), 0(R8) 585 MOVD $256(R6), R6 586 MOVD $256(R8), R8 587 MOVD $-32(R5), R5 588 CMPBGE R5, $32, largeLoop 589 BR mediumLoop 590 591mediumLoopBody: // Copying 32 bytes at a time 592 MVC $32, 0(R6), 0(R8) 593 MOVD $32(R6), R6 594 MOVD $32(R8), R8 595 MOVD $-4(R5), R5 596 CMPBGE R5, $4, mediumLoopBody 597 BR smallLoop 598 599returnC: 600 MOVD R7, c+56(FP) 601 RET 602 603TEXT ·subVW(SB), NOSPLIT, $0 604 MOVD z_len+8(FP), R5 605 MOVD x+24(FP), R6 606 MOVD y+48(FP), R7 // The borrow bit passed in 607 MOVD z+0(FP), R8 608 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. 609 610 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return 611 612 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag 613 MOVD 0(R6), R9 614 SUBC R7, R9 615 MOVD R9, 0(R8) 616 CMPBEQ R5, $1, returnResult 617 MOVD 8(R6), R9 618 SUBE R0, R9 619 MOVD R9, 8(R8) 620 CMPBEQ R5, $2, returnResult 621 622 // Update the counters 623 MOVD $16, R12 // i = 2 624 MOVD $-2(R5), R5 // n = n - 2 625 626loopOverEachWord: 627 BRC $3, copySetup // no borrow, copy the rest 628 MOVD 0(R6)(R12*1), R9 629 630 // Originally we used the borrow flag generated in the previous iteration 631 // (i.e: SUBE could be used here to do the subtraction). However, since we 632 // already know borrow is 1 (otherwise we will go to copy section), we can 633 // use SUBC here so the current iteration does not depend on the borrow flag 634 // generated in the previous iteration. This could be useful when branch prediction happens. 635 SUBC $1, R9 636 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 637 638 MOVD $8(R12), R12 // i++ 639 BRCTG R5, loopOverEachWord // n-- 640 641// return the current borrow value 642returnResult: 643 SUBE R0, R0 644 NEG R0, R0 645 MOVD R0, c+56(FP) 646 RET 647 648// Update position of x(R6) and z(R8) based on the current counter value and perform copying. 649// With the assumption that x and z will not overlap with each other or x and z will 650// point to same memory region, we can use a faster version of copy using only MVC here. 651// In the following implementation, we have three copy loops, each copying a word, 4 words, and 652// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 653copySetup: 654 ADD R12, R6 655 ADD R12, R8 656 657 CMPBGE R5, $4, mediumLoop 658 659smallLoop: // does a loop unrolling to copy word when n < 4 660 CMPBEQ R5, $0, returnZero 661 MVC $8, 0(R6), 0(R8) 662 CMPBEQ R5, $1, returnZero 663 MVC $8, 8(R6), 8(R8) 664 CMPBEQ R5, $2, returnZero 665 MVC $8, 16(R6), 16(R8) 666 667returnZero: 668 MOVD $0, c+56(FP) // return 0 as borrow 669 RET 670 671mediumLoop: 672 CMPBLT R5, $4, smallLoop 673 CMPBLT R5, $32, mediumLoopBody 674 675largeLoop: // Copying 256 bytes at a time 676 MVC $256, 0(R6), 0(R8) 677 MOVD $256(R6), R6 678 MOVD $256(R8), R8 679 MOVD $-32(R5), R5 680 CMPBGE R5, $32, largeLoop 681 BR mediumLoop 682 683mediumLoopBody: // Copying 32 bytes at a time 684 MVC $32, 0(R6), 0(R8) 685 MOVD $32(R6), R6 686 MOVD $32(R8), R8 687 MOVD $-4(R5), R5 688 CMPBGE R5, $4, mediumLoopBody 689 BR smallLoop 690 691returnC: 692 MOVD R7, c+56(FP) 693 RET 694 695// func shlVU(z, x []Word, s uint) (c Word) 696TEXT ·shlVU(SB), NOSPLIT, $0 697 BR ·shlVU_g(SB) 698 699// func shrVU(z, x []Word, s uint) (c Word) 700TEXT ·shrVU(SB), NOSPLIT, $0 701 BR ·shrVU_g(SB) 702 703// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i 704// func mulAddVWW(z, x []Word, y, r Word) (c Word) 705TEXT ·mulAddVWW(SB), NOSPLIT, $0 706 MOVD z+0(FP), R2 707 MOVD x+24(FP), R8 708 MOVD y+48(FP), R9 709 MOVD r+56(FP), R4 // c = r 710 MOVD z_len+8(FP), R5 711 MOVD $0, R1 // i = 0 712 MOVD $0, R7 // i*8 = 0 713 MOVD $0, R0 // make sure it's zero 714 BR E5 715 716L5: 717 MOVD (R8)(R1*1), R6 718 MULHDU R9, R6 719 ADDC R4, R11 // add to low order bits 720 ADDE R0, R6 721 MOVD R11, (R2)(R1*1) 722 MOVD R6, R4 723 ADD $8, R1 // i*8 + 8 724 ADD $1, R7 // i++ 725 726E5: 727 CMPBLT R7, R5, L5 // i < n 728 729 MOVD R4, c+64(FP) 730 RET 731 732// func addMulVVW(z, x []Word, y Word) (c Word) 733// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i 734TEXT ·addMulVVW(SB), NOSPLIT, $0 735 MOVD z+0(FP), R2 736 MOVD x+24(FP), R8 737 MOVD y+48(FP), R9 738 MOVD z_len+8(FP), R5 739 740 MOVD $0, R1 // i*8 = 0 741 MOVD $0, R7 // i = 0 742 MOVD $0, R0 // make sure it's zero 743 MOVD $0, R4 // c = 0 744 745 MOVD R5, R12 746 AND $-2, R12 747 CMPBGE R5, $2, A6 748 BR E6 749 750A6: 751 MOVD (R8)(R1*1), R6 752 MULHDU R9, R6 753 MOVD (R2)(R1*1), R10 754 ADDC R10, R11 // add to low order bits 755 ADDE R0, R6 756 ADDC R4, R11 757 ADDE R0, R6 758 MOVD R6, R4 759 MOVD R11, (R2)(R1*1) 760 761 MOVD (8)(R8)(R1*1), R6 762 MULHDU R9, R6 763 MOVD (8)(R2)(R1*1), R10 764 ADDC R10, R11 // add to low order bits 765 ADDE R0, R6 766 ADDC R4, R11 767 ADDE R0, R6 768 MOVD R6, R4 769 MOVD R11, (8)(R2)(R1*1) 770 771 ADD $16, R1 // i*8 + 8 772 ADD $2, R7 // i++ 773 774 CMPBLT R7, R12, A6 775 BR E6 776 777L6: 778 MOVD (R8)(R1*1), R6 779 MULHDU R9, R6 780 MOVD (R2)(R1*1), R10 781 ADDC R10, R11 // add to low order bits 782 ADDE R0, R6 783 ADDC R4, R11 784 ADDE R0, R6 785 MOVD R6, R4 786 MOVD R11, (R2)(R1*1) 787 788 ADD $8, R1 // i*8 + 8 789 ADD $1, R7 // i++ 790 791E6: 792 CMPBLT R7, R5, L6 // i < n 793 794 MOVD R4, c+56(FP) 795 RET 796 797