1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !math_big_pure_go && (ppc64 || ppc64le) 6// +build !math_big_pure_go 7// +build ppc64 ppc64le 8 9#include "textflag.h" 10 11// This file provides fast assembly versions for the elementary 12// arithmetic operations on vectors implemented in arith.go. 13 14// func mulWW(x, y Word) (z1, z0 Word) 15TEXT ·mulWW(SB), NOSPLIT, $0 16 MOVD x+0(FP), R4 17 MOVD y+8(FP), R5 18 MULHDU R4, R5, R6 19 MULLD R4, R5, R7 20 MOVD R6, z1+16(FP) 21 MOVD R7, z0+24(FP) 22 RET 23 24// func addVV(z, y, y []Word) (c Word) 25// z[i] = x[i] + y[i] for all i, carrying 26TEXT ·addVV(SB), NOSPLIT, $0 27 MOVD z_len+8(FP), R7 // R7 = z_len 28 MOVD x+24(FP), R8 // R8 = x[] 29 MOVD y+48(FP), R9 // R9 = y[] 30 MOVD z+0(FP), R10 // R10 = z[] 31 32 // If z_len = 0, we are done 33 CMP R0, R7 34 MOVD R0, R4 35 BEQ done 36 37 // Process the first iteration out of the loop so we can 38 // use MOVDU and avoid 3 index registers updates. 39 MOVD 0(R8), R11 // R11 = x[i] 40 MOVD 0(R9), R12 // R12 = y[i] 41 ADD $-1, R7 // R7 = z_len - 1 42 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 43 CMP R0, R7 44 MOVD R15, 0(R10) // z[i] 45 BEQ final // If z_len was 1, we are done 46 47 SRD $2, R7, R5 // R5 = z_len/4 48 CMP R0, R5 49 MOVD R5, CTR // Set up loop counter 50 BEQ tail // If R5 = 0, we can't use the loop 51 52 // Process 4 elements per iteration. Unrolling this loop 53 // means a performance trade-off: we will lose performance 54 // for small values of z_len (0.90x in the worst case), but 55 // gain significant performance as z_len increases (up to 56 // 1.45x). 57loop: 58 MOVD 8(R8), R11 // R11 = x[i] 59 MOVD 16(R8), R12 // R12 = x[i+1] 60 MOVD 24(R8), R14 // R14 = x[i+2] 61 MOVDU 32(R8), R15 // R15 = x[i+3] 62 MOVD 8(R9), R16 // R16 = y[i] 63 MOVD 16(R9), R17 // R17 = y[i+1] 64 MOVD 24(R9), R18 // R18 = y[i+2] 65 MOVDU 32(R9), R19 // R19 = y[i+3] 66 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 67 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 68 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 69 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 70 MOVD R20, 8(R10) // z[i] 71 MOVD R21, 16(R10) // z[i+1] 72 MOVD R22, 24(R10) // z[i+2] 73 MOVDU R23, 32(R10) // z[i+3] 74 ADD $-4, R7 // R7 = z_len - 4 75 BC 16, 0, loop // bdnz 76 77 // We may have more elements to read 78 CMP R0, R7 79 BEQ final 80 81 // Process the remaining elements, one at a time 82tail: 83 MOVDU 8(R8), R11 // R11 = x[i] 84 MOVDU 8(R9), R16 // R16 = y[i] 85 ADD $-1, R7 // R7 = z_len - 1 86 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 87 CMP R0, R7 88 MOVDU R20, 8(R10) // z[i] 89 BEQ final // If R7 = 0, we are done 90 91 MOVDU 8(R8), R11 92 MOVDU 8(R9), R16 93 ADD $-1, R7 94 ADDE R11, R16, R20 95 CMP R0, R7 96 MOVDU R20, 8(R10) 97 BEQ final 98 99 MOVD 8(R8), R11 100 MOVD 8(R9), R16 101 ADDE R11, R16, R20 102 MOVD R20, 8(R10) 103 104final: 105 ADDZE R4 // Capture CA 106 107done: 108 MOVD R4, c+72(FP) 109 RET 110 111// func subVV(z, x, y []Word) (c Word) 112// z[i] = x[i] - y[i] for all i, carrying 113TEXT ·subVV(SB), NOSPLIT, $0 114 MOVD z_len+8(FP), R7 // R7 = z_len 115 MOVD x+24(FP), R8 // R8 = x[] 116 MOVD y+48(FP), R9 // R9 = y[] 117 MOVD z+0(FP), R10 // R10 = z[] 118 119 // If z_len = 0, we are done 120 CMP R0, R7 121 MOVD R0, R4 122 BEQ done 123 124 // Process the first iteration out of the loop so we can 125 // use MOVDU and avoid 3 index registers updates. 126 MOVD 0(R8), R11 // R11 = x[i] 127 MOVD 0(R9), R12 // R12 = y[i] 128 ADD $-1, R7 // R7 = z_len - 1 129 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 130 CMP R0, R7 131 MOVD R15, 0(R10) // z[i] 132 BEQ final // If z_len was 1, we are done 133 134 SRD $2, R7, R5 // R5 = z_len/4 135 CMP R0, R5 136 MOVD R5, CTR // Set up loop counter 137 BEQ tail // If R5 = 0, we can't use the loop 138 139 // Process 4 elements per iteration. Unrolling this loop 140 // means a performance trade-off: we will lose performance 141 // for small values of z_len (0.92x in the worst case), but 142 // gain significant performance as z_len increases (up to 143 // 1.45x). 144loop: 145 MOVD 8(R8), R11 // R11 = x[i] 146 MOVD 16(R8), R12 // R12 = x[i+1] 147 MOVD 24(R8), R14 // R14 = x[i+2] 148 MOVDU 32(R8), R15 // R15 = x[i+3] 149 MOVD 8(R9), R16 // R16 = y[i] 150 MOVD 16(R9), R17 // R17 = y[i+1] 151 MOVD 24(R9), R18 // R18 = y[i+2] 152 MOVDU 32(R9), R19 // R19 = y[i+3] 153 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 154 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 155 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 156 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 157 MOVD R20, 8(R10) // z[i] 158 MOVD R21, 16(R10) // z[i+1] 159 MOVD R22, 24(R10) // z[i+2] 160 MOVDU R23, 32(R10) // z[i+3] 161 ADD $-4, R7 // R7 = z_len - 4 162 BC 16, 0, loop // bdnz 163 164 // We may have more elements to read 165 CMP R0, R7 166 BEQ final 167 168 // Process the remaining elements, one at a time 169tail: 170 MOVDU 8(R8), R11 // R11 = x[i] 171 MOVDU 8(R9), R16 // R16 = y[i] 172 ADD $-1, R7 // R7 = z_len - 1 173 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 174 CMP R0, R7 175 MOVDU R20, 8(R10) // z[i] 176 BEQ final // If R7 = 0, we are done 177 178 MOVDU 8(R8), R11 179 MOVDU 8(R9), R16 180 ADD $-1, R7 181 SUBE R16, R11, R20 182 CMP R0, R7 183 MOVDU R20, 8(R10) 184 BEQ final 185 186 MOVD 8(R8), R11 187 MOVD 8(R9), R16 188 SUBE R16, R11, R20 189 MOVD R20, 8(R10) 190 191final: 192 ADDZE R4 193 XOR $1, R4 194 195done: 196 MOVD R4, c+72(FP) 197 RET 198 199// func addVW(z, x []Word, y Word) (c Word) 200TEXT ·addVW(SB), NOSPLIT, $0 201 MOVD z+0(FP), R10 // R10 = z[] 202 MOVD x+24(FP), R8 // R8 = x[] 203 MOVD y+48(FP), R4 // R4 = y = c 204 MOVD z_len+8(FP), R11 // R11 = z_len 205 206 CMP R0, R11 // If z_len is zero, return 207 BEQ done 208 209 // We will process the first iteration out of the loop so we capture 210 // the value of c. In the subsequent iterations, we will rely on the 211 // value of CA set here. 212 MOVD 0(R8), R20 // R20 = x[i] 213 ADD $-1, R11 // R11 = z_len - 1 214 ADDC R20, R4, R6 // R6 = x[i] + c 215 CMP R0, R11 // If z_len was 1, we are done 216 MOVD R6, 0(R10) // z[i] 217 BEQ final 218 219 // We will read 4 elements per iteration 220 SRD $2, R11, R9 // R9 = z_len/4 221 DCBT (R8) 222 CMP R0, R9 223 MOVD R9, CTR // Set up the loop counter 224 BEQ tail // If R9 = 0, we can't use the loop 225 226loop: 227 MOVD 8(R8), R20 // R20 = x[i] 228 MOVD 16(R8), R21 // R21 = x[i+1] 229 MOVD 24(R8), R22 // R22 = x[i+2] 230 MOVDU 32(R8), R23 // R23 = x[i+3] 231 ADDZE R20, R24 // R24 = x[i] + CA 232 ADDZE R21, R25 // R25 = x[i+1] + CA 233 ADDZE R22, R26 // R26 = x[i+2] + CA 234 ADDZE R23, R27 // R27 = x[i+3] + CA 235 MOVD R24, 8(R10) // z[i] 236 MOVD R25, 16(R10) // z[i+1] 237 MOVD R26, 24(R10) // z[i+2] 238 MOVDU R27, 32(R10) // z[i+3] 239 ADD $-4, R11 // R11 = z_len - 4 240 BC 16, 0, loop // bdnz 241 242 // We may have some elements to read 243 CMP R0, R11 244 BEQ final 245 246tail: 247 MOVDU 8(R8), R20 248 ADDZE R20, R24 249 ADD $-1, R11 250 MOVDU R24, 8(R10) 251 CMP R0, R11 252 BEQ final 253 254 MOVDU 8(R8), R20 255 ADDZE R20, R24 256 ADD $-1, R11 257 MOVDU R24, 8(R10) 258 CMP R0, R11 259 BEQ final 260 261 MOVD 8(R8), R20 262 ADDZE R20, R24 263 MOVD R24, 8(R10) 264 265final: 266 ADDZE R0, R4 // c = CA 267done: 268 MOVD R4, c+56(FP) 269 RET 270 271// func subVW(z, x []Word, y Word) (c Word) 272TEXT ·subVW(SB), NOSPLIT, $0 273 MOVD z+0(FP), R10 // R10 = z[] 274 MOVD x+24(FP), R8 // R8 = x[] 275 MOVD y+48(FP), R4 // R4 = y = c 276 MOVD z_len+8(FP), R11 // R11 = z_len 277 278 CMP R0, R11 // If z_len is zero, return 279 BEQ done 280 281 // We will process the first iteration out of the loop so we capture 282 // the value of c. In the subsequent iterations, we will rely on the 283 // value of CA set here. 284 MOVD 0(R8), R20 // R20 = x[i] 285 ADD $-1, R11 // R11 = z_len - 1 286 SUBC R4, R20, R6 // R6 = x[i] - c 287 CMP R0, R11 // If z_len was 1, we are done 288 MOVD R6, 0(R10) // z[i] 289 BEQ final 290 291 // We will read 4 elements per iteration 292 SRD $2, R11, R9 // R9 = z_len/4 293 DCBT (R8) 294 CMP R0, R9 295 MOVD R9, CTR // Set up the loop counter 296 BEQ tail // If R9 = 0, we can't use the loop 297 298 // The loop here is almost the same as the one used in s390x, but 299 // we don't need to capture CA every iteration because we've already 300 // done that above. 301loop: 302 MOVD 8(R8), R20 303 MOVD 16(R8), R21 304 MOVD 24(R8), R22 305 MOVDU 32(R8), R23 306 SUBE R0, R20 307 SUBE R0, R21 308 SUBE R0, R22 309 SUBE R0, R23 310 MOVD R20, 8(R10) 311 MOVD R21, 16(R10) 312 MOVD R22, 24(R10) 313 MOVDU R23, 32(R10) 314 ADD $-4, R11 315 BC 16, 0, loop // bdnz 316 317 // We may have some elements to read 318 CMP R0, R11 319 BEQ final 320 321tail: 322 MOVDU 8(R8), R20 323 SUBE R0, R20 324 ADD $-1, R11 325 MOVDU R20, 8(R10) 326 CMP R0, R11 327 BEQ final 328 329 MOVDU 8(R8), R20 330 SUBE R0, R20 331 ADD $-1, R11 332 MOVDU R20, 8(R10) 333 CMP R0, R11 334 BEQ final 335 336 MOVD 8(R8), R20 337 SUBE R0, R20 338 MOVD R20, 8(R10) 339 340final: 341 // Capture CA 342 SUBE R4, R4 343 NEG R4, R4 344 345done: 346 MOVD R4, c+56(FP) 347 RET 348 349TEXT ·shlVU(SB), NOSPLIT, $0 350 BR ·shlVU_g(SB) 351 352TEXT ·shrVU(SB), NOSPLIT, $0 353 BR ·shrVU_g(SB) 354 355// func mulAddVWW(z, x []Word, y, r Word) (c Word) 356TEXT ·mulAddVWW(SB), NOSPLIT, $0 357 MOVD z+0(FP), R10 // R10 = z[] 358 MOVD x+24(FP), R8 // R8 = x[] 359 MOVD y+48(FP), R9 // R9 = y 360 MOVD r+56(FP), R4 // R4 = r = c 361 MOVD z_len+8(FP), R11 // R11 = z_len 362 363 CMP R0, R11 364 BEQ done 365 366 MOVD 0(R8), R20 367 ADD $-1, R11 368 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 369 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 370 ADDC R4, R6 // R6 = z0 + r 371 ADDZE R7 // R7 = z1 + CA 372 CMP R0, R11 373 MOVD R7, R4 // R4 = c 374 MOVD R6, 0(R10) // z[i] 375 BEQ done 376 377 // We will read 4 elements per iteration 378 SRD $2, R11, R14 // R14 = z_len/4 379 DCBT (R8) 380 CMP R0, R14 381 MOVD R14, CTR // Set up the loop counter 382 BEQ tail // If R9 = 0, we can't use the loop 383 384loop: 385 MOVD 8(R8), R20 // R20 = x[i] 386 MOVD 16(R8), R21 // R21 = x[i+1] 387 MOVD 24(R8), R22 // R22 = x[i+2] 388 MOVDU 32(R8), R23 // R23 = x[i+3] 389 MULLD R9, R20, R24 // R24 = z0[i] 390 MULHDU R9, R20, R20 // R20 = z1[i] 391 ADDC R4, R24 // R24 = z0[i] + c 392 ADDZE R20 // R7 = z1[i] + CA 393 MULLD R9, R21, R25 394 MULHDU R9, R21, R21 395 ADDC R20, R25 396 ADDZE R21 397 MULLD R9, R22, R26 398 MULHDU R9, R22, R22 399 MULLD R9, R23, R27 400 MULHDU R9, R23, R23 401 ADDC R21, R26 402 ADDZE R22 403 MOVD R24, 8(R10) // z[i] 404 MOVD R25, 16(R10) // z[i+1] 405 ADDC R22, R27 406 ADDZE R23,R4 // update carry 407 MOVD R26, 24(R10) // z[i+2] 408 MOVDU R27, 32(R10) // z[i+3] 409 ADD $-4, R11 // R11 = z_len - 4 410 BC 16, 0, loop // bdnz 411 412 // We may have some elements to read 413 CMP R0, R11 414 BEQ done 415 416 // Process the remaining elements, one at a time 417tail: 418 MOVDU 8(R8), R20 // R20 = x[i] 419 MULLD R9, R20, R24 // R24 = z0[i] 420 MULHDU R9, R20, R25 // R25 = z1[i] 421 ADD $-1, R11 // R11 = z_len - 1 422 ADDC R4, R24 423 ADDZE R25 424 MOVDU R24, 8(R10) // z[i] 425 CMP R0, R11 426 MOVD R25, R4 // R4 = c 427 BEQ done // If R11 = 0, we are done 428 429 MOVDU 8(R8), R20 430 MULLD R9, R20, R24 431 MULHDU R9, R20, R25 432 ADD $-1, R11 433 ADDC R4, R24 434 ADDZE R25 435 MOVDU R24, 8(R10) 436 CMP R0, R11 437 MOVD R25, R4 438 BEQ done 439 440 MOVD 8(R8), R20 441 MULLD R9, R20, R24 442 MULHDU R9, R20, R25 443 ADD $-1, R11 444 ADDC R4, R24 445 ADDZE R25 446 MOVD R24, 8(R10) 447 MOVD R25, R4 448 449done: 450 MOVD R4, c+64(FP) 451 RET 452 453// func addMulVVW(z, x []Word, y Word) (c Word) 454TEXT ·addMulVVW(SB), NOSPLIT, $0 455 MOVD z+0(FP), R10 // R10 = z[] 456 MOVD x+24(FP), R8 // R8 = x[] 457 MOVD y+48(FP), R9 // R9 = y 458 MOVD z_len+8(FP), R22 // R22 = z_len 459 460 MOVD R0, R3 // R3 will be the index register 461 CMP R0, R22 462 MOVD R0, R4 // R4 = c = 0 463 MOVD R22, CTR // Initialize loop counter 464 BEQ done 465 466loop: 467 MOVD (R8)(R3), R20 // Load x[i] 468 MOVD (R10)(R3), R21 // Load z[i] 469 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) 470 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) 471 ADDC R21, R6 // R6 = z0 472 ADDZE R7 // R7 = z1 473 ADDC R4, R6 // R6 = z0 + c + 0 474 ADDZE R7, R4 // c += z1 475 MOVD R6, (R10)(R3) // Store z[i] 476 ADD $8, R3 477 BC 16, 0, loop // bdnz 478 479done: 480 MOVD R4, c+56(FP) 481 RET 482 483 484