1// Copyright 2019 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Based on CRYPTOGAMS code with the following comment: 6// # ==================================================================== 7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 8// # project. The module is, however, dual licensed under OpenSSL and 9// # CRYPTOGAMS licenses depending on where you obtain it. For further 10// # details see http://www.openssl.org/~appro/cryptogams/. 11// # ==================================================================== 12 13// Code for the perl script that generates the ppc64 assembler 14// can be found in the cryptogams repository at the link below. It is based on 15// the original from openssl. 16 17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 18 19// The differences in this and the original implementation are 20// due to the calling conventions and initialization of constants. 21 22// +build !gccgo,!purego 23 24#include "textflag.h" 25 26#define OUT R3 27#define INP R4 28#define LEN R5 29#define KEY R6 30#define CNT R7 31#define TMP R15 32 33#define CONSTBASE R16 34#define BLOCKS R17 35 36DATA consts<>+0x00(SB)/8, $0x3320646e61707865 37DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 38DATA consts<>+0x10(SB)/8, $0x0000000000000001 39DATA consts<>+0x18(SB)/8, $0x0000000000000000 40DATA consts<>+0x20(SB)/8, $0x0000000000000004 41DATA consts<>+0x28(SB)/8, $0x0000000000000000 42DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d 43DATA consts<>+0x38(SB)/8, $0x0203000106070405 44DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c 45DATA consts<>+0x48(SB)/8, $0x0102030005060704 46DATA consts<>+0x50(SB)/8, $0x6170786561707865 47DATA consts<>+0x58(SB)/8, $0x6170786561707865 48DATA consts<>+0x60(SB)/8, $0x3320646e3320646e 49DATA consts<>+0x68(SB)/8, $0x3320646e3320646e 50DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 51DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 52DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 53DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 54DATA consts<>+0x90(SB)/8, $0x0000000100000000 55DATA consts<>+0x98(SB)/8, $0x0000000300000002 56GLOBL consts<>(SB), RODATA, $0xa0 57 58//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) 59TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 60 MOVD out+0(FP), OUT 61 MOVD inp+8(FP), INP 62 MOVD len+16(FP), LEN 63 MOVD key+24(FP), KEY 64 MOVD counter+32(FP), CNT 65 66 // Addressing for constants 67 MOVD $consts<>+0x00(SB), CONSTBASE 68 MOVD $16, R8 69 MOVD $32, R9 70 MOVD $48, R10 71 MOVD $64, R11 72 SRD $6, LEN, BLOCKS 73 // V16 74 LXVW4X (CONSTBASE)(R0), VS48 75 ADD $80,CONSTBASE 76 77 // Load key into V17,V18 78 LXVW4X (KEY)(R0), VS49 79 LXVW4X (KEY)(R8), VS50 80 81 // Load CNT, NONCE into V19 82 LXVW4X (CNT)(R0), VS51 83 84 // Clear V27 85 VXOR V27, V27, V27 86 87 // V28 88 LXVW4X (CONSTBASE)(R11), VS60 89 90 // splat slot from V19 -> V26 91 VSPLTW $0, V19, V26 92 93 VSLDOI $4, V19, V27, V19 94 VSLDOI $12, V27, V19, V19 95 96 VADDUWM V26, V28, V26 97 98 MOVD $10, R14 99 MOVD R14, CTR 100 101loop_outer_vsx: 102 // V0, V1, V2, V3 103 LXVW4X (R0)(CONSTBASE), VS32 104 LXVW4X (R8)(CONSTBASE), VS33 105 LXVW4X (R9)(CONSTBASE), VS34 106 LXVW4X (R10)(CONSTBASE), VS35 107 108 // splat values from V17, V18 into V4-V11 109 VSPLTW $0, V17, V4 110 VSPLTW $1, V17, V5 111 VSPLTW $2, V17, V6 112 VSPLTW $3, V17, V7 113 VSPLTW $0, V18, V8 114 VSPLTW $1, V18, V9 115 VSPLTW $2, V18, V10 116 VSPLTW $3, V18, V11 117 118 // VOR 119 VOR V26, V26, V12 120 121 // splat values from V19 -> V13, V14, V15 122 VSPLTW $1, V19, V13 123 VSPLTW $2, V19, V14 124 VSPLTW $3, V19, V15 125 126 // splat const values 127 VSPLTISW $-16, V27 128 VSPLTISW $12, V28 129 VSPLTISW $8, V29 130 VSPLTISW $7, V30 131 132loop_vsx: 133 VADDUWM V0, V4, V0 134 VADDUWM V1, V5, V1 135 VADDUWM V2, V6, V2 136 VADDUWM V3, V7, V3 137 138 VXOR V12, V0, V12 139 VXOR V13, V1, V13 140 VXOR V14, V2, V14 141 VXOR V15, V3, V15 142 143 VRLW V12, V27, V12 144 VRLW V13, V27, V13 145 VRLW V14, V27, V14 146 VRLW V15, V27, V15 147 148 VADDUWM V8, V12, V8 149 VADDUWM V9, V13, V9 150 VADDUWM V10, V14, V10 151 VADDUWM V11, V15, V11 152 153 VXOR V4, V8, V4 154 VXOR V5, V9, V5 155 VXOR V6, V10, V6 156 VXOR V7, V11, V7 157 158 VRLW V4, V28, V4 159 VRLW V5, V28, V5 160 VRLW V6, V28, V6 161 VRLW V7, V28, V7 162 163 VADDUWM V0, V4, V0 164 VADDUWM V1, V5, V1 165 VADDUWM V2, V6, V2 166 VADDUWM V3, V7, V3 167 168 VXOR V12, V0, V12 169 VXOR V13, V1, V13 170 VXOR V14, V2, V14 171 VXOR V15, V3, V15 172 173 VRLW V12, V29, V12 174 VRLW V13, V29, V13 175 VRLW V14, V29, V14 176 VRLW V15, V29, V15 177 178 VADDUWM V8, V12, V8 179 VADDUWM V9, V13, V9 180 VADDUWM V10, V14, V10 181 VADDUWM V11, V15, V11 182 183 VXOR V4, V8, V4 184 VXOR V5, V9, V5 185 VXOR V6, V10, V6 186 VXOR V7, V11, V7 187 188 VRLW V4, V30, V4 189 VRLW V5, V30, V5 190 VRLW V6, V30, V6 191 VRLW V7, V30, V7 192 193 VADDUWM V0, V5, V0 194 VADDUWM V1, V6, V1 195 VADDUWM V2, V7, V2 196 VADDUWM V3, V4, V3 197 198 VXOR V15, V0, V15 199 VXOR V12, V1, V12 200 VXOR V13, V2, V13 201 VXOR V14, V3, V14 202 203 VRLW V15, V27, V15 204 VRLW V12, V27, V12 205 VRLW V13, V27, V13 206 VRLW V14, V27, V14 207 208 VADDUWM V10, V15, V10 209 VADDUWM V11, V12, V11 210 VADDUWM V8, V13, V8 211 VADDUWM V9, V14, V9 212 213 VXOR V5, V10, V5 214 VXOR V6, V11, V6 215 VXOR V7, V8, V7 216 VXOR V4, V9, V4 217 218 VRLW V5, V28, V5 219 VRLW V6, V28, V6 220 VRLW V7, V28, V7 221 VRLW V4, V28, V4 222 223 VADDUWM V0, V5, V0 224 VADDUWM V1, V6, V1 225 VADDUWM V2, V7, V2 226 VADDUWM V3, V4, V3 227 228 VXOR V15, V0, V15 229 VXOR V12, V1, V12 230 VXOR V13, V2, V13 231 VXOR V14, V3, V14 232 233 VRLW V15, V29, V15 234 VRLW V12, V29, V12 235 VRLW V13, V29, V13 236 VRLW V14, V29, V14 237 238 VADDUWM V10, V15, V10 239 VADDUWM V11, V12, V11 240 VADDUWM V8, V13, V8 241 VADDUWM V9, V14, V9 242 243 VXOR V5, V10, V5 244 VXOR V6, V11, V6 245 VXOR V7, V8, V7 246 VXOR V4, V9, V4 247 248 VRLW V5, V30, V5 249 VRLW V6, V30, V6 250 VRLW V7, V30, V7 251 VRLW V4, V30, V4 252 BC 16, LT, loop_vsx 253 254 VADDUWM V12, V26, V12 255 256 WORD $0x13600F8C // VMRGEW V0, V1, V27 257 WORD $0x13821F8C // VMRGEW V2, V3, V28 258 259 WORD $0x10000E8C // VMRGOW V0, V1, V0 260 WORD $0x10421E8C // VMRGOW V2, V3, V2 261 262 WORD $0x13A42F8C // VMRGEW V4, V5, V29 263 WORD $0x13C63F8C // VMRGEW V6, V7, V30 264 265 XXPERMDI VS32, VS34, $0, VS33 266 XXPERMDI VS32, VS34, $3, VS35 267 XXPERMDI VS59, VS60, $0, VS32 268 XXPERMDI VS59, VS60, $3, VS34 269 270 WORD $0x10842E8C // VMRGOW V4, V5, V4 271 WORD $0x10C63E8C // VMRGOW V6, V7, V6 272 273 WORD $0x13684F8C // VMRGEW V8, V9, V27 274 WORD $0x138A5F8C // VMRGEW V10, V11, V28 275 276 XXPERMDI VS36, VS38, $0, VS37 277 XXPERMDI VS36, VS38, $3, VS39 278 XXPERMDI VS61, VS62, $0, VS36 279 XXPERMDI VS61, VS62, $3, VS38 280 281 WORD $0x11084E8C // VMRGOW V8, V9, V8 282 WORD $0x114A5E8C // VMRGOW V10, V11, V10 283 284 WORD $0x13AC6F8C // VMRGEW V12, V13, V29 285 WORD $0x13CE7F8C // VMRGEW V14, V15, V30 286 287 XXPERMDI VS40, VS42, $0, VS41 288 XXPERMDI VS40, VS42, $3, VS43 289 XXPERMDI VS59, VS60, $0, VS40 290 XXPERMDI VS59, VS60, $3, VS42 291 292 WORD $0x118C6E8C // VMRGOW V12, V13, V12 293 WORD $0x11CE7E8C // VMRGOW V14, V15, V14 294 295 VSPLTISW $4, V27 296 VADDUWM V26, V27, V26 297 298 XXPERMDI VS44, VS46, $0, VS45 299 XXPERMDI VS44, VS46, $3, VS47 300 XXPERMDI VS61, VS62, $0, VS44 301 XXPERMDI VS61, VS62, $3, VS46 302 303 VADDUWM V0, V16, V0 304 VADDUWM V4, V17, V4 305 VADDUWM V8, V18, V8 306 VADDUWM V12, V19, V12 307 308 CMPU LEN, $64 309 BLT tail_vsx 310 311 // Bottom of loop 312 LXVW4X (INP)(R0), VS59 313 LXVW4X (INP)(R8), VS60 314 LXVW4X (INP)(R9), VS61 315 LXVW4X (INP)(R10), VS62 316 317 VXOR V27, V0, V27 318 VXOR V28, V4, V28 319 VXOR V29, V8, V29 320 VXOR V30, V12, V30 321 322 STXVW4X VS59, (OUT)(R0) 323 STXVW4X VS60, (OUT)(R8) 324 ADD $64, INP 325 STXVW4X VS61, (OUT)(R9) 326 ADD $-64, LEN 327 STXVW4X VS62, (OUT)(R10) 328 ADD $64, OUT 329 BEQ done_vsx 330 331 VADDUWM V1, V16, V0 332 VADDUWM V5, V17, V4 333 VADDUWM V9, V18, V8 334 VADDUWM V13, V19, V12 335 336 CMPU LEN, $64 337 BLT tail_vsx 338 339 LXVW4X (INP)(R0), VS59 340 LXVW4X (INP)(R8), VS60 341 LXVW4X (INP)(R9), VS61 342 LXVW4X (INP)(R10), VS62 343 VXOR V27, V0, V27 344 345 VXOR V28, V4, V28 346 VXOR V29, V8, V29 347 VXOR V30, V12, V30 348 349 STXVW4X VS59, (OUT)(R0) 350 STXVW4X VS60, (OUT)(R8) 351 ADD $64, INP 352 STXVW4X VS61, (OUT)(R9) 353 ADD $-64, LEN 354 STXVW4X VS62, (OUT)(V10) 355 ADD $64, OUT 356 BEQ done_vsx 357 358 VADDUWM V2, V16, V0 359 VADDUWM V6, V17, V4 360 VADDUWM V10, V18, V8 361 VADDUWM V14, V19, V12 362 363 CMPU LEN, $64 364 BLT tail_vsx 365 366 LXVW4X (INP)(R0), VS59 367 LXVW4X (INP)(R8), VS60 368 LXVW4X (INP)(R9), VS61 369 LXVW4X (INP)(R10), VS62 370 371 VXOR V27, V0, V27 372 VXOR V28, V4, V28 373 VXOR V29, V8, V29 374 VXOR V30, V12, V30 375 376 STXVW4X VS59, (OUT)(R0) 377 STXVW4X VS60, (OUT)(R8) 378 ADD $64, INP 379 STXVW4X VS61, (OUT)(R9) 380 ADD $-64, LEN 381 STXVW4X VS62, (OUT)(R10) 382 ADD $64, OUT 383 BEQ done_vsx 384 385 VADDUWM V3, V16, V0 386 VADDUWM V7, V17, V4 387 VADDUWM V11, V18, V8 388 VADDUWM V15, V19, V12 389 390 CMPU LEN, $64 391 BLT tail_vsx 392 393 LXVW4X (INP)(R0), VS59 394 LXVW4X (INP)(R8), VS60 395 LXVW4X (INP)(R9), VS61 396 LXVW4X (INP)(R10), VS62 397 398 VXOR V27, V0, V27 399 VXOR V28, V4, V28 400 VXOR V29, V8, V29 401 VXOR V30, V12, V30 402 403 STXVW4X VS59, (OUT)(R0) 404 STXVW4X VS60, (OUT)(R8) 405 ADD $64, INP 406 STXVW4X VS61, (OUT)(R9) 407 ADD $-64, LEN 408 STXVW4X VS62, (OUT)(R10) 409 ADD $64, OUT 410 411 MOVD $10, R14 412 MOVD R14, CTR 413 BNE loop_outer_vsx 414 415done_vsx: 416 // Increment counter by number of 64 byte blocks 417 MOVD (CNT), R14 418 ADD BLOCKS, R14 419 MOVD R14, (CNT) 420 RET 421 422tail_vsx: 423 ADD $32, R1, R11 424 MOVD LEN, CTR 425 426 // Save values on stack to copy from 427 STXVW4X VS32, (R11)(R0) 428 STXVW4X VS36, (R11)(R8) 429 STXVW4X VS40, (R11)(R9) 430 STXVW4X VS44, (R11)(R10) 431 ADD $-1, R11, R12 432 ADD $-1, INP 433 ADD $-1, OUT 434 435looptail_vsx: 436 // Copying the result to OUT 437 // in bytes. 438 MOVBZU 1(R12), KEY 439 MOVBZU 1(INP), TMP 440 XOR KEY, TMP, KEY 441 MOVBU KEY, 1(OUT) 442 BC 16, LT, looptail_vsx 443 444 // Clear the stack values 445 STXVW4X VS48, (R11)(R0) 446 STXVW4X VS48, (R11)(R8) 447 STXVW4X VS48, (R11)(R9) 448 STXVW4X VS48, (R11)(R10) 449 BR done_vsx 450