1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !gccgo,!appengine 6 7#include "go_asm.h" 8#include "textflag.h" 9 10// This is an implementation of the ChaCha20 encryption algorithm as 11// specified in RFC 7539. It uses vector instructions to compute 12// 4 keystream blocks in parallel (256 bytes) which are then XORed 13// with the bytes in the input slice. 14 15GLOBL ·constants<>(SB), RODATA|NOPTR, $32 16// BSWAP: swap bytes in each 4-byte element 17DATA ·constants<>+0x00(SB)/4, $0x03020100 18DATA ·constants<>+0x04(SB)/4, $0x07060504 19DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 20DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c 21// J0: [j0, j1, j2, j3] 22DATA ·constants<>+0x10(SB)/4, $0x61707865 23DATA ·constants<>+0x14(SB)/4, $0x3320646e 24DATA ·constants<>+0x18(SB)/4, $0x79622d32 25DATA ·constants<>+0x1c(SB)/4, $0x6b206574 26 27#define BSWAP V5 28#define J0 V6 29#define KEY0 V7 30#define KEY1 V8 31#define NONCE V9 32#define CTR V10 33#define M0 V11 34#define M1 V12 35#define M2 V13 36#define M3 V14 37#define INC V15 38#define X0 V16 39#define X1 V17 40#define X2 V18 41#define X3 V19 42#define X4 V20 43#define X5 V21 44#define X6 V22 45#define X7 V23 46#define X8 V24 47#define X9 V25 48#define X10 V26 49#define X11 V27 50#define X12 V28 51#define X13 V29 52#define X14 V30 53#define X15 V31 54 55#define NUM_ROUNDS 20 56 57#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ 58 VAF a1, a0, a0 \ 59 VAF b1, b0, b0 \ 60 VAF c1, c0, c0 \ 61 VAF d1, d0, d0 \ 62 VX a0, a2, a2 \ 63 VX b0, b2, b2 \ 64 VX c0, c2, c2 \ 65 VX d0, d2, d2 \ 66 VERLLF $16, a2, a2 \ 67 VERLLF $16, b2, b2 \ 68 VERLLF $16, c2, c2 \ 69 VERLLF $16, d2, d2 \ 70 VAF a2, a3, a3 \ 71 VAF b2, b3, b3 \ 72 VAF c2, c3, c3 \ 73 VAF d2, d3, d3 \ 74 VX a3, a1, a1 \ 75 VX b3, b1, b1 \ 76 VX c3, c1, c1 \ 77 VX d3, d1, d1 \ 78 VERLLF $12, a1, a1 \ 79 VERLLF $12, b1, b1 \ 80 VERLLF $12, c1, c1 \ 81 VERLLF $12, d1, d1 \ 82 VAF a1, a0, a0 \ 83 VAF b1, b0, b0 \ 84 VAF c1, c0, c0 \ 85 VAF d1, d0, d0 \ 86 VX a0, a2, a2 \ 87 VX b0, b2, b2 \ 88 VX c0, c2, c2 \ 89 VX d0, d2, d2 \ 90 VERLLF $8, a2, a2 \ 91 VERLLF $8, b2, b2 \ 92 VERLLF $8, c2, c2 \ 93 VERLLF $8, d2, d2 \ 94 VAF a2, a3, a3 \ 95 VAF b2, b3, b3 \ 96 VAF c2, c3, c3 \ 97 VAF d2, d3, d3 \ 98 VX a3, a1, a1 \ 99 VX b3, b1, b1 \ 100 VX c3, c1, c1 \ 101 VX d3, d1, d1 \ 102 VERLLF $7, a1, a1 \ 103 VERLLF $7, b1, b1 \ 104 VERLLF $7, c1, c1 \ 105 VERLLF $7, d1, d1 106 107#define PERMUTE(mask, v0, v1, v2, v3) \ 108 VPERM v0, v0, mask, v0 \ 109 VPERM v1, v1, mask, v1 \ 110 VPERM v2, v2, mask, v2 \ 111 VPERM v3, v3, mask, v3 112 113#define ADDV(x, v0, v1, v2, v3) \ 114 VAF x, v0, v0 \ 115 VAF x, v1, v1 \ 116 VAF x, v2, v2 \ 117 VAF x, v3, v3 118 119#define XORV(off, dst, src, v0, v1, v2, v3) \ 120 VLM off(src), M0, M3 \ 121 PERMUTE(BSWAP, v0, v1, v2, v3) \ 122 VX v0, M0, M0 \ 123 VX v1, M1, M1 \ 124 VX v2, M2, M2 \ 125 VX v3, M3, M3 \ 126 VSTM M0, M3, off(dst) 127 128#define SHUFFLE(a, b, c, d, t, u, v, w) \ 129 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} 130 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} 131 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} 132 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} 133 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} 134 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} 135 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} 136 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} 137 138// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) 139TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 140 MOVD $·constants<>(SB), R1 141 MOVD dst+0(FP), R2 // R2=&dst[0] 142 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) 143 MOVD key+48(FP), R5 // R5=key 144 MOVD nonce+56(FP), R6 // R6=nonce 145 MOVD counter+64(FP), R7 // R7=counter 146 147 // load BSWAP and J0 148 VLM (R1), BSWAP, J0 149 150 // setup 151 MOVD $95, R0 152 VLM (R5), KEY0, KEY1 153 VLL R0, (R6), NONCE 154 VZERO M0 155 VLEIB $7, $32, M0 156 VSRLB M0, NONCE, NONCE 157 158 // initialize counter values 159 VLREPF (R7), CTR 160 VZERO INC 161 VLEIF $1, $1, INC 162 VLEIF $2, $2, INC 163 VLEIF $3, $3, INC 164 VAF INC, CTR, CTR 165 VREPIF $4, INC 166 167chacha: 168 VREPF $0, J0, X0 169 VREPF $1, J0, X1 170 VREPF $2, J0, X2 171 VREPF $3, J0, X3 172 VREPF $0, KEY0, X4 173 VREPF $1, KEY0, X5 174 VREPF $2, KEY0, X6 175 VREPF $3, KEY0, X7 176 VREPF $0, KEY1, X8 177 VREPF $1, KEY1, X9 178 VREPF $2, KEY1, X10 179 VREPF $3, KEY1, X11 180 VLR CTR, X12 181 VREPF $1, NONCE, X13 182 VREPF $2, NONCE, X14 183 VREPF $3, NONCE, X15 184 185 MOVD $(NUM_ROUNDS/2), R1 186 187loop: 188 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) 189 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) 190 191 ADD $-1, R1 192 BNE loop 193 194 // decrement length 195 ADD $-256, R4 196 197 // rearrange vectors 198 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) 199 ADDV(J0, X0, X1, X2, X3) 200 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) 201 ADDV(KEY0, X4, X5, X6, X7) 202 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) 203 ADDV(KEY1, X8, X9, X10, X11) 204 VAF CTR, X12, X12 205 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) 206 ADDV(NONCE, X12, X13, X14, X15) 207 208 // increment counters 209 VAF INC, CTR, CTR 210 211 // xor keystream with plaintext 212 XORV(0*64, R2, R3, X0, X4, X8, X12) 213 XORV(1*64, R2, R3, X1, X5, X9, X13) 214 XORV(2*64, R2, R3, X2, X6, X10, X14) 215 XORV(3*64, R2, R3, X3, X7, X11, X15) 216 217 // increment pointers 218 MOVD $256(R2), R2 219 MOVD $256(R3), R3 220 221 CMPBNE R4, $0, chacha 222 223 VSTEF $0, CTR, (R7) 224 RET 225