1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build go1.11,gc,!purego 6 7#include "textflag.h" 8 9#define NUM_ROUNDS 10 10 11// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) 12TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 13 MOVD dst+0(FP), R1 14 MOVD src+24(FP), R2 15 MOVD src_len+32(FP), R3 16 MOVD key+48(FP), R4 17 MOVD nonce+56(FP), R6 18 MOVD counter+64(FP), R7 19 20 MOVD $·constants(SB), R10 21 MOVD $·incRotMatrix(SB), R11 22 23 MOVW (R7), R20 24 25 AND $~255, R3, R13 26 ADD R2, R13, R12 // R12 for block end 27 AND $255, R3, R13 28loop: 29 MOVD $NUM_ROUNDS, R21 30 VLD1 (R11), [V30.S4, V31.S4] 31 32 // load contants 33 // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] 34 WORD $0x4D60E940 35 36 // load keys 37 // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] 38 WORD $0x4DFFE884 39 // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] 40 WORD $0x4DFFE888 41 SUB $32, R4 42 43 // load counter + nonce 44 // VLD1R (R7), [V12.S4] 45 WORD $0x4D40C8EC 46 47 // VLD3R (R6), [V13.S4, V14.S4, V15.S4] 48 WORD $0x4D40E8CD 49 50 // update counter 51 VADD V30.S4, V12.S4, V12.S4 52 53chacha: 54 // V0..V3 += V4..V7 55 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) 56 VADD V0.S4, V4.S4, V0.S4 57 VADD V1.S4, V5.S4, V1.S4 58 VADD V2.S4, V6.S4, V2.S4 59 VADD V3.S4, V7.S4, V3.S4 60 VEOR V12.B16, V0.B16, V12.B16 61 VEOR V13.B16, V1.B16, V13.B16 62 VEOR V14.B16, V2.B16, V14.B16 63 VEOR V15.B16, V3.B16, V15.B16 64 VREV32 V12.H8, V12.H8 65 VREV32 V13.H8, V13.H8 66 VREV32 V14.H8, V14.H8 67 VREV32 V15.H8, V15.H8 68 // V8..V11 += V12..V15 69 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) 70 VADD V8.S4, V12.S4, V8.S4 71 VADD V9.S4, V13.S4, V9.S4 72 VADD V10.S4, V14.S4, V10.S4 73 VADD V11.S4, V15.S4, V11.S4 74 VEOR V8.B16, V4.B16, V16.B16 75 VEOR V9.B16, V5.B16, V17.B16 76 VEOR V10.B16, V6.B16, V18.B16 77 VEOR V11.B16, V7.B16, V19.B16 78 VSHL $12, V16.S4, V4.S4 79 VSHL $12, V17.S4, V5.S4 80 VSHL $12, V18.S4, V6.S4 81 VSHL $12, V19.S4, V7.S4 82 VSRI $20, V16.S4, V4.S4 83 VSRI $20, V17.S4, V5.S4 84 VSRI $20, V18.S4, V6.S4 85 VSRI $20, V19.S4, V7.S4 86 87 // V0..V3 += V4..V7 88 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) 89 VADD V0.S4, V4.S4, V0.S4 90 VADD V1.S4, V5.S4, V1.S4 91 VADD V2.S4, V6.S4, V2.S4 92 VADD V3.S4, V7.S4, V3.S4 93 VEOR V12.B16, V0.B16, V12.B16 94 VEOR V13.B16, V1.B16, V13.B16 95 VEOR V14.B16, V2.B16, V14.B16 96 VEOR V15.B16, V3.B16, V15.B16 97 VTBL V31.B16, [V12.B16], V12.B16 98 VTBL V31.B16, [V13.B16], V13.B16 99 VTBL V31.B16, [V14.B16], V14.B16 100 VTBL V31.B16, [V15.B16], V15.B16 101 102 // V8..V11 += V12..V15 103 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) 104 VADD V12.S4, V8.S4, V8.S4 105 VADD V13.S4, V9.S4, V9.S4 106 VADD V14.S4, V10.S4, V10.S4 107 VADD V15.S4, V11.S4, V11.S4 108 VEOR V8.B16, V4.B16, V16.B16 109 VEOR V9.B16, V5.B16, V17.B16 110 VEOR V10.B16, V6.B16, V18.B16 111 VEOR V11.B16, V7.B16, V19.B16 112 VSHL $7, V16.S4, V4.S4 113 VSHL $7, V17.S4, V5.S4 114 VSHL $7, V18.S4, V6.S4 115 VSHL $7, V19.S4, V7.S4 116 VSRI $25, V16.S4, V4.S4 117 VSRI $25, V17.S4, V5.S4 118 VSRI $25, V18.S4, V6.S4 119 VSRI $25, V19.S4, V7.S4 120 121 // V0..V3 += V5..V7, V4 122 // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) 123 VADD V0.S4, V5.S4, V0.S4 124 VADD V1.S4, V6.S4, V1.S4 125 VADD V2.S4, V7.S4, V2.S4 126 VADD V3.S4, V4.S4, V3.S4 127 VEOR V15.B16, V0.B16, V15.B16 128 VEOR V12.B16, V1.B16, V12.B16 129 VEOR V13.B16, V2.B16, V13.B16 130 VEOR V14.B16, V3.B16, V14.B16 131 VREV32 V12.H8, V12.H8 132 VREV32 V13.H8, V13.H8 133 VREV32 V14.H8, V14.H8 134 VREV32 V15.H8, V15.H8 135 136 // V10 += V15; V5 <<<= ((V10 XOR V5), 12) 137 // ... 138 VADD V15.S4, V10.S4, V10.S4 139 VADD V12.S4, V11.S4, V11.S4 140 VADD V13.S4, V8.S4, V8.S4 141 VADD V14.S4, V9.S4, V9.S4 142 VEOR V10.B16, V5.B16, V16.B16 143 VEOR V11.B16, V6.B16, V17.B16 144 VEOR V8.B16, V7.B16, V18.B16 145 VEOR V9.B16, V4.B16, V19.B16 146 VSHL $12, V16.S4, V5.S4 147 VSHL $12, V17.S4, V6.S4 148 VSHL $12, V18.S4, V7.S4 149 VSHL $12, V19.S4, V4.S4 150 VSRI $20, V16.S4, V5.S4 151 VSRI $20, V17.S4, V6.S4 152 VSRI $20, V18.S4, V7.S4 153 VSRI $20, V19.S4, V4.S4 154 155 // V0 += V5; V15 <<<= ((V0 XOR V15), 8) 156 // ... 157 VADD V5.S4, V0.S4, V0.S4 158 VADD V6.S4, V1.S4, V1.S4 159 VADD V7.S4, V2.S4, V2.S4 160 VADD V4.S4, V3.S4, V3.S4 161 VEOR V0.B16, V15.B16, V15.B16 162 VEOR V1.B16, V12.B16, V12.B16 163 VEOR V2.B16, V13.B16, V13.B16 164 VEOR V3.B16, V14.B16, V14.B16 165 VTBL V31.B16, [V12.B16], V12.B16 166 VTBL V31.B16, [V13.B16], V13.B16 167 VTBL V31.B16, [V14.B16], V14.B16 168 VTBL V31.B16, [V15.B16], V15.B16 169 170 // V10 += V15; V5 <<<= ((V10 XOR V5), 7) 171 // ... 172 VADD V15.S4, V10.S4, V10.S4 173 VADD V12.S4, V11.S4, V11.S4 174 VADD V13.S4, V8.S4, V8.S4 175 VADD V14.S4, V9.S4, V9.S4 176 VEOR V10.B16, V5.B16, V16.B16 177 VEOR V11.B16, V6.B16, V17.B16 178 VEOR V8.B16, V7.B16, V18.B16 179 VEOR V9.B16, V4.B16, V19.B16 180 VSHL $7, V16.S4, V5.S4 181 VSHL $7, V17.S4, V6.S4 182 VSHL $7, V18.S4, V7.S4 183 VSHL $7, V19.S4, V4.S4 184 VSRI $25, V16.S4, V5.S4 185 VSRI $25, V17.S4, V6.S4 186 VSRI $25, V18.S4, V7.S4 187 VSRI $25, V19.S4, V4.S4 188 189 SUB $1, R21 190 CBNZ R21, chacha 191 192 // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] 193 WORD $0x4D60E950 194 195 // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] 196 WORD $0x4DFFE894 197 VADD V30.S4, V12.S4, V12.S4 198 VADD V16.S4, V0.S4, V0.S4 199 VADD V17.S4, V1.S4, V1.S4 200 VADD V18.S4, V2.S4, V2.S4 201 VADD V19.S4, V3.S4, V3.S4 202 // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] 203 WORD $0x4DFFE898 204 // restore R4 205 SUB $32, R4 206 207 // load counter + nonce 208 // VLD1R (R7), [V28.S4] 209 WORD $0x4D40C8FC 210 // VLD3R (R6), [V29.S4, V30.S4, V31.S4] 211 WORD $0x4D40E8DD 212 213 VADD V20.S4, V4.S4, V4.S4 214 VADD V21.S4, V5.S4, V5.S4 215 VADD V22.S4, V6.S4, V6.S4 216 VADD V23.S4, V7.S4, V7.S4 217 VADD V24.S4, V8.S4, V8.S4 218 VADD V25.S4, V9.S4, V9.S4 219 VADD V26.S4, V10.S4, V10.S4 220 VADD V27.S4, V11.S4, V11.S4 221 VADD V28.S4, V12.S4, V12.S4 222 VADD V29.S4, V13.S4, V13.S4 223 VADD V30.S4, V14.S4, V14.S4 224 VADD V31.S4, V15.S4, V15.S4 225 226 VZIP1 V1.S4, V0.S4, V16.S4 227 VZIP2 V1.S4, V0.S4, V17.S4 228 VZIP1 V3.S4, V2.S4, V18.S4 229 VZIP2 V3.S4, V2.S4, V19.S4 230 VZIP1 V5.S4, V4.S4, V20.S4 231 VZIP2 V5.S4, V4.S4, V21.S4 232 VZIP1 V7.S4, V6.S4, V22.S4 233 VZIP2 V7.S4, V6.S4, V23.S4 234 VZIP1 V9.S4, V8.S4, V24.S4 235 VZIP2 V9.S4, V8.S4, V25.S4 236 VZIP1 V11.S4, V10.S4, V26.S4 237 VZIP2 V11.S4, V10.S4, V27.S4 238 VZIP1 V13.S4, V12.S4, V28.S4 239 VZIP2 V13.S4, V12.S4, V29.S4 240 VZIP1 V15.S4, V14.S4, V30.S4 241 VZIP2 V15.S4, V14.S4, V31.S4 242 VZIP1 V18.D2, V16.D2, V0.D2 243 VZIP2 V18.D2, V16.D2, V4.D2 244 VZIP1 V19.D2, V17.D2, V8.D2 245 VZIP2 V19.D2, V17.D2, V12.D2 246 VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] 247 248 VZIP1 V22.D2, V20.D2, V1.D2 249 VZIP2 V22.D2, V20.D2, V5.D2 250 VZIP1 V23.D2, V21.D2, V9.D2 251 VZIP2 V23.D2, V21.D2, V13.D2 252 VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] 253 VZIP1 V26.D2, V24.D2, V2.D2 254 VZIP2 V26.D2, V24.D2, V6.D2 255 VZIP1 V27.D2, V25.D2, V10.D2 256 VZIP2 V27.D2, V25.D2, V14.D2 257 VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] 258 VZIP1 V30.D2, V28.D2, V3.D2 259 VZIP2 V30.D2, V28.D2, V7.D2 260 VZIP1 V31.D2, V29.D2, V11.D2 261 VZIP2 V31.D2, V29.D2, V15.D2 262 VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] 263 VEOR V0.B16, V16.B16, V16.B16 264 VEOR V1.B16, V17.B16, V17.B16 265 VEOR V2.B16, V18.B16, V18.B16 266 VEOR V3.B16, V19.B16, V19.B16 267 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) 268 VEOR V4.B16, V20.B16, V20.B16 269 VEOR V5.B16, V21.B16, V21.B16 270 VEOR V6.B16, V22.B16, V22.B16 271 VEOR V7.B16, V23.B16, V23.B16 272 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) 273 VEOR V8.B16, V24.B16, V24.B16 274 VEOR V9.B16, V25.B16, V25.B16 275 VEOR V10.B16, V26.B16, V26.B16 276 VEOR V11.B16, V27.B16, V27.B16 277 VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) 278 VEOR V12.B16, V28.B16, V28.B16 279 VEOR V13.B16, V29.B16, V29.B16 280 VEOR V14.B16, V30.B16, V30.B16 281 VEOR V15.B16, V31.B16, V31.B16 282 VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) 283 284 ADD $4, R20 285 MOVW R20, (R7) // update counter 286 287 CMP R2, R12 288 BGT loop 289 290 RET 291 292 293DATA ·constants+0x00(SB)/4, $0x61707865 294DATA ·constants+0x04(SB)/4, $0x3320646e 295DATA ·constants+0x08(SB)/4, $0x79622d32 296DATA ·constants+0x0c(SB)/4, $0x6b206574 297GLOBL ·constants(SB), NOPTR|RODATA, $32 298 299DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 300DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 301DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 302DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 303DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 304DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 305DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B 306DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F 307GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 308