1/* $NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <aarch64/asm.h> 30 31RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $") 32 33#define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ 34STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 35STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 36STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 37STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 38STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 39STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 40STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 41STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 42STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 43STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 44STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 45STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 46STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 47STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 48STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 49STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 50STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 51STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 52STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 53STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 54/* end ROUND */ 55 56#define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \ 57 f(a0,b0,c0,d0, t0, r); \ 58 f(a1,b1,c1,d1, t1, r); \ 59 f(a2,b2,c2,d2, t2, r); \ 60 f(a3,b3,c3,d3, t3, r); \ 61 /* end of STEP */ 62 63/* 64 * Each step of the ChaCha quarterround, split up so we can interleave 65 * the quarterrounds on independent rows/diagonals to maximize pipeline 66 * efficiency. Reference: 67 * 68 * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop 69 * Record of the State of the Art in Stream Ciphers -- SASC 2008. 70 * https://cr.yp.to/papers.html#chacha 71 * 72 * a += b; d ^= a; d <<<= 16; 73 * c += d; b ^= c; b <<<= 12; 74 * a += b; d ^= a; d <<<= 8; 75 * c += d; b ^= c; b <<<= 7; 76 * 77 * The rotations are implemented with: 78 * <<< 16 REV32 Vn.8h for 16, 79 * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR) 80 * <<< 8 TBL (general permutation; rot8 below stored in r) 81 * <<< 7 SHL/SRI/ORR 82 */ 83#define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 84#define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 85#if 0 86#define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16 87#define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16) 88#define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 89#else 90#define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h 91#define STEP3(a,b,c,d, t, r) /* nothing */ 92#define STEP4(a,b,c,d, t, r) /* nothing */ 93#endif 94 95#define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 96#if 0 97#define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 98#define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12 99#define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12) 100#define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 101#else 102#define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 103#define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12 104#define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12) 105#define STEP9(a,b,c,d, t, r) /* nothing */ 106#endif 107 108#define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 109#define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 110#if 0 111#define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8 112#define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8) 113#define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 114#else 115#define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b 116#define STEP13(a,b,c,d, t, r) /* nothing */ 117#define STEP14(a,b,c,d, t, r) /* nothing */ 118#endif 119 120#define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 121#if 0 122#define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 123#define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7 124#define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7) 125#define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 126#else 127#define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 128#define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7 129#define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7) 130#define STEP19(a,b,c,d, t, r) /* nothing */ 131#endif 132 133#if defined(__AARCH64EB__) 134#define HTOLE32(x) rev32 x, x 135#define LE32TOH(x) rev32 x, x 136#else 137#define LE32TOH(x) 138#define HTOLE32(x) 139#endif 140 141/* 142 * chacha_stream256_neon(uint8_t s[256]@x0, 143 * uint32_t blkno@w1, 144 * const uint8_t nonce[12]@x2, 145 * const uint8_t key[32]@x3, 146 * const uint8_t const[16]@x4, 147 * unsigned nr@w5) 148 */ 149ENTRY(chacha_stream256_neon) 150 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 151 mov fp, sp 152 153 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 154 stp d10, d11, [sp, #0x20] 155 stp d12, d13, [sp, #0x30] 156 stp d14, d15, [sp, #0x40] 157 158 adrl x9, v0123 /* x9 := &v0123 */ 159 mov x10, x4 /* r10 := c */ 160 mov x11, x3 /* r11 := k */ 161 add x12, x3, #16 /* r12 := k+4 */ 162 mov x13, x2 /* r13 := nonce */ 163 164 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 165 dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */ 166 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 167 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 168 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 169 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 170 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 171 172 LE32TOH(v0.16b) 173 LE32TOH(v1.16b) 174 LE32TOH(v2.16b) 175 LE32TOH(v3.16b) 176 LE32TOH(v4.16b) 177 LE32TOH(v5.16b) 178 LE32TOH(v6.16b) 179 LE32TOH(v7.16b) 180 LE32TOH(v8.16b) 181 LE32TOH(v9.16b) 182 LE32TOH(v10.16b) 183 LE32TOH(v11.16b) 184 /* LE32TOH(v12.16b) -- blkno, already host order */ 185 LE32TOH(v13.16b) 186 LE32TOH(v14.16b) 187 LE32TOH(v15.16b) 188 189 mov v16.16b, v0.16b 190 mov v17.16b, v1.16b 191 mov v18.16b, v2.16b 192 mov v19.16b, v3.16b 193 mov v20.16b, v4.16b 194 mov v21.16b, v5.16b 195 mov v22.16b, v6.16b 196 mov v23.16b, v7.16b 197 mov v24.16b, v8.16b 198 mov v25.16b, v9.16b 199 mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 200 mov w8, v10.s[0] /* v27-31 needed as temporaries */ 201 mov w9, v11.s[0] 202 mov w10, v13.s[0] 203 mov w11, v14.s[0] 204 mov w12, v15.s[0] 205 206 _ALIGN_TEXT 2071: subs w5, w5, #2 208 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 209 v28,v29,v30,v31, v27) 210 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 211 v28,v29,v30,v31, v27) 212 b.ne 1b 213 214 dup v27.4s, w8 215 dup v28.4s, w9 216 dup v29.4s, w10 217 dup v30.4s, w11 218 dup v31.4s, w12 219 220 add v0.4s, v0.4s, v16.4s 221 add v1.4s, v1.4s, v17.4s 222 add v2.4s, v2.4s, v18.4s 223 add v3.4s, v3.4s, v19.4s 224 add v4.4s, v4.4s, v20.4s 225 add v5.4s, v5.4s, v21.4s 226 add v6.4s, v6.4s, v22.4s 227 add v7.4s, v7.4s, v23.4s 228 add v8.4s, v8.4s, v24.4s 229 add v9.4s, v9.4s, v25.4s 230 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 231 add v11.4s, v11.4s, v28.4s 232 add v12.4s, v12.4s, v26.4s 233 add v13.4s, v13.4s, v29.4s 234 add v14.4s, v14.4s, v30.4s 235 add v15.4s, v15.4s, v31.4s 236 237 HTOLE32(v0.16b) 238 HTOLE32(v1.16b) 239 HTOLE32(v2.16b) 240 HTOLE32(v3.16b) 241 HTOLE32(v4.16b) 242 HTOLE32(v5.16b) 243 HTOLE32(v6.16b) 244 HTOLE32(v7.16b) 245 HTOLE32(v8.16b) 246 HTOLE32(v9.16b) 247 HTOLE32(v10.16b) 248 HTOLE32(v11.16b) 249 HTOLE32(v12.16b) 250 HTOLE32(v13.16b) 251 HTOLE32(v14.16b) 252 HTOLE32(v15.16b) 253 254 st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 255 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 256 st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16 257 st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16 258 st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16 259 st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16 260 st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16 261 st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16 262 st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16 263 st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16 264 st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16 265 st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16 266 st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16 267 st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16 268 st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16 269 st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16 270 271 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 272 ldp d10, d11, [sp, #0x20] 273 ldp d12, d13, [sp, #0x30] 274 ldp d14, d15, [sp, #0x40] 275 276 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 277 ret 278END(chacha_stream256_neon) 279 280/* 281 * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1, 282 * uint32_t blkno@w2, 283 * const uint8_t nonce[12]@x3, 284 * const uint8_t key[32]@x4, 285 * const uint8_t const[16]@x5, 286 * unsigned nr@w6) 287 */ 288ENTRY(chacha_stream_xor256_neon) 289 stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 290 mov fp, sp 291 292 stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 293 stp d10, d11, [sp, #0x20] 294 stp d12, d13, [sp, #0x30] 295 stp d14, d15, [sp, #0x40] 296 297 adrl x9, v0123 /* x9 := &v0123 */ 298 mov x10, x5 /* r10 := c */ 299 mov x11, x4 /* r11 := k */ 300 add x12, x4, #16 /* r12 := k+4 */ 301 mov x13, x3 /* r13 := nonce */ 302 303 ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 304 dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */ 305 ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 306 ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 307 ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 308 ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 309 add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 310 311 LE32TOH(v0.16b) 312 LE32TOH(v1.16b) 313 LE32TOH(v2.16b) 314 LE32TOH(v3.16b) 315 LE32TOH(v4.16b) 316 LE32TOH(v5.16b) 317 LE32TOH(v6.16b) 318 LE32TOH(v7.16b) 319 LE32TOH(v8.16b) 320 LE32TOH(v9.16b) 321 LE32TOH(v10.16b) 322 LE32TOH(v11.16b) 323 /* LE32TOH(v12.16b) -- blkno, already host order */ 324 LE32TOH(v13.16b) 325 LE32TOH(v14.16b) 326 LE32TOH(v15.16b) 327 328 mov v16.16b, v0.16b 329 mov v17.16b, v1.16b 330 mov v18.16b, v2.16b 331 mov v19.16b, v3.16b 332 mov v20.16b, v4.16b 333 mov v21.16b, v5.16b 334 mov v22.16b, v6.16b 335 mov v23.16b, v7.16b 336 mov v24.16b, v8.16b 337 mov v25.16b, v9.16b 338 mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 339 mov w8, v10.s[0] /* v27-31 needed as temporaries */ 340 mov w9, v11.s[0] 341 mov w10, v13.s[0] 342 mov w11, v14.s[0] 343 mov w12, v15.s[0] 344 345 _ALIGN_TEXT 3461: subs w6, w6, #2 347 ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 348 v28,v29,v30,v31, v27) 349 ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 350 v28,v29,v30,v31, v27) 351 b.ne 1b 352 353 dup v27.4s, w8 354 dup v28.4s, w9 355 dup v29.4s, w10 356 dup v30.4s, w11 357 dup v31.4s, w12 358 359 add v0.4s, v0.4s, v16.4s 360 add v1.4s, v1.4s, v17.4s 361 add v2.4s, v2.4s, v18.4s 362 add v3.4s, v3.4s, v19.4s 363 add v4.4s, v4.4s, v20.4s 364 add v5.4s, v5.4s, v21.4s 365 add v6.4s, v6.4s, v22.4s 366 add v7.4s, v7.4s, v23.4s 367 add v8.4s, v8.4s, v24.4s 368 add v9.4s, v9.4s, v25.4s 369 add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 370 add v11.4s, v11.4s, v28.4s 371 add v12.4s, v12.4s, v26.4s 372 add v13.4s, v13.4s, v29.4s 373 add v14.4s, v14.4s, v30.4s 374 add v15.4s, v15.4s, v31.4s 375 376 /* 377 * We could do these sixteen LD4-into-lane instructions instead 378 * by four LD1-into-register instructions, but we would need to 379 * permute the elements in v0-v15 to put them in the right 380 * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized 381 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the 382 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to 383 * exceed the savings in cost from four LD1 instructions rather 384 * than sixteen LD4 instructions, even if we interleave the LD1 385 * instructions with the ZIPs. 386 */ 387 ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16 388 ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16 389 ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16 390 ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16 391 ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16 392 ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16 393 ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16 394 ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16 395 ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16 396 ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16 397 ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16 398 ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16 399 ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16 400 ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16 401 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 402 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 403 404 HTOLE32(v0.16b) 405 HTOLE32(v1.16b) 406 HTOLE32(v2.16b) 407 HTOLE32(v3.16b) 408 HTOLE32(v4.16b) 409 HTOLE32(v5.16b) 410 HTOLE32(v6.16b) 411 HTOLE32(v7.16b) 412 HTOLE32(v8.16b) 413 HTOLE32(v9.16b) 414 HTOLE32(v10.16b) 415 HTOLE32(v11.16b) 416 HTOLE32(v12.16b) 417 HTOLE32(v13.16b) 418 HTOLE32(v14.16b) 419 HTOLE32(v15.16b) 420 421 eor v16.16b, v16.16b, v0.16b 422 eor v17.16b, v17.16b, v1.16b 423 eor v18.16b, v18.16b, v2.16b 424 eor v19.16b, v19.16b, v3.16b 425 eor v20.16b, v20.16b, v4.16b 426 eor v21.16b, v21.16b, v5.16b 427 eor v22.16b, v22.16b, v6.16b 428 eor v23.16b, v23.16b, v7.16b 429 eor v24.16b, v24.16b, v8.16b 430 eor v25.16b, v25.16b, v9.16b 431 eor v26.16b, v26.16b, v10.16b 432 eor v27.16b, v27.16b, v11.16b 433 eor v28.16b, v28.16b, v12.16b 434 eor v29.16b, v29.16b, v13.16b 435 eor v30.16b, v30.16b, v14.16b 436 eor v31.16b, v31.16b, v15.16b 437 438 st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16 439 st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16 440 st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16 441 st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16 442 st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16 443 st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16 444 st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16 445 st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16 446 st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16 447 st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16 448 st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16 449 st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16 450 st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16 451 st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16 452 st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16 453 st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16 454 455 ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 456 ldp d10, d11, [sp, #0x20] 457 ldp d12, d13, [sp, #0x30] 458 ldp d14, d15, [sp, #0x40] 459 460 ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 461 ret 462END(chacha_stream_xor256_neon) 463 464 .section .rodata 465 .p2align 4 466 467 .type v0123,@object 468v0123: 469 .long 0, 1, 2, 3 470END(v0123) 471 472 /* 473 * Must be immediately after v0123 -- we load them in a single 474 * ld1 instruction. 475 */ 476 .type rot8,@object 477rot8: 478 .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 479END(rot8) 480