1*96000e66Sjakllsch/* $NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $ */ 2f45822bdSriastradh 3f45822bdSriastradh/*- 4f45822bdSriastradh * Copyright (c) 2020 The NetBSD Foundation, Inc. 5f45822bdSriastradh * All rights reserved. 6f45822bdSriastradh * 7f45822bdSriastradh * Redistribution and use in source and binary forms, with or without 8f45822bdSriastradh * modification, are permitted provided that the following conditions 9f45822bdSriastradh * are met: 10f45822bdSriastradh * 1. Redistributions of source code must retain the above copyright 11f45822bdSriastradh * notice, this list of conditions and the following disclaimer. 12f45822bdSriastradh * 2. Redistributions in binary form must reproduce the above copyright 13f45822bdSriastradh * notice, this list of conditions and the following disclaimer in the 14f45822bdSriastradh * documentation and/or other materials provided with the distribution. 15f45822bdSriastradh * 16f45822bdSriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17f45822bdSriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18f45822bdSriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19f45822bdSriastradh * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20f45822bdSriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21f45822bdSriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22f45822bdSriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23f45822bdSriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24f45822bdSriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25f45822bdSriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26f45822bdSriastradh * POSSIBILITY OF SUCH DAMAGE. 27f45822bdSriastradh */ 28f45822bdSriastradh 29f4043acdSriastradh#include <aarch64/asm.h> 30f45822bdSriastradh 31*96000e66SjakllschRCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $") 320af5b014Sriastradh 33f45822bdSriastradh#define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ 34f45822bdSriastradhSTEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 35f45822bdSriastradhSTEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 36f45822bdSriastradhSTEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 37f45822bdSriastradhSTEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 38f45822bdSriastradhSTEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 39f45822bdSriastradhSTEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 40f45822bdSriastradhSTEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 41f45822bdSriastradhSTEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 42f45822bdSriastradhSTEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 43f45822bdSriastradhSTEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 44f45822bdSriastradhSTEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 45f45822bdSriastradhSTEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 46f45822bdSriastradhSTEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 47f45822bdSriastradhSTEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 48f45822bdSriastradhSTEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 49f45822bdSriastradhSTEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 50f45822bdSriastradhSTEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 51f45822bdSriastradhSTEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 52f45822bdSriastradhSTEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 53f45822bdSriastradhSTEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ 54f45822bdSriastradh/* end ROUND */ 55f45822bdSriastradh 56f45822bdSriastradh#define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \ 57f45822bdSriastradh f(a0,b0,c0,d0, t0, r); \ 58f45822bdSriastradh f(a1,b1,c1,d1, t1, r); \ 59f45822bdSriastradh f(a2,b2,c2,d2, t2, r); \ 60f45822bdSriastradh f(a3,b3,c3,d3, t3, r); \ 61f45822bdSriastradh /* end of STEP */ 62f45822bdSriastradh 63f45822bdSriastradh/* 64f45822bdSriastradh * Each step of the ChaCha quarterround, split up so we can interleave 65f45822bdSriastradh * the quarterrounds on independent rows/diagonals to maximize pipeline 66f45822bdSriastradh * efficiency. Reference: 67f45822bdSriastradh * 68f45822bdSriastradh * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop 69f45822bdSriastradh * Record of the State of the Art in Stream Ciphers -- SASC 2008. 70f45822bdSriastradh * https://cr.yp.to/papers.html#chacha 71f45822bdSriastradh * 72f45822bdSriastradh * a += b; d ^= a; d <<<= 16; 73f45822bdSriastradh * c += d; b ^= c; b <<<= 12; 74f45822bdSriastradh * a += b; d ^= a; d <<<= 8; 75f45822bdSriastradh * c += d; b ^= c; b <<<= 7; 76f45822bdSriastradh * 77f45822bdSriastradh * The rotations are implemented with: 78f45822bdSriastradh * <<< 16 REV32 Vn.8h for 16, 79f45822bdSriastradh * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR) 80f45822bdSriastradh * <<< 8 TBL (general permutation; rot8 below stored in r) 81f45822bdSriastradh * <<< 7 SHL/SRI/ORR 82f45822bdSriastradh */ 83f45822bdSriastradh#define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 84f45822bdSriastradh#define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 85f45822bdSriastradh#if 0 86f45822bdSriastradh#define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16 87f45822bdSriastradh#define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16) 88f45822bdSriastradh#define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 89f45822bdSriastradh#else 90f45822bdSriastradh#define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h 91f45822bdSriastradh#define STEP3(a,b,c,d, t, r) /* nothing */ 92f45822bdSriastradh#define STEP4(a,b,c,d, t, r) /* nothing */ 93f45822bdSriastradh#endif 94f45822bdSriastradh 95f45822bdSriastradh#define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 96f45822bdSriastradh#if 0 97f45822bdSriastradh#define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 98f45822bdSriastradh#define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12 99f45822bdSriastradh#define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12) 100f45822bdSriastradh#define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 101f45822bdSriastradh#else 102f45822bdSriastradh#define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 103f45822bdSriastradh#define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12 104f45822bdSriastradh#define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12) 105f45822bdSriastradh#define STEP9(a,b,c,d, t, r) /* nothing */ 106f45822bdSriastradh#endif 107f45822bdSriastradh 108f45822bdSriastradh#define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s 109f45822bdSriastradh#define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b 110f45822bdSriastradh#if 0 111f45822bdSriastradh#define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8 112f45822bdSriastradh#define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8) 113f45822bdSriastradh#define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b 114f45822bdSriastradh#else 115f45822bdSriastradh#define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b 116f45822bdSriastradh#define STEP13(a,b,c,d, t, r) /* nothing */ 117f45822bdSriastradh#define STEP14(a,b,c,d, t, r) /* nothing */ 118f45822bdSriastradh#endif 119f45822bdSriastradh 120f45822bdSriastradh#define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s 121f45822bdSriastradh#if 0 122f45822bdSriastradh#define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b 123f45822bdSriastradh#define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7 124f45822bdSriastradh#define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7) 125f45822bdSriastradh#define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b 126f45822bdSriastradh#else 127f45822bdSriastradh#define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b 128f45822bdSriastradh#define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7 129f45822bdSriastradh#define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7) 130f45822bdSriastradh#define STEP19(a,b,c,d, t, r) /* nothing */ 131f45822bdSriastradh#endif 132f45822bdSriastradh 133*96000e66Sjakllsch#if defined(__AARCH64EB__) 134f45822bdSriastradh#define HTOLE32(x) rev32 x, x 135f45822bdSriastradh#define LE32TOH(x) rev32 x, x 136*96000e66Sjakllsch#else 137*96000e66Sjakllsch#define LE32TOH(x) 138*96000e66Sjakllsch#define HTOLE32(x) 139f45822bdSriastradh#endif 140f45822bdSriastradh 141f45822bdSriastradh/* 142f45822bdSriastradh * chacha_stream256_neon(uint8_t s[256]@x0, 143f45822bdSriastradh * uint32_t blkno@w1, 144f45822bdSriastradh * const uint8_t nonce[12]@x2, 14537fa73a4Sriastradh * const uint8_t key[32]@x3, 146f45822bdSriastradh * const uint8_t const[16]@x4, 147f45822bdSriastradh * unsigned nr@w5) 148f45822bdSriastradh */ 149f45822bdSriastradhENTRY(chacha_stream256_neon) 150f45822bdSriastradh stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 151f45822bdSriastradh mov fp, sp 152f45822bdSriastradh 153f45822bdSriastradh stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 154f45822bdSriastradh stp d10, d11, [sp, #0x20] 155f45822bdSriastradh stp d12, d13, [sp, #0x30] 156f45822bdSriastradh stp d14, d15, [sp, #0x40] 157f45822bdSriastradh 158f45822bdSriastradh adrl x9, v0123 /* x9 := &v0123 */ 159f45822bdSriastradh mov x10, x4 /* r10 := c */ 160f45822bdSriastradh mov x11, x3 /* r11 := k */ 161f45822bdSriastradh add x12, x3, #16 /* r12 := k+4 */ 162f45822bdSriastradh mov x13, x2 /* r13 := nonce */ 163f45822bdSriastradh 164f45822bdSriastradh ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 165f45822bdSriastradh dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */ 166f45822bdSriastradh ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 167f45822bdSriastradh ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 168f45822bdSriastradh ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 169f45822bdSriastradh ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 170f45822bdSriastradh add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 171f45822bdSriastradh 172be52d94aSriastradh LE32TOH(v0.16b) 173be52d94aSriastradh LE32TOH(v1.16b) 174be52d94aSriastradh LE32TOH(v2.16b) 175be52d94aSriastradh LE32TOH(v3.16b) 176be52d94aSriastradh LE32TOH(v4.16b) 177be52d94aSriastradh LE32TOH(v5.16b) 178be52d94aSriastradh LE32TOH(v6.16b) 179be52d94aSriastradh LE32TOH(v7.16b) 180be52d94aSriastradh LE32TOH(v8.16b) 181be52d94aSriastradh LE32TOH(v9.16b) 182be52d94aSriastradh LE32TOH(v10.16b) 183be52d94aSriastradh LE32TOH(v11.16b) 184be52d94aSriastradh /* LE32TOH(v12.16b) -- blkno, already host order */ 185be52d94aSriastradh LE32TOH(v13.16b) 186be52d94aSriastradh LE32TOH(v14.16b) 187be52d94aSriastradh LE32TOH(v15.16b) 188f45822bdSriastradh 189f45822bdSriastradh mov v16.16b, v0.16b 190f45822bdSriastradh mov v17.16b, v1.16b 191f45822bdSriastradh mov v18.16b, v2.16b 192f45822bdSriastradh mov v19.16b, v3.16b 193f45822bdSriastradh mov v20.16b, v4.16b 194f45822bdSriastradh mov v21.16b, v5.16b 195f45822bdSriastradh mov v22.16b, v6.16b 196f45822bdSriastradh mov v23.16b, v7.16b 197f45822bdSriastradh mov v24.16b, v8.16b 198f45822bdSriastradh mov v25.16b, v9.16b 199f45822bdSriastradh mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 200f45822bdSriastradh mov w8, v10.s[0] /* v27-31 needed as temporaries */ 201f45822bdSriastradh mov w9, v11.s[0] 202f45822bdSriastradh mov w10, v13.s[0] 203f45822bdSriastradh mov w11, v14.s[0] 204f45822bdSriastradh mov w12, v15.s[0] 205f45822bdSriastradh 206d189e131Sriastradh _ALIGN_TEXT 207f45822bdSriastradh1: subs w5, w5, #2 208f45822bdSriastradh ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 209f45822bdSriastradh v28,v29,v30,v31, v27) 210f45822bdSriastradh ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 211f45822bdSriastradh v28,v29,v30,v31, v27) 212f45822bdSriastradh b.ne 1b 213f45822bdSriastradh 214f45822bdSriastradh dup v27.4s, w8 215f45822bdSriastradh dup v28.4s, w9 216f45822bdSriastradh dup v29.4s, w10 217f45822bdSriastradh dup v30.4s, w11 218f45822bdSriastradh dup v31.4s, w12 219f45822bdSriastradh 220f45822bdSriastradh add v0.4s, v0.4s, v16.4s 221f45822bdSriastradh add v1.4s, v1.4s, v17.4s 222f45822bdSriastradh add v2.4s, v2.4s, v18.4s 223f45822bdSriastradh add v3.4s, v3.4s, v19.4s 224f45822bdSriastradh add v4.4s, v4.4s, v20.4s 225f45822bdSriastradh add v5.4s, v5.4s, v21.4s 226f45822bdSriastradh add v6.4s, v6.4s, v22.4s 227f45822bdSriastradh add v7.4s, v7.4s, v23.4s 228f45822bdSriastradh add v8.4s, v8.4s, v24.4s 229f45822bdSriastradh add v9.4s, v9.4s, v25.4s 230f45822bdSriastradh add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 231f45822bdSriastradh add v11.4s, v11.4s, v28.4s 232f45822bdSriastradh add v12.4s, v12.4s, v26.4s 233f45822bdSriastradh add v13.4s, v13.4s, v29.4s 234f45822bdSriastradh add v14.4s, v14.4s, v30.4s 235f45822bdSriastradh add v15.4s, v15.4s, v31.4s 236f45822bdSriastradh 237be52d94aSriastradh HTOLE32(v0.16b) 238be52d94aSriastradh HTOLE32(v1.16b) 239be52d94aSriastradh HTOLE32(v2.16b) 240be52d94aSriastradh HTOLE32(v3.16b) 241be52d94aSriastradh HTOLE32(v4.16b) 242be52d94aSriastradh HTOLE32(v5.16b) 243be52d94aSriastradh HTOLE32(v6.16b) 244be52d94aSriastradh HTOLE32(v7.16b) 245be52d94aSriastradh HTOLE32(v8.16b) 246be52d94aSriastradh HTOLE32(v9.16b) 247be52d94aSriastradh HTOLE32(v10.16b) 248be52d94aSriastradh HTOLE32(v11.16b) 249be52d94aSriastradh HTOLE32(v12.16b) 250be52d94aSriastradh HTOLE32(v13.16b) 251be52d94aSriastradh HTOLE32(v14.16b) 252be52d94aSriastradh HTOLE32(v15.16b) 253f45822bdSriastradh 254f45822bdSriastradh st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 255f45822bdSriastradh st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 256f45822bdSriastradh st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16 257f45822bdSriastradh st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16 258f45822bdSriastradh st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16 259f45822bdSriastradh st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16 260f45822bdSriastradh st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16 261f45822bdSriastradh st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16 262f45822bdSriastradh st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16 263f45822bdSriastradh st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16 264f45822bdSriastradh st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16 265f45822bdSriastradh st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16 266f45822bdSriastradh st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16 267f45822bdSriastradh st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16 268f45822bdSriastradh st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16 269f45822bdSriastradh st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16 270f45822bdSriastradh 271f45822bdSriastradh ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 272f45822bdSriastradh ldp d10, d11, [sp, #0x20] 273f45822bdSriastradh ldp d12, d13, [sp, #0x30] 274f45822bdSriastradh ldp d14, d15, [sp, #0x40] 275f45822bdSriastradh 276f45822bdSriastradh ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 277f45822bdSriastradh ret 278f45822bdSriastradhEND(chacha_stream256_neon) 279f45822bdSriastradh 280f45822bdSriastradh/* 281f45822bdSriastradh * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1, 282f45822bdSriastradh * uint32_t blkno@w2, 283f45822bdSriastradh * const uint8_t nonce[12]@x3, 284f45822bdSriastradh * const uint8_t key[32]@x4, 285f45822bdSriastradh * const uint8_t const[16]@x5, 286f45822bdSriastradh * unsigned nr@w6) 287f45822bdSriastradh */ 288f45822bdSriastradhENTRY(chacha_stream_xor256_neon) 289f45822bdSriastradh stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ 290f45822bdSriastradh mov fp, sp 291f45822bdSriastradh 292f45822bdSriastradh stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ 293f45822bdSriastradh stp d10, d11, [sp, #0x20] 294f45822bdSriastradh stp d12, d13, [sp, #0x30] 295f45822bdSriastradh stp d14, d15, [sp, #0x40] 296f45822bdSriastradh 297f45822bdSriastradh adrl x9, v0123 /* x9 := &v0123 */ 298f45822bdSriastradh mov x10, x5 /* r10 := c */ 299f45822bdSriastradh mov x11, x4 /* r11 := k */ 300f45822bdSriastradh add x12, x4, #16 /* r12 := k+4 */ 301f45822bdSriastradh mov x13, x3 /* r13 := nonce */ 302f45822bdSriastradh 303f45822bdSriastradh ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ 304f45822bdSriastradh dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */ 305f45822bdSriastradh ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ 306f45822bdSriastradh ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ 307f45822bdSriastradh ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ 308f45822bdSriastradh ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ 309f45822bdSriastradh add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ 310f45822bdSriastradh 311be52d94aSriastradh LE32TOH(v0.16b) 312be52d94aSriastradh LE32TOH(v1.16b) 313be52d94aSriastradh LE32TOH(v2.16b) 314be52d94aSriastradh LE32TOH(v3.16b) 315be52d94aSriastradh LE32TOH(v4.16b) 316be52d94aSriastradh LE32TOH(v5.16b) 317be52d94aSriastradh LE32TOH(v6.16b) 318be52d94aSriastradh LE32TOH(v7.16b) 319be52d94aSriastradh LE32TOH(v8.16b) 320be52d94aSriastradh LE32TOH(v9.16b) 321be52d94aSriastradh LE32TOH(v10.16b) 322be52d94aSriastradh LE32TOH(v11.16b) 323be52d94aSriastradh /* LE32TOH(v12.16b) -- blkno, already host order */ 324be52d94aSriastradh LE32TOH(v13.16b) 325be52d94aSriastradh LE32TOH(v14.16b) 326be52d94aSriastradh LE32TOH(v15.16b) 327f45822bdSriastradh 328f45822bdSriastradh mov v16.16b, v0.16b 329f45822bdSriastradh mov v17.16b, v1.16b 330f45822bdSriastradh mov v18.16b, v2.16b 331f45822bdSriastradh mov v19.16b, v3.16b 332f45822bdSriastradh mov v20.16b, v4.16b 333f45822bdSriastradh mov v21.16b, v5.16b 334f45822bdSriastradh mov v22.16b, v6.16b 335f45822bdSriastradh mov v23.16b, v7.16b 336f45822bdSriastradh mov v24.16b, v8.16b 337f45822bdSriastradh mov v25.16b, v9.16b 338f45822bdSriastradh mov v26.16b, v12.16b /* reordered since v12 isn't dup */ 339f45822bdSriastradh mov w8, v10.s[0] /* v27-31 needed as temporaries */ 340f45822bdSriastradh mov w9, v11.s[0] 341f45822bdSriastradh mov w10, v13.s[0] 342f45822bdSriastradh mov w11, v14.s[0] 343f45822bdSriastradh mov w12, v15.s[0] 344f45822bdSriastradh 345d189e131Sriastradh _ALIGN_TEXT 346f45822bdSriastradh1: subs w6, w6, #2 347f45822bdSriastradh ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, 348f45822bdSriastradh v28,v29,v30,v31, v27) 349f45822bdSriastradh ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, 350f45822bdSriastradh v28,v29,v30,v31, v27) 351f45822bdSriastradh b.ne 1b 352f45822bdSriastradh 353f45822bdSriastradh dup v27.4s, w8 354f45822bdSriastradh dup v28.4s, w9 355f45822bdSriastradh dup v29.4s, w10 356f45822bdSriastradh dup v30.4s, w11 357f45822bdSriastradh dup v31.4s, w12 358f45822bdSriastradh 359f45822bdSriastradh add v0.4s, v0.4s, v16.4s 360f45822bdSriastradh add v1.4s, v1.4s, v17.4s 361f45822bdSriastradh add v2.4s, v2.4s, v18.4s 362f45822bdSriastradh add v3.4s, v3.4s, v19.4s 363f45822bdSriastradh add v4.4s, v4.4s, v20.4s 364f45822bdSriastradh add v5.4s, v5.4s, v21.4s 365f45822bdSriastradh add v6.4s, v6.4s, v22.4s 366f45822bdSriastradh add v7.4s, v7.4s, v23.4s 367f45822bdSriastradh add v8.4s, v8.4s, v24.4s 368f45822bdSriastradh add v9.4s, v9.4s, v25.4s 369f45822bdSriastradh add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ 370f45822bdSriastradh add v11.4s, v11.4s, v28.4s 371f45822bdSriastradh add v12.4s, v12.4s, v26.4s 372f45822bdSriastradh add v13.4s, v13.4s, v29.4s 373f45822bdSriastradh add v14.4s, v14.4s, v30.4s 374f45822bdSriastradh add v15.4s, v15.4s, v31.4s 375f45822bdSriastradh 376f45822bdSriastradh /* 377f45822bdSriastradh * We could do these sixteen LD4-into-lane instructions instead 378f45822bdSriastradh * by four LD1-into-register instructions, but we would need to 379f45822bdSriastradh * permute the elements in v0-v15 to put them in the right 380f45822bdSriastradh * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized 381f45822bdSriastradh * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the 382f45822bdSriastradh * net cost of the thirty-two ZIP1/ZIP2 instructions seems to 383f45822bdSriastradh * exceed the savings in cost from four LD1 instructions rather 384f45822bdSriastradh * than sixteen LD4 instructions, even if we interleave the LD1 385f45822bdSriastradh * instructions with the ZIPs. 386f45822bdSriastradh */ 387f45822bdSriastradh ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16 388f45822bdSriastradh ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16 389f45822bdSriastradh ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16 390f45822bdSriastradh ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16 391f45822bdSriastradh ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16 392f45822bdSriastradh ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16 393f45822bdSriastradh ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16 394f45822bdSriastradh ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16 395f45822bdSriastradh ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16 396f45822bdSriastradh ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16 397f45822bdSriastradh ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16 398f45822bdSriastradh ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16 399f45822bdSriastradh ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16 400f45822bdSriastradh ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16 401f45822bdSriastradh ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 402f45822bdSriastradh ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 403f45822bdSriastradh 404be52d94aSriastradh HTOLE32(v0.16b) 405be52d94aSriastradh HTOLE32(v1.16b) 406be52d94aSriastradh HTOLE32(v2.16b) 407be52d94aSriastradh HTOLE32(v3.16b) 408be52d94aSriastradh HTOLE32(v4.16b) 409be52d94aSriastradh HTOLE32(v5.16b) 410be52d94aSriastradh HTOLE32(v6.16b) 411be52d94aSriastradh HTOLE32(v7.16b) 412be52d94aSriastradh HTOLE32(v8.16b) 413be52d94aSriastradh HTOLE32(v9.16b) 414be52d94aSriastradh HTOLE32(v10.16b) 415be52d94aSriastradh HTOLE32(v11.16b) 416be52d94aSriastradh HTOLE32(v12.16b) 417be52d94aSriastradh HTOLE32(v13.16b) 418be52d94aSriastradh HTOLE32(v14.16b) 419be52d94aSriastradh HTOLE32(v15.16b) 420f45822bdSriastradh 421f45822bdSriastradh eor v16.16b, v16.16b, v0.16b 422f45822bdSriastradh eor v17.16b, v17.16b, v1.16b 423f45822bdSriastradh eor v18.16b, v18.16b, v2.16b 424f45822bdSriastradh eor v19.16b, v19.16b, v3.16b 425f45822bdSriastradh eor v20.16b, v20.16b, v4.16b 426f45822bdSriastradh eor v21.16b, v21.16b, v5.16b 427f45822bdSriastradh eor v22.16b, v22.16b, v6.16b 428f45822bdSriastradh eor v23.16b, v23.16b, v7.16b 429f45822bdSriastradh eor v24.16b, v24.16b, v8.16b 430f45822bdSriastradh eor v25.16b, v25.16b, v9.16b 431f45822bdSriastradh eor v26.16b, v26.16b, v10.16b 432f45822bdSriastradh eor v27.16b, v27.16b, v11.16b 433f45822bdSriastradh eor v28.16b, v28.16b, v12.16b 434f45822bdSriastradh eor v29.16b, v29.16b, v13.16b 435f45822bdSriastradh eor v30.16b, v30.16b, v14.16b 436f45822bdSriastradh eor v31.16b, v31.16b, v15.16b 437f45822bdSriastradh 438f45822bdSriastradh st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16 439f45822bdSriastradh st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16 440f45822bdSriastradh st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16 441f45822bdSriastradh st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16 442f45822bdSriastradh st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16 443f45822bdSriastradh st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16 444f45822bdSriastradh st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16 445f45822bdSriastradh st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16 446f45822bdSriastradh st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16 447f45822bdSriastradh st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16 448f45822bdSriastradh st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16 449f45822bdSriastradh st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16 450f45822bdSriastradh st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16 451f45822bdSriastradh st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16 452f45822bdSriastradh st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16 453f45822bdSriastradh st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16 454f45822bdSriastradh 455f45822bdSriastradh ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ 456f45822bdSriastradh ldp d10, d11, [sp, #0x20] 457f45822bdSriastradh ldp d12, d13, [sp, #0x30] 458f45822bdSriastradh ldp d14, d15, [sp, #0x40] 459f45822bdSriastradh 460f45822bdSriastradh ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ 461f45822bdSriastradh ret 462f45822bdSriastradhEND(chacha_stream_xor256_neon) 463f45822bdSriastradh 464f45822bdSriastradh .section .rodata 465f45822bdSriastradh .p2align 4 466f45822bdSriastradh 467f45822bdSriastradh .type v0123,@object 468f45822bdSriastradhv0123: 469f45822bdSriastradh .long 0, 1, 2, 3 470f45822bdSriastradhEND(v0123) 471f45822bdSriastradh 472f45822bdSriastradh /* 473f45822bdSriastradh * Must be immediately after v0123 -- we load them in a single 474f45822bdSriastradh * ld1 instruction. 475f45822bdSriastradh */ 476f45822bdSriastradh .type rot8,@object 477f45822bdSriastradhrot8: 478f45822bdSriastradh .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 479f45822bdSriastradhEND(rot8) 480