1/* sha256-armv8-aarch64-ce.S - ARM/CE accelerated SHA-256 transform function 2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 3 * 4 * This file is part of Libgcrypt. 5 * 6 * Libgcrypt is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as 8 * published by the Free Software Foundation; either version 2.1 of 9 * the License, or (at your option) any later version. 10 * 11 * Libgcrypt is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "asm-common-aarch64.h" 21 22#if defined(__AARCH64EL__) && \ 23 defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ 24 defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA256) 25 26.cpu generic+simd+crypto 27 28.text 29 30 31/* Constants */ 32 33.align 4 34gcry_sha256_aarch64_ce_K: 35.LK: 36 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 37 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 38 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 39 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 40 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 41 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 42 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 43 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 44 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 45 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 46 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 47 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 48 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 49 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 50 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 51 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 52 53 54/* Register macros */ 55 56#define vH0123 v0 57#define vH4567 v1 58 59#define vABCD0 v2 60#define qABCD0 q2 61#define vABCD1 v3 62#define qABCD1 q3 63#define vEFGH v4 64#define qEFGH q4 65 66#define vT0 v5 67#define vT1 v6 68 69#define vW0 v16 70#define vW1 v17 71#define vW2 v18 72#define vW3 v19 73 74#define vK0 v20 75#define vK1 v21 76#define vK2 v22 77#define vK3 v23 78 79 80/* Round macros */ 81 82#define _(...) /*_*/ 83 84#define do_loadk(nk0, nk1) ld1 {nk0.16b-nk1.16b},[x3],#32; 85#define do_add(a, b) add a.4s, a.4s, b.4s; 86#define do_sha256su0(w0, w1) sha256su0 w0.4s, w1.4s; 87#define do_sha256su1(w0, w2, w3) sha256su1 w0.4s, w2.4s, w3.4s; 88 89#define do_rounds(k, nk0, nk1, w0, w1, w2, w3, loadk_fn, add_fn, su0_fn, su1_fn) \ 90 loadk_fn( v##nk0, v##nk1 ); \ 91 su0_fn( v##w0, v##w1 ); \ 92 mov vABCD1.16b, vABCD0.16b; \ 93 sha256h qABCD0, qEFGH, v##k.4s; \ 94 sha256h2 qEFGH, qABCD1, v##k.4s; \ 95 add_fn( v##nk0, v##w2 ); \ 96 su1_fn( v##w0, v##w2, v##w3 ); 97 98 99/* Other functional macros */ 100 101#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; 102 103 104/* 105 * unsigned int 106 * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data, 107 * size_t num_blks) 108 */ 109.align 3 110.globl _gcry_sha256_transform_armv8_ce 111ELF(.type _gcry_sha256_transform_armv8_ce,%function;) 112_gcry_sha256_transform_armv8_ce: 113 /* input: 114 * r0: ctx, CTX 115 * r1: data (64*nblks bytes) 116 * r2: nblks 117 */ 118 CFI_STARTPROC(); 119 120 cbz x2, .Ldo_nothing; 121 122 GET_DATA_POINTER(x3, .LK); 123 mov x4, x3 124 125 ld1 {vH0123.4s-vH4567.4s}, [x0] /* load state */ 126 127 ld1 {vW0.16b-vW1.16b}, [x1], #32 128 do_loadk(vK0, vK1) 129 ld1 {vW2.16b-vW3.16b}, [x1], #32 130 mov vABCD0.16b, vH0123.16b 131 mov vEFGH.16b, vH4567.16b 132 133 rev32 vW0.16b, vW0.16b 134 rev32 vW1.16b, vW1.16b 135 rev32 vW2.16b, vW2.16b 136 do_add(vK0, vW0) 137 rev32 vW3.16b, vW3.16b 138 do_add(vK1, vW1) 139 140.Loop: 141 do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1) 142 sub x2,x2,#1 143 do_rounds(K1, K3, _ , W1, W2, W3, W0, _ , do_add, do_sha256su0, do_sha256su1) 144 do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1) 145 do_rounds(K3, K1, _ , W3, W0, W1, W2, _ , do_add, do_sha256su0, do_sha256su1) 146 147 do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1) 148 do_rounds(K1, K3, _ , W1, W2, W3, W0, _ , do_add, do_sha256su0, do_sha256su1) 149 do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1) 150 do_rounds(K3, K1, _ , W3, W0, W1, W2, _ , do_add, do_sha256su0, do_sha256su1) 151 152 do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1) 153 do_rounds(K1, K3, _ , W1, W2, W3, W0, _ , do_add, do_sha256su0, do_sha256su1) 154 do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1) 155 do_rounds(K3, K1, _ , W3, W0, W1, W2, _ , do_add, do_sha256su0, do_sha256su1) 156 157 cbz x2, .Lend 158 159 do_rounds(K0, K2, K3, W0, _ , W2, W3, do_loadk, do_add, _, _) 160 ld1 {vW0.16b}, [x1], #16 161 mov x3, x4 162 do_rounds(K1, K3, _ , W1, _ , W3, _ , _ , do_add, _, _) 163 ld1 {vW1.16b}, [x1], #16 164 rev32 vW0.16b, vW0.16b 165 do_rounds(K2, K0, K1, W2, _ , W0, _ , do_loadk, do_add, _, _) 166 rev32 vW1.16b, vW1.16b 167 ld1 {vW2.16b}, [x1], #16 168 do_rounds(K3, K1, _ , W3, _ , W1, _ , _ , do_add, _, _) 169 ld1 {vW3.16b}, [x1], #16 170 171 do_add(vH0123, vABCD0) 172 do_add(vH4567, vEFGH) 173 174 rev32 vW2.16b, vW2.16b 175 mov vABCD0.16b, vH0123.16b 176 rev32 vW3.16b, vW3.16b 177 mov vEFGH.16b, vH4567.16b 178 179 b .Loop 180 181.Lend: 182 183 do_rounds(K0, K2, K3, W0, _ , W2, W3, do_loadk, do_add, _, _) 184 do_rounds(K1, K3, _ , W1, _ , W3, _ , _ , do_add, _, _) 185 do_rounds(K2, _ , _ , W2, _ , _ , _ , _ , _, _, _) 186 do_rounds(K3, _ , _ , W3, _ , _ , _ , _ , _, _, _) 187 188 CLEAR_REG(vW0) 189 CLEAR_REG(vW1) 190 CLEAR_REG(vW2) 191 CLEAR_REG(vW3) 192 CLEAR_REG(vK0) 193 CLEAR_REG(vK1) 194 CLEAR_REG(vK2) 195 CLEAR_REG(vK3) 196 197 do_add(vH0123, vABCD0) 198 do_add(vH4567, vEFGH) 199 200 CLEAR_REG(vABCD0) 201 CLEAR_REG(vABCD1) 202 CLEAR_REG(vEFGH) 203 204 st1 {vH0123.4s-vH4567.4s}, [x0] /* store state */ 205 206 CLEAR_REG(vH0123) 207 CLEAR_REG(vH4567) 208 209.Ldo_nothing: 210 mov x0, #0 211 ret 212 CFI_ENDPROC(); 213ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;) 214 215#endif 216