1/* sha1-armv8-aarch32-ce.S - ARM/CE accelerated SHA-1 transform function 2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 3 * 4 * This file is part of Libgcrypt. 5 * 6 * Libgcrypt is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as 8 * published by the Free Software Foundation; either version 2.1 of 9 * the License, or (at your option) any later version. 10 * 11 * Libgcrypt is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include <config.h> 21 22#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ 23 defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ 24 defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) && defined(USE_SHA1) 25 26.syntax unified 27.arch armv8-a 28.fpu crypto-neon-fp-armv8 29.arm 30 31.text 32 33#ifdef __PIC__ 34# define GET_DATA_POINTER(reg, name, rtmp) \ 35 ldr reg, 1f; \ 36 ldr rtmp, 2f; \ 37 b 3f; \ 38 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ 39 2: .word name(GOT); \ 40 3: add reg, pc, reg; \ 41 ldr reg, [reg, rtmp]; 42#else 43# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name 44#endif 45 46 47/* Constants */ 48 49#define K1 0x5A827999 50#define K2 0x6ED9EBA1 51#define K3 0x8F1BBCDC 52#define K4 0xCA62C1D6 53.align 4 54gcry_sha1_aarch32_ce_K_VEC: 55.LK_VEC: 56.LK1: .long K1, K1, K1, K1 57.LK2: .long K2, K2, K2, K2 58.LK3: .long K3, K3, K3, K3 59.LK4: .long K4, K4, K4, K4 60 61 62/* Register macros */ 63 64#define qH4 q0 65#define sH4 s0 66#define qH0123 q1 67 68#define qABCD q2 69#define qE0 q3 70#define qE1 q4 71 72#define qT0 q5 73#define qT1 q6 74 75#define qW0 q8 76#define qW1 q9 77#define qW2 q10 78#define qW3 q11 79 80#define qK1 q12 81#define qK2 q13 82#define qK3 q14 83#define qK4 q15 84 85 86/* Round macros */ 87 88#define _(...) /*_*/ 89#define do_add(dst, src0, src1) vadd.u32 dst, src0, src1; 90#define do_sha1su0(w0,w1,w2) sha1su0.32 w0,w1,w2; 91#define do_sha1su1(w0,w3) sha1su1.32 w0,w3; 92 93#define do_rounds(f, e0, e1, t, k, w0, w1, w2, w3, add_fn, sha1su0_fn, sha1su1_fn) \ 94 sha1su1_fn( w3, w2 ); \ 95 sha1h.32 e0, qABCD; \ 96 sha1##f.32 qABCD, e1, t; \ 97 add_fn( t, w2, k ); \ 98 sha1su0_fn( w0, w1, w2 ); 99 100 101/* Other functional macros */ 102 103#define CLEAR_REG(reg) veor reg, reg; 104 105 106/* 107 * unsigned int 108 * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data, 109 * size_t nblks) 110 */ 111.align 3 112.globl _gcry_sha1_transform_armv8_ce 113.type _gcry_sha1_transform_armv8_ce,%function; 114_gcry_sha1_transform_armv8_ce: 115 /* input: 116 * r0: ctx, CTX 117 * r1: data (64*nblks bytes) 118 * r2: nblks 119 */ 120 121 cmp r2, #0; 122 push {r4,lr}; 123 beq .Ldo_nothing; 124 125 vpush {q4-q7}; 126 127 GET_DATA_POINTER(r4, .LK_VEC, lr); 128 129 veor qH4, qH4 130 vld1.32 {qH0123}, [r0] /* load h0,h1,h2,h3 */ 131 132 vld1.32 {qK1-qK2}, [r4]! /* load K1,K2 */ 133 vldr sH4, [r0, #16] /* load h4 */ 134 vld1.32 {qK3-qK4}, [r4] /* load K3,K4 */ 135 136 vld1.8 {qW0-qW1}, [r1]! 137 vmov qABCD, qH0123 138 vld1.8 {qW2-qW3}, [r1]! 139 140 vrev32.8 qW0, qW0 141 vrev32.8 qW1, qW1 142 vrev32.8 qW2, qW2 143 do_add(qT0, qW0, qK1) 144 vrev32.8 qW3, qW3 145 do_add(qT1, qW1, qK1) 146 147.Loop: 148 do_rounds(c, qE1, qH4, qT0, qK1, qW0, qW1, qW2, qW3, do_add, do_sha1su0, _) 149 subs r2, r2, #1 150 do_rounds(c, qE0, qE1, qT1, qK1, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1) 151 do_rounds(c, qE1, qE0, qT0, qK1, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1) 152 do_rounds(c, qE0, qE1, qT1, qK2, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1) 153 do_rounds(c, qE1, qE0, qT0, qK2, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1) 154 155 do_rounds(p, qE0, qE1, qT1, qK2, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1) 156 do_rounds(p, qE1, qE0, qT0, qK2, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1) 157 do_rounds(p, qE0, qE1, qT1, qK2, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1) 158 do_rounds(p, qE1, qE0, qT0, qK3, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1) 159 do_rounds(p, qE0, qE1, qT1, qK3, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1) 160 161 do_rounds(m, qE1, qE0, qT0, qK3, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1) 162 do_rounds(m, qE0, qE1, qT1, qK3, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1) 163 do_rounds(m, qE1, qE0, qT0, qK3, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1) 164 do_rounds(m, qE0, qE1, qT1, qK4, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1) 165 do_rounds(m, qE1, qE0, qT0, qK4, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1) 166 167 do_rounds(p, qE0, qE1, qT1, qK4, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1) 168 beq .Lend 169 170 vld1.8 {qW0-qW1}, [r1]! /* preload */ 171 do_rounds(p, qE1, qE0, qT0, qK4, _ , _ , qW2, qW3, do_add, _, do_sha1su1) 172 vrev32.8 qW0, qW0 173 vld1.8 {qW2}, [r1]! 174 vrev32.8 qW1, qW1 175 do_rounds(p, qE0, qE1, qT1, qK4, _ , _ , qW3, _ , do_add, _, _) 176 vld1.8 {qW3}, [r1]! 177 vrev32.8 qW2, qW2 178 do_rounds(p, qE1, qE0, qT0, _, _, _, _, _, _, _, _) 179 vrev32.8 qW3, qW3 180 do_rounds(p, qE0, qE1, qT1, _, _, _, _, _, _, _, _) 181 182 do_add(qT0, qW0, qK1) 183 vadd.u32 qH4, qE0 184 vadd.u32 qABCD, qH0123 185 do_add(qT1, qW1, qK1) 186 187 vmov qH0123, qABCD 188 189 b .Loop 190 191.Lend: 192 do_rounds(p, qE1, qE0, qT0, qK4, _ , _ , qW2, qW3, do_add, _, do_sha1su1) 193 do_rounds(p, qE0, qE1, qT1, qK4, _ , _ , qW3, _ , do_add, _, _) 194 do_rounds(p, qE1, qE0, qT0, _, _, _, _, _, _, _, _) 195 do_rounds(p, qE0, qE1, qT1, _, _, _, _, _, _, _, _) 196 197 vadd.u32 qH4, qE0 198 vadd.u32 qH0123, qABCD 199 200 CLEAR_REG(qW0) 201 CLEAR_REG(qW1) 202 CLEAR_REG(qW2) 203 CLEAR_REG(qW3) 204 CLEAR_REG(qABCD) 205 CLEAR_REG(qE1) 206 CLEAR_REG(qE0) 207 208 vstr sH4, [r0, #16] /* store h4 */ 209 vst1.32 {qH0123}, [r0] /* store h0,h1,h2,h3 */ 210 211 CLEAR_REG(qH0123) 212 CLEAR_REG(qH4) 213 vpop {q4-q7} 214 215.Ldo_nothing: 216 mov r0, #0 217 pop {r4,pc} 218.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce; 219 220#endif 221