1/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform 2 * 3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#include <config.h> 22 23#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ 24 defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ 25 defined(HAVE_GCC_INLINE_ASM_NEON) 26 27.text 28 29.syntax unified 30.fpu neon 31.arm 32 33/* structure of SHA512_CONTEXT */ 34#define hd_a 0 35#define hd_b ((hd_a) + 8) 36#define hd_c ((hd_b) + 8) 37#define hd_d ((hd_c) + 8) 38#define hd_e ((hd_d) + 8) 39#define hd_f ((hd_e) + 8) 40#define hd_g ((hd_f) + 8) 41 42/* register macros */ 43#define RK %r2 44 45#define RA d0 46#define RB d1 47#define RC d2 48#define RD d3 49#define RE d4 50#define RF d5 51#define RG d6 52#define RH d7 53 54#define RT0 d8 55#define RT1 d9 56#define RT2 d10 57#define RT3 d11 58#define RT4 d12 59#define RT5 d13 60#define RT6 d14 61#define RT7 d15 62 63#define RT01q q4 64#define RT23q q5 65#define RT45q q6 66#define RT67q q7 67 68#define RW0 d16 69#define RW1 d17 70#define RW2 d18 71#define RW3 d19 72#define RW4 d20 73#define RW5 d21 74#define RW6 d22 75#define RW7 d23 76#define RW8 d24 77#define RW9 d25 78#define RW10 d26 79#define RW11 d27 80#define RW12 d28 81#define RW13 d29 82#define RW14 d30 83#define RW15 d31 84 85#define RW01q q8 86#define RW23q q9 87#define RW45q q10 88#define RW67q q11 89#define RW89q q12 90#define RW1011q q13 91#define RW1213q q14 92#define RW1415q q15 93 94/*********************************************************************** 95 * ARM assembly implementation of sha512 transform 96 ***********************************************************************/ 97#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \ 98 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ 99 vshr.u64 RT2, re, #14; \ 100 vshl.u64 RT3, re, #64 - 14; \ 101 interleave_op(arg1); \ 102 vshr.u64 RT4, re, #18; \ 103 vshl.u64 RT5, re, #64 - 18; \ 104 vld1.64 {RT0}, [RK]!; \ 105 veor.64 RT23q, RT23q, RT45q; \ 106 vshr.u64 RT4, re, #41; \ 107 vshl.u64 RT5, re, #64 - 41; \ 108 vadd.u64 RT0, RT0, rw0; \ 109 veor.64 RT23q, RT23q, RT45q; \ 110 vmov.64 RT7, re; \ 111 veor.64 RT1, RT2, RT3; \ 112 vbsl.64 RT7, rf, rg; \ 113 \ 114 vadd.u64 RT1, RT1, rh; \ 115 vshr.u64 RT2, ra, #28; \ 116 vshl.u64 RT3, ra, #64 - 28; \ 117 vadd.u64 RT1, RT1, RT0; \ 118 vshr.u64 RT4, ra, #34; \ 119 vshl.u64 RT5, ra, #64 - 34; \ 120 vadd.u64 RT1, RT1, RT7; \ 121 \ 122 /* h = Sum0 (a) + Maj (a, b, c); */ \ 123 veor.64 RT23q, RT23q, RT45q; \ 124 vshr.u64 RT4, ra, #39; \ 125 vshl.u64 RT5, ra, #64 - 39; \ 126 veor.64 RT0, ra, rb; \ 127 veor.64 RT23q, RT23q, RT45q; \ 128 vbsl.64 RT0, rc, rb; \ 129 vadd.u64 rd, rd, RT1; /* d+=t1; */ \ 130 veor.64 rh, RT2, RT3; \ 131 \ 132 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ 133 vshr.u64 RT2, rd, #14; \ 134 vshl.u64 RT3, rd, #64 - 14; \ 135 vadd.u64 rh, rh, RT0; \ 136 vshr.u64 RT4, rd, #18; \ 137 vshl.u64 RT5, rd, #64 - 18; \ 138 vadd.u64 rh, rh, RT1; /* h+=t1; */ \ 139 vld1.64 {RT0}, [RK]!; \ 140 veor.64 RT23q, RT23q, RT45q; \ 141 vshr.u64 RT4, rd, #41; \ 142 vshl.u64 RT5, rd, #64 - 41; \ 143 vadd.u64 RT0, RT0, rw1; \ 144 veor.64 RT23q, RT23q, RT45q; \ 145 vmov.64 RT7, rd; \ 146 veor.64 RT1, RT2, RT3; \ 147 vbsl.64 RT7, re, rf; \ 148 \ 149 vadd.u64 RT1, RT1, rg; \ 150 vshr.u64 RT2, rh, #28; \ 151 vshl.u64 RT3, rh, #64 - 28; \ 152 vadd.u64 RT1, RT1, RT0; \ 153 vshr.u64 RT4, rh, #34; \ 154 vshl.u64 RT5, rh, #64 - 34; \ 155 vadd.u64 RT1, RT1, RT7; \ 156 \ 157 /* g = Sum0 (h) + Maj (h, a, b); */ \ 158 veor.64 RT23q, RT23q, RT45q; \ 159 vshr.u64 RT4, rh, #39; \ 160 vshl.u64 RT5, rh, #64 - 39; \ 161 veor.64 RT0, rh, ra; \ 162 veor.64 RT23q, RT23q, RT45q; \ 163 vbsl.64 RT0, rb, ra; \ 164 vadd.u64 rc, rc, RT1; /* c+=t1; */ \ 165 veor.64 rg, RT2, RT3; \ 166 \ 167 /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ 168 /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \ 169 \ 170 /**** S0(w[1:2]) */ \ 171 \ 172 /* w[0:1] += w[9:10] */ \ 173 /* RT23q = rw1:rw2 */ \ 174 vext.u64 RT23q, rw01q, rw23q, #1; \ 175 vadd.u64 rw0, rw9; \ 176 vadd.u64 rg, rg, RT0; \ 177 vadd.u64 rw1, rw10;\ 178 vadd.u64 rg, rg, RT1; /* g+=t1; */ \ 179 \ 180 vshr.u64 RT45q, RT23q, #1; \ 181 vshl.u64 RT67q, RT23q, #64 - 1; \ 182 vshr.u64 RT01q, RT23q, #8; \ 183 veor.u64 RT45q, RT45q, RT67q; \ 184 vshl.u64 RT67q, RT23q, #64 - 8; \ 185 veor.u64 RT45q, RT45q, RT01q; \ 186 vshr.u64 RT01q, RT23q, #7; \ 187 veor.u64 RT45q, RT45q, RT67q; \ 188 \ 189 /**** S1(w[14:15]) */ \ 190 vshr.u64 RT23q, rw1415q, #6; \ 191 veor.u64 RT01q, RT01q, RT45q; \ 192 vshr.u64 RT45q, rw1415q, #19; \ 193 vshl.u64 RT67q, rw1415q, #64 - 19; \ 194 veor.u64 RT23q, RT23q, RT45q; \ 195 vshr.u64 RT45q, rw1415q, #61; \ 196 veor.u64 RT23q, RT23q, RT67q; \ 197 vshl.u64 RT67q, rw1415q, #64 - 61; \ 198 veor.u64 RT23q, RT23q, RT45q; \ 199 vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \ 200 veor.u64 RT01q, RT23q, RT67q; 201#define vadd_RT01q(rw01q) \ 202 /* w[0:1] += S(w[14:15]) */ \ 203 vadd.u64 rw01q, RT01q; 204 205#define dummy(_) /*_*/ 206 207#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \ 208 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ 209 vshr.u64 RT2, re, #14; \ 210 vshl.u64 RT3, re, #64 - 14; \ 211 interleave_op1(arg1); \ 212 vshr.u64 RT4, re, #18; \ 213 vshl.u64 RT5, re, #64 - 18; \ 214 interleave_op2(arg2); \ 215 vld1.64 {RT0}, [RK]!; \ 216 veor.64 RT23q, RT23q, RT45q; \ 217 vshr.u64 RT4, re, #41; \ 218 vshl.u64 RT5, re, #64 - 41; \ 219 vadd.u64 RT0, RT0, rw0; \ 220 veor.64 RT23q, RT23q, RT45q; \ 221 vmov.64 RT7, re; \ 222 veor.64 RT1, RT2, RT3; \ 223 vbsl.64 RT7, rf, rg; \ 224 \ 225 vadd.u64 RT1, RT1, rh; \ 226 vshr.u64 RT2, ra, #28; \ 227 vshl.u64 RT3, ra, #64 - 28; \ 228 vadd.u64 RT1, RT1, RT0; \ 229 vshr.u64 RT4, ra, #34; \ 230 vshl.u64 RT5, ra, #64 - 34; \ 231 vadd.u64 RT1, RT1, RT7; \ 232 \ 233 /* h = Sum0 (a) + Maj (a, b, c); */ \ 234 veor.64 RT23q, RT23q, RT45q; \ 235 vshr.u64 RT4, ra, #39; \ 236 vshl.u64 RT5, ra, #64 - 39; \ 237 veor.64 RT0, ra, rb; \ 238 veor.64 RT23q, RT23q, RT45q; \ 239 vbsl.64 RT0, rc, rb; \ 240 vadd.u64 rd, rd, RT1; /* d+=t1; */ \ 241 veor.64 rh, RT2, RT3; \ 242 \ 243 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ 244 vshr.u64 RT2, rd, #14; \ 245 vshl.u64 RT3, rd, #64 - 14; \ 246 vadd.u64 rh, rh, RT0; \ 247 vshr.u64 RT4, rd, #18; \ 248 vshl.u64 RT5, rd, #64 - 18; \ 249 vadd.u64 rh, rh, RT1; /* h+=t1; */ \ 250 vld1.64 {RT0}, [RK]!; \ 251 veor.64 RT23q, RT23q, RT45q; \ 252 vshr.u64 RT4, rd, #41; \ 253 vshl.u64 RT5, rd, #64 - 41; \ 254 vadd.u64 RT0, RT0, rw1; \ 255 veor.64 RT23q, RT23q, RT45q; \ 256 vmov.64 RT7, rd; \ 257 veor.64 RT1, RT2, RT3; \ 258 vbsl.64 RT7, re, rf; \ 259 \ 260 vadd.u64 RT1, RT1, rg; \ 261 vshr.u64 RT2, rh, #28; \ 262 vshl.u64 RT3, rh, #64 - 28; \ 263 vadd.u64 RT1, RT1, RT0; \ 264 vshr.u64 RT4, rh, #34; \ 265 vshl.u64 RT5, rh, #64 - 34; \ 266 vadd.u64 RT1, RT1, RT7; \ 267 \ 268 /* g = Sum0 (h) + Maj (h, a, b); */ \ 269 veor.64 RT23q, RT23q, RT45q; \ 270 vshr.u64 RT4, rh, #39; \ 271 vshl.u64 RT5, rh, #64 - 39; \ 272 veor.64 RT0, rh, ra; \ 273 veor.64 RT23q, RT23q, RT45q; \ 274 vbsl.64 RT0, rb, ra; \ 275 vadd.u64 rc, rc, RT1; /* c+=t1; */ \ 276 veor.64 rg, RT2, RT3; 277#define vadd_rg_RT0(rg) \ 278 vadd.u64 rg, rg, RT0; 279#define vadd_rg_RT1(rg) \ 280 vadd.u64 rg, rg, RT1; /* g+=t1; */ 281 282.align 3 283.globl _gcry_sha512_transform_armv7_neon 284.type _gcry_sha512_transform_armv7_neon,%function; 285 286_gcry_sha512_transform_armv7_neon: 287 /* Input: 288 * %r0: SHA512_CONTEXT 289 * %r1: data 290 * %r2: u64 k[] constants 291 * %r3: nblks 292 */ 293 push {%lr}; 294 295 mov %lr, #0; 296 297 /* Load context to d0-d7 */ 298 vld1.64 {RA-RD}, [%r0]!; 299 vld1.64 {RE-RH}, [%r0]; 300 sub %r0, #(4*8); 301 302 /* Load input to w[16], d16-d31 */ 303 /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ 304 vld1.64 {RW0-RW3}, [%r1]!; 305 vld1.64 {RW4-RW7}, [%r1]!; 306 vld1.64 {RW8-RW11}, [%r1]!; 307 vld1.64 {RW12-RW15}, [%r1]!; 308#ifdef __ARMEL__ 309 /* byteswap */ 310 vrev64.8 RW01q, RW01q; 311 vrev64.8 RW23q, RW23q; 312 vrev64.8 RW45q, RW45q; 313 vrev64.8 RW67q, RW67q; 314 vrev64.8 RW89q, RW89q; 315 vrev64.8 RW1011q, RW1011q; 316 vrev64.8 RW1213q, RW1213q; 317 vrev64.8 RW1415q, RW1415q; 318#endif 319 320 /* EABI says that d8-d15 must be preserved by callee. */ 321 vpush {RT0-RT7}; 322 323.Loop: 324 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _); 325 b .Lenter_rounds; 326 327.Loop_rounds: 328 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q); 329.Lenter_rounds: 330 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q); 331 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q); 332 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); 333 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); 334 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); 335 add %lr, #16; 336 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); 337 cmp %lr, #64; 338 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); 339 bne .Loop_rounds; 340 341 subs %r3, #1; 342 343 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _); 344 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG); 345 beq .Lhandle_tail; 346 vld1.64 {RW0-RW3}, [%r1]!; 347 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); 348 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); 349#ifdef __ARMEL__ 350 vrev64.8 RW01q, RW01q; 351 vrev64.8 RW23q, RW23q; 352#endif 353 vld1.64 {RW4-RW7}, [%r1]!; 354 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); 355 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); 356#ifdef __ARMEL__ 357 vrev64.8 RW45q, RW45q; 358 vrev64.8 RW67q, RW67q; 359#endif 360 vld1.64 {RW8-RW11}, [%r1]!; 361 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); 362 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); 363#ifdef __ARMEL__ 364 vrev64.8 RW89q, RW89q; 365 vrev64.8 RW1011q, RW1011q; 366#endif 367 vld1.64 {RW12-RW15}, [%r1]!; 368 vadd_rg_RT0(RA); 369 vadd_rg_RT1(RA); 370 371 /* Load context */ 372 vld1.64 {RT0-RT3}, [%r0]!; 373 vld1.64 {RT4-RT7}, [%r0]; 374 sub %r0, #(4*8); 375 376#ifdef __ARMEL__ 377 vrev64.8 RW1213q, RW1213q; 378 vrev64.8 RW1415q, RW1415q; 379#endif 380 381 vadd.u64 RA, RT0; 382 vadd.u64 RB, RT1; 383 vadd.u64 RC, RT2; 384 vadd.u64 RD, RT3; 385 vadd.u64 RE, RT4; 386 vadd.u64 RF, RT5; 387 vadd.u64 RG, RT6; 388 vadd.u64 RH, RT7; 389 390 /* Store the first half of context */ 391 vst1.64 {RA-RD}, [%r0]!; 392 sub RK, $(8*80); 393 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ 394 mov %lr, #0; 395 sub %r0, #(4*8); 396 397 b .Loop; 398.ltorg 399 400.Lhandle_tail: 401 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); 402 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); 403 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); 404 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); 405 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); 406 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); 407 408 /* Load context to d16-d23 */ 409 vld1.64 {RW0-RW3}, [%r0]!; 410 vadd_rg_RT0(RA); 411 vld1.64 {RW4-RW7}, [%r0]; 412 vadd_rg_RT1(RA); 413 sub %r0, #(4*8); 414 415 vadd.u64 RA, RW0; 416 vadd.u64 RB, RW1; 417 vadd.u64 RC, RW2; 418 vadd.u64 RD, RW3; 419 vadd.u64 RE, RW4; 420 vadd.u64 RF, RW5; 421 vadd.u64 RG, RW6; 422 vadd.u64 RH, RW7; 423 424 /* Store the first half of context */ 425 vst1.64 {RA-RD}, [%r0]!; 426 427 /* Clear used registers */ 428 /* d16-d31 */ 429 veor.u64 RW01q, RW01q; 430 veor.u64 RW23q, RW23q; 431 veor.u64 RW45q, RW45q; 432 veor.u64 RW67q, RW67q; 433 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ 434 veor.u64 RW89q, RW89q; 435 veor.u64 RW1011q, RW1011q; 436 veor.u64 RW1213q, RW1213q; 437 veor.u64 RW1415q, RW1415q; 438 /* d8-d15 */ 439 vpop {RT0-RT7}; 440 /* d0-d7 (q0-q3) */ 441 veor.u64 %q0, %q0; 442 veor.u64 %q1, %q1; 443 veor.u64 %q2, %q2; 444 veor.u64 %q3, %q3; 445 446 eor %r0, %r0; 447 pop {%pc}; 448.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; 449 450#endif 451