1/* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH 2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 3 * 4 * This file is part of Libgcrypt. 5 * 6 * Libgcrypt is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as 8 * published by the Free Software Foundation; either version 2.1 of 9 * the License, or (at your option) any later version. 10 * 11 * Libgcrypt is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include <config.h> 21 22#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ 23 defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ 24 defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) 25 26.syntax unified 27.arch armv8-a 28.fpu crypto-neon-fp-armv8 29.arm 30 31.text 32 33#ifdef __PIC__ 34# define GET_DATA_POINTER(reg, name, rtmp) \ 35 ldr reg, 1f; \ 36 ldr rtmp, 2f; \ 37 b 3f; \ 38 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ 39 2: .word name(GOT); \ 40 3: add reg, pc, reg; \ 41 ldr reg, [reg, rtmp]; 42#else 43# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name 44#endif 45 46 47/* Constants */ 48 49.align 4 50gcry_gcm_reduction_constant: 51.Lrconst64: 52 .quad 0xc200000000000000 53 54 55/* Register macros */ 56 57#define rhash q0 58#define rhash_l d0 59#define rhash_h d1 60 61#define rh1 q1 62#define rh1_l d2 63#define rh1_h d3 64 65#define rbuf q2 66#define rbuf_l d4 67#define rbuf_h d5 68 69#define rbuf1 q3 70#define rbuf1_l d6 71#define rbuf1_h d7 72 73#define rbuf2 q4 74#define rbuf2_l d8 75#define rbuf2_h d9 76 77#define rbuf3 q5 78#define rbuf3_l d10 79#define rbuf3_h d11 80 81#define rh2 q6 82#define rh2_l d12 83#define rh2_h d13 84 85#define rh3 q7 86#define rh3_l d14 87#define rh3_h d15 88 89#define rh4 q8 90#define rh4_l d16 91#define rh4_h d17 92 93#define rr2 q9 94#define rr2_l d18 95#define rr2_h d19 96 97#define rr3 q10 98#define rr3_l d20 99#define rr3_h d21 100 101#define rr0 q11 102#define rr0_l d22 103#define rr0_h d23 104 105#define rr1 q12 106#define rr1_l d24 107#define rr1_h d25 108 109#define rt0 q13 110#define rt0_l d26 111#define rt0_h d27 112 113#define rt1 q14 114#define rt1_l d28 115#define rt1_h d29 116 117#define rrconst q15 118#define rrconst_l d30 119#define rrconst_h d31 120 121/* GHASH macros */ 122 123/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in 124 * Cryptology — CT-RSA 2015" for details. 125 */ 126 127/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) 128 * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'. 129 */ 130#define PMUL_128x128(r0, r1, a, b, t, interleave_op) \ 131 veor t##_h, b##_l, b##_h; \ 132 veor t##_l, a##_l, a##_h; \ 133 vmull.p64 r0, a##_l, b##_l; \ 134 vmull.p64 r1, a##_h, b##_h; \ 135 vmull.p64 t, t##_h, t##_l; \ 136 interleave_op; \ 137 veor t, r0; \ 138 veor t, r1; \ 139 veor r0##_h, t##_l; \ 140 veor r1##_l, t##_h; 141 142/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) 143 * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'. 144 * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) 145 * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'. 146 */ 147#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \ 148 veor tA##_h, bA##_l, bA##_h; \ 149 veor tA##_l, aA##_l, aA##_h; \ 150 veor tB##_h, bB##_l, bB##_h; \ 151 veor tB##_l, aB##_l, aB##_h; \ 152 vmull.p64 r0A, aA##_l, bA##_l; \ 153 vmull.p64 r1A, aA##_h, bA##_h; \ 154 vmull.p64 tA, tA##_h, tA##_l; \ 155 vmull.p64 r0B, aB##_l, bB##_l; \ 156 vmull.p64 r1B, aB##_h, bB##_h; \ 157 vmull.p64 tB, tB##_h, tB##_l; \ 158 interleave_op; \ 159 veor tA, r0A; \ 160 veor tA, r1A; \ 161 veor tB, r0B; \ 162 veor tB, r1B; \ 163 veor r0A##_h, tA##_l; \ 164 veor r1A##_l, tA##_h; \ 165 veor r0B##_h, tB##_l; \ 166 veor r1B##_l, tB##_h; \ 167 168/* Input: 'r0:r1', Output: 'a' */ 169#define REDUCTION(a, r0, r1, rconst, t, interleave_op) \ 170 vmull.p64 t, r0##_l, rconst; \ 171 veor r0##_h, t##_l; \ 172 veor r1##_l, t##_h; \ 173 interleave_op; \ 174 vmull.p64 t, r0##_h, rconst; \ 175 veor r1, t; \ 176 veor a, r0, r1; 177 178#define _(...) __VA_ARGS__ 179#define __ _() 180 181/* Other functional macros */ 182 183#define CLEAR_REG(reg) veor reg, reg; 184 185 186/* 187 * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, 188 * const byte *buf, size_t nblocks, 189 * void *gcm_table); 190 */ 191.align 3 192.globl _gcry_ghash_armv8_ce_pmull 193.type _gcry_ghash_armv8_ce_pmull,%function; 194_gcry_ghash_armv8_ce_pmull: 195 /* input: 196 * r0: gcm_key 197 * r1: result/hash 198 * r2: buf 199 * r3: nblocks 200 * %st+0: gcm_table 201 */ 202 push {r4-r6, lr} 203 204 cmp r3, #0 205 beq .Ldo_nothing 206 207 GET_DATA_POINTER(r4, .Lrconst64, lr) 208 209 vld1.64 {rhash}, [r1] 210 vld1.64 {rh1}, [r0] 211 212 vrev64.8 rhash, rhash /* byte-swap */ 213 vld1.64 {rrconst_h}, [r4] 214 vext.8 rhash, rhash, rhash, #8 215 216 cmp r3, #4 217 blo .Less_than_4 218 219 /* Bulk processing of 4 blocks per loop iteration. */ 220 221 ldr r5, [sp, #(4*4)]; 222 add r6, r5, #32 223 224 vpush {q4-q7} 225 226 vld1.64 {rh2-rh3}, [r5] 227 vld1.64 {rh4}, [r6] 228 229 vld1.64 {rbuf-rbuf1}, [r2]! 230 sub r3, r3, #4 231 vld1.64 {rbuf2-rbuf3}, [r2]! 232 233 cmp r3, #4 234 vrev64.8 rbuf, rbuf /* byte-swap */ 235 vrev64.8 rbuf1, rbuf1 /* byte-swap */ 236 vrev64.8 rbuf2, rbuf2 /* byte-swap */ 237 vrev64.8 rbuf3, rbuf3 /* byte-swap */ 238 239 vext.8 rbuf, rbuf, rbuf, #8 240 vext.8 rbuf1, rbuf1, rbuf1, #8 241 vext.8 rbuf2, rbuf2, rbuf2, #8 242 vext.8 rbuf3, rbuf3, rbuf3, #8 243 veor rhash, rhash, rbuf /* in0 ^ hash */ 244 245 blo .Lend_4 246 247.Loop_4: 248 /* (in0 ^ hash) * H⁴ => rr2:rr3 */ 249 /* (in1) * H³ => rr0:rr1 */ 250 PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) 251 252 vld1.64 {rbuf-rbuf1}, [r2]! 253 sub r3, r3, #4 254 veor rr0, rr0, rr2 255 veor rr1, rr1, rr3 256 257 /* (in2) * H² => rr2:rr3 */ 258 /* (in3) * H¹ => rhash:rbuf3 */ 259 PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, 260 _(vrev64.8 rbuf, rbuf)) 261 262 vld1.64 {rbuf2}, [r2]! 263 264 vrev64.8 rbuf1, rbuf1 265 veor rr0, rr0, rr2 266 veor rr1, rr1, rr3 267 268 cmp r3, #4 269 vext.8 rbuf, rbuf, rbuf, #8 270 vext.8 rbuf1, rbuf1, rbuf1, #8 271 272 veor rr0, rr0, rhash 273 veor rr1, rr1, rbuf3 274 275 vld1.64 {rbuf3}, [r2]! 276 277 REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, 278 _(vrev64.8 rbuf2, rbuf2; 279 vrev64.8 rbuf3, rbuf3)) 280 281 vext.8 rbuf2, rbuf2, rbuf2, #8 282 vext.8 rbuf3, rbuf3, rbuf3, #8 283 veor rhash, rhash, rbuf /* in0 ^ hash */ 284 285 bhs .Loop_4 286 287.Lend_4: 288 /* (in0 ^ hash) * H⁴ => rr2:rr3 */ 289 /* (in1) * H³ => rr0:rr1 */ 290 PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) 291 292 /* (in2) * H² => rhash:rbuf */ 293 /* (in3) * H¹ => rbuf1:rbuf2 */ 294 PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1, 295 _(veor rr0, rr0, rr2; 296 veor rr1, rr1, rr3)) 297 298 veor rr0, rr0, rhash 299 veor rr1, rr1, rbuf 300 301 veor rr0, rr0, rbuf1 302 veor rr1, rr1, rbuf2 303 304 REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, 305 _(CLEAR_REG(rr2); 306 CLEAR_REG(rr3); 307 CLEAR_REG(rbuf1); 308 CLEAR_REG(rbuf2); 309 CLEAR_REG(rbuf3); 310 CLEAR_REG(rh2); 311 CLEAR_REG(rh3); 312 CLEAR_REG(rh4))) 313 314 vpop {q4-q7} 315 316 cmp r3, #0 317 beq .Ldone 318 319.Less_than_4: 320 /* Handle remaining blocks. */ 321 322 vld1.64 {rbuf}, [r2]! 323 subs r3, r3, #1 324 325 vrev64.8 rbuf, rbuf /* byte-swap */ 326 vext.8 rbuf, rbuf, rbuf, #8 327 328 veor rhash, rhash, rbuf 329 330 beq .Lend 331 332.Loop: 333 vld1.64 {rbuf}, [r2]! 334 subs r3, r3, #1 335 PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf)) 336 REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8)) 337 veor rhash, rhash, rbuf 338 339 bne .Loop 340 341.Lend: 342 PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf))) 343 REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1))) 344 345.Ldone: 346 CLEAR_REG(rr1) 347 vrev64.8 rhash, rhash /* byte-swap */ 348 CLEAR_REG(rt0) 349 CLEAR_REG(rr0) 350 vext.8 rhash, rhash, rhash, #8 351 CLEAR_REG(rt1) 352 vst1.64 {rhash}, [r1] 353 CLEAR_REG(rhash) 354 355.Ldo_nothing: 356 mov r0, #0 357 pop {r4-r6, pc} 358.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull; 359 360 361/* 362 * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table); 363 */ 364.align 3 365.globl _gcry_ghash_setup_armv8_ce_pmull 366.type _gcry_ghash_setup_armv8_ce_pmull,%function; 367_gcry_ghash_setup_armv8_ce_pmull: 368 /* input: 369 * r0: gcm_key 370 * r1: gcm_table 371 */ 372 373 vpush {q4-q7} 374 375 GET_DATA_POINTER(r2, .Lrconst64, r3) 376 377 vld1.64 {rrconst_h}, [r2] 378 379#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \ 380 /* H <<< 1 */ \ 381 vshr.s64 ma, ib, #63; \ 382 vshr.u64 oa, ib, #63; \ 383 vshr.u64 ob, ia, #63; \ 384 vand ma, const_d; \ 385 vshl.u64 ib, ib, #1; \ 386 vshl.u64 ia, ia, #1; \ 387 vorr ob, ib; \ 388 vorr oa, ia; \ 389 veor ob, ma; \ 390 vst1.64 {oa, ob}, [r_out] 391 392 vld1.64 {rhash}, [r0] 393 vrev64.8 rhash, rhash /* byte-swap */ 394 vext.8 rhash, rhash, rhash, #8 395 396 vmov rbuf1, rhash 397 GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */ 398 399 /* H² */ 400 PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __) 401 REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __) 402 vmov rhash, rh2 403 GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */ 404 add r1, r1, #16 405 406 /* H³ */ 407 PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __) 408 REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __) 409 410 /* H⁴ */ 411 PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __) 412 REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __) 413 414 GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */ 415 add r1, r1, #16 416 GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */ 417 418 CLEAR_REG(rt0) 419 CLEAR_REG(rt1) 420 CLEAR_REG(rr1) 421 CLEAR_REG(rr0) 422 CLEAR_REG(rh1) 423 CLEAR_REG(rh2) 424 CLEAR_REG(rh3) 425 CLEAR_REG(rh4) 426 CLEAR_REG(rhash) 427 CLEAR_REG(rbuf1) 428 CLEAR_REG(rrconst) 429 vpop {q4-q7} 430 bx lr 431.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; 432 433#endif 434