1/* cipher-gcm-armv8-aarch64-ce.S - ARM/CE accelerated GHASH 2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 3 * 4 * This file is part of Libgcrypt. 5 * 6 * Libgcrypt is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as 8 * published by the Free Software Foundation; either version 2.1 of 9 * the License, or (at your option) any later version. 10 * 11 * Libgcrypt is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "asm-common-aarch64.h" 21 22#if defined(__AARCH64EL__) && \ 23 defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ 24 defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) 25 26.cpu generic+simd+crypto 27 28.text 29 30 31/* Constants */ 32 33.align 4 34gcry_gcm_reduction_constant: 35.Lrconst: 36 .quad 0x87 37 38 39/* Register macros */ 40 41#define rhash v0 42#define rr0 v1 43#define rr1 v2 44#define rbuf v3 45#define rbuf1 v4 46#define rbuf2 v5 47#define rbuf3 v6 48#define rbuf4 v7 49#define rbuf5 v8 50#define rr2 v9 51#define rr3 v10 52#define rr4 v11 53#define rr5 v12 54#define rr6 v13 55#define rr7 v14 56#define rr8 v15 57#define rr9 v16 58 59#define rrconst v18 60#define rh1 v19 61#define rh2 v20 62#define rh3 v21 63#define rh4 v22 64#define rh5 v23 65#define rh6 v24 66#define t0 v25 67#define t1 v26 68#define t2 v27 69#define t3 v28 70#define t4 v29 71#define t5 v30 72#define vZZ v31 73 74/* GHASH macros */ 75 76/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in 77 * Cryptology — CT-RSA 2015" for details. 78 */ 79 80/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ 81#define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \ 82 ext T0.16b, b.16b, b.16b, #8; \ 83 pmull r0.1q, a.1d, b.1d; \ 84 pmull2 r1.1q, a.2d, b.2d; \ 85 pmull T1.1q, a.1d, T0.1d; \ 86 pmull2 T0.1q, a.2d, T0.2d; \ 87 interleave_op; \ 88 eor T0.16b, T0.16b, T1.16b; \ 89 ext T1.16b, vZZ.16b, T0.16b, #8; \ 90 ext T0.16b, T0.16b, vZZ.16b, #8; \ 91 eor r0.16b, r0.16b, T1.16b; \ 92 eor r1.16b, r1.16b, T0.16b; 93 94/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) 95 * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) 96 * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C) 97 */ 98#define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \ 99 r0B, r1B, aB, bB, t0B, t1B, \ 100 r0C, r1C, aC, bC, t0C, t1C, interleave_op) \ 101 ext t0A.16b, bA.16b, bA.16b, #8; \ 102 pmull r0A.1q, aA.1d, bA.1d; \ 103 pmull2 r1A.1q, aA.2d, bA.2d; \ 104 ext t0B.16b, bB.16b, bB.16b, #8; \ 105 pmull r0B.1q, aB.1d, bB.1d; \ 106 pmull2 r1B.1q, aB.2d, bB.2d; \ 107 ext t0C.16b, bC.16b, bC.16b, #8; \ 108 pmull r0C.1q, aC.1d, bC.1d; \ 109 pmull2 r1C.1q, aC.2d, bC.2d; \ 110 pmull t1A.1q, aA.1d, t0A.1d; \ 111 pmull2 t0A.1q, aA.2d, t0A.2d; \ 112 pmull t1B.1q, aB.1d, t0B.1d; \ 113 pmull2 t0B.1q, aB.2d, t0B.2d; \ 114 pmull t1C.1q, aC.1d, t0C.1d; \ 115 pmull2 t0C.1q, aC.2d, t0C.2d; \ 116 eor t0A.16b, t0A.16b, t1A.16b; \ 117 eor t0B.16b, t0B.16b, t1B.16b; \ 118 eor t0C.16b, t0C.16b, t1C.16b; \ 119 interleave_op; \ 120 ext t1A.16b, vZZ.16b, t0A.16b, #8; \ 121 ext t0A.16b, t0A.16b, vZZ.16b, #8; \ 122 ext t1B.16b, vZZ.16b, t0B.16b, #8; \ 123 ext t0B.16b, t0B.16b, vZZ.16b, #8; \ 124 ext t1C.16b, vZZ.16b, t0C.16b, #8; \ 125 ext t0C.16b, t0C.16b, vZZ.16b, #8; \ 126 eor r0A.16b, r0A.16b, t1A.16b; \ 127 eor r1A.16b, r1A.16b, t0A.16b; \ 128 eor r0B.16b, r0B.16b, t1B.16b; \ 129 eor r1B.16b, r1B.16b, t0B.16b; \ 130 eor r0C.16b, r0C.16b, t1C.16b; \ 131 eor r1C.16b, r1C.16b, t0C.16b; \ 132 133/* Input: 'r0:r1', Output: 'a' */ 134#define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \ 135 interleave_op3) \ 136 pmull2 T0.1q, r1.2d, rconst.2d; \ 137 interleave_op1; \ 138 ext T1.16b, T0.16b, vZZ.16b, #8; \ 139 ext T0.16b, vZZ.16b, T0.16b, #8; \ 140 interleave_op2; \ 141 eor r1.16b, r1.16b, T1.16b; \ 142 eor r0.16b, r0.16b, T0.16b; \ 143 pmull T0.1q, r1.1d, rconst.1d; \ 144 interleave_op3; \ 145 eor a.16b, r0.16b, T0.16b; 146 147/* Other functional macros */ 148 149#define _(...) __VA_ARGS__ 150#define __ _() 151 152#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; 153 154#define VPUSH_ABI \ 155 stp d8, d9, [sp, #-16]!; \ 156 CFI_ADJUST_CFA_OFFSET(16); \ 157 stp d10, d11, [sp, #-16]!; \ 158 CFI_ADJUST_CFA_OFFSET(16); \ 159 stp d12, d13, [sp, #-16]!; \ 160 CFI_ADJUST_CFA_OFFSET(16); \ 161 stp d14, d15, [sp, #-16]!; \ 162 CFI_ADJUST_CFA_OFFSET(16); 163 164#define VPOP_ABI \ 165 ldp d14, d15, [sp], #16; \ 166 CFI_ADJUST_CFA_OFFSET(-16); \ 167 ldp d12, d13, [sp], #16; \ 168 CFI_ADJUST_CFA_OFFSET(-16); \ 169 ldp d10, d11, [sp], #16; \ 170 CFI_ADJUST_CFA_OFFSET(-16); \ 171 ldp d8, d9, [sp], #16; \ 172 CFI_ADJUST_CFA_OFFSET(-16); 173 174/* 175 * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, 176 * const byte *buf, size_t nblocks, 177 * void *gcm_table); 178 */ 179.align 3 180.globl _gcry_ghash_armv8_ce_pmull 181ELF(.type _gcry_ghash_armv8_ce_pmull,%function;) 182_gcry_ghash_armv8_ce_pmull: 183 /* input: 184 * x0: gcm_key 185 * x1: result/hash 186 * x2: buf 187 * x3: nblocks 188 * x4: gcm_table 189 */ 190 CFI_STARTPROC(); 191 192 cbz x3, .Ldo_nothing; 193 194 GET_DATA_POINTER(x5, .Lrconst) 195 196 eor vZZ.16b, vZZ.16b, vZZ.16b 197 ld1 {rhash.16b}, [x1] 198 ld1 {rh1.16b}, [x0] 199 200 rbit rhash.16b, rhash.16b /* bit-swap */ 201 ld1r {rrconst.2d}, [x5] 202 203 cmp x3, #6 204 b.lo .Less_than_6 205 206 add x6, x4, #64 207 VPUSH_ABI 208 209 ld1 {rh2.16b-rh5.16b}, [x4] 210 ld1 {rh6.16b}, [x6] 211 212 sub x3, x3, #6 213 214 ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) 215 ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) 216 217 rbit rbuf.16b, rbuf.16b /* bit-swap */ 218 rbit rbuf1.16b, rbuf1.16b /* bit-swap */ 219 rbit rbuf2.16b, rbuf2.16b /* bit-swap */ 220 rbit rbuf3.16b, rbuf3.16b /* bit-swap */ 221 rbit rbuf4.16b, rbuf4.16b /* bit-swap */ 222 rbit rbuf5.16b, rbuf5.16b /* bit-swap */ 223 eor rhash.16b, rhash.16b, rbuf.16b 224 225 cmp x3, #6 226 b.lo .Lend_6 227 228.Loop_6: 229 230 /* (in1) * H⁵ => rr0:rr1 */ 231 /* (in2) * H⁴ => rr2:rr3 */ 232 /* (in0 ^ hash) * H⁶ => rr4:rr5 */ 233 PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, 234 rr2, rr3, rbuf2, rh4, t2, t3, 235 rr4, rr5, rhash, rh6, t4, t5, 236 _(sub x3, x3, #6)) 237 238 ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) 239 cmp x3, #6 240 241 eor rr0.16b, rr0.16b, rr2.16b 242 eor rr1.16b, rr1.16b, rr3.16b 243 244 /* (in3) * H³ => rr2:rr3 */ 245 /* (in4) * H² => rr6:rr7 */ 246 /* (in5) * H¹ => rr8:rr9 */ 247 PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1, 248 rr6, rr7, rbuf4, rh2, t2, t3, 249 rr8, rr9, rbuf5, rh1, t4, t5, 250 _(eor rr0.16b, rr0.16b, rr4.16b; 251 eor rr1.16b, rr1.16b, rr5.16b)) 252 253 eor rr0.16b, rr0.16b, rr2.16b 254 eor rr1.16b, rr1.16b, rr3.16b 255 rbit rbuf.16b, rbuf.16b 256 eor rr0.16b, rr0.16b, rr6.16b 257 eor rr1.16b, rr1.16b, rr7.16b 258 rbit rbuf1.16b, rbuf1.16b 259 eor rr0.16b, rr0.16b, rr8.16b 260 eor rr1.16b, rr1.16b, rr9.16b 261 ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) 262 263 REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, 264 _(rbit rbuf2.16b, rbuf2.16b), 265 _(rbit rbuf3.16b, rbuf3.16b), 266 _(rbit rbuf4.16b, rbuf4.16b)) 267 268 rbit rbuf5.16b, rbuf5.16b 269 eor rhash.16b, rhash.16b, rbuf.16b 270 271 b.hs .Loop_6 272 273.Lend_6: 274 275 /* (in1) * H⁵ => rr0:rr1 */ 276 /* (in0 ^ hash) * H⁶ => rr2:rr3 */ 277 /* (in2) * H⁴ => rr4:rr5 */ 278 PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, 279 rr2, rr3, rhash, rh6, t2, t3, 280 rr4, rr5, rbuf2, rh4, t4, t5, 281 __) 282 eor rr0.16b, rr0.16b, rr2.16b 283 eor rr1.16b, rr1.16b, rr3.16b 284 eor rr0.16b, rr0.16b, rr4.16b 285 eor rr1.16b, rr1.16b, rr5.16b 286 287 /* (in3) * H³ => rhash:rbuf */ 288 /* (in4) * H² => rr6:rr7 */ 289 /* (in5) * H¹ => rr8:rr9 */ 290 PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1, 291 rr6, rr7, rbuf4, rh2, t2, t3, 292 rr8, rr9, rbuf5, rh1, t4, t5, 293 _(CLEAR_REG(rh4); 294 CLEAR_REG(rh5); 295 CLEAR_REG(rh6))) 296 eor rr0.16b, rr0.16b, rhash.16b 297 eor rr1.16b, rr1.16b, rbuf.16b 298 eor rr0.16b, rr0.16b, rr6.16b 299 eor rr1.16b, rr1.16b, rr7.16b 300 eor rr0.16b, rr0.16b, rr8.16b 301 eor rr1.16b, rr1.16b, rr9.16b 302 303 REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, 304 _(CLEAR_REG(rh2); 305 CLEAR_REG(rh3); 306 CLEAR_REG(rr2); 307 CLEAR_REG(rbuf2); 308 CLEAR_REG(rbuf3)), 309 _(CLEAR_REG(rr3); 310 CLEAR_REG(rr4); 311 CLEAR_REG(rr5); 312 CLEAR_REG(rr6); 313 CLEAR_REG(rr7)), 314 _(CLEAR_REG(rr8); 315 CLEAR_REG(rr9); 316 CLEAR_REG(rbuf1); 317 CLEAR_REG(rbuf2))) 318 319 CLEAR_REG(rbuf4) 320 CLEAR_REG(rbuf5) 321 CLEAR_REG(t2) 322 CLEAR_REG(t3) 323 CLEAR_REG(t4) 324 CLEAR_REG(t5) 325 326 VPOP_ABI 327 328 cbz x3, .Ldone 329 330.Less_than_6: 331 /* Handle remaining blocks. */ 332 333 ld1 {rbuf.16b}, [x2], #16 334 sub x3, x3, #1 335 336 rbit rbuf.16b, rbuf.16b /* bit-swap */ 337 338 eor rhash.16b, rhash.16b, rbuf.16b 339 340 cbz x3, .Lend 341 342.Loop: 343 PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16)) 344 REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, 345 _(sub x3, x3, #1), 346 _(rbit rbuf.16b, rbuf.16b), 347 __) 348 eor rhash.16b, rhash.16b, rbuf.16b 349 350 cbnz x3, .Loop 351 352.Lend: 353 PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf))) 354 REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __) 355 356.Ldone: 357 CLEAR_REG(rr1) 358 CLEAR_REG(rr0) 359 rbit rhash.16b, rhash.16b /* bit-swap */ 360 CLEAR_REG(t0) 361 CLEAR_REG(t1) 362 363 st1 {rhash.2d}, [x1] 364 CLEAR_REG(rhash) 365 366.Ldo_nothing: 367 mov x0, #0 368 ret 369 CFI_ENDPROC() 370ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;) 371 372 373/* 374 * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table); 375 */ 376.align 3 377.globl _gcry_ghash_setup_armv8_ce_pmull 378ELF(.type _gcry_ghash_setup_armv8_ce_pmull,%function;) 379_gcry_ghash_setup_armv8_ce_pmull: 380 /* input: 381 * x0: gcm_key 382 * x1: gcm_table 383 */ 384 CFI_STARTPROC() 385 386 GET_DATA_POINTER(x2, .Lrconst) 387 388 eor vZZ.16b, vZZ.16b, vZZ.16b 389 390 /* H¹ */ 391 ld1 {rh1.16b}, [x0] 392 rbit rh1.16b, rh1.16b 393 st1 {rh1.16b}, [x0] 394 395 ld1r {rrconst.2d}, [x2] 396 397 /* H² */ 398 PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __) 399 REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __) 400 401 /* H³ */ 402 PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __) 403 REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __) 404 405 /* H⁴ */ 406 PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __) 407 REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __) 408 409 /* H⁵ */ 410 PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __) 411 REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __) 412 413 /* H⁶ */ 414 PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __) 415 REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __) 416 417 st1 {rh2.16b-rh4.16b}, [x1], #(3*16) 418 st1 {rh5.16b-rh6.16b}, [x1] 419 420 ret 421 CFI_ENDPROC() 422ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;) 423 424#endif 425