1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 2# 3# Licensed under the Apache License 2.0 (the "License"). You may not use 4# this file except in compliance with the License. You can obtain a copy 5# in the file LICENSE in the source distribution or at 6# https://www.openssl.org/source/license.html 7 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# 17# AES-NI-CTR+GHASH stitch. 18# 19# February 2013 20# 21# OpenSSL GCM implementation is organized in such way that its 22# performance is rather close to the sum of its streamed components, 23# in the context parallelized AES-NI CTR and modulo-scheduled 24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 25# was observed to perform significantly better than the sum of the 26# components on contemporary CPUs, the effort was deemed impossible to 27# justify. This module is based on combination of Intel submissions, 28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 29# Locktyukhin of Intel Corp. who verified that it reduces shuffles 30# pressure with notable relative improvement, achieving 1.0 cycle per 31# byte processed with 128-bit key on Haswell processor, 0.74 - on 32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 33# measurements for favourable packet size, one divisible by 96. 34# Applications using the EVP interface will observe a few percent 35# worse performance.] 36# 37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 38# 39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 41 42# Generated once from 43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl 44# and modified for ICP. Modification are kept at a bare minimum to ease later 45# upstream merges. 46 47#if defined(__x86_64__) && defined(HAVE_AVX) && \ 48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) 49 50.extern gcm_avx_can_use_movbe 51 52.text 53 54#ifdef HAVE_MOVBE 55.type _aesni_ctr32_ghash_6x,@function 56.align 32 57_aesni_ctr32_ghash_6x: 58 vmovdqu 32(%r11),%xmm2 59 subq $6,%rdx 60 vpxor %xmm4,%xmm4,%xmm4 61 vmovdqu 0-128(%rcx),%xmm15 62 vpaddb %xmm2,%xmm1,%xmm10 63 vpaddb %xmm2,%xmm10,%xmm11 64 vpaddb %xmm2,%xmm11,%xmm12 65 vpaddb %xmm2,%xmm12,%xmm13 66 vpaddb %xmm2,%xmm13,%xmm14 67 vpxor %xmm15,%xmm1,%xmm9 68 vmovdqu %xmm4,16+8(%rsp) 69 jmp .Loop6x 70 71.align 32 72.Loop6x: 73 addl $100663296,%ebx 74 jc .Lhandle_ctr32 75 vmovdqu 0-32(%r9),%xmm3 76 vpaddb %xmm2,%xmm14,%xmm1 77 vpxor %xmm15,%xmm10,%xmm10 78 vpxor %xmm15,%xmm11,%xmm11 79 80.Lresume_ctr32: 81 vmovdqu %xmm1,(%r8) 82 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 83 vpxor %xmm15,%xmm12,%xmm12 84 vmovups 16-128(%rcx),%xmm2 85 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 86 xorq %r12,%r12 87 cmpq %r14,%r15 88 89 vaesenc %xmm2,%xmm9,%xmm9 90 vmovdqu 48+8(%rsp),%xmm0 91 vpxor %xmm15,%xmm13,%xmm13 92 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 93 vaesenc %xmm2,%xmm10,%xmm10 94 vpxor %xmm15,%xmm14,%xmm14 95 setnc %r12b 96 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 97 vaesenc %xmm2,%xmm11,%xmm11 98 vmovdqu 16-32(%r9),%xmm3 99 negq %r12 100 vaesenc %xmm2,%xmm12,%xmm12 101 vpxor %xmm5,%xmm6,%xmm6 102 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 103 vpxor %xmm4,%xmm8,%xmm8 104 vaesenc %xmm2,%xmm13,%xmm13 105 vpxor %xmm5,%xmm1,%xmm4 106 andq $0x60,%r12 107 vmovups 32-128(%rcx),%xmm15 108 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 109 vaesenc %xmm2,%xmm14,%xmm14 110 111 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 112 leaq (%r14,%r12,1),%r14 113 vaesenc %xmm15,%xmm9,%xmm9 114 vpxor 16+8(%rsp),%xmm8,%xmm8 115 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 116 vmovdqu 64+8(%rsp),%xmm0 117 vaesenc %xmm15,%xmm10,%xmm10 118 movbeq 88(%r14),%r13 119 vaesenc %xmm15,%xmm11,%xmm11 120 movbeq 80(%r14),%r12 121 vaesenc %xmm15,%xmm12,%xmm12 122 movq %r13,32+8(%rsp) 123 vaesenc %xmm15,%xmm13,%xmm13 124 movq %r12,40+8(%rsp) 125 vmovdqu 48-32(%r9),%xmm5 126 vaesenc %xmm15,%xmm14,%xmm14 127 128 vmovups 48-128(%rcx),%xmm15 129 vpxor %xmm1,%xmm6,%xmm6 130 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 131 vaesenc %xmm15,%xmm9,%xmm9 132 vpxor %xmm2,%xmm6,%xmm6 133 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 134 vaesenc %xmm15,%xmm10,%xmm10 135 vpxor %xmm3,%xmm7,%xmm7 136 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 137 vaesenc %xmm15,%xmm11,%xmm11 138 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 139 vmovdqu 80+8(%rsp),%xmm0 140 vaesenc %xmm15,%xmm12,%xmm12 141 vaesenc %xmm15,%xmm13,%xmm13 142 vpxor %xmm1,%xmm4,%xmm4 143 vmovdqu 64-32(%r9),%xmm1 144 vaesenc %xmm15,%xmm14,%xmm14 145 146 vmovups 64-128(%rcx),%xmm15 147 vpxor %xmm2,%xmm6,%xmm6 148 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 149 vaesenc %xmm15,%xmm9,%xmm9 150 vpxor %xmm3,%xmm6,%xmm6 151 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 152 vaesenc %xmm15,%xmm10,%xmm10 153 movbeq 72(%r14),%r13 154 vpxor %xmm5,%xmm7,%xmm7 155 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 156 vaesenc %xmm15,%xmm11,%xmm11 157 movbeq 64(%r14),%r12 158 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 159 vmovdqu 96+8(%rsp),%xmm0 160 vaesenc %xmm15,%xmm12,%xmm12 161 movq %r13,48+8(%rsp) 162 vaesenc %xmm15,%xmm13,%xmm13 163 movq %r12,56+8(%rsp) 164 vpxor %xmm2,%xmm4,%xmm4 165 vmovdqu 96-32(%r9),%xmm2 166 vaesenc %xmm15,%xmm14,%xmm14 167 168 vmovups 80-128(%rcx),%xmm15 169 vpxor %xmm3,%xmm6,%xmm6 170 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 171 vaesenc %xmm15,%xmm9,%xmm9 172 vpxor %xmm5,%xmm6,%xmm6 173 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 174 vaesenc %xmm15,%xmm10,%xmm10 175 movbeq 56(%r14),%r13 176 vpxor %xmm1,%xmm7,%xmm7 177 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 178 vpxor 112+8(%rsp),%xmm8,%xmm8 179 vaesenc %xmm15,%xmm11,%xmm11 180 movbeq 48(%r14),%r12 181 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 182 vaesenc %xmm15,%xmm12,%xmm12 183 movq %r13,64+8(%rsp) 184 vaesenc %xmm15,%xmm13,%xmm13 185 movq %r12,72+8(%rsp) 186 vpxor %xmm3,%xmm4,%xmm4 187 vmovdqu 112-32(%r9),%xmm3 188 vaesenc %xmm15,%xmm14,%xmm14 189 190 vmovups 96-128(%rcx),%xmm15 191 vpxor %xmm5,%xmm6,%xmm6 192 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 193 vaesenc %xmm15,%xmm9,%xmm9 194 vpxor %xmm1,%xmm6,%xmm6 195 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 196 vaesenc %xmm15,%xmm10,%xmm10 197 movbeq 40(%r14),%r13 198 vpxor %xmm2,%xmm7,%xmm7 199 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 200 vaesenc %xmm15,%xmm11,%xmm11 201 movbeq 32(%r14),%r12 202 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 203 vaesenc %xmm15,%xmm12,%xmm12 204 movq %r13,80+8(%rsp) 205 vaesenc %xmm15,%xmm13,%xmm13 206 movq %r12,88+8(%rsp) 207 vpxor %xmm5,%xmm6,%xmm6 208 vaesenc %xmm15,%xmm14,%xmm14 209 vpxor %xmm1,%xmm6,%xmm6 210 211 vmovups 112-128(%rcx),%xmm15 212 vpslldq $8,%xmm6,%xmm5 213 vpxor %xmm2,%xmm4,%xmm4 214 vmovdqu 16(%r11),%xmm3 215 216 vaesenc %xmm15,%xmm9,%xmm9 217 vpxor %xmm8,%xmm7,%xmm7 218 vaesenc %xmm15,%xmm10,%xmm10 219 vpxor %xmm5,%xmm4,%xmm4 220 movbeq 24(%r14),%r13 221 vaesenc %xmm15,%xmm11,%xmm11 222 movbeq 16(%r14),%r12 223 vpalignr $8,%xmm4,%xmm4,%xmm0 224 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 225 movq %r13,96+8(%rsp) 226 vaesenc %xmm15,%xmm12,%xmm12 227 movq %r12,104+8(%rsp) 228 vaesenc %xmm15,%xmm13,%xmm13 229 vmovups 128-128(%rcx),%xmm1 230 vaesenc %xmm15,%xmm14,%xmm14 231 232 vaesenc %xmm1,%xmm9,%xmm9 233 vmovups 144-128(%rcx),%xmm15 234 vaesenc %xmm1,%xmm10,%xmm10 235 vpsrldq $8,%xmm6,%xmm6 236 vaesenc %xmm1,%xmm11,%xmm11 237 vpxor %xmm6,%xmm7,%xmm7 238 vaesenc %xmm1,%xmm12,%xmm12 239 vpxor %xmm0,%xmm4,%xmm4 240 movbeq 8(%r14),%r13 241 vaesenc %xmm1,%xmm13,%xmm13 242 movbeq 0(%r14),%r12 243 vaesenc %xmm1,%xmm14,%xmm14 244 vmovups 160-128(%rcx),%xmm1 245 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 246 jb .Lenc_tail 247 248 vaesenc %xmm15,%xmm9,%xmm9 249 vaesenc %xmm15,%xmm10,%xmm10 250 vaesenc %xmm15,%xmm11,%xmm11 251 vaesenc %xmm15,%xmm12,%xmm12 252 vaesenc %xmm15,%xmm13,%xmm13 253 vaesenc %xmm15,%xmm14,%xmm14 254 255 vaesenc %xmm1,%xmm9,%xmm9 256 vaesenc %xmm1,%xmm10,%xmm10 257 vaesenc %xmm1,%xmm11,%xmm11 258 vaesenc %xmm1,%xmm12,%xmm12 259 vaesenc %xmm1,%xmm13,%xmm13 260 vmovups 176-128(%rcx),%xmm15 261 vaesenc %xmm1,%xmm14,%xmm14 262 vmovups 192-128(%rcx),%xmm1 263 cmpl $14,%ebp // ICP does not zero key schedule. 264 jb .Lenc_tail 265 266 vaesenc %xmm15,%xmm9,%xmm9 267 vaesenc %xmm15,%xmm10,%xmm10 268 vaesenc %xmm15,%xmm11,%xmm11 269 vaesenc %xmm15,%xmm12,%xmm12 270 vaesenc %xmm15,%xmm13,%xmm13 271 vaesenc %xmm15,%xmm14,%xmm14 272 273 vaesenc %xmm1,%xmm9,%xmm9 274 vaesenc %xmm1,%xmm10,%xmm10 275 vaesenc %xmm1,%xmm11,%xmm11 276 vaesenc %xmm1,%xmm12,%xmm12 277 vaesenc %xmm1,%xmm13,%xmm13 278 vmovups 208-128(%rcx),%xmm15 279 vaesenc %xmm1,%xmm14,%xmm14 280 vmovups 224-128(%rcx),%xmm1 281 jmp .Lenc_tail 282 283.align 32 284.Lhandle_ctr32: 285 vmovdqu (%r11),%xmm0 286 vpshufb %xmm0,%xmm1,%xmm6 287 vmovdqu 48(%r11),%xmm5 288 vpaddd 64(%r11),%xmm6,%xmm10 289 vpaddd %xmm5,%xmm6,%xmm11 290 vmovdqu 0-32(%r9),%xmm3 291 vpaddd %xmm5,%xmm10,%xmm12 292 vpshufb %xmm0,%xmm10,%xmm10 293 vpaddd %xmm5,%xmm11,%xmm13 294 vpshufb %xmm0,%xmm11,%xmm11 295 vpxor %xmm15,%xmm10,%xmm10 296 vpaddd %xmm5,%xmm12,%xmm14 297 vpshufb %xmm0,%xmm12,%xmm12 298 vpxor %xmm15,%xmm11,%xmm11 299 vpaddd %xmm5,%xmm13,%xmm1 300 vpshufb %xmm0,%xmm13,%xmm13 301 vpshufb %xmm0,%xmm14,%xmm14 302 vpshufb %xmm0,%xmm1,%xmm1 303 jmp .Lresume_ctr32 304 305.align 32 306.Lenc_tail: 307 vaesenc %xmm15,%xmm9,%xmm9 308 vmovdqu %xmm7,16+8(%rsp) 309 vpalignr $8,%xmm4,%xmm4,%xmm8 310 vaesenc %xmm15,%xmm10,%xmm10 311 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 312 vpxor 0(%rdi),%xmm1,%xmm2 313 vaesenc %xmm15,%xmm11,%xmm11 314 vpxor 16(%rdi),%xmm1,%xmm0 315 vaesenc %xmm15,%xmm12,%xmm12 316 vpxor 32(%rdi),%xmm1,%xmm5 317 vaesenc %xmm15,%xmm13,%xmm13 318 vpxor 48(%rdi),%xmm1,%xmm6 319 vaesenc %xmm15,%xmm14,%xmm14 320 vpxor 64(%rdi),%xmm1,%xmm7 321 vpxor 80(%rdi),%xmm1,%xmm3 322 vmovdqu (%r8),%xmm1 323 324 vaesenclast %xmm2,%xmm9,%xmm9 325 vmovdqu 32(%r11),%xmm2 326 vaesenclast %xmm0,%xmm10,%xmm10 327 vpaddb %xmm2,%xmm1,%xmm0 328 movq %r13,112+8(%rsp) 329 leaq 96(%rdi),%rdi 330 vaesenclast %xmm5,%xmm11,%xmm11 331 vpaddb %xmm2,%xmm0,%xmm5 332 movq %r12,120+8(%rsp) 333 leaq 96(%rsi),%rsi 334 vmovdqu 0-128(%rcx),%xmm15 335 vaesenclast %xmm6,%xmm12,%xmm12 336 vpaddb %xmm2,%xmm5,%xmm6 337 vaesenclast %xmm7,%xmm13,%xmm13 338 vpaddb %xmm2,%xmm6,%xmm7 339 vaesenclast %xmm3,%xmm14,%xmm14 340 vpaddb %xmm2,%xmm7,%xmm3 341 342 addq $0x60,%r10 343 subq $0x6,%rdx 344 jc .L6x_done 345 346 vmovups %xmm9,-96(%rsi) 347 vpxor %xmm15,%xmm1,%xmm9 348 vmovups %xmm10,-80(%rsi) 349 vmovdqa %xmm0,%xmm10 350 vmovups %xmm11,-64(%rsi) 351 vmovdqa %xmm5,%xmm11 352 vmovups %xmm12,-48(%rsi) 353 vmovdqa %xmm6,%xmm12 354 vmovups %xmm13,-32(%rsi) 355 vmovdqa %xmm7,%xmm13 356 vmovups %xmm14,-16(%rsi) 357 vmovdqa %xmm3,%xmm14 358 vmovdqu 32+8(%rsp),%xmm7 359 jmp .Loop6x 360 361.L6x_done: 362 vpxor 16+8(%rsp),%xmm8,%xmm8 363 vpxor %xmm4,%xmm8,%xmm8 364 365 .byte 0xf3,0xc3 366.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x 367#endif /* ifdef HAVE_MOVBE */ 368 369.type _aesni_ctr32_ghash_no_movbe_6x,@function 370.align 32 371_aesni_ctr32_ghash_no_movbe_6x: 372 vmovdqu 32(%r11),%xmm2 373 subq $6,%rdx 374 vpxor %xmm4,%xmm4,%xmm4 375 vmovdqu 0-128(%rcx),%xmm15 376 vpaddb %xmm2,%xmm1,%xmm10 377 vpaddb %xmm2,%xmm10,%xmm11 378 vpaddb %xmm2,%xmm11,%xmm12 379 vpaddb %xmm2,%xmm12,%xmm13 380 vpaddb %xmm2,%xmm13,%xmm14 381 vpxor %xmm15,%xmm1,%xmm9 382 vmovdqu %xmm4,16+8(%rsp) 383 jmp .Loop6x_nmb 384 385.align 32 386.Loop6x_nmb: 387 addl $100663296,%ebx 388 jc .Lhandle_ctr32_nmb 389 vmovdqu 0-32(%r9),%xmm3 390 vpaddb %xmm2,%xmm14,%xmm1 391 vpxor %xmm15,%xmm10,%xmm10 392 vpxor %xmm15,%xmm11,%xmm11 393 394.Lresume_ctr32_nmb: 395 vmovdqu %xmm1,(%r8) 396 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 397 vpxor %xmm15,%xmm12,%xmm12 398 vmovups 16-128(%rcx),%xmm2 399 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 400 xorq %r12,%r12 401 cmpq %r14,%r15 402 403 vaesenc %xmm2,%xmm9,%xmm9 404 vmovdqu 48+8(%rsp),%xmm0 405 vpxor %xmm15,%xmm13,%xmm13 406 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 407 vaesenc %xmm2,%xmm10,%xmm10 408 vpxor %xmm15,%xmm14,%xmm14 409 setnc %r12b 410 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 411 vaesenc %xmm2,%xmm11,%xmm11 412 vmovdqu 16-32(%r9),%xmm3 413 negq %r12 414 vaesenc %xmm2,%xmm12,%xmm12 415 vpxor %xmm5,%xmm6,%xmm6 416 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 417 vpxor %xmm4,%xmm8,%xmm8 418 vaesenc %xmm2,%xmm13,%xmm13 419 vpxor %xmm5,%xmm1,%xmm4 420 andq $0x60,%r12 421 vmovups 32-128(%rcx),%xmm15 422 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 423 vaesenc %xmm2,%xmm14,%xmm14 424 425 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 426 leaq (%r14,%r12,1),%r14 427 vaesenc %xmm15,%xmm9,%xmm9 428 vpxor 16+8(%rsp),%xmm8,%xmm8 429 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 430 vmovdqu 64+8(%rsp),%xmm0 431 vaesenc %xmm15,%xmm10,%xmm10 432 movq 88(%r14),%r13 433 bswapq %r13 434 vaesenc %xmm15,%xmm11,%xmm11 435 movq 80(%r14),%r12 436 bswapq %r12 437 vaesenc %xmm15,%xmm12,%xmm12 438 movq %r13,32+8(%rsp) 439 vaesenc %xmm15,%xmm13,%xmm13 440 movq %r12,40+8(%rsp) 441 vmovdqu 48-32(%r9),%xmm5 442 vaesenc %xmm15,%xmm14,%xmm14 443 444 vmovups 48-128(%rcx),%xmm15 445 vpxor %xmm1,%xmm6,%xmm6 446 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 447 vaesenc %xmm15,%xmm9,%xmm9 448 vpxor %xmm2,%xmm6,%xmm6 449 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 450 vaesenc %xmm15,%xmm10,%xmm10 451 vpxor %xmm3,%xmm7,%xmm7 452 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 453 vaesenc %xmm15,%xmm11,%xmm11 454 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 455 vmovdqu 80+8(%rsp),%xmm0 456 vaesenc %xmm15,%xmm12,%xmm12 457 vaesenc %xmm15,%xmm13,%xmm13 458 vpxor %xmm1,%xmm4,%xmm4 459 vmovdqu 64-32(%r9),%xmm1 460 vaesenc %xmm15,%xmm14,%xmm14 461 462 vmovups 64-128(%rcx),%xmm15 463 vpxor %xmm2,%xmm6,%xmm6 464 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 465 vaesenc %xmm15,%xmm9,%xmm9 466 vpxor %xmm3,%xmm6,%xmm6 467 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 468 vaesenc %xmm15,%xmm10,%xmm10 469 movq 72(%r14),%r13 470 bswapq %r13 471 vpxor %xmm5,%xmm7,%xmm7 472 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 473 vaesenc %xmm15,%xmm11,%xmm11 474 movq 64(%r14),%r12 475 bswapq %r12 476 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 477 vmovdqu 96+8(%rsp),%xmm0 478 vaesenc %xmm15,%xmm12,%xmm12 479 movq %r13,48+8(%rsp) 480 vaesenc %xmm15,%xmm13,%xmm13 481 movq %r12,56+8(%rsp) 482 vpxor %xmm2,%xmm4,%xmm4 483 vmovdqu 96-32(%r9),%xmm2 484 vaesenc %xmm15,%xmm14,%xmm14 485 486 vmovups 80-128(%rcx),%xmm15 487 vpxor %xmm3,%xmm6,%xmm6 488 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 489 vaesenc %xmm15,%xmm9,%xmm9 490 vpxor %xmm5,%xmm6,%xmm6 491 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 492 vaesenc %xmm15,%xmm10,%xmm10 493 movq 56(%r14),%r13 494 bswapq %r13 495 vpxor %xmm1,%xmm7,%xmm7 496 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 497 vpxor 112+8(%rsp),%xmm8,%xmm8 498 vaesenc %xmm15,%xmm11,%xmm11 499 movq 48(%r14),%r12 500 bswapq %r12 501 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 502 vaesenc %xmm15,%xmm12,%xmm12 503 movq %r13,64+8(%rsp) 504 vaesenc %xmm15,%xmm13,%xmm13 505 movq %r12,72+8(%rsp) 506 vpxor %xmm3,%xmm4,%xmm4 507 vmovdqu 112-32(%r9),%xmm3 508 vaesenc %xmm15,%xmm14,%xmm14 509 510 vmovups 96-128(%rcx),%xmm15 511 vpxor %xmm5,%xmm6,%xmm6 512 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 513 vaesenc %xmm15,%xmm9,%xmm9 514 vpxor %xmm1,%xmm6,%xmm6 515 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 516 vaesenc %xmm15,%xmm10,%xmm10 517 movq 40(%r14),%r13 518 bswapq %r13 519 vpxor %xmm2,%xmm7,%xmm7 520 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 521 vaesenc %xmm15,%xmm11,%xmm11 522 movq 32(%r14),%r12 523 bswapq %r12 524 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 525 vaesenc %xmm15,%xmm12,%xmm12 526 movq %r13,80+8(%rsp) 527 vaesenc %xmm15,%xmm13,%xmm13 528 movq %r12,88+8(%rsp) 529 vpxor %xmm5,%xmm6,%xmm6 530 vaesenc %xmm15,%xmm14,%xmm14 531 vpxor %xmm1,%xmm6,%xmm6 532 533 vmovups 112-128(%rcx),%xmm15 534 vpslldq $8,%xmm6,%xmm5 535 vpxor %xmm2,%xmm4,%xmm4 536 vmovdqu 16(%r11),%xmm3 537 538 vaesenc %xmm15,%xmm9,%xmm9 539 vpxor %xmm8,%xmm7,%xmm7 540 vaesenc %xmm15,%xmm10,%xmm10 541 vpxor %xmm5,%xmm4,%xmm4 542 movq 24(%r14),%r13 543 bswapq %r13 544 vaesenc %xmm15,%xmm11,%xmm11 545 movq 16(%r14),%r12 546 bswapq %r12 547 vpalignr $8,%xmm4,%xmm4,%xmm0 548 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 549 movq %r13,96+8(%rsp) 550 vaesenc %xmm15,%xmm12,%xmm12 551 movq %r12,104+8(%rsp) 552 vaesenc %xmm15,%xmm13,%xmm13 553 vmovups 128-128(%rcx),%xmm1 554 vaesenc %xmm15,%xmm14,%xmm14 555 556 vaesenc %xmm1,%xmm9,%xmm9 557 vmovups 144-128(%rcx),%xmm15 558 vaesenc %xmm1,%xmm10,%xmm10 559 vpsrldq $8,%xmm6,%xmm6 560 vaesenc %xmm1,%xmm11,%xmm11 561 vpxor %xmm6,%xmm7,%xmm7 562 vaesenc %xmm1,%xmm12,%xmm12 563 vpxor %xmm0,%xmm4,%xmm4 564 movq 8(%r14),%r13 565 bswapq %r13 566 vaesenc %xmm1,%xmm13,%xmm13 567 movq 0(%r14),%r12 568 bswapq %r12 569 vaesenc %xmm1,%xmm14,%xmm14 570 vmovups 160-128(%rcx),%xmm1 571 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 572 jb .Lenc_tail_nmb 573 574 vaesenc %xmm15,%xmm9,%xmm9 575 vaesenc %xmm15,%xmm10,%xmm10 576 vaesenc %xmm15,%xmm11,%xmm11 577 vaesenc %xmm15,%xmm12,%xmm12 578 vaesenc %xmm15,%xmm13,%xmm13 579 vaesenc %xmm15,%xmm14,%xmm14 580 581 vaesenc %xmm1,%xmm9,%xmm9 582 vaesenc %xmm1,%xmm10,%xmm10 583 vaesenc %xmm1,%xmm11,%xmm11 584 vaesenc %xmm1,%xmm12,%xmm12 585 vaesenc %xmm1,%xmm13,%xmm13 586 vmovups 176-128(%rcx),%xmm15 587 vaesenc %xmm1,%xmm14,%xmm14 588 vmovups 192-128(%rcx),%xmm1 589 cmpl $14,%ebp // ICP does not zero key schedule. 590 jb .Lenc_tail_nmb 591 592 vaesenc %xmm15,%xmm9,%xmm9 593 vaesenc %xmm15,%xmm10,%xmm10 594 vaesenc %xmm15,%xmm11,%xmm11 595 vaesenc %xmm15,%xmm12,%xmm12 596 vaesenc %xmm15,%xmm13,%xmm13 597 vaesenc %xmm15,%xmm14,%xmm14 598 599 vaesenc %xmm1,%xmm9,%xmm9 600 vaesenc %xmm1,%xmm10,%xmm10 601 vaesenc %xmm1,%xmm11,%xmm11 602 vaesenc %xmm1,%xmm12,%xmm12 603 vaesenc %xmm1,%xmm13,%xmm13 604 vmovups 208-128(%rcx),%xmm15 605 vaesenc %xmm1,%xmm14,%xmm14 606 vmovups 224-128(%rcx),%xmm1 607 jmp .Lenc_tail_nmb 608 609.align 32 610.Lhandle_ctr32_nmb: 611 vmovdqu (%r11),%xmm0 612 vpshufb %xmm0,%xmm1,%xmm6 613 vmovdqu 48(%r11),%xmm5 614 vpaddd 64(%r11),%xmm6,%xmm10 615 vpaddd %xmm5,%xmm6,%xmm11 616 vmovdqu 0-32(%r9),%xmm3 617 vpaddd %xmm5,%xmm10,%xmm12 618 vpshufb %xmm0,%xmm10,%xmm10 619 vpaddd %xmm5,%xmm11,%xmm13 620 vpshufb %xmm0,%xmm11,%xmm11 621 vpxor %xmm15,%xmm10,%xmm10 622 vpaddd %xmm5,%xmm12,%xmm14 623 vpshufb %xmm0,%xmm12,%xmm12 624 vpxor %xmm15,%xmm11,%xmm11 625 vpaddd %xmm5,%xmm13,%xmm1 626 vpshufb %xmm0,%xmm13,%xmm13 627 vpshufb %xmm0,%xmm14,%xmm14 628 vpshufb %xmm0,%xmm1,%xmm1 629 jmp .Lresume_ctr32_nmb 630 631.align 32 632.Lenc_tail_nmb: 633 vaesenc %xmm15,%xmm9,%xmm9 634 vmovdqu %xmm7,16+8(%rsp) 635 vpalignr $8,%xmm4,%xmm4,%xmm8 636 vaesenc %xmm15,%xmm10,%xmm10 637 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 638 vpxor 0(%rdi),%xmm1,%xmm2 639 vaesenc %xmm15,%xmm11,%xmm11 640 vpxor 16(%rdi),%xmm1,%xmm0 641 vaesenc %xmm15,%xmm12,%xmm12 642 vpxor 32(%rdi),%xmm1,%xmm5 643 vaesenc %xmm15,%xmm13,%xmm13 644 vpxor 48(%rdi),%xmm1,%xmm6 645 vaesenc %xmm15,%xmm14,%xmm14 646 vpxor 64(%rdi),%xmm1,%xmm7 647 vpxor 80(%rdi),%xmm1,%xmm3 648 vmovdqu (%r8),%xmm1 649 650 vaesenclast %xmm2,%xmm9,%xmm9 651 vmovdqu 32(%r11),%xmm2 652 vaesenclast %xmm0,%xmm10,%xmm10 653 vpaddb %xmm2,%xmm1,%xmm0 654 movq %r13,112+8(%rsp) 655 leaq 96(%rdi),%rdi 656 vaesenclast %xmm5,%xmm11,%xmm11 657 vpaddb %xmm2,%xmm0,%xmm5 658 movq %r12,120+8(%rsp) 659 leaq 96(%rsi),%rsi 660 vmovdqu 0-128(%rcx),%xmm15 661 vaesenclast %xmm6,%xmm12,%xmm12 662 vpaddb %xmm2,%xmm5,%xmm6 663 vaesenclast %xmm7,%xmm13,%xmm13 664 vpaddb %xmm2,%xmm6,%xmm7 665 vaesenclast %xmm3,%xmm14,%xmm14 666 vpaddb %xmm2,%xmm7,%xmm3 667 668 addq $0x60,%r10 669 subq $0x6,%rdx 670 jc .L6x_done_nmb 671 672 vmovups %xmm9,-96(%rsi) 673 vpxor %xmm15,%xmm1,%xmm9 674 vmovups %xmm10,-80(%rsi) 675 vmovdqa %xmm0,%xmm10 676 vmovups %xmm11,-64(%rsi) 677 vmovdqa %xmm5,%xmm11 678 vmovups %xmm12,-48(%rsi) 679 vmovdqa %xmm6,%xmm12 680 vmovups %xmm13,-32(%rsi) 681 vmovdqa %xmm7,%xmm13 682 vmovups %xmm14,-16(%rsi) 683 vmovdqa %xmm3,%xmm14 684 vmovdqu 32+8(%rsp),%xmm7 685 jmp .Loop6x_nmb 686 687.L6x_done_nmb: 688 vpxor 16+8(%rsp),%xmm8,%xmm8 689 vpxor %xmm4,%xmm8,%xmm8 690 691 .byte 0xf3,0xc3 692.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x 693 694.globl aesni_gcm_decrypt 695.type aesni_gcm_decrypt,@function 696.align 32 697aesni_gcm_decrypt: 698.cfi_startproc 699 xorq %r10,%r10 700 cmpq $0x60,%rdx 701 jb .Lgcm_dec_abort 702 703 leaq (%rsp),%rax 704.cfi_def_cfa_register %rax 705 pushq %rbx 706.cfi_offset %rbx,-16 707 pushq %rbp 708.cfi_offset %rbp,-24 709 pushq %r12 710.cfi_offset %r12,-32 711 pushq %r13 712.cfi_offset %r13,-40 713 pushq %r14 714.cfi_offset %r14,-48 715 pushq %r15 716.cfi_offset %r15,-56 717 vzeroupper 718 719 vmovdqu (%r8),%xmm1 720 addq $-128,%rsp 721 movl 12(%r8),%ebx 722 leaq .Lbswap_mask(%rip),%r11 723 leaq -128(%rcx),%r14 724 movq $0xf80,%r15 725 vmovdqu (%r9),%xmm8 726 andq $-128,%rsp 727 vmovdqu (%r11),%xmm0 728 leaq 128(%rcx),%rcx 729 leaq 32+32(%r9),%r9 730 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. 731 vpshufb %xmm0,%xmm8,%xmm8 732 733 andq %r15,%r14 734 andq %rsp,%r15 735 subq %r14,%r15 736 jc .Ldec_no_key_aliasing 737 cmpq $768,%r15 738 jnc .Ldec_no_key_aliasing 739 subq %r15,%rsp 740.Ldec_no_key_aliasing: 741 742 vmovdqu 80(%rdi),%xmm7 743 leaq (%rdi),%r14 744 vmovdqu 64(%rdi),%xmm4 745 leaq -192(%rdi,%rdx,1),%r15 746 vmovdqu 48(%rdi),%xmm5 747 shrq $4,%rdx 748 xorq %r10,%r10 749 vmovdqu 32(%rdi),%xmm6 750 vpshufb %xmm0,%xmm7,%xmm7 751 vmovdqu 16(%rdi),%xmm2 752 vpshufb %xmm0,%xmm4,%xmm4 753 vmovdqu (%rdi),%xmm3 754 vpshufb %xmm0,%xmm5,%xmm5 755 vmovdqu %xmm4,48(%rsp) 756 vpshufb %xmm0,%xmm6,%xmm6 757 vmovdqu %xmm5,64(%rsp) 758 vpshufb %xmm0,%xmm2,%xmm2 759 vmovdqu %xmm6,80(%rsp) 760 vpshufb %xmm0,%xmm3,%xmm3 761 vmovdqu %xmm2,96(%rsp) 762 vmovdqu %xmm3,112(%rsp) 763 764#ifdef HAVE_MOVBE 765#ifdef _KERNEL 766 testl $1,gcm_avx_can_use_movbe(%rip) 767#else 768 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 769#endif 770 jz 1f 771 call _aesni_ctr32_ghash_6x 772 jmp 2f 7731: 774#endif 775 call _aesni_ctr32_ghash_no_movbe_6x 7762: 777 vmovups %xmm9,-96(%rsi) 778 vmovups %xmm10,-80(%rsi) 779 vmovups %xmm11,-64(%rsi) 780 vmovups %xmm12,-48(%rsi) 781 vmovups %xmm13,-32(%rsi) 782 vmovups %xmm14,-16(%rsi) 783 784 vpshufb (%r11),%xmm8,%xmm8 785 vmovdqu %xmm8,-64(%r9) 786 787 vzeroupper 788 movq -48(%rax),%r15 789.cfi_restore %r15 790 movq -40(%rax),%r14 791.cfi_restore %r14 792 movq -32(%rax),%r13 793.cfi_restore %r13 794 movq -24(%rax),%r12 795.cfi_restore %r12 796 movq -16(%rax),%rbp 797.cfi_restore %rbp 798 movq -8(%rax),%rbx 799.cfi_restore %rbx 800 leaq (%rax),%rsp 801.cfi_def_cfa_register %rsp 802.Lgcm_dec_abort: 803 movq %r10,%rax 804 .byte 0xf3,0xc3 805.cfi_endproc 806.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 807.type _aesni_ctr32_6x,@function 808.align 32 809_aesni_ctr32_6x: 810 vmovdqu 0-128(%rcx),%xmm4 811 vmovdqu 32(%r11),%xmm2 812 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. 813 vmovups 16-128(%rcx),%xmm15 814 leaq 32-128(%rcx),%r12 815 vpxor %xmm4,%xmm1,%xmm9 816 addl $100663296,%ebx 817 jc .Lhandle_ctr32_2 818 vpaddb %xmm2,%xmm1,%xmm10 819 vpaddb %xmm2,%xmm10,%xmm11 820 vpxor %xmm4,%xmm10,%xmm10 821 vpaddb %xmm2,%xmm11,%xmm12 822 vpxor %xmm4,%xmm11,%xmm11 823 vpaddb %xmm2,%xmm12,%xmm13 824 vpxor %xmm4,%xmm12,%xmm12 825 vpaddb %xmm2,%xmm13,%xmm14 826 vpxor %xmm4,%xmm13,%xmm13 827 vpaddb %xmm2,%xmm14,%xmm1 828 vpxor %xmm4,%xmm14,%xmm14 829 jmp .Loop_ctr32 830 831.align 16 832.Loop_ctr32: 833 vaesenc %xmm15,%xmm9,%xmm9 834 vaesenc %xmm15,%xmm10,%xmm10 835 vaesenc %xmm15,%xmm11,%xmm11 836 vaesenc %xmm15,%xmm12,%xmm12 837 vaesenc %xmm15,%xmm13,%xmm13 838 vaesenc %xmm15,%xmm14,%xmm14 839 vmovups (%r12),%xmm15 840 leaq 16(%r12),%r12 841 decl %r13d 842 jnz .Loop_ctr32 843 844 vmovdqu (%r12),%xmm3 845 vaesenc %xmm15,%xmm9,%xmm9 846 vpxor 0(%rdi),%xmm3,%xmm4 847 vaesenc %xmm15,%xmm10,%xmm10 848 vpxor 16(%rdi),%xmm3,%xmm5 849 vaesenc %xmm15,%xmm11,%xmm11 850 vpxor 32(%rdi),%xmm3,%xmm6 851 vaesenc %xmm15,%xmm12,%xmm12 852 vpxor 48(%rdi),%xmm3,%xmm8 853 vaesenc %xmm15,%xmm13,%xmm13 854 vpxor 64(%rdi),%xmm3,%xmm2 855 vaesenc %xmm15,%xmm14,%xmm14 856 vpxor 80(%rdi),%xmm3,%xmm3 857 leaq 96(%rdi),%rdi 858 859 vaesenclast %xmm4,%xmm9,%xmm9 860 vaesenclast %xmm5,%xmm10,%xmm10 861 vaesenclast %xmm6,%xmm11,%xmm11 862 vaesenclast %xmm8,%xmm12,%xmm12 863 vaesenclast %xmm2,%xmm13,%xmm13 864 vaesenclast %xmm3,%xmm14,%xmm14 865 vmovups %xmm9,0(%rsi) 866 vmovups %xmm10,16(%rsi) 867 vmovups %xmm11,32(%rsi) 868 vmovups %xmm12,48(%rsi) 869 vmovups %xmm13,64(%rsi) 870 vmovups %xmm14,80(%rsi) 871 leaq 96(%rsi),%rsi 872 873 .byte 0xf3,0xc3 874.align 32 875.Lhandle_ctr32_2: 876 vpshufb %xmm0,%xmm1,%xmm6 877 vmovdqu 48(%r11),%xmm5 878 vpaddd 64(%r11),%xmm6,%xmm10 879 vpaddd %xmm5,%xmm6,%xmm11 880 vpaddd %xmm5,%xmm10,%xmm12 881 vpshufb %xmm0,%xmm10,%xmm10 882 vpaddd %xmm5,%xmm11,%xmm13 883 vpshufb %xmm0,%xmm11,%xmm11 884 vpxor %xmm4,%xmm10,%xmm10 885 vpaddd %xmm5,%xmm12,%xmm14 886 vpshufb %xmm0,%xmm12,%xmm12 887 vpxor %xmm4,%xmm11,%xmm11 888 vpaddd %xmm5,%xmm13,%xmm1 889 vpshufb %xmm0,%xmm13,%xmm13 890 vpxor %xmm4,%xmm12,%xmm12 891 vpshufb %xmm0,%xmm14,%xmm14 892 vpxor %xmm4,%xmm13,%xmm13 893 vpshufb %xmm0,%xmm1,%xmm1 894 vpxor %xmm4,%xmm14,%xmm14 895 jmp .Loop_ctr32 896.size _aesni_ctr32_6x,.-_aesni_ctr32_6x 897 898.globl aesni_gcm_encrypt 899.type aesni_gcm_encrypt,@function 900.align 32 901aesni_gcm_encrypt: 902.cfi_startproc 903 xorq %r10,%r10 904 cmpq $288,%rdx 905 jb .Lgcm_enc_abort 906 907 leaq (%rsp),%rax 908.cfi_def_cfa_register %rax 909 pushq %rbx 910.cfi_offset %rbx,-16 911 pushq %rbp 912.cfi_offset %rbp,-24 913 pushq %r12 914.cfi_offset %r12,-32 915 pushq %r13 916.cfi_offset %r13,-40 917 pushq %r14 918.cfi_offset %r14,-48 919 pushq %r15 920.cfi_offset %r15,-56 921 vzeroupper 922 923 vmovdqu (%r8),%xmm1 924 addq $-128,%rsp 925 movl 12(%r8),%ebx 926 leaq .Lbswap_mask(%rip),%r11 927 leaq -128(%rcx),%r14 928 movq $0xf80,%r15 929 leaq 128(%rcx),%rcx 930 vmovdqu (%r11),%xmm0 931 andq $-128,%rsp 932 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. 933 934 andq %r15,%r14 935 andq %rsp,%r15 936 subq %r14,%r15 937 jc .Lenc_no_key_aliasing 938 cmpq $768,%r15 939 jnc .Lenc_no_key_aliasing 940 subq %r15,%rsp 941.Lenc_no_key_aliasing: 942 943 leaq (%rsi),%r14 944 leaq -192(%rsi,%rdx,1),%r15 945 shrq $4,%rdx 946 947 call _aesni_ctr32_6x 948 vpshufb %xmm0,%xmm9,%xmm8 949 vpshufb %xmm0,%xmm10,%xmm2 950 vmovdqu %xmm8,112(%rsp) 951 vpshufb %xmm0,%xmm11,%xmm4 952 vmovdqu %xmm2,96(%rsp) 953 vpshufb %xmm0,%xmm12,%xmm5 954 vmovdqu %xmm4,80(%rsp) 955 vpshufb %xmm0,%xmm13,%xmm6 956 vmovdqu %xmm5,64(%rsp) 957 vpshufb %xmm0,%xmm14,%xmm7 958 vmovdqu %xmm6,48(%rsp) 959 960 call _aesni_ctr32_6x 961 962 vmovdqu (%r9),%xmm8 963 leaq 32+32(%r9),%r9 964 subq $12,%rdx 965 movq $192,%r10 966 vpshufb %xmm0,%xmm8,%xmm8 967 968#ifdef HAVE_MOVBE 969#ifdef _KERNEL 970 testl $1,gcm_avx_can_use_movbe(%rip) 971#else 972 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 973#endif 974 jz 1f 975 call _aesni_ctr32_ghash_6x 976 jmp 2f 9771: 978#endif 979 call _aesni_ctr32_ghash_no_movbe_6x 9802: 981 vmovdqu 32(%rsp),%xmm7 982 vmovdqu (%r11),%xmm0 983 vmovdqu 0-32(%r9),%xmm3 984 vpunpckhqdq %xmm7,%xmm7,%xmm1 985 vmovdqu 32-32(%r9),%xmm15 986 vmovups %xmm9,-96(%rsi) 987 vpshufb %xmm0,%xmm9,%xmm9 988 vpxor %xmm7,%xmm1,%xmm1 989 vmovups %xmm10,-80(%rsi) 990 vpshufb %xmm0,%xmm10,%xmm10 991 vmovups %xmm11,-64(%rsi) 992 vpshufb %xmm0,%xmm11,%xmm11 993 vmovups %xmm12,-48(%rsi) 994 vpshufb %xmm0,%xmm12,%xmm12 995 vmovups %xmm13,-32(%rsi) 996 vpshufb %xmm0,%xmm13,%xmm13 997 vmovups %xmm14,-16(%rsi) 998 vpshufb %xmm0,%xmm14,%xmm14 999 vmovdqu %xmm9,16(%rsp) 1000 vmovdqu 48(%rsp),%xmm6 1001 vmovdqu 16-32(%r9),%xmm0 1002 vpunpckhqdq %xmm6,%xmm6,%xmm2 1003 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 1004 vpxor %xmm6,%xmm2,%xmm2 1005 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 1006 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1007 1008 vmovdqu 64(%rsp),%xmm9 1009 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 1010 vmovdqu 48-32(%r9),%xmm3 1011 vpxor %xmm5,%xmm4,%xmm4 1012 vpunpckhqdq %xmm9,%xmm9,%xmm5 1013 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 1014 vpxor %xmm9,%xmm5,%xmm5 1015 vpxor %xmm7,%xmm6,%xmm6 1016 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1017 vmovdqu 80-32(%r9),%xmm15 1018 vpxor %xmm1,%xmm2,%xmm2 1019 1020 vmovdqu 80(%rsp),%xmm1 1021 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 1022 vmovdqu 64-32(%r9),%xmm0 1023 vpxor %xmm4,%xmm7,%xmm7 1024 vpunpckhqdq %xmm1,%xmm1,%xmm4 1025 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 1026 vpxor %xmm1,%xmm4,%xmm4 1027 vpxor %xmm6,%xmm9,%xmm9 1028 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 1029 vpxor %xmm2,%xmm5,%xmm5 1030 1031 vmovdqu 96(%rsp),%xmm2 1032 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 1033 vmovdqu 96-32(%r9),%xmm3 1034 vpxor %xmm7,%xmm6,%xmm6 1035 vpunpckhqdq %xmm2,%xmm2,%xmm7 1036 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 1037 vpxor %xmm2,%xmm7,%xmm7 1038 vpxor %xmm9,%xmm1,%xmm1 1039 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 1040 vmovdqu 128-32(%r9),%xmm15 1041 vpxor %xmm5,%xmm4,%xmm4 1042 1043 vpxor 112(%rsp),%xmm8,%xmm8 1044 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 1045 vmovdqu 112-32(%r9),%xmm0 1046 vpunpckhqdq %xmm8,%xmm8,%xmm9 1047 vpxor %xmm6,%xmm5,%xmm5 1048 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 1049 vpxor %xmm8,%xmm9,%xmm9 1050 vpxor %xmm1,%xmm2,%xmm2 1051 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 1052 vpxor %xmm4,%xmm7,%xmm4 1053 1054 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 1055 vmovdqu 0-32(%r9),%xmm3 1056 vpunpckhqdq %xmm14,%xmm14,%xmm1 1057 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 1058 vpxor %xmm14,%xmm1,%xmm1 1059 vpxor %xmm5,%xmm6,%xmm5 1060 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 1061 vmovdqu 32-32(%r9),%xmm15 1062 vpxor %xmm2,%xmm8,%xmm7 1063 vpxor %xmm4,%xmm9,%xmm6 1064 1065 vmovdqu 16-32(%r9),%xmm0 1066 vpxor %xmm5,%xmm7,%xmm9 1067 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 1068 vpxor %xmm9,%xmm6,%xmm6 1069 vpunpckhqdq %xmm13,%xmm13,%xmm2 1070 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 1071 vpxor %xmm13,%xmm2,%xmm2 1072 vpslldq $8,%xmm6,%xmm9 1073 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1074 vpxor %xmm9,%xmm5,%xmm8 1075 vpsrldq $8,%xmm6,%xmm6 1076 vpxor %xmm6,%xmm7,%xmm7 1077 1078 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 1079 vmovdqu 48-32(%r9),%xmm3 1080 vpxor %xmm4,%xmm5,%xmm5 1081 vpunpckhqdq %xmm12,%xmm12,%xmm9 1082 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 1083 vpxor %xmm12,%xmm9,%xmm9 1084 vpxor %xmm14,%xmm13,%xmm13 1085 vpalignr $8,%xmm8,%xmm8,%xmm14 1086 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1087 vmovdqu 80-32(%r9),%xmm15 1088 vpxor %xmm1,%xmm2,%xmm2 1089 1090 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 1091 vmovdqu 64-32(%r9),%xmm0 1092 vpxor %xmm5,%xmm4,%xmm4 1093 vpunpckhqdq %xmm11,%xmm11,%xmm1 1094 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 1095 vpxor %xmm11,%xmm1,%xmm1 1096 vpxor %xmm13,%xmm12,%xmm12 1097 vxorps 16(%rsp),%xmm7,%xmm7 1098 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 1099 vpxor %xmm2,%xmm9,%xmm9 1100 1101 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1102 vxorps %xmm14,%xmm8,%xmm8 1103 1104 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 1105 vmovdqu 96-32(%r9),%xmm3 1106 vpxor %xmm4,%xmm5,%xmm5 1107 vpunpckhqdq %xmm10,%xmm10,%xmm2 1108 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 1109 vpxor %xmm10,%xmm2,%xmm2 1110 vpalignr $8,%xmm8,%xmm8,%xmm14 1111 vpxor %xmm12,%xmm11,%xmm11 1112 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 1113 vmovdqu 128-32(%r9),%xmm15 1114 vpxor %xmm9,%xmm1,%xmm1 1115 1116 vxorps %xmm7,%xmm14,%xmm14 1117 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1118 vxorps %xmm14,%xmm8,%xmm8 1119 1120 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 1121 vmovdqu 112-32(%r9),%xmm0 1122 vpxor %xmm5,%xmm4,%xmm4 1123 vpunpckhqdq %xmm8,%xmm8,%xmm9 1124 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 1125 vpxor %xmm8,%xmm9,%xmm9 1126 vpxor %xmm11,%xmm10,%xmm10 1127 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 1128 vpxor %xmm1,%xmm2,%xmm2 1129 1130 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 1131 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 1132 vpxor %xmm4,%xmm5,%xmm5 1133 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 1134 vpxor %xmm10,%xmm7,%xmm7 1135 vpxor %xmm2,%xmm6,%xmm6 1136 1137 vpxor %xmm5,%xmm7,%xmm4 1138 vpxor %xmm4,%xmm6,%xmm6 1139 vpslldq $8,%xmm6,%xmm1 1140 vmovdqu 16(%r11),%xmm3 1141 vpsrldq $8,%xmm6,%xmm6 1142 vpxor %xmm1,%xmm5,%xmm8 1143 vpxor %xmm6,%xmm7,%xmm7 1144 1145 vpalignr $8,%xmm8,%xmm8,%xmm2 1146 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1147 vpxor %xmm2,%xmm8,%xmm8 1148 1149 vpalignr $8,%xmm8,%xmm8,%xmm2 1150 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1151 vpxor %xmm7,%xmm2,%xmm2 1152 vpxor %xmm2,%xmm8,%xmm8 1153 vpshufb (%r11),%xmm8,%xmm8 1154 vmovdqu %xmm8,-64(%r9) 1155 1156 vzeroupper 1157 movq -48(%rax),%r15 1158.cfi_restore %r15 1159 movq -40(%rax),%r14 1160.cfi_restore %r14 1161 movq -32(%rax),%r13 1162.cfi_restore %r13 1163 movq -24(%rax),%r12 1164.cfi_restore %r12 1165 movq -16(%rax),%rbp 1166.cfi_restore %rbp 1167 movq -8(%rax),%rbx 1168.cfi_restore %rbx 1169 leaq (%rax),%rsp 1170.cfi_def_cfa_register %rsp 1171.Lgcm_enc_abort: 1172 movq %r10,%rax 1173 .byte 0xf3,0xc3 1174.cfi_endproc 1175.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1176 1177/* Some utility routines */ 1178 1179/* 1180 * clear all fpu registers 1181 * void clear_fpu_regs_avx(void); 1182 */ 1183.globl clear_fpu_regs_avx 1184.type clear_fpu_regs_avx,@function 1185.align 32 1186clear_fpu_regs_avx: 1187 vzeroall 1188 ret 1189.size clear_fpu_regs_avx,.-clear_fpu_regs_avx 1190 1191/* 1192 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1193 * 1194 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and 1195 * stores the result at `dst'. The XOR is performed using FPU registers, 1196 * so make sure FPU state is saved when running this in the kernel. 1197 */ 1198.globl gcm_xor_avx 1199.type gcm_xor_avx,@function 1200.align 32 1201gcm_xor_avx: 1202 movdqu (%rdi), %xmm0 1203 movdqu (%rsi), %xmm1 1204 pxor %xmm1, %xmm0 1205 movdqu %xmm0, (%rsi) 1206 ret 1207.size gcm_xor_avx,.-gcm_xor_avx 1208 1209/* 1210 * Toggle a boolean_t value atomically and return the new value. 1211 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); 1212 */ 1213.globl atomic_toggle_boolean_nv 1214.type atomic_toggle_boolean_nv,@function 1215.align 32 1216atomic_toggle_boolean_nv: 1217 xorl %eax, %eax 1218 lock 1219 xorl $1, (%rdi) 1220 jz 1f 1221 movl $1, %eax 12221: 1223 ret 1224.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv 1225 1226.align 64 1227.Lbswap_mask: 1228.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1229.Lpoly: 1230.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1231.Lone_msb: 1232.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1233.Ltwo_lsb: 1234.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1235.Lone_lsb: 1236.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1237.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1238.align 64 1239 1240/* Mark the stack non-executable. */ 1241#if defined(__linux__) && defined(__ELF__) 1242.section .note.GNU-stack,"",%progbits 1243#endif 1244 1245#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ 1246