1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 2# 3# Licensed under the Apache License 2.0 (the "License"). You may not use 4# this file except in compliance with the License. You can obtain a copy 5# in the file LICENSE in the source distribution or at 6# https://www.openssl.org/source/license.html 7 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# 17# AES-NI-CTR+GHASH stitch. 18# 19# February 2013 20# 21# OpenSSL GCM implementation is organized in such way that its 22# performance is rather close to the sum of its streamed components, 23# in the context parallelized AES-NI CTR and modulo-scheduled 24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 25# was observed to perform significantly better than the sum of the 26# components on contemporary CPUs, the effort was deemed impossible to 27# justify. This module is based on combination of Intel submissions, 28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 29# Locktyukhin of Intel Corp. who verified that it reduces shuffles 30# pressure with notable relative improvement, achieving 1.0 cycle per 31# byte processed with 128-bit key on Haswell processor, 0.74 - on 32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 33# measurements for favourable packet size, one divisible by 96. 34# Applications using the EVP interface will observe a few percent 35# worse performance.] 36# 37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 38# 39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 41 42# Generated once from 43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl 44# and modified for ICP. Modification are kept at a bare minimum to ease later 45# upstream merges. 46 47#if defined(__x86_64__) && defined(HAVE_AVX) && \ 48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) 49 50#define _ASM 51#include <sys/asm_linkage.h> 52 53/* Windows userland links with OpenSSL */ 54#if !defined (_WIN32) || defined (_KERNEL) 55 56.extern gcm_avx_can_use_movbe 57 58.text 59 60#ifdef HAVE_MOVBE 61.balign 32 62FUNCTION(_aesni_ctr32_ghash_6x) 63.cfi_startproc 64 ENDBR 65 vmovdqu 32(%r11),%xmm2 66 subq $6,%rdx 67 vpxor %xmm4,%xmm4,%xmm4 68 vmovdqu 0-128(%rcx),%xmm15 69 vpaddb %xmm2,%xmm1,%xmm10 70 vpaddb %xmm2,%xmm10,%xmm11 71 vpaddb %xmm2,%xmm11,%xmm12 72 vpaddb %xmm2,%xmm12,%xmm13 73 vpaddb %xmm2,%xmm13,%xmm14 74 vpxor %xmm15,%xmm1,%xmm9 75 vmovdqu %xmm4,16+8(%rsp) 76 jmp .Loop6x 77 78.balign 32 79.Loop6x: 80 addl $100663296,%ebx 81 jc .Lhandle_ctr32 82 vmovdqu 0-32(%r9),%xmm3 83 vpaddb %xmm2,%xmm14,%xmm1 84 vpxor %xmm15,%xmm10,%xmm10 85 vpxor %xmm15,%xmm11,%xmm11 86 87.Lresume_ctr32: 88 vmovdqu %xmm1,(%r8) 89 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 90 vpxor %xmm15,%xmm12,%xmm12 91 vmovups 16-128(%rcx),%xmm2 92 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 93 xorq %r12,%r12 94 cmpq %r14,%r15 95 96 vaesenc %xmm2,%xmm9,%xmm9 97 vmovdqu 48+8(%rsp),%xmm0 98 vpxor %xmm15,%xmm13,%xmm13 99 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 100 vaesenc %xmm2,%xmm10,%xmm10 101 vpxor %xmm15,%xmm14,%xmm14 102 setnc %r12b 103 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 104 vaesenc %xmm2,%xmm11,%xmm11 105 vmovdqu 16-32(%r9),%xmm3 106 negq %r12 107 vaesenc %xmm2,%xmm12,%xmm12 108 vpxor %xmm5,%xmm6,%xmm6 109 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 110 vpxor %xmm4,%xmm8,%xmm8 111 vaesenc %xmm2,%xmm13,%xmm13 112 vpxor %xmm5,%xmm1,%xmm4 113 andq $0x60,%r12 114 vmovups 32-128(%rcx),%xmm15 115 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 116 vaesenc %xmm2,%xmm14,%xmm14 117 118 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 119 leaq (%r14,%r12,1),%r14 120 vaesenc %xmm15,%xmm9,%xmm9 121 vpxor 16+8(%rsp),%xmm8,%xmm8 122 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 123 vmovdqu 64+8(%rsp),%xmm0 124 vaesenc %xmm15,%xmm10,%xmm10 125 movbeq 88(%r14),%r13 126 vaesenc %xmm15,%xmm11,%xmm11 127 movbeq 80(%r14),%r12 128 vaesenc %xmm15,%xmm12,%xmm12 129 movq %r13,32+8(%rsp) 130 vaesenc %xmm15,%xmm13,%xmm13 131 movq %r12,40+8(%rsp) 132 vmovdqu 48-32(%r9),%xmm5 133 vaesenc %xmm15,%xmm14,%xmm14 134 135 vmovups 48-128(%rcx),%xmm15 136 vpxor %xmm1,%xmm6,%xmm6 137 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 138 vaesenc %xmm15,%xmm9,%xmm9 139 vpxor %xmm2,%xmm6,%xmm6 140 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 141 vaesenc %xmm15,%xmm10,%xmm10 142 vpxor %xmm3,%xmm7,%xmm7 143 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 144 vaesenc %xmm15,%xmm11,%xmm11 145 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 146 vmovdqu 80+8(%rsp),%xmm0 147 vaesenc %xmm15,%xmm12,%xmm12 148 vaesenc %xmm15,%xmm13,%xmm13 149 vpxor %xmm1,%xmm4,%xmm4 150 vmovdqu 64-32(%r9),%xmm1 151 vaesenc %xmm15,%xmm14,%xmm14 152 153 vmovups 64-128(%rcx),%xmm15 154 vpxor %xmm2,%xmm6,%xmm6 155 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 156 vaesenc %xmm15,%xmm9,%xmm9 157 vpxor %xmm3,%xmm6,%xmm6 158 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 159 vaesenc %xmm15,%xmm10,%xmm10 160 movbeq 72(%r14),%r13 161 vpxor %xmm5,%xmm7,%xmm7 162 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 163 vaesenc %xmm15,%xmm11,%xmm11 164 movbeq 64(%r14),%r12 165 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 166 vmovdqu 96+8(%rsp),%xmm0 167 vaesenc %xmm15,%xmm12,%xmm12 168 movq %r13,48+8(%rsp) 169 vaesenc %xmm15,%xmm13,%xmm13 170 movq %r12,56+8(%rsp) 171 vpxor %xmm2,%xmm4,%xmm4 172 vmovdqu 96-32(%r9),%xmm2 173 vaesenc %xmm15,%xmm14,%xmm14 174 175 vmovups 80-128(%rcx),%xmm15 176 vpxor %xmm3,%xmm6,%xmm6 177 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 178 vaesenc %xmm15,%xmm9,%xmm9 179 vpxor %xmm5,%xmm6,%xmm6 180 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 181 vaesenc %xmm15,%xmm10,%xmm10 182 movbeq 56(%r14),%r13 183 vpxor %xmm1,%xmm7,%xmm7 184 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 185 vpxor 112+8(%rsp),%xmm8,%xmm8 186 vaesenc %xmm15,%xmm11,%xmm11 187 movbeq 48(%r14),%r12 188 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 189 vaesenc %xmm15,%xmm12,%xmm12 190 movq %r13,64+8(%rsp) 191 vaesenc %xmm15,%xmm13,%xmm13 192 movq %r12,72+8(%rsp) 193 vpxor %xmm3,%xmm4,%xmm4 194 vmovdqu 112-32(%r9),%xmm3 195 vaesenc %xmm15,%xmm14,%xmm14 196 197 vmovups 96-128(%rcx),%xmm15 198 vpxor %xmm5,%xmm6,%xmm6 199 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 200 vaesenc %xmm15,%xmm9,%xmm9 201 vpxor %xmm1,%xmm6,%xmm6 202 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 203 vaesenc %xmm15,%xmm10,%xmm10 204 movbeq 40(%r14),%r13 205 vpxor %xmm2,%xmm7,%xmm7 206 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 207 vaesenc %xmm15,%xmm11,%xmm11 208 movbeq 32(%r14),%r12 209 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 210 vaesenc %xmm15,%xmm12,%xmm12 211 movq %r13,80+8(%rsp) 212 vaesenc %xmm15,%xmm13,%xmm13 213 movq %r12,88+8(%rsp) 214 vpxor %xmm5,%xmm6,%xmm6 215 vaesenc %xmm15,%xmm14,%xmm14 216 vpxor %xmm1,%xmm6,%xmm6 217 218 vmovups 112-128(%rcx),%xmm15 219 vpslldq $8,%xmm6,%xmm5 220 vpxor %xmm2,%xmm4,%xmm4 221 vmovdqu 16(%r11),%xmm3 222 223 vaesenc %xmm15,%xmm9,%xmm9 224 vpxor %xmm8,%xmm7,%xmm7 225 vaesenc %xmm15,%xmm10,%xmm10 226 vpxor %xmm5,%xmm4,%xmm4 227 movbeq 24(%r14),%r13 228 vaesenc %xmm15,%xmm11,%xmm11 229 movbeq 16(%r14),%r12 230 vpalignr $8,%xmm4,%xmm4,%xmm0 231 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 232 movq %r13,96+8(%rsp) 233 vaesenc %xmm15,%xmm12,%xmm12 234 movq %r12,104+8(%rsp) 235 vaesenc %xmm15,%xmm13,%xmm13 236 vmovups 128-128(%rcx),%xmm1 237 vaesenc %xmm15,%xmm14,%xmm14 238 239 vaesenc %xmm1,%xmm9,%xmm9 240 vmovups 144-128(%rcx),%xmm15 241 vaesenc %xmm1,%xmm10,%xmm10 242 vpsrldq $8,%xmm6,%xmm6 243 vaesenc %xmm1,%xmm11,%xmm11 244 vpxor %xmm6,%xmm7,%xmm7 245 vaesenc %xmm1,%xmm12,%xmm12 246 vpxor %xmm0,%xmm4,%xmm4 247 movbeq 8(%r14),%r13 248 vaesenc %xmm1,%xmm13,%xmm13 249 movbeq 0(%r14),%r12 250 vaesenc %xmm1,%xmm14,%xmm14 251 vmovups 160-128(%rcx),%xmm1 252 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 253 jb .Lenc_tail 254 255 vaesenc %xmm15,%xmm9,%xmm9 256 vaesenc %xmm15,%xmm10,%xmm10 257 vaesenc %xmm15,%xmm11,%xmm11 258 vaesenc %xmm15,%xmm12,%xmm12 259 vaesenc %xmm15,%xmm13,%xmm13 260 vaesenc %xmm15,%xmm14,%xmm14 261 262 vaesenc %xmm1,%xmm9,%xmm9 263 vaesenc %xmm1,%xmm10,%xmm10 264 vaesenc %xmm1,%xmm11,%xmm11 265 vaesenc %xmm1,%xmm12,%xmm12 266 vaesenc %xmm1,%xmm13,%xmm13 267 vmovups 176-128(%rcx),%xmm15 268 vaesenc %xmm1,%xmm14,%xmm14 269 vmovups 192-128(%rcx),%xmm1 270 cmpl $14,%ebp // ICP does not zero key schedule. 271 jb .Lenc_tail 272 273 vaesenc %xmm15,%xmm9,%xmm9 274 vaesenc %xmm15,%xmm10,%xmm10 275 vaesenc %xmm15,%xmm11,%xmm11 276 vaesenc %xmm15,%xmm12,%xmm12 277 vaesenc %xmm15,%xmm13,%xmm13 278 vaesenc %xmm15,%xmm14,%xmm14 279 280 vaesenc %xmm1,%xmm9,%xmm9 281 vaesenc %xmm1,%xmm10,%xmm10 282 vaesenc %xmm1,%xmm11,%xmm11 283 vaesenc %xmm1,%xmm12,%xmm12 284 vaesenc %xmm1,%xmm13,%xmm13 285 vmovups 208-128(%rcx),%xmm15 286 vaesenc %xmm1,%xmm14,%xmm14 287 vmovups 224-128(%rcx),%xmm1 288 jmp .Lenc_tail 289 290.balign 32 291.Lhandle_ctr32: 292 vmovdqu (%r11),%xmm0 293 vpshufb %xmm0,%xmm1,%xmm6 294 vmovdqu 48(%r11),%xmm5 295 vpaddd 64(%r11),%xmm6,%xmm10 296 vpaddd %xmm5,%xmm6,%xmm11 297 vmovdqu 0-32(%r9),%xmm3 298 vpaddd %xmm5,%xmm10,%xmm12 299 vpshufb %xmm0,%xmm10,%xmm10 300 vpaddd %xmm5,%xmm11,%xmm13 301 vpshufb %xmm0,%xmm11,%xmm11 302 vpxor %xmm15,%xmm10,%xmm10 303 vpaddd %xmm5,%xmm12,%xmm14 304 vpshufb %xmm0,%xmm12,%xmm12 305 vpxor %xmm15,%xmm11,%xmm11 306 vpaddd %xmm5,%xmm13,%xmm1 307 vpshufb %xmm0,%xmm13,%xmm13 308 vpshufb %xmm0,%xmm14,%xmm14 309 vpshufb %xmm0,%xmm1,%xmm1 310 jmp .Lresume_ctr32 311 312.balign 32 313.Lenc_tail: 314 vaesenc %xmm15,%xmm9,%xmm9 315 vmovdqu %xmm7,16+8(%rsp) 316 vpalignr $8,%xmm4,%xmm4,%xmm8 317 vaesenc %xmm15,%xmm10,%xmm10 318 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 319 vpxor 0(%rdi),%xmm1,%xmm2 320 vaesenc %xmm15,%xmm11,%xmm11 321 vpxor 16(%rdi),%xmm1,%xmm0 322 vaesenc %xmm15,%xmm12,%xmm12 323 vpxor 32(%rdi),%xmm1,%xmm5 324 vaesenc %xmm15,%xmm13,%xmm13 325 vpxor 48(%rdi),%xmm1,%xmm6 326 vaesenc %xmm15,%xmm14,%xmm14 327 vpxor 64(%rdi),%xmm1,%xmm7 328 vpxor 80(%rdi),%xmm1,%xmm3 329 vmovdqu (%r8),%xmm1 330 331 vaesenclast %xmm2,%xmm9,%xmm9 332 vmovdqu 32(%r11),%xmm2 333 vaesenclast %xmm0,%xmm10,%xmm10 334 vpaddb %xmm2,%xmm1,%xmm0 335 movq %r13,112+8(%rsp) 336 leaq 96(%rdi),%rdi 337 vaesenclast %xmm5,%xmm11,%xmm11 338 vpaddb %xmm2,%xmm0,%xmm5 339 movq %r12,120+8(%rsp) 340 leaq 96(%rsi),%rsi 341 vmovdqu 0-128(%rcx),%xmm15 342 vaesenclast %xmm6,%xmm12,%xmm12 343 vpaddb %xmm2,%xmm5,%xmm6 344 vaesenclast %xmm7,%xmm13,%xmm13 345 vpaddb %xmm2,%xmm6,%xmm7 346 vaesenclast %xmm3,%xmm14,%xmm14 347 vpaddb %xmm2,%xmm7,%xmm3 348 349 addq $0x60,%r10 350 subq $0x6,%rdx 351 jc .L6x_done 352 353 vmovups %xmm9,-96(%rsi) 354 vpxor %xmm15,%xmm1,%xmm9 355 vmovups %xmm10,-80(%rsi) 356 vmovdqa %xmm0,%xmm10 357 vmovups %xmm11,-64(%rsi) 358 vmovdqa %xmm5,%xmm11 359 vmovups %xmm12,-48(%rsi) 360 vmovdqa %xmm6,%xmm12 361 vmovups %xmm13,-32(%rsi) 362 vmovdqa %xmm7,%xmm13 363 vmovups %xmm14,-16(%rsi) 364 vmovdqa %xmm3,%xmm14 365 vmovdqu 32+8(%rsp),%xmm7 366 jmp .Loop6x 367 368.L6x_done: 369 vpxor 16+8(%rsp),%xmm8,%xmm8 370 vpxor %xmm4,%xmm8,%xmm8 371 372 RET 373.cfi_endproc 374SET_SIZE(_aesni_ctr32_ghash_6x) 375#endif /* ifdef HAVE_MOVBE */ 376 377.balign 32 378FUNCTION(_aesni_ctr32_ghash_no_movbe_6x) 379.cfi_startproc 380 ENDBR 381 vmovdqu 32(%r11),%xmm2 382 subq $6,%rdx 383 vpxor %xmm4,%xmm4,%xmm4 384 vmovdqu 0-128(%rcx),%xmm15 385 vpaddb %xmm2,%xmm1,%xmm10 386 vpaddb %xmm2,%xmm10,%xmm11 387 vpaddb %xmm2,%xmm11,%xmm12 388 vpaddb %xmm2,%xmm12,%xmm13 389 vpaddb %xmm2,%xmm13,%xmm14 390 vpxor %xmm15,%xmm1,%xmm9 391 vmovdqu %xmm4,16+8(%rsp) 392 jmp .Loop6x_nmb 393 394.balign 32 395.Loop6x_nmb: 396 addl $100663296,%ebx 397 jc .Lhandle_ctr32_nmb 398 vmovdqu 0-32(%r9),%xmm3 399 vpaddb %xmm2,%xmm14,%xmm1 400 vpxor %xmm15,%xmm10,%xmm10 401 vpxor %xmm15,%xmm11,%xmm11 402 403.Lresume_ctr32_nmb: 404 vmovdqu %xmm1,(%r8) 405 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 406 vpxor %xmm15,%xmm12,%xmm12 407 vmovups 16-128(%rcx),%xmm2 408 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 409 xorq %r12,%r12 410 cmpq %r14,%r15 411 412 vaesenc %xmm2,%xmm9,%xmm9 413 vmovdqu 48+8(%rsp),%xmm0 414 vpxor %xmm15,%xmm13,%xmm13 415 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 416 vaesenc %xmm2,%xmm10,%xmm10 417 vpxor %xmm15,%xmm14,%xmm14 418 setnc %r12b 419 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 420 vaesenc %xmm2,%xmm11,%xmm11 421 vmovdqu 16-32(%r9),%xmm3 422 negq %r12 423 vaesenc %xmm2,%xmm12,%xmm12 424 vpxor %xmm5,%xmm6,%xmm6 425 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 426 vpxor %xmm4,%xmm8,%xmm8 427 vaesenc %xmm2,%xmm13,%xmm13 428 vpxor %xmm5,%xmm1,%xmm4 429 andq $0x60,%r12 430 vmovups 32-128(%rcx),%xmm15 431 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 432 vaesenc %xmm2,%xmm14,%xmm14 433 434 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 435 leaq (%r14,%r12,1),%r14 436 vaesenc %xmm15,%xmm9,%xmm9 437 vpxor 16+8(%rsp),%xmm8,%xmm8 438 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 439 vmovdqu 64+8(%rsp),%xmm0 440 vaesenc %xmm15,%xmm10,%xmm10 441 movq 88(%r14),%r13 442 bswapq %r13 443 vaesenc %xmm15,%xmm11,%xmm11 444 movq 80(%r14),%r12 445 bswapq %r12 446 vaesenc %xmm15,%xmm12,%xmm12 447 movq %r13,32+8(%rsp) 448 vaesenc %xmm15,%xmm13,%xmm13 449 movq %r12,40+8(%rsp) 450 vmovdqu 48-32(%r9),%xmm5 451 vaesenc %xmm15,%xmm14,%xmm14 452 453 vmovups 48-128(%rcx),%xmm15 454 vpxor %xmm1,%xmm6,%xmm6 455 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 456 vaesenc %xmm15,%xmm9,%xmm9 457 vpxor %xmm2,%xmm6,%xmm6 458 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 459 vaesenc %xmm15,%xmm10,%xmm10 460 vpxor %xmm3,%xmm7,%xmm7 461 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 462 vaesenc %xmm15,%xmm11,%xmm11 463 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 464 vmovdqu 80+8(%rsp),%xmm0 465 vaesenc %xmm15,%xmm12,%xmm12 466 vaesenc %xmm15,%xmm13,%xmm13 467 vpxor %xmm1,%xmm4,%xmm4 468 vmovdqu 64-32(%r9),%xmm1 469 vaesenc %xmm15,%xmm14,%xmm14 470 471 vmovups 64-128(%rcx),%xmm15 472 vpxor %xmm2,%xmm6,%xmm6 473 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 474 vaesenc %xmm15,%xmm9,%xmm9 475 vpxor %xmm3,%xmm6,%xmm6 476 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 477 vaesenc %xmm15,%xmm10,%xmm10 478 movq 72(%r14),%r13 479 bswapq %r13 480 vpxor %xmm5,%xmm7,%xmm7 481 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 482 vaesenc %xmm15,%xmm11,%xmm11 483 movq 64(%r14),%r12 484 bswapq %r12 485 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 486 vmovdqu 96+8(%rsp),%xmm0 487 vaesenc %xmm15,%xmm12,%xmm12 488 movq %r13,48+8(%rsp) 489 vaesenc %xmm15,%xmm13,%xmm13 490 movq %r12,56+8(%rsp) 491 vpxor %xmm2,%xmm4,%xmm4 492 vmovdqu 96-32(%r9),%xmm2 493 vaesenc %xmm15,%xmm14,%xmm14 494 495 vmovups 80-128(%rcx),%xmm15 496 vpxor %xmm3,%xmm6,%xmm6 497 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 498 vaesenc %xmm15,%xmm9,%xmm9 499 vpxor %xmm5,%xmm6,%xmm6 500 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 501 vaesenc %xmm15,%xmm10,%xmm10 502 movq 56(%r14),%r13 503 bswapq %r13 504 vpxor %xmm1,%xmm7,%xmm7 505 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 506 vpxor 112+8(%rsp),%xmm8,%xmm8 507 vaesenc %xmm15,%xmm11,%xmm11 508 movq 48(%r14),%r12 509 bswapq %r12 510 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 511 vaesenc %xmm15,%xmm12,%xmm12 512 movq %r13,64+8(%rsp) 513 vaesenc %xmm15,%xmm13,%xmm13 514 movq %r12,72+8(%rsp) 515 vpxor %xmm3,%xmm4,%xmm4 516 vmovdqu 112-32(%r9),%xmm3 517 vaesenc %xmm15,%xmm14,%xmm14 518 519 vmovups 96-128(%rcx),%xmm15 520 vpxor %xmm5,%xmm6,%xmm6 521 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 522 vaesenc %xmm15,%xmm9,%xmm9 523 vpxor %xmm1,%xmm6,%xmm6 524 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 525 vaesenc %xmm15,%xmm10,%xmm10 526 movq 40(%r14),%r13 527 bswapq %r13 528 vpxor %xmm2,%xmm7,%xmm7 529 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 530 vaesenc %xmm15,%xmm11,%xmm11 531 movq 32(%r14),%r12 532 bswapq %r12 533 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 534 vaesenc %xmm15,%xmm12,%xmm12 535 movq %r13,80+8(%rsp) 536 vaesenc %xmm15,%xmm13,%xmm13 537 movq %r12,88+8(%rsp) 538 vpxor %xmm5,%xmm6,%xmm6 539 vaesenc %xmm15,%xmm14,%xmm14 540 vpxor %xmm1,%xmm6,%xmm6 541 542 vmovups 112-128(%rcx),%xmm15 543 vpslldq $8,%xmm6,%xmm5 544 vpxor %xmm2,%xmm4,%xmm4 545 vmovdqu 16(%r11),%xmm3 546 547 vaesenc %xmm15,%xmm9,%xmm9 548 vpxor %xmm8,%xmm7,%xmm7 549 vaesenc %xmm15,%xmm10,%xmm10 550 vpxor %xmm5,%xmm4,%xmm4 551 movq 24(%r14),%r13 552 bswapq %r13 553 vaesenc %xmm15,%xmm11,%xmm11 554 movq 16(%r14),%r12 555 bswapq %r12 556 vpalignr $8,%xmm4,%xmm4,%xmm0 557 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 558 movq %r13,96+8(%rsp) 559 vaesenc %xmm15,%xmm12,%xmm12 560 movq %r12,104+8(%rsp) 561 vaesenc %xmm15,%xmm13,%xmm13 562 vmovups 128-128(%rcx),%xmm1 563 vaesenc %xmm15,%xmm14,%xmm14 564 565 vaesenc %xmm1,%xmm9,%xmm9 566 vmovups 144-128(%rcx),%xmm15 567 vaesenc %xmm1,%xmm10,%xmm10 568 vpsrldq $8,%xmm6,%xmm6 569 vaesenc %xmm1,%xmm11,%xmm11 570 vpxor %xmm6,%xmm7,%xmm7 571 vaesenc %xmm1,%xmm12,%xmm12 572 vpxor %xmm0,%xmm4,%xmm4 573 movq 8(%r14),%r13 574 bswapq %r13 575 vaesenc %xmm1,%xmm13,%xmm13 576 movq 0(%r14),%r12 577 bswapq %r12 578 vaesenc %xmm1,%xmm14,%xmm14 579 vmovups 160-128(%rcx),%xmm1 580 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 581 jb .Lenc_tail_nmb 582 583 vaesenc %xmm15,%xmm9,%xmm9 584 vaesenc %xmm15,%xmm10,%xmm10 585 vaesenc %xmm15,%xmm11,%xmm11 586 vaesenc %xmm15,%xmm12,%xmm12 587 vaesenc %xmm15,%xmm13,%xmm13 588 vaesenc %xmm15,%xmm14,%xmm14 589 590 vaesenc %xmm1,%xmm9,%xmm9 591 vaesenc %xmm1,%xmm10,%xmm10 592 vaesenc %xmm1,%xmm11,%xmm11 593 vaesenc %xmm1,%xmm12,%xmm12 594 vaesenc %xmm1,%xmm13,%xmm13 595 vmovups 176-128(%rcx),%xmm15 596 vaesenc %xmm1,%xmm14,%xmm14 597 vmovups 192-128(%rcx),%xmm1 598 cmpl $14,%ebp // ICP does not zero key schedule. 599 jb .Lenc_tail_nmb 600 601 vaesenc %xmm15,%xmm9,%xmm9 602 vaesenc %xmm15,%xmm10,%xmm10 603 vaesenc %xmm15,%xmm11,%xmm11 604 vaesenc %xmm15,%xmm12,%xmm12 605 vaesenc %xmm15,%xmm13,%xmm13 606 vaesenc %xmm15,%xmm14,%xmm14 607 608 vaesenc %xmm1,%xmm9,%xmm9 609 vaesenc %xmm1,%xmm10,%xmm10 610 vaesenc %xmm1,%xmm11,%xmm11 611 vaesenc %xmm1,%xmm12,%xmm12 612 vaesenc %xmm1,%xmm13,%xmm13 613 vmovups 208-128(%rcx),%xmm15 614 vaesenc %xmm1,%xmm14,%xmm14 615 vmovups 224-128(%rcx),%xmm1 616 jmp .Lenc_tail_nmb 617 618.balign 32 619.Lhandle_ctr32_nmb: 620 vmovdqu (%r11),%xmm0 621 vpshufb %xmm0,%xmm1,%xmm6 622 vmovdqu 48(%r11),%xmm5 623 vpaddd 64(%r11),%xmm6,%xmm10 624 vpaddd %xmm5,%xmm6,%xmm11 625 vmovdqu 0-32(%r9),%xmm3 626 vpaddd %xmm5,%xmm10,%xmm12 627 vpshufb %xmm0,%xmm10,%xmm10 628 vpaddd %xmm5,%xmm11,%xmm13 629 vpshufb %xmm0,%xmm11,%xmm11 630 vpxor %xmm15,%xmm10,%xmm10 631 vpaddd %xmm5,%xmm12,%xmm14 632 vpshufb %xmm0,%xmm12,%xmm12 633 vpxor %xmm15,%xmm11,%xmm11 634 vpaddd %xmm5,%xmm13,%xmm1 635 vpshufb %xmm0,%xmm13,%xmm13 636 vpshufb %xmm0,%xmm14,%xmm14 637 vpshufb %xmm0,%xmm1,%xmm1 638 jmp .Lresume_ctr32_nmb 639 640.balign 32 641.Lenc_tail_nmb: 642 vaesenc %xmm15,%xmm9,%xmm9 643 vmovdqu %xmm7,16+8(%rsp) 644 vpalignr $8,%xmm4,%xmm4,%xmm8 645 vaesenc %xmm15,%xmm10,%xmm10 646 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 647 vpxor 0(%rdi),%xmm1,%xmm2 648 vaesenc %xmm15,%xmm11,%xmm11 649 vpxor 16(%rdi),%xmm1,%xmm0 650 vaesenc %xmm15,%xmm12,%xmm12 651 vpxor 32(%rdi),%xmm1,%xmm5 652 vaesenc %xmm15,%xmm13,%xmm13 653 vpxor 48(%rdi),%xmm1,%xmm6 654 vaesenc %xmm15,%xmm14,%xmm14 655 vpxor 64(%rdi),%xmm1,%xmm7 656 vpxor 80(%rdi),%xmm1,%xmm3 657 vmovdqu (%r8),%xmm1 658 659 vaesenclast %xmm2,%xmm9,%xmm9 660 vmovdqu 32(%r11),%xmm2 661 vaesenclast %xmm0,%xmm10,%xmm10 662 vpaddb %xmm2,%xmm1,%xmm0 663 movq %r13,112+8(%rsp) 664 leaq 96(%rdi),%rdi 665 vaesenclast %xmm5,%xmm11,%xmm11 666 vpaddb %xmm2,%xmm0,%xmm5 667 movq %r12,120+8(%rsp) 668 leaq 96(%rsi),%rsi 669 vmovdqu 0-128(%rcx),%xmm15 670 vaesenclast %xmm6,%xmm12,%xmm12 671 vpaddb %xmm2,%xmm5,%xmm6 672 vaesenclast %xmm7,%xmm13,%xmm13 673 vpaddb %xmm2,%xmm6,%xmm7 674 vaesenclast %xmm3,%xmm14,%xmm14 675 vpaddb %xmm2,%xmm7,%xmm3 676 677 addq $0x60,%r10 678 subq $0x6,%rdx 679 jc .L6x_done_nmb 680 681 vmovups %xmm9,-96(%rsi) 682 vpxor %xmm15,%xmm1,%xmm9 683 vmovups %xmm10,-80(%rsi) 684 vmovdqa %xmm0,%xmm10 685 vmovups %xmm11,-64(%rsi) 686 vmovdqa %xmm5,%xmm11 687 vmovups %xmm12,-48(%rsi) 688 vmovdqa %xmm6,%xmm12 689 vmovups %xmm13,-32(%rsi) 690 vmovdqa %xmm7,%xmm13 691 vmovups %xmm14,-16(%rsi) 692 vmovdqa %xmm3,%xmm14 693 vmovdqu 32+8(%rsp),%xmm7 694 jmp .Loop6x_nmb 695 696.L6x_done_nmb: 697 vpxor 16+8(%rsp),%xmm8,%xmm8 698 vpxor %xmm4,%xmm8,%xmm8 699 700 RET 701.cfi_endproc 702SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x) 703 704ENTRY_ALIGN(aesni_gcm_decrypt, 32) 705.cfi_startproc 706 ENDBR 707 xorq %r10,%r10 708 cmpq $0x60,%rdx 709 jb .Lgcm_dec_abort 710 711 leaq (%rsp),%rax 712.cfi_def_cfa_register %rax 713 pushq %rbx 714.cfi_offset %rbx,-16 715 pushq %rbp 716.cfi_offset %rbp,-24 717 pushq %r12 718.cfi_offset %r12,-32 719 pushq %r13 720.cfi_offset %r13,-40 721 pushq %r14 722.cfi_offset %r14,-48 723 pushq %r15 724.cfi_offset %r15,-56 725 pushq %r9 726.cfi_offset %r9,-64 727 vzeroupper 728 729 vmovdqu (%r8),%xmm1 730 addq $-128,%rsp 731 movl 12(%r8),%ebx 732 leaq .Lbswap_mask(%rip),%r11 733 leaq -128(%rcx),%r14 734 movq $0xf80,%r15 735 vmovdqu (%r9),%xmm8 736 andq $-128,%rsp 737 vmovdqu (%r11),%xmm0 738 leaq 128(%rcx),%rcx 739 movq 32(%r9),%r9 740 leaq 32(%r9),%r9 741 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. 742 vpshufb %xmm0,%xmm8,%xmm8 743 744 andq %r15,%r14 745 andq %rsp,%r15 746 subq %r14,%r15 747 jc .Ldec_no_key_aliasing 748 cmpq $768,%r15 749 jnc .Ldec_no_key_aliasing 750 subq %r15,%rsp 751.Ldec_no_key_aliasing: 752 753 vmovdqu 80(%rdi),%xmm7 754 leaq (%rdi),%r14 755 vmovdqu 64(%rdi),%xmm4 756 leaq -192(%rdi,%rdx,1),%r15 757 vmovdqu 48(%rdi),%xmm5 758 shrq $4,%rdx 759 xorq %r10,%r10 760 vmovdqu 32(%rdi),%xmm6 761 vpshufb %xmm0,%xmm7,%xmm7 762 vmovdqu 16(%rdi),%xmm2 763 vpshufb %xmm0,%xmm4,%xmm4 764 vmovdqu (%rdi),%xmm3 765 vpshufb %xmm0,%xmm5,%xmm5 766 vmovdqu %xmm4,48(%rsp) 767 vpshufb %xmm0,%xmm6,%xmm6 768 vmovdqu %xmm5,64(%rsp) 769 vpshufb %xmm0,%xmm2,%xmm2 770 vmovdqu %xmm6,80(%rsp) 771 vpshufb %xmm0,%xmm3,%xmm3 772 vmovdqu %xmm2,96(%rsp) 773 vmovdqu %xmm3,112(%rsp) 774 775#ifdef HAVE_MOVBE 776#ifdef _KERNEL 777 testl $1,gcm_avx_can_use_movbe(%rip) 778#else 779 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 780#endif 781 jz 1f 782 call _aesni_ctr32_ghash_6x 783 jmp 2f 7841: 785#endif 786 call _aesni_ctr32_ghash_no_movbe_6x 7872: 788 vmovups %xmm9,-96(%rsi) 789 vmovups %xmm10,-80(%rsi) 790 vmovups %xmm11,-64(%rsi) 791 vmovups %xmm12,-48(%rsi) 792 vmovups %xmm13,-32(%rsi) 793 vmovups %xmm14,-16(%rsi) 794 795 vpshufb (%r11),%xmm8,%xmm8 796 movq -56(%rax),%r9 797.cfi_restore %r9 798 vmovdqu %xmm8,(%r9) 799 800 vzeroupper 801 movq -48(%rax),%r15 802.cfi_restore %r15 803 movq -40(%rax),%r14 804.cfi_restore %r14 805 movq -32(%rax),%r13 806.cfi_restore %r13 807 movq -24(%rax),%r12 808.cfi_restore %r12 809 movq -16(%rax),%rbp 810.cfi_restore %rbp 811 movq -8(%rax),%rbx 812.cfi_restore %rbx 813 leaq (%rax),%rsp 814.cfi_def_cfa_register %rsp 815.Lgcm_dec_abort: 816 movq %r10,%rax 817 RET 818.cfi_endproc 819SET_SIZE(aesni_gcm_decrypt) 820 821.balign 32 822FUNCTION(_aesni_ctr32_6x) 823.cfi_startproc 824 ENDBR 825 vmovdqu 0-128(%rcx),%xmm4 826 vmovdqu 32(%r11),%xmm2 827 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. 828 vmovups 16-128(%rcx),%xmm15 829 leaq 32-128(%rcx),%r12 830 vpxor %xmm4,%xmm1,%xmm9 831 addl $100663296,%ebx 832 jc .Lhandle_ctr32_2 833 vpaddb %xmm2,%xmm1,%xmm10 834 vpaddb %xmm2,%xmm10,%xmm11 835 vpxor %xmm4,%xmm10,%xmm10 836 vpaddb %xmm2,%xmm11,%xmm12 837 vpxor %xmm4,%xmm11,%xmm11 838 vpaddb %xmm2,%xmm12,%xmm13 839 vpxor %xmm4,%xmm12,%xmm12 840 vpaddb %xmm2,%xmm13,%xmm14 841 vpxor %xmm4,%xmm13,%xmm13 842 vpaddb %xmm2,%xmm14,%xmm1 843 vpxor %xmm4,%xmm14,%xmm14 844 jmp .Loop_ctr32 845 846.balign 16 847.Loop_ctr32: 848 vaesenc %xmm15,%xmm9,%xmm9 849 vaesenc %xmm15,%xmm10,%xmm10 850 vaesenc %xmm15,%xmm11,%xmm11 851 vaesenc %xmm15,%xmm12,%xmm12 852 vaesenc %xmm15,%xmm13,%xmm13 853 vaesenc %xmm15,%xmm14,%xmm14 854 vmovups (%r12),%xmm15 855 leaq 16(%r12),%r12 856 decl %r13d 857 jnz .Loop_ctr32 858 859 vmovdqu (%r12),%xmm3 860 vaesenc %xmm15,%xmm9,%xmm9 861 vpxor 0(%rdi),%xmm3,%xmm4 862 vaesenc %xmm15,%xmm10,%xmm10 863 vpxor 16(%rdi),%xmm3,%xmm5 864 vaesenc %xmm15,%xmm11,%xmm11 865 vpxor 32(%rdi),%xmm3,%xmm6 866 vaesenc %xmm15,%xmm12,%xmm12 867 vpxor 48(%rdi),%xmm3,%xmm8 868 vaesenc %xmm15,%xmm13,%xmm13 869 vpxor 64(%rdi),%xmm3,%xmm2 870 vaesenc %xmm15,%xmm14,%xmm14 871 vpxor 80(%rdi),%xmm3,%xmm3 872 leaq 96(%rdi),%rdi 873 874 vaesenclast %xmm4,%xmm9,%xmm9 875 vaesenclast %xmm5,%xmm10,%xmm10 876 vaesenclast %xmm6,%xmm11,%xmm11 877 vaesenclast %xmm8,%xmm12,%xmm12 878 vaesenclast %xmm2,%xmm13,%xmm13 879 vaesenclast %xmm3,%xmm14,%xmm14 880 vmovups %xmm9,0(%rsi) 881 vmovups %xmm10,16(%rsi) 882 vmovups %xmm11,32(%rsi) 883 vmovups %xmm12,48(%rsi) 884 vmovups %xmm13,64(%rsi) 885 vmovups %xmm14,80(%rsi) 886 leaq 96(%rsi),%rsi 887 888 RET 889.balign 32 890.Lhandle_ctr32_2: 891 vpshufb %xmm0,%xmm1,%xmm6 892 vmovdqu 48(%r11),%xmm5 893 vpaddd 64(%r11),%xmm6,%xmm10 894 vpaddd %xmm5,%xmm6,%xmm11 895 vpaddd %xmm5,%xmm10,%xmm12 896 vpshufb %xmm0,%xmm10,%xmm10 897 vpaddd %xmm5,%xmm11,%xmm13 898 vpshufb %xmm0,%xmm11,%xmm11 899 vpxor %xmm4,%xmm10,%xmm10 900 vpaddd %xmm5,%xmm12,%xmm14 901 vpshufb %xmm0,%xmm12,%xmm12 902 vpxor %xmm4,%xmm11,%xmm11 903 vpaddd %xmm5,%xmm13,%xmm1 904 vpshufb %xmm0,%xmm13,%xmm13 905 vpxor %xmm4,%xmm12,%xmm12 906 vpshufb %xmm0,%xmm14,%xmm14 907 vpxor %xmm4,%xmm13,%xmm13 908 vpshufb %xmm0,%xmm1,%xmm1 909 vpxor %xmm4,%xmm14,%xmm14 910 jmp .Loop_ctr32 911.cfi_endproc 912SET_SIZE(_aesni_ctr32_6x) 913 914ENTRY_ALIGN(aesni_gcm_encrypt, 32) 915.cfi_startproc 916 ENDBR 917 xorq %r10,%r10 918 cmpq $288,%rdx 919 jb .Lgcm_enc_abort 920 921 leaq (%rsp),%rax 922.cfi_def_cfa_register %rax 923 pushq %rbx 924.cfi_offset %rbx,-16 925 pushq %rbp 926.cfi_offset %rbp,-24 927 pushq %r12 928.cfi_offset %r12,-32 929 pushq %r13 930.cfi_offset %r13,-40 931 pushq %r14 932.cfi_offset %r14,-48 933 pushq %r15 934.cfi_offset %r15,-56 935 pushq %r9 936.cfi_offset %r9,-64 937 vzeroupper 938 939 vmovdqu (%r8),%xmm1 940 addq $-128,%rsp 941 movl 12(%r8),%ebx 942 leaq .Lbswap_mask(%rip),%r11 943 leaq -128(%rcx),%r14 944 movq $0xf80,%r15 945 leaq 128(%rcx),%rcx 946 vmovdqu (%r11),%xmm0 947 andq $-128,%rsp 948 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. 949 950 andq %r15,%r14 951 andq %rsp,%r15 952 subq %r14,%r15 953 jc .Lenc_no_key_aliasing 954 cmpq $768,%r15 955 jnc .Lenc_no_key_aliasing 956 subq %r15,%rsp 957.Lenc_no_key_aliasing: 958 959 leaq (%rsi),%r14 960 leaq -192(%rsi,%rdx,1),%r15 961 shrq $4,%rdx 962 963 call _aesni_ctr32_6x 964 vpshufb %xmm0,%xmm9,%xmm8 965 vpshufb %xmm0,%xmm10,%xmm2 966 vmovdqu %xmm8,112(%rsp) 967 vpshufb %xmm0,%xmm11,%xmm4 968 vmovdqu %xmm2,96(%rsp) 969 vpshufb %xmm0,%xmm12,%xmm5 970 vmovdqu %xmm4,80(%rsp) 971 vpshufb %xmm0,%xmm13,%xmm6 972 vmovdqu %xmm5,64(%rsp) 973 vpshufb %xmm0,%xmm14,%xmm7 974 vmovdqu %xmm6,48(%rsp) 975 976 call _aesni_ctr32_6x 977 978 vmovdqu (%r9),%xmm8 979 movq 32(%r9),%r9 980 leaq 32(%r9),%r9 981 subq $12,%rdx 982 movq $192,%r10 983 vpshufb %xmm0,%xmm8,%xmm8 984 985#ifdef HAVE_MOVBE 986#ifdef _KERNEL 987 testl $1,gcm_avx_can_use_movbe(%rip) 988#else 989 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 990#endif 991 jz 1f 992 call _aesni_ctr32_ghash_6x 993 jmp 2f 9941: 995#endif 996 call _aesni_ctr32_ghash_no_movbe_6x 9972: 998 vmovdqu 32(%rsp),%xmm7 999 vmovdqu (%r11),%xmm0 1000 vmovdqu 0-32(%r9),%xmm3 1001 vpunpckhqdq %xmm7,%xmm7,%xmm1 1002 vmovdqu 32-32(%r9),%xmm15 1003 vmovups %xmm9,-96(%rsi) 1004 vpshufb %xmm0,%xmm9,%xmm9 1005 vpxor %xmm7,%xmm1,%xmm1 1006 vmovups %xmm10,-80(%rsi) 1007 vpshufb %xmm0,%xmm10,%xmm10 1008 vmovups %xmm11,-64(%rsi) 1009 vpshufb %xmm0,%xmm11,%xmm11 1010 vmovups %xmm12,-48(%rsi) 1011 vpshufb %xmm0,%xmm12,%xmm12 1012 vmovups %xmm13,-32(%rsi) 1013 vpshufb %xmm0,%xmm13,%xmm13 1014 vmovups %xmm14,-16(%rsi) 1015 vpshufb %xmm0,%xmm14,%xmm14 1016 vmovdqu %xmm9,16(%rsp) 1017 vmovdqu 48(%rsp),%xmm6 1018 vmovdqu 16-32(%r9),%xmm0 1019 vpunpckhqdq %xmm6,%xmm6,%xmm2 1020 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 1021 vpxor %xmm6,%xmm2,%xmm2 1022 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 1023 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1024 1025 vmovdqu 64(%rsp),%xmm9 1026 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 1027 vmovdqu 48-32(%r9),%xmm3 1028 vpxor %xmm5,%xmm4,%xmm4 1029 vpunpckhqdq %xmm9,%xmm9,%xmm5 1030 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 1031 vpxor %xmm9,%xmm5,%xmm5 1032 vpxor %xmm7,%xmm6,%xmm6 1033 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1034 vmovdqu 80-32(%r9),%xmm15 1035 vpxor %xmm1,%xmm2,%xmm2 1036 1037 vmovdqu 80(%rsp),%xmm1 1038 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 1039 vmovdqu 64-32(%r9),%xmm0 1040 vpxor %xmm4,%xmm7,%xmm7 1041 vpunpckhqdq %xmm1,%xmm1,%xmm4 1042 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 1043 vpxor %xmm1,%xmm4,%xmm4 1044 vpxor %xmm6,%xmm9,%xmm9 1045 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 1046 vpxor %xmm2,%xmm5,%xmm5 1047 1048 vmovdqu 96(%rsp),%xmm2 1049 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 1050 vmovdqu 96-32(%r9),%xmm3 1051 vpxor %xmm7,%xmm6,%xmm6 1052 vpunpckhqdq %xmm2,%xmm2,%xmm7 1053 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 1054 vpxor %xmm2,%xmm7,%xmm7 1055 vpxor %xmm9,%xmm1,%xmm1 1056 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 1057 vmovdqu 128-32(%r9),%xmm15 1058 vpxor %xmm5,%xmm4,%xmm4 1059 1060 vpxor 112(%rsp),%xmm8,%xmm8 1061 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 1062 vmovdqu 112-32(%r9),%xmm0 1063 vpunpckhqdq %xmm8,%xmm8,%xmm9 1064 vpxor %xmm6,%xmm5,%xmm5 1065 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 1066 vpxor %xmm8,%xmm9,%xmm9 1067 vpxor %xmm1,%xmm2,%xmm2 1068 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 1069 vpxor %xmm4,%xmm7,%xmm4 1070 1071 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 1072 vmovdqu 0-32(%r9),%xmm3 1073 vpunpckhqdq %xmm14,%xmm14,%xmm1 1074 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 1075 vpxor %xmm14,%xmm1,%xmm1 1076 vpxor %xmm5,%xmm6,%xmm5 1077 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 1078 vmovdqu 32-32(%r9),%xmm15 1079 vpxor %xmm2,%xmm8,%xmm7 1080 vpxor %xmm4,%xmm9,%xmm6 1081 1082 vmovdqu 16-32(%r9),%xmm0 1083 vpxor %xmm5,%xmm7,%xmm9 1084 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 1085 vpxor %xmm9,%xmm6,%xmm6 1086 vpunpckhqdq %xmm13,%xmm13,%xmm2 1087 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 1088 vpxor %xmm13,%xmm2,%xmm2 1089 vpslldq $8,%xmm6,%xmm9 1090 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1091 vpxor %xmm9,%xmm5,%xmm8 1092 vpsrldq $8,%xmm6,%xmm6 1093 vpxor %xmm6,%xmm7,%xmm7 1094 1095 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 1096 vmovdqu 48-32(%r9),%xmm3 1097 vpxor %xmm4,%xmm5,%xmm5 1098 vpunpckhqdq %xmm12,%xmm12,%xmm9 1099 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 1100 vpxor %xmm12,%xmm9,%xmm9 1101 vpxor %xmm14,%xmm13,%xmm13 1102 vpalignr $8,%xmm8,%xmm8,%xmm14 1103 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1104 vmovdqu 80-32(%r9),%xmm15 1105 vpxor %xmm1,%xmm2,%xmm2 1106 1107 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 1108 vmovdqu 64-32(%r9),%xmm0 1109 vpxor %xmm5,%xmm4,%xmm4 1110 vpunpckhqdq %xmm11,%xmm11,%xmm1 1111 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 1112 vpxor %xmm11,%xmm1,%xmm1 1113 vpxor %xmm13,%xmm12,%xmm12 1114 vxorps 16(%rsp),%xmm7,%xmm7 1115 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 1116 vpxor %xmm2,%xmm9,%xmm9 1117 1118 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1119 vxorps %xmm14,%xmm8,%xmm8 1120 1121 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 1122 vmovdqu 96-32(%r9),%xmm3 1123 vpxor %xmm4,%xmm5,%xmm5 1124 vpunpckhqdq %xmm10,%xmm10,%xmm2 1125 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 1126 vpxor %xmm10,%xmm2,%xmm2 1127 vpalignr $8,%xmm8,%xmm8,%xmm14 1128 vpxor %xmm12,%xmm11,%xmm11 1129 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 1130 vmovdqu 128-32(%r9),%xmm15 1131 vpxor %xmm9,%xmm1,%xmm1 1132 1133 vxorps %xmm7,%xmm14,%xmm14 1134 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1135 vxorps %xmm14,%xmm8,%xmm8 1136 1137 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 1138 vmovdqu 112-32(%r9),%xmm0 1139 vpxor %xmm5,%xmm4,%xmm4 1140 vpunpckhqdq %xmm8,%xmm8,%xmm9 1141 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 1142 vpxor %xmm8,%xmm9,%xmm9 1143 vpxor %xmm11,%xmm10,%xmm10 1144 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 1145 vpxor %xmm1,%xmm2,%xmm2 1146 1147 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 1148 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 1149 vpxor %xmm4,%xmm5,%xmm5 1150 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 1151 vpxor %xmm10,%xmm7,%xmm7 1152 vpxor %xmm2,%xmm6,%xmm6 1153 1154 vpxor %xmm5,%xmm7,%xmm4 1155 vpxor %xmm4,%xmm6,%xmm6 1156 vpslldq $8,%xmm6,%xmm1 1157 vmovdqu 16(%r11),%xmm3 1158 vpsrldq $8,%xmm6,%xmm6 1159 vpxor %xmm1,%xmm5,%xmm8 1160 vpxor %xmm6,%xmm7,%xmm7 1161 1162 vpalignr $8,%xmm8,%xmm8,%xmm2 1163 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1164 vpxor %xmm2,%xmm8,%xmm8 1165 1166 vpalignr $8,%xmm8,%xmm8,%xmm2 1167 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1168 vpxor %xmm7,%xmm2,%xmm2 1169 vpxor %xmm2,%xmm8,%xmm8 1170 vpshufb (%r11),%xmm8,%xmm8 1171 movq -56(%rax),%r9 1172.cfi_restore %r9 1173 vmovdqu %xmm8,(%r9) 1174 1175 vzeroupper 1176 movq -48(%rax),%r15 1177.cfi_restore %r15 1178 movq -40(%rax),%r14 1179.cfi_restore %r14 1180 movq -32(%rax),%r13 1181.cfi_restore %r13 1182 movq -24(%rax),%r12 1183.cfi_restore %r12 1184 movq -16(%rax),%rbp 1185.cfi_restore %rbp 1186 movq -8(%rax),%rbx 1187.cfi_restore %rbx 1188 leaq (%rax),%rsp 1189.cfi_def_cfa_register %rsp 1190.Lgcm_enc_abort: 1191 movq %r10,%rax 1192 RET 1193.cfi_endproc 1194SET_SIZE(aesni_gcm_encrypt) 1195 1196#endif /* !_WIN32 || _KERNEL */ 1197 1198/* Some utility routines */ 1199 1200/* 1201 * clear all fpu registers 1202 * void clear_fpu_regs_avx(void); 1203 */ 1204ENTRY_ALIGN(clear_fpu_regs_avx, 32) 1205 vzeroall 1206 RET 1207SET_SIZE(clear_fpu_regs_avx) 1208 1209/* 1210 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1211 * 1212 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and 1213 * stores the result at `dst'. The XOR is performed using FPU registers, 1214 * so make sure FPU state is saved when running this in the kernel. 1215 */ 1216ENTRY_ALIGN(gcm_xor_avx, 32) 1217 movdqu (%rdi), %xmm0 1218 movdqu (%rsi), %xmm1 1219 pxor %xmm1, %xmm0 1220 movdqu %xmm0, (%rsi) 1221 RET 1222SET_SIZE(gcm_xor_avx) 1223 1224/* 1225 * Toggle a boolean_t value atomically and return the new value. 1226 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); 1227 */ 1228ENTRY_ALIGN(atomic_toggle_boolean_nv, 32) 1229 xorl %eax, %eax 1230 lock 1231 xorl $1, (%rdi) 1232 jz 1f 1233 movl $1, %eax 12341: 1235 RET 1236SET_SIZE(atomic_toggle_boolean_nv) 1237 1238SECTION_STATIC 1239 1240.balign 64 1241.Lbswap_mask: 1242.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1243.Lpoly: 1244.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1245.Lone_msb: 1246.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1247.Ltwo_lsb: 1248.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1249.Lone_lsb: 1250.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1251.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1252.balign 64 1253 1254/* Mark the stack non-executable. */ 1255#if defined(__linux__) && defined(__ELF__) 1256.section .note.GNU-stack,"",%progbits 1257#endif 1258 1259#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ 1260