1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 2# 3# Licensed under the Apache License 2.0 (the "License"). You may not use 4# this file except in compliance with the License. You can obtain a copy 5# in the file LICENSE in the source distribution or at 6# https://www.openssl.org/source/license.html 7 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# 17# AES-NI-CTR+GHASH stitch. 18# 19# February 2013 20# 21# OpenSSL GCM implementation is organized in such way that its 22# performance is rather close to the sum of its streamed components, 23# in the context parallelized AES-NI CTR and modulo-scheduled 24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 25# was observed to perform significantly better than the sum of the 26# components on contemporary CPUs, the effort was deemed impossible to 27# justify. This module is based on combination of Intel submissions, 28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 29# Locktyukhin of Intel Corp. who verified that it reduces shuffles 30# pressure with notable relative improvement, achieving 1.0 cycle per 31# byte processed with 128-bit key on Haswell processor, 0.74 - on 32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 33# measurements for favourable packet size, one divisible by 96. 34# Applications using the EVP interface will observe a few percent 35# worse performance.] 36# 37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 38# 39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 41 42# Generated once from 43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl 44# and modified for ICP. Modification are kept at a bare minimum to ease later 45# upstream merges. 46 47#if defined(__x86_64__) && defined(HAVE_AVX) && \ 48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) 49 50.extern gcm_avx_can_use_movbe 51 52.text 53 54#ifdef HAVE_MOVBE 55.type _aesni_ctr32_ghash_6x,@function 56.align 32 57_aesni_ctr32_ghash_6x: 58.cfi_startproc 59 vmovdqu 32(%r11),%xmm2 60 subq $6,%rdx 61 vpxor %xmm4,%xmm4,%xmm4 62 vmovdqu 0-128(%rcx),%xmm15 63 vpaddb %xmm2,%xmm1,%xmm10 64 vpaddb %xmm2,%xmm10,%xmm11 65 vpaddb %xmm2,%xmm11,%xmm12 66 vpaddb %xmm2,%xmm12,%xmm13 67 vpaddb %xmm2,%xmm13,%xmm14 68 vpxor %xmm15,%xmm1,%xmm9 69 vmovdqu %xmm4,16+8(%rsp) 70 jmp .Loop6x 71 72.align 32 73.Loop6x: 74 addl $100663296,%ebx 75 jc .Lhandle_ctr32 76 vmovdqu 0-32(%r9),%xmm3 77 vpaddb %xmm2,%xmm14,%xmm1 78 vpxor %xmm15,%xmm10,%xmm10 79 vpxor %xmm15,%xmm11,%xmm11 80 81.Lresume_ctr32: 82 vmovdqu %xmm1,(%r8) 83 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 84 vpxor %xmm15,%xmm12,%xmm12 85 vmovups 16-128(%rcx),%xmm2 86 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 87 xorq %r12,%r12 88 cmpq %r14,%r15 89 90 vaesenc %xmm2,%xmm9,%xmm9 91 vmovdqu 48+8(%rsp),%xmm0 92 vpxor %xmm15,%xmm13,%xmm13 93 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 94 vaesenc %xmm2,%xmm10,%xmm10 95 vpxor %xmm15,%xmm14,%xmm14 96 setnc %r12b 97 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 98 vaesenc %xmm2,%xmm11,%xmm11 99 vmovdqu 16-32(%r9),%xmm3 100 negq %r12 101 vaesenc %xmm2,%xmm12,%xmm12 102 vpxor %xmm5,%xmm6,%xmm6 103 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 104 vpxor %xmm4,%xmm8,%xmm8 105 vaesenc %xmm2,%xmm13,%xmm13 106 vpxor %xmm5,%xmm1,%xmm4 107 andq $0x60,%r12 108 vmovups 32-128(%rcx),%xmm15 109 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 110 vaesenc %xmm2,%xmm14,%xmm14 111 112 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 113 leaq (%r14,%r12,1),%r14 114 vaesenc %xmm15,%xmm9,%xmm9 115 vpxor 16+8(%rsp),%xmm8,%xmm8 116 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 117 vmovdqu 64+8(%rsp),%xmm0 118 vaesenc %xmm15,%xmm10,%xmm10 119 movbeq 88(%r14),%r13 120 vaesenc %xmm15,%xmm11,%xmm11 121 movbeq 80(%r14),%r12 122 vaesenc %xmm15,%xmm12,%xmm12 123 movq %r13,32+8(%rsp) 124 vaesenc %xmm15,%xmm13,%xmm13 125 movq %r12,40+8(%rsp) 126 vmovdqu 48-32(%r9),%xmm5 127 vaesenc %xmm15,%xmm14,%xmm14 128 129 vmovups 48-128(%rcx),%xmm15 130 vpxor %xmm1,%xmm6,%xmm6 131 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 132 vaesenc %xmm15,%xmm9,%xmm9 133 vpxor %xmm2,%xmm6,%xmm6 134 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 135 vaesenc %xmm15,%xmm10,%xmm10 136 vpxor %xmm3,%xmm7,%xmm7 137 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 138 vaesenc %xmm15,%xmm11,%xmm11 139 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 140 vmovdqu 80+8(%rsp),%xmm0 141 vaesenc %xmm15,%xmm12,%xmm12 142 vaesenc %xmm15,%xmm13,%xmm13 143 vpxor %xmm1,%xmm4,%xmm4 144 vmovdqu 64-32(%r9),%xmm1 145 vaesenc %xmm15,%xmm14,%xmm14 146 147 vmovups 64-128(%rcx),%xmm15 148 vpxor %xmm2,%xmm6,%xmm6 149 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 150 vaesenc %xmm15,%xmm9,%xmm9 151 vpxor %xmm3,%xmm6,%xmm6 152 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 153 vaesenc %xmm15,%xmm10,%xmm10 154 movbeq 72(%r14),%r13 155 vpxor %xmm5,%xmm7,%xmm7 156 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 157 vaesenc %xmm15,%xmm11,%xmm11 158 movbeq 64(%r14),%r12 159 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 160 vmovdqu 96+8(%rsp),%xmm0 161 vaesenc %xmm15,%xmm12,%xmm12 162 movq %r13,48+8(%rsp) 163 vaesenc %xmm15,%xmm13,%xmm13 164 movq %r12,56+8(%rsp) 165 vpxor %xmm2,%xmm4,%xmm4 166 vmovdqu 96-32(%r9),%xmm2 167 vaesenc %xmm15,%xmm14,%xmm14 168 169 vmovups 80-128(%rcx),%xmm15 170 vpxor %xmm3,%xmm6,%xmm6 171 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 172 vaesenc %xmm15,%xmm9,%xmm9 173 vpxor %xmm5,%xmm6,%xmm6 174 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 175 vaesenc %xmm15,%xmm10,%xmm10 176 movbeq 56(%r14),%r13 177 vpxor %xmm1,%xmm7,%xmm7 178 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 179 vpxor 112+8(%rsp),%xmm8,%xmm8 180 vaesenc %xmm15,%xmm11,%xmm11 181 movbeq 48(%r14),%r12 182 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 183 vaesenc %xmm15,%xmm12,%xmm12 184 movq %r13,64+8(%rsp) 185 vaesenc %xmm15,%xmm13,%xmm13 186 movq %r12,72+8(%rsp) 187 vpxor %xmm3,%xmm4,%xmm4 188 vmovdqu 112-32(%r9),%xmm3 189 vaesenc %xmm15,%xmm14,%xmm14 190 191 vmovups 96-128(%rcx),%xmm15 192 vpxor %xmm5,%xmm6,%xmm6 193 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 194 vaesenc %xmm15,%xmm9,%xmm9 195 vpxor %xmm1,%xmm6,%xmm6 196 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 197 vaesenc %xmm15,%xmm10,%xmm10 198 movbeq 40(%r14),%r13 199 vpxor %xmm2,%xmm7,%xmm7 200 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 201 vaesenc %xmm15,%xmm11,%xmm11 202 movbeq 32(%r14),%r12 203 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 204 vaesenc %xmm15,%xmm12,%xmm12 205 movq %r13,80+8(%rsp) 206 vaesenc %xmm15,%xmm13,%xmm13 207 movq %r12,88+8(%rsp) 208 vpxor %xmm5,%xmm6,%xmm6 209 vaesenc %xmm15,%xmm14,%xmm14 210 vpxor %xmm1,%xmm6,%xmm6 211 212 vmovups 112-128(%rcx),%xmm15 213 vpslldq $8,%xmm6,%xmm5 214 vpxor %xmm2,%xmm4,%xmm4 215 vmovdqu 16(%r11),%xmm3 216 217 vaesenc %xmm15,%xmm9,%xmm9 218 vpxor %xmm8,%xmm7,%xmm7 219 vaesenc %xmm15,%xmm10,%xmm10 220 vpxor %xmm5,%xmm4,%xmm4 221 movbeq 24(%r14),%r13 222 vaesenc %xmm15,%xmm11,%xmm11 223 movbeq 16(%r14),%r12 224 vpalignr $8,%xmm4,%xmm4,%xmm0 225 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 226 movq %r13,96+8(%rsp) 227 vaesenc %xmm15,%xmm12,%xmm12 228 movq %r12,104+8(%rsp) 229 vaesenc %xmm15,%xmm13,%xmm13 230 vmovups 128-128(%rcx),%xmm1 231 vaesenc %xmm15,%xmm14,%xmm14 232 233 vaesenc %xmm1,%xmm9,%xmm9 234 vmovups 144-128(%rcx),%xmm15 235 vaesenc %xmm1,%xmm10,%xmm10 236 vpsrldq $8,%xmm6,%xmm6 237 vaesenc %xmm1,%xmm11,%xmm11 238 vpxor %xmm6,%xmm7,%xmm7 239 vaesenc %xmm1,%xmm12,%xmm12 240 vpxor %xmm0,%xmm4,%xmm4 241 movbeq 8(%r14),%r13 242 vaesenc %xmm1,%xmm13,%xmm13 243 movbeq 0(%r14),%r12 244 vaesenc %xmm1,%xmm14,%xmm14 245 vmovups 160-128(%rcx),%xmm1 246 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 247 jb .Lenc_tail 248 249 vaesenc %xmm15,%xmm9,%xmm9 250 vaesenc %xmm15,%xmm10,%xmm10 251 vaesenc %xmm15,%xmm11,%xmm11 252 vaesenc %xmm15,%xmm12,%xmm12 253 vaesenc %xmm15,%xmm13,%xmm13 254 vaesenc %xmm15,%xmm14,%xmm14 255 256 vaesenc %xmm1,%xmm9,%xmm9 257 vaesenc %xmm1,%xmm10,%xmm10 258 vaesenc %xmm1,%xmm11,%xmm11 259 vaesenc %xmm1,%xmm12,%xmm12 260 vaesenc %xmm1,%xmm13,%xmm13 261 vmovups 176-128(%rcx),%xmm15 262 vaesenc %xmm1,%xmm14,%xmm14 263 vmovups 192-128(%rcx),%xmm1 264 cmpl $14,%ebp // ICP does not zero key schedule. 265 jb .Lenc_tail 266 267 vaesenc %xmm15,%xmm9,%xmm9 268 vaesenc %xmm15,%xmm10,%xmm10 269 vaesenc %xmm15,%xmm11,%xmm11 270 vaesenc %xmm15,%xmm12,%xmm12 271 vaesenc %xmm15,%xmm13,%xmm13 272 vaesenc %xmm15,%xmm14,%xmm14 273 274 vaesenc %xmm1,%xmm9,%xmm9 275 vaesenc %xmm1,%xmm10,%xmm10 276 vaesenc %xmm1,%xmm11,%xmm11 277 vaesenc %xmm1,%xmm12,%xmm12 278 vaesenc %xmm1,%xmm13,%xmm13 279 vmovups 208-128(%rcx),%xmm15 280 vaesenc %xmm1,%xmm14,%xmm14 281 vmovups 224-128(%rcx),%xmm1 282 jmp .Lenc_tail 283 284.align 32 285.Lhandle_ctr32: 286 vmovdqu (%r11),%xmm0 287 vpshufb %xmm0,%xmm1,%xmm6 288 vmovdqu 48(%r11),%xmm5 289 vpaddd 64(%r11),%xmm6,%xmm10 290 vpaddd %xmm5,%xmm6,%xmm11 291 vmovdqu 0-32(%r9),%xmm3 292 vpaddd %xmm5,%xmm10,%xmm12 293 vpshufb %xmm0,%xmm10,%xmm10 294 vpaddd %xmm5,%xmm11,%xmm13 295 vpshufb %xmm0,%xmm11,%xmm11 296 vpxor %xmm15,%xmm10,%xmm10 297 vpaddd %xmm5,%xmm12,%xmm14 298 vpshufb %xmm0,%xmm12,%xmm12 299 vpxor %xmm15,%xmm11,%xmm11 300 vpaddd %xmm5,%xmm13,%xmm1 301 vpshufb %xmm0,%xmm13,%xmm13 302 vpshufb %xmm0,%xmm14,%xmm14 303 vpshufb %xmm0,%xmm1,%xmm1 304 jmp .Lresume_ctr32 305 306.align 32 307.Lenc_tail: 308 vaesenc %xmm15,%xmm9,%xmm9 309 vmovdqu %xmm7,16+8(%rsp) 310 vpalignr $8,%xmm4,%xmm4,%xmm8 311 vaesenc %xmm15,%xmm10,%xmm10 312 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 313 vpxor 0(%rdi),%xmm1,%xmm2 314 vaesenc %xmm15,%xmm11,%xmm11 315 vpxor 16(%rdi),%xmm1,%xmm0 316 vaesenc %xmm15,%xmm12,%xmm12 317 vpxor 32(%rdi),%xmm1,%xmm5 318 vaesenc %xmm15,%xmm13,%xmm13 319 vpxor 48(%rdi),%xmm1,%xmm6 320 vaesenc %xmm15,%xmm14,%xmm14 321 vpxor 64(%rdi),%xmm1,%xmm7 322 vpxor 80(%rdi),%xmm1,%xmm3 323 vmovdqu (%r8),%xmm1 324 325 vaesenclast %xmm2,%xmm9,%xmm9 326 vmovdqu 32(%r11),%xmm2 327 vaesenclast %xmm0,%xmm10,%xmm10 328 vpaddb %xmm2,%xmm1,%xmm0 329 movq %r13,112+8(%rsp) 330 leaq 96(%rdi),%rdi 331 vaesenclast %xmm5,%xmm11,%xmm11 332 vpaddb %xmm2,%xmm0,%xmm5 333 movq %r12,120+8(%rsp) 334 leaq 96(%rsi),%rsi 335 vmovdqu 0-128(%rcx),%xmm15 336 vaesenclast %xmm6,%xmm12,%xmm12 337 vpaddb %xmm2,%xmm5,%xmm6 338 vaesenclast %xmm7,%xmm13,%xmm13 339 vpaddb %xmm2,%xmm6,%xmm7 340 vaesenclast %xmm3,%xmm14,%xmm14 341 vpaddb %xmm2,%xmm7,%xmm3 342 343 addq $0x60,%r10 344 subq $0x6,%rdx 345 jc .L6x_done 346 347 vmovups %xmm9,-96(%rsi) 348 vpxor %xmm15,%xmm1,%xmm9 349 vmovups %xmm10,-80(%rsi) 350 vmovdqa %xmm0,%xmm10 351 vmovups %xmm11,-64(%rsi) 352 vmovdqa %xmm5,%xmm11 353 vmovups %xmm12,-48(%rsi) 354 vmovdqa %xmm6,%xmm12 355 vmovups %xmm13,-32(%rsi) 356 vmovdqa %xmm7,%xmm13 357 vmovups %xmm14,-16(%rsi) 358 vmovdqa %xmm3,%xmm14 359 vmovdqu 32+8(%rsp),%xmm7 360 jmp .Loop6x 361 362.L6x_done: 363 vpxor 16+8(%rsp),%xmm8,%xmm8 364 vpxor %xmm4,%xmm8,%xmm8 365 366 .byte 0xf3,0xc3 367.cfi_endproc 368.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x 369#endif /* ifdef HAVE_MOVBE */ 370 371.type _aesni_ctr32_ghash_no_movbe_6x,@function 372.align 32 373_aesni_ctr32_ghash_no_movbe_6x: 374.cfi_startproc 375 vmovdqu 32(%r11),%xmm2 376 subq $6,%rdx 377 vpxor %xmm4,%xmm4,%xmm4 378 vmovdqu 0-128(%rcx),%xmm15 379 vpaddb %xmm2,%xmm1,%xmm10 380 vpaddb %xmm2,%xmm10,%xmm11 381 vpaddb %xmm2,%xmm11,%xmm12 382 vpaddb %xmm2,%xmm12,%xmm13 383 vpaddb %xmm2,%xmm13,%xmm14 384 vpxor %xmm15,%xmm1,%xmm9 385 vmovdqu %xmm4,16+8(%rsp) 386 jmp .Loop6x_nmb 387 388.align 32 389.Loop6x_nmb: 390 addl $100663296,%ebx 391 jc .Lhandle_ctr32_nmb 392 vmovdqu 0-32(%r9),%xmm3 393 vpaddb %xmm2,%xmm14,%xmm1 394 vpxor %xmm15,%xmm10,%xmm10 395 vpxor %xmm15,%xmm11,%xmm11 396 397.Lresume_ctr32_nmb: 398 vmovdqu %xmm1,(%r8) 399 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 400 vpxor %xmm15,%xmm12,%xmm12 401 vmovups 16-128(%rcx),%xmm2 402 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 403 xorq %r12,%r12 404 cmpq %r14,%r15 405 406 vaesenc %xmm2,%xmm9,%xmm9 407 vmovdqu 48+8(%rsp),%xmm0 408 vpxor %xmm15,%xmm13,%xmm13 409 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 410 vaesenc %xmm2,%xmm10,%xmm10 411 vpxor %xmm15,%xmm14,%xmm14 412 setnc %r12b 413 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 414 vaesenc %xmm2,%xmm11,%xmm11 415 vmovdqu 16-32(%r9),%xmm3 416 negq %r12 417 vaesenc %xmm2,%xmm12,%xmm12 418 vpxor %xmm5,%xmm6,%xmm6 419 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 420 vpxor %xmm4,%xmm8,%xmm8 421 vaesenc %xmm2,%xmm13,%xmm13 422 vpxor %xmm5,%xmm1,%xmm4 423 andq $0x60,%r12 424 vmovups 32-128(%rcx),%xmm15 425 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 426 vaesenc %xmm2,%xmm14,%xmm14 427 428 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 429 leaq (%r14,%r12,1),%r14 430 vaesenc %xmm15,%xmm9,%xmm9 431 vpxor 16+8(%rsp),%xmm8,%xmm8 432 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 433 vmovdqu 64+8(%rsp),%xmm0 434 vaesenc %xmm15,%xmm10,%xmm10 435 movq 88(%r14),%r13 436 bswapq %r13 437 vaesenc %xmm15,%xmm11,%xmm11 438 movq 80(%r14),%r12 439 bswapq %r12 440 vaesenc %xmm15,%xmm12,%xmm12 441 movq %r13,32+8(%rsp) 442 vaesenc %xmm15,%xmm13,%xmm13 443 movq %r12,40+8(%rsp) 444 vmovdqu 48-32(%r9),%xmm5 445 vaesenc %xmm15,%xmm14,%xmm14 446 447 vmovups 48-128(%rcx),%xmm15 448 vpxor %xmm1,%xmm6,%xmm6 449 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 450 vaesenc %xmm15,%xmm9,%xmm9 451 vpxor %xmm2,%xmm6,%xmm6 452 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 453 vaesenc %xmm15,%xmm10,%xmm10 454 vpxor %xmm3,%xmm7,%xmm7 455 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 456 vaesenc %xmm15,%xmm11,%xmm11 457 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 458 vmovdqu 80+8(%rsp),%xmm0 459 vaesenc %xmm15,%xmm12,%xmm12 460 vaesenc %xmm15,%xmm13,%xmm13 461 vpxor %xmm1,%xmm4,%xmm4 462 vmovdqu 64-32(%r9),%xmm1 463 vaesenc %xmm15,%xmm14,%xmm14 464 465 vmovups 64-128(%rcx),%xmm15 466 vpxor %xmm2,%xmm6,%xmm6 467 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 468 vaesenc %xmm15,%xmm9,%xmm9 469 vpxor %xmm3,%xmm6,%xmm6 470 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 471 vaesenc %xmm15,%xmm10,%xmm10 472 movq 72(%r14),%r13 473 bswapq %r13 474 vpxor %xmm5,%xmm7,%xmm7 475 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 476 vaesenc %xmm15,%xmm11,%xmm11 477 movq 64(%r14),%r12 478 bswapq %r12 479 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 480 vmovdqu 96+8(%rsp),%xmm0 481 vaesenc %xmm15,%xmm12,%xmm12 482 movq %r13,48+8(%rsp) 483 vaesenc %xmm15,%xmm13,%xmm13 484 movq %r12,56+8(%rsp) 485 vpxor %xmm2,%xmm4,%xmm4 486 vmovdqu 96-32(%r9),%xmm2 487 vaesenc %xmm15,%xmm14,%xmm14 488 489 vmovups 80-128(%rcx),%xmm15 490 vpxor %xmm3,%xmm6,%xmm6 491 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 492 vaesenc %xmm15,%xmm9,%xmm9 493 vpxor %xmm5,%xmm6,%xmm6 494 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 495 vaesenc %xmm15,%xmm10,%xmm10 496 movq 56(%r14),%r13 497 bswapq %r13 498 vpxor %xmm1,%xmm7,%xmm7 499 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 500 vpxor 112+8(%rsp),%xmm8,%xmm8 501 vaesenc %xmm15,%xmm11,%xmm11 502 movq 48(%r14),%r12 503 bswapq %r12 504 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 505 vaesenc %xmm15,%xmm12,%xmm12 506 movq %r13,64+8(%rsp) 507 vaesenc %xmm15,%xmm13,%xmm13 508 movq %r12,72+8(%rsp) 509 vpxor %xmm3,%xmm4,%xmm4 510 vmovdqu 112-32(%r9),%xmm3 511 vaesenc %xmm15,%xmm14,%xmm14 512 513 vmovups 96-128(%rcx),%xmm15 514 vpxor %xmm5,%xmm6,%xmm6 515 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 516 vaesenc %xmm15,%xmm9,%xmm9 517 vpxor %xmm1,%xmm6,%xmm6 518 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 519 vaesenc %xmm15,%xmm10,%xmm10 520 movq 40(%r14),%r13 521 bswapq %r13 522 vpxor %xmm2,%xmm7,%xmm7 523 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 524 vaesenc %xmm15,%xmm11,%xmm11 525 movq 32(%r14),%r12 526 bswapq %r12 527 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 528 vaesenc %xmm15,%xmm12,%xmm12 529 movq %r13,80+8(%rsp) 530 vaesenc %xmm15,%xmm13,%xmm13 531 movq %r12,88+8(%rsp) 532 vpxor %xmm5,%xmm6,%xmm6 533 vaesenc %xmm15,%xmm14,%xmm14 534 vpxor %xmm1,%xmm6,%xmm6 535 536 vmovups 112-128(%rcx),%xmm15 537 vpslldq $8,%xmm6,%xmm5 538 vpxor %xmm2,%xmm4,%xmm4 539 vmovdqu 16(%r11),%xmm3 540 541 vaesenc %xmm15,%xmm9,%xmm9 542 vpxor %xmm8,%xmm7,%xmm7 543 vaesenc %xmm15,%xmm10,%xmm10 544 vpxor %xmm5,%xmm4,%xmm4 545 movq 24(%r14),%r13 546 bswapq %r13 547 vaesenc %xmm15,%xmm11,%xmm11 548 movq 16(%r14),%r12 549 bswapq %r12 550 vpalignr $8,%xmm4,%xmm4,%xmm0 551 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 552 movq %r13,96+8(%rsp) 553 vaesenc %xmm15,%xmm12,%xmm12 554 movq %r12,104+8(%rsp) 555 vaesenc %xmm15,%xmm13,%xmm13 556 vmovups 128-128(%rcx),%xmm1 557 vaesenc %xmm15,%xmm14,%xmm14 558 559 vaesenc %xmm1,%xmm9,%xmm9 560 vmovups 144-128(%rcx),%xmm15 561 vaesenc %xmm1,%xmm10,%xmm10 562 vpsrldq $8,%xmm6,%xmm6 563 vaesenc %xmm1,%xmm11,%xmm11 564 vpxor %xmm6,%xmm7,%xmm7 565 vaesenc %xmm1,%xmm12,%xmm12 566 vpxor %xmm0,%xmm4,%xmm4 567 movq 8(%r14),%r13 568 bswapq %r13 569 vaesenc %xmm1,%xmm13,%xmm13 570 movq 0(%r14),%r12 571 bswapq %r12 572 vaesenc %xmm1,%xmm14,%xmm14 573 vmovups 160-128(%rcx),%xmm1 574 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. 575 jb .Lenc_tail_nmb 576 577 vaesenc %xmm15,%xmm9,%xmm9 578 vaesenc %xmm15,%xmm10,%xmm10 579 vaesenc %xmm15,%xmm11,%xmm11 580 vaesenc %xmm15,%xmm12,%xmm12 581 vaesenc %xmm15,%xmm13,%xmm13 582 vaesenc %xmm15,%xmm14,%xmm14 583 584 vaesenc %xmm1,%xmm9,%xmm9 585 vaesenc %xmm1,%xmm10,%xmm10 586 vaesenc %xmm1,%xmm11,%xmm11 587 vaesenc %xmm1,%xmm12,%xmm12 588 vaesenc %xmm1,%xmm13,%xmm13 589 vmovups 176-128(%rcx),%xmm15 590 vaesenc %xmm1,%xmm14,%xmm14 591 vmovups 192-128(%rcx),%xmm1 592 cmpl $14,%ebp // ICP does not zero key schedule. 593 jb .Lenc_tail_nmb 594 595 vaesenc %xmm15,%xmm9,%xmm9 596 vaesenc %xmm15,%xmm10,%xmm10 597 vaesenc %xmm15,%xmm11,%xmm11 598 vaesenc %xmm15,%xmm12,%xmm12 599 vaesenc %xmm15,%xmm13,%xmm13 600 vaesenc %xmm15,%xmm14,%xmm14 601 602 vaesenc %xmm1,%xmm9,%xmm9 603 vaesenc %xmm1,%xmm10,%xmm10 604 vaesenc %xmm1,%xmm11,%xmm11 605 vaesenc %xmm1,%xmm12,%xmm12 606 vaesenc %xmm1,%xmm13,%xmm13 607 vmovups 208-128(%rcx),%xmm15 608 vaesenc %xmm1,%xmm14,%xmm14 609 vmovups 224-128(%rcx),%xmm1 610 jmp .Lenc_tail_nmb 611 612.align 32 613.Lhandle_ctr32_nmb: 614 vmovdqu (%r11),%xmm0 615 vpshufb %xmm0,%xmm1,%xmm6 616 vmovdqu 48(%r11),%xmm5 617 vpaddd 64(%r11),%xmm6,%xmm10 618 vpaddd %xmm5,%xmm6,%xmm11 619 vmovdqu 0-32(%r9),%xmm3 620 vpaddd %xmm5,%xmm10,%xmm12 621 vpshufb %xmm0,%xmm10,%xmm10 622 vpaddd %xmm5,%xmm11,%xmm13 623 vpshufb %xmm0,%xmm11,%xmm11 624 vpxor %xmm15,%xmm10,%xmm10 625 vpaddd %xmm5,%xmm12,%xmm14 626 vpshufb %xmm0,%xmm12,%xmm12 627 vpxor %xmm15,%xmm11,%xmm11 628 vpaddd %xmm5,%xmm13,%xmm1 629 vpshufb %xmm0,%xmm13,%xmm13 630 vpshufb %xmm0,%xmm14,%xmm14 631 vpshufb %xmm0,%xmm1,%xmm1 632 jmp .Lresume_ctr32_nmb 633 634.align 32 635.Lenc_tail_nmb: 636 vaesenc %xmm15,%xmm9,%xmm9 637 vmovdqu %xmm7,16+8(%rsp) 638 vpalignr $8,%xmm4,%xmm4,%xmm8 639 vaesenc %xmm15,%xmm10,%xmm10 640 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 641 vpxor 0(%rdi),%xmm1,%xmm2 642 vaesenc %xmm15,%xmm11,%xmm11 643 vpxor 16(%rdi),%xmm1,%xmm0 644 vaesenc %xmm15,%xmm12,%xmm12 645 vpxor 32(%rdi),%xmm1,%xmm5 646 vaesenc %xmm15,%xmm13,%xmm13 647 vpxor 48(%rdi),%xmm1,%xmm6 648 vaesenc %xmm15,%xmm14,%xmm14 649 vpxor 64(%rdi),%xmm1,%xmm7 650 vpxor 80(%rdi),%xmm1,%xmm3 651 vmovdqu (%r8),%xmm1 652 653 vaesenclast %xmm2,%xmm9,%xmm9 654 vmovdqu 32(%r11),%xmm2 655 vaesenclast %xmm0,%xmm10,%xmm10 656 vpaddb %xmm2,%xmm1,%xmm0 657 movq %r13,112+8(%rsp) 658 leaq 96(%rdi),%rdi 659 vaesenclast %xmm5,%xmm11,%xmm11 660 vpaddb %xmm2,%xmm0,%xmm5 661 movq %r12,120+8(%rsp) 662 leaq 96(%rsi),%rsi 663 vmovdqu 0-128(%rcx),%xmm15 664 vaesenclast %xmm6,%xmm12,%xmm12 665 vpaddb %xmm2,%xmm5,%xmm6 666 vaesenclast %xmm7,%xmm13,%xmm13 667 vpaddb %xmm2,%xmm6,%xmm7 668 vaesenclast %xmm3,%xmm14,%xmm14 669 vpaddb %xmm2,%xmm7,%xmm3 670 671 addq $0x60,%r10 672 subq $0x6,%rdx 673 jc .L6x_done_nmb 674 675 vmovups %xmm9,-96(%rsi) 676 vpxor %xmm15,%xmm1,%xmm9 677 vmovups %xmm10,-80(%rsi) 678 vmovdqa %xmm0,%xmm10 679 vmovups %xmm11,-64(%rsi) 680 vmovdqa %xmm5,%xmm11 681 vmovups %xmm12,-48(%rsi) 682 vmovdqa %xmm6,%xmm12 683 vmovups %xmm13,-32(%rsi) 684 vmovdqa %xmm7,%xmm13 685 vmovups %xmm14,-16(%rsi) 686 vmovdqa %xmm3,%xmm14 687 vmovdqu 32+8(%rsp),%xmm7 688 jmp .Loop6x_nmb 689 690.L6x_done_nmb: 691 vpxor 16+8(%rsp),%xmm8,%xmm8 692 vpxor %xmm4,%xmm8,%xmm8 693 694 .byte 0xf3,0xc3 695.cfi_endproc 696.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x 697 698.globl aesni_gcm_decrypt 699.type aesni_gcm_decrypt,@function 700.align 32 701aesni_gcm_decrypt: 702.cfi_startproc 703 xorq %r10,%r10 704 cmpq $0x60,%rdx 705 jb .Lgcm_dec_abort 706 707 leaq (%rsp),%rax 708.cfi_def_cfa_register %rax 709 pushq %rbx 710.cfi_offset %rbx,-16 711 pushq %rbp 712.cfi_offset %rbp,-24 713 pushq %r12 714.cfi_offset %r12,-32 715 pushq %r13 716.cfi_offset %r13,-40 717 pushq %r14 718.cfi_offset %r14,-48 719 pushq %r15 720.cfi_offset %r15,-56 721 pushq %r9 722.cfi_offset %r9,-64 723 vzeroupper 724 725 vmovdqu (%r8),%xmm1 726 addq $-128,%rsp 727 movl 12(%r8),%ebx 728 leaq .Lbswap_mask(%rip),%r11 729 leaq -128(%rcx),%r14 730 movq $0xf80,%r15 731 vmovdqu (%r9),%xmm8 732 andq $-128,%rsp 733 vmovdqu (%r11),%xmm0 734 leaq 128(%rcx),%rcx 735 movq 32(%r9),%r9 736 leaq 32(%r9),%r9 737 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. 738 vpshufb %xmm0,%xmm8,%xmm8 739 740 andq %r15,%r14 741 andq %rsp,%r15 742 subq %r14,%r15 743 jc .Ldec_no_key_aliasing 744 cmpq $768,%r15 745 jnc .Ldec_no_key_aliasing 746 subq %r15,%rsp 747.Ldec_no_key_aliasing: 748 749 vmovdqu 80(%rdi),%xmm7 750 leaq (%rdi),%r14 751 vmovdqu 64(%rdi),%xmm4 752 leaq -192(%rdi,%rdx,1),%r15 753 vmovdqu 48(%rdi),%xmm5 754 shrq $4,%rdx 755 xorq %r10,%r10 756 vmovdqu 32(%rdi),%xmm6 757 vpshufb %xmm0,%xmm7,%xmm7 758 vmovdqu 16(%rdi),%xmm2 759 vpshufb %xmm0,%xmm4,%xmm4 760 vmovdqu (%rdi),%xmm3 761 vpshufb %xmm0,%xmm5,%xmm5 762 vmovdqu %xmm4,48(%rsp) 763 vpshufb %xmm0,%xmm6,%xmm6 764 vmovdqu %xmm5,64(%rsp) 765 vpshufb %xmm0,%xmm2,%xmm2 766 vmovdqu %xmm6,80(%rsp) 767 vpshufb %xmm0,%xmm3,%xmm3 768 vmovdqu %xmm2,96(%rsp) 769 vmovdqu %xmm3,112(%rsp) 770 771#ifdef HAVE_MOVBE 772#ifdef _KERNEL 773 testl $1,gcm_avx_can_use_movbe(%rip) 774#else 775 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 776#endif 777 jz 1f 778 call _aesni_ctr32_ghash_6x 779 jmp 2f 7801: 781#endif 782 call _aesni_ctr32_ghash_no_movbe_6x 7832: 784 vmovups %xmm9,-96(%rsi) 785 vmovups %xmm10,-80(%rsi) 786 vmovups %xmm11,-64(%rsi) 787 vmovups %xmm12,-48(%rsi) 788 vmovups %xmm13,-32(%rsi) 789 vmovups %xmm14,-16(%rsi) 790 791 vpshufb (%r11),%xmm8,%xmm8 792 movq -56(%rax),%r9 793.cfi_restore %r9 794 vmovdqu %xmm8,(%r9) 795 796 vzeroupper 797 movq -48(%rax),%r15 798.cfi_restore %r15 799 movq -40(%rax),%r14 800.cfi_restore %r14 801 movq -32(%rax),%r13 802.cfi_restore %r13 803 movq -24(%rax),%r12 804.cfi_restore %r12 805 movq -16(%rax),%rbp 806.cfi_restore %rbp 807 movq -8(%rax),%rbx 808.cfi_restore %rbx 809 leaq (%rax),%rsp 810.cfi_def_cfa_register %rsp 811.Lgcm_dec_abort: 812 movq %r10,%rax 813 .byte 0xf3,0xc3 814.cfi_endproc 815.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 816.type _aesni_ctr32_6x,@function 817.align 32 818_aesni_ctr32_6x: 819.cfi_startproc 820 vmovdqu 0-128(%rcx),%xmm4 821 vmovdqu 32(%r11),%xmm2 822 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. 823 vmovups 16-128(%rcx),%xmm15 824 leaq 32-128(%rcx),%r12 825 vpxor %xmm4,%xmm1,%xmm9 826 addl $100663296,%ebx 827 jc .Lhandle_ctr32_2 828 vpaddb %xmm2,%xmm1,%xmm10 829 vpaddb %xmm2,%xmm10,%xmm11 830 vpxor %xmm4,%xmm10,%xmm10 831 vpaddb %xmm2,%xmm11,%xmm12 832 vpxor %xmm4,%xmm11,%xmm11 833 vpaddb %xmm2,%xmm12,%xmm13 834 vpxor %xmm4,%xmm12,%xmm12 835 vpaddb %xmm2,%xmm13,%xmm14 836 vpxor %xmm4,%xmm13,%xmm13 837 vpaddb %xmm2,%xmm14,%xmm1 838 vpxor %xmm4,%xmm14,%xmm14 839 jmp .Loop_ctr32 840 841.align 16 842.Loop_ctr32: 843 vaesenc %xmm15,%xmm9,%xmm9 844 vaesenc %xmm15,%xmm10,%xmm10 845 vaesenc %xmm15,%xmm11,%xmm11 846 vaesenc %xmm15,%xmm12,%xmm12 847 vaesenc %xmm15,%xmm13,%xmm13 848 vaesenc %xmm15,%xmm14,%xmm14 849 vmovups (%r12),%xmm15 850 leaq 16(%r12),%r12 851 decl %r13d 852 jnz .Loop_ctr32 853 854 vmovdqu (%r12),%xmm3 855 vaesenc %xmm15,%xmm9,%xmm9 856 vpxor 0(%rdi),%xmm3,%xmm4 857 vaesenc %xmm15,%xmm10,%xmm10 858 vpxor 16(%rdi),%xmm3,%xmm5 859 vaesenc %xmm15,%xmm11,%xmm11 860 vpxor 32(%rdi),%xmm3,%xmm6 861 vaesenc %xmm15,%xmm12,%xmm12 862 vpxor 48(%rdi),%xmm3,%xmm8 863 vaesenc %xmm15,%xmm13,%xmm13 864 vpxor 64(%rdi),%xmm3,%xmm2 865 vaesenc %xmm15,%xmm14,%xmm14 866 vpxor 80(%rdi),%xmm3,%xmm3 867 leaq 96(%rdi),%rdi 868 869 vaesenclast %xmm4,%xmm9,%xmm9 870 vaesenclast %xmm5,%xmm10,%xmm10 871 vaesenclast %xmm6,%xmm11,%xmm11 872 vaesenclast %xmm8,%xmm12,%xmm12 873 vaesenclast %xmm2,%xmm13,%xmm13 874 vaesenclast %xmm3,%xmm14,%xmm14 875 vmovups %xmm9,0(%rsi) 876 vmovups %xmm10,16(%rsi) 877 vmovups %xmm11,32(%rsi) 878 vmovups %xmm12,48(%rsi) 879 vmovups %xmm13,64(%rsi) 880 vmovups %xmm14,80(%rsi) 881 leaq 96(%rsi),%rsi 882 883 .byte 0xf3,0xc3 884.align 32 885.Lhandle_ctr32_2: 886 vpshufb %xmm0,%xmm1,%xmm6 887 vmovdqu 48(%r11),%xmm5 888 vpaddd 64(%r11),%xmm6,%xmm10 889 vpaddd %xmm5,%xmm6,%xmm11 890 vpaddd %xmm5,%xmm10,%xmm12 891 vpshufb %xmm0,%xmm10,%xmm10 892 vpaddd %xmm5,%xmm11,%xmm13 893 vpshufb %xmm0,%xmm11,%xmm11 894 vpxor %xmm4,%xmm10,%xmm10 895 vpaddd %xmm5,%xmm12,%xmm14 896 vpshufb %xmm0,%xmm12,%xmm12 897 vpxor %xmm4,%xmm11,%xmm11 898 vpaddd %xmm5,%xmm13,%xmm1 899 vpshufb %xmm0,%xmm13,%xmm13 900 vpxor %xmm4,%xmm12,%xmm12 901 vpshufb %xmm0,%xmm14,%xmm14 902 vpxor %xmm4,%xmm13,%xmm13 903 vpshufb %xmm0,%xmm1,%xmm1 904 vpxor %xmm4,%xmm14,%xmm14 905 jmp .Loop_ctr32 906.cfi_endproc 907.size _aesni_ctr32_6x,.-_aesni_ctr32_6x 908 909.globl aesni_gcm_encrypt 910.type aesni_gcm_encrypt,@function 911.align 32 912aesni_gcm_encrypt: 913.cfi_startproc 914 xorq %r10,%r10 915 cmpq $288,%rdx 916 jb .Lgcm_enc_abort 917 918 leaq (%rsp),%rax 919.cfi_def_cfa_register %rax 920 pushq %rbx 921.cfi_offset %rbx,-16 922 pushq %rbp 923.cfi_offset %rbp,-24 924 pushq %r12 925.cfi_offset %r12,-32 926 pushq %r13 927.cfi_offset %r13,-40 928 pushq %r14 929.cfi_offset %r14,-48 930 pushq %r15 931.cfi_offset %r15,-56 932 pushq %r9 933.cfi_offset %r9,-64 934 vzeroupper 935 936 vmovdqu (%r8),%xmm1 937 addq $-128,%rsp 938 movl 12(%r8),%ebx 939 leaq .Lbswap_mask(%rip),%r11 940 leaq -128(%rcx),%r14 941 movq $0xf80,%r15 942 leaq 128(%rcx),%rcx 943 vmovdqu (%r11),%xmm0 944 andq $-128,%rsp 945 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. 946 947 andq %r15,%r14 948 andq %rsp,%r15 949 subq %r14,%r15 950 jc .Lenc_no_key_aliasing 951 cmpq $768,%r15 952 jnc .Lenc_no_key_aliasing 953 subq %r15,%rsp 954.Lenc_no_key_aliasing: 955 956 leaq (%rsi),%r14 957 leaq -192(%rsi,%rdx,1),%r15 958 shrq $4,%rdx 959 960 call _aesni_ctr32_6x 961 vpshufb %xmm0,%xmm9,%xmm8 962 vpshufb %xmm0,%xmm10,%xmm2 963 vmovdqu %xmm8,112(%rsp) 964 vpshufb %xmm0,%xmm11,%xmm4 965 vmovdqu %xmm2,96(%rsp) 966 vpshufb %xmm0,%xmm12,%xmm5 967 vmovdqu %xmm4,80(%rsp) 968 vpshufb %xmm0,%xmm13,%xmm6 969 vmovdqu %xmm5,64(%rsp) 970 vpshufb %xmm0,%xmm14,%xmm7 971 vmovdqu %xmm6,48(%rsp) 972 973 call _aesni_ctr32_6x 974 975 vmovdqu (%r9),%xmm8 976 movq 32(%r9),%r9 977 leaq 32(%r9),%r9 978 subq $12,%rdx 979 movq $192,%r10 980 vpshufb %xmm0,%xmm8,%xmm8 981 982#ifdef HAVE_MOVBE 983#ifdef _KERNEL 984 testl $1,gcm_avx_can_use_movbe(%rip) 985#else 986 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) 987#endif 988 jz 1f 989 call _aesni_ctr32_ghash_6x 990 jmp 2f 9911: 992#endif 993 call _aesni_ctr32_ghash_no_movbe_6x 9942: 995 vmovdqu 32(%rsp),%xmm7 996 vmovdqu (%r11),%xmm0 997 vmovdqu 0-32(%r9),%xmm3 998 vpunpckhqdq %xmm7,%xmm7,%xmm1 999 vmovdqu 32-32(%r9),%xmm15 1000 vmovups %xmm9,-96(%rsi) 1001 vpshufb %xmm0,%xmm9,%xmm9 1002 vpxor %xmm7,%xmm1,%xmm1 1003 vmovups %xmm10,-80(%rsi) 1004 vpshufb %xmm0,%xmm10,%xmm10 1005 vmovups %xmm11,-64(%rsi) 1006 vpshufb %xmm0,%xmm11,%xmm11 1007 vmovups %xmm12,-48(%rsi) 1008 vpshufb %xmm0,%xmm12,%xmm12 1009 vmovups %xmm13,-32(%rsi) 1010 vpshufb %xmm0,%xmm13,%xmm13 1011 vmovups %xmm14,-16(%rsi) 1012 vpshufb %xmm0,%xmm14,%xmm14 1013 vmovdqu %xmm9,16(%rsp) 1014 vmovdqu 48(%rsp),%xmm6 1015 vmovdqu 16-32(%r9),%xmm0 1016 vpunpckhqdq %xmm6,%xmm6,%xmm2 1017 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 1018 vpxor %xmm6,%xmm2,%xmm2 1019 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 1020 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1021 1022 vmovdqu 64(%rsp),%xmm9 1023 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 1024 vmovdqu 48-32(%r9),%xmm3 1025 vpxor %xmm5,%xmm4,%xmm4 1026 vpunpckhqdq %xmm9,%xmm9,%xmm5 1027 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 1028 vpxor %xmm9,%xmm5,%xmm5 1029 vpxor %xmm7,%xmm6,%xmm6 1030 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1031 vmovdqu 80-32(%r9),%xmm15 1032 vpxor %xmm1,%xmm2,%xmm2 1033 1034 vmovdqu 80(%rsp),%xmm1 1035 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 1036 vmovdqu 64-32(%r9),%xmm0 1037 vpxor %xmm4,%xmm7,%xmm7 1038 vpunpckhqdq %xmm1,%xmm1,%xmm4 1039 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 1040 vpxor %xmm1,%xmm4,%xmm4 1041 vpxor %xmm6,%xmm9,%xmm9 1042 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 1043 vpxor %xmm2,%xmm5,%xmm5 1044 1045 vmovdqu 96(%rsp),%xmm2 1046 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 1047 vmovdqu 96-32(%r9),%xmm3 1048 vpxor %xmm7,%xmm6,%xmm6 1049 vpunpckhqdq %xmm2,%xmm2,%xmm7 1050 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 1051 vpxor %xmm2,%xmm7,%xmm7 1052 vpxor %xmm9,%xmm1,%xmm1 1053 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 1054 vmovdqu 128-32(%r9),%xmm15 1055 vpxor %xmm5,%xmm4,%xmm4 1056 1057 vpxor 112(%rsp),%xmm8,%xmm8 1058 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 1059 vmovdqu 112-32(%r9),%xmm0 1060 vpunpckhqdq %xmm8,%xmm8,%xmm9 1061 vpxor %xmm6,%xmm5,%xmm5 1062 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 1063 vpxor %xmm8,%xmm9,%xmm9 1064 vpxor %xmm1,%xmm2,%xmm2 1065 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 1066 vpxor %xmm4,%xmm7,%xmm4 1067 1068 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 1069 vmovdqu 0-32(%r9),%xmm3 1070 vpunpckhqdq %xmm14,%xmm14,%xmm1 1071 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 1072 vpxor %xmm14,%xmm1,%xmm1 1073 vpxor %xmm5,%xmm6,%xmm5 1074 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 1075 vmovdqu 32-32(%r9),%xmm15 1076 vpxor %xmm2,%xmm8,%xmm7 1077 vpxor %xmm4,%xmm9,%xmm6 1078 1079 vmovdqu 16-32(%r9),%xmm0 1080 vpxor %xmm5,%xmm7,%xmm9 1081 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 1082 vpxor %xmm9,%xmm6,%xmm6 1083 vpunpckhqdq %xmm13,%xmm13,%xmm2 1084 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 1085 vpxor %xmm13,%xmm2,%xmm2 1086 vpslldq $8,%xmm6,%xmm9 1087 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 1088 vpxor %xmm9,%xmm5,%xmm8 1089 vpsrldq $8,%xmm6,%xmm6 1090 vpxor %xmm6,%xmm7,%xmm7 1091 1092 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 1093 vmovdqu 48-32(%r9),%xmm3 1094 vpxor %xmm4,%xmm5,%xmm5 1095 vpunpckhqdq %xmm12,%xmm12,%xmm9 1096 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 1097 vpxor %xmm12,%xmm9,%xmm9 1098 vpxor %xmm14,%xmm13,%xmm13 1099 vpalignr $8,%xmm8,%xmm8,%xmm14 1100 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 1101 vmovdqu 80-32(%r9),%xmm15 1102 vpxor %xmm1,%xmm2,%xmm2 1103 1104 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 1105 vmovdqu 64-32(%r9),%xmm0 1106 vpxor %xmm5,%xmm4,%xmm4 1107 vpunpckhqdq %xmm11,%xmm11,%xmm1 1108 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 1109 vpxor %xmm11,%xmm1,%xmm1 1110 vpxor %xmm13,%xmm12,%xmm12 1111 vxorps 16(%rsp),%xmm7,%xmm7 1112 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 1113 vpxor %xmm2,%xmm9,%xmm9 1114 1115 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1116 vxorps %xmm14,%xmm8,%xmm8 1117 1118 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 1119 vmovdqu 96-32(%r9),%xmm3 1120 vpxor %xmm4,%xmm5,%xmm5 1121 vpunpckhqdq %xmm10,%xmm10,%xmm2 1122 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 1123 vpxor %xmm10,%xmm2,%xmm2 1124 vpalignr $8,%xmm8,%xmm8,%xmm14 1125 vpxor %xmm12,%xmm11,%xmm11 1126 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 1127 vmovdqu 128-32(%r9),%xmm15 1128 vpxor %xmm9,%xmm1,%xmm1 1129 1130 vxorps %xmm7,%xmm14,%xmm14 1131 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 1132 vxorps %xmm14,%xmm8,%xmm8 1133 1134 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 1135 vmovdqu 112-32(%r9),%xmm0 1136 vpxor %xmm5,%xmm4,%xmm4 1137 vpunpckhqdq %xmm8,%xmm8,%xmm9 1138 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 1139 vpxor %xmm8,%xmm9,%xmm9 1140 vpxor %xmm11,%xmm10,%xmm10 1141 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 1142 vpxor %xmm1,%xmm2,%xmm2 1143 1144 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 1145 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 1146 vpxor %xmm4,%xmm5,%xmm5 1147 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 1148 vpxor %xmm10,%xmm7,%xmm7 1149 vpxor %xmm2,%xmm6,%xmm6 1150 1151 vpxor %xmm5,%xmm7,%xmm4 1152 vpxor %xmm4,%xmm6,%xmm6 1153 vpslldq $8,%xmm6,%xmm1 1154 vmovdqu 16(%r11),%xmm3 1155 vpsrldq $8,%xmm6,%xmm6 1156 vpxor %xmm1,%xmm5,%xmm8 1157 vpxor %xmm6,%xmm7,%xmm7 1158 1159 vpalignr $8,%xmm8,%xmm8,%xmm2 1160 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1161 vpxor %xmm2,%xmm8,%xmm8 1162 1163 vpalignr $8,%xmm8,%xmm8,%xmm2 1164 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 1165 vpxor %xmm7,%xmm2,%xmm2 1166 vpxor %xmm2,%xmm8,%xmm8 1167 vpshufb (%r11),%xmm8,%xmm8 1168 movq -56(%rax),%r9 1169.cfi_restore %r9 1170 vmovdqu %xmm8,(%r9) 1171 1172 vzeroupper 1173 movq -48(%rax),%r15 1174.cfi_restore %r15 1175 movq -40(%rax),%r14 1176.cfi_restore %r14 1177 movq -32(%rax),%r13 1178.cfi_restore %r13 1179 movq -24(%rax),%r12 1180.cfi_restore %r12 1181 movq -16(%rax),%rbp 1182.cfi_restore %rbp 1183 movq -8(%rax),%rbx 1184.cfi_restore %rbx 1185 leaq (%rax),%rsp 1186.cfi_def_cfa_register %rsp 1187.Lgcm_enc_abort: 1188 movq %r10,%rax 1189 .byte 0xf3,0xc3 1190.cfi_endproc 1191.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1192 1193/* Some utility routines */ 1194 1195/* 1196 * clear all fpu registers 1197 * void clear_fpu_regs_avx(void); 1198 */ 1199.globl clear_fpu_regs_avx 1200.type clear_fpu_regs_avx,@function 1201.align 32 1202clear_fpu_regs_avx: 1203 vzeroall 1204 RET 1205.size clear_fpu_regs_avx,.-clear_fpu_regs_avx 1206 1207/* 1208 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1209 * 1210 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and 1211 * stores the result at `dst'. The XOR is performed using FPU registers, 1212 * so make sure FPU state is saved when running this in the kernel. 1213 */ 1214.globl gcm_xor_avx 1215.type gcm_xor_avx,@function 1216.align 32 1217gcm_xor_avx: 1218 movdqu (%rdi), %xmm0 1219 movdqu (%rsi), %xmm1 1220 pxor %xmm1, %xmm0 1221 movdqu %xmm0, (%rsi) 1222 RET 1223.size gcm_xor_avx,.-gcm_xor_avx 1224 1225/* 1226 * Toggle a boolean_t value atomically and return the new value. 1227 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); 1228 */ 1229.globl atomic_toggle_boolean_nv 1230.type atomic_toggle_boolean_nv,@function 1231.align 32 1232atomic_toggle_boolean_nv: 1233 xorl %eax, %eax 1234 lock 1235 xorl $1, (%rdi) 1236 jz 1f 1237 movl $1, %eax 12381: 1239 RET 1240.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv 1241 1242.align 64 1243.Lbswap_mask: 1244.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1245.Lpoly: 1246.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1247.Lone_msb: 1248.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1249.Ltwo_lsb: 1250.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1251.Lone_lsb: 1252.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1253.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1254.align 64 1255 1256/* Mark the stack non-executable. */ 1257#if defined(__linux__) && defined(__ELF__) 1258.section .note.GNU-stack,"",%progbits 1259#endif 1260 1261#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ 1262