1/* 2 * This file contains the core of a bitslice DES implementation for x86-64/SSE2. 3 * It is part of John the Ripper password cracker, 4 * Copyright (c) 2000-2001,2005,2006,2008,2011,2012,2015,2019 by Solar Designer 5 * Copyright (c) 2015,2017 by magnum 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted. (This is a heavily cut-down "BSD license".) 8 * 9 * Gate counts per S-box: 49 44 46 33 48 46 46 41 10 * Average: 44.125 11 * 12 * The Boolean expressions corresponding to DES S-boxes have been generated 13 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's 14 * John the Ripper password cracker: http://www.openwall.com/john/ 15 * Being mathematical formulas, they are not copyrighted and are free for reuse 16 * by anyone. 17 * 18 * The x86-64/SSE2 code for the S-boxes was generated by Solar Designer using a 19 * Perl script. The script performed various optimizations, including the 20 * x86-64 specific optimization of preferring registers 0-7 over 8-15 to reduce 21 * the number of instruction prefixes (and thus code size). The instruction 22 * scheduling has been tuned for Core 2. 23 * 24 * The effort has been sponsored by Rapid7: http://www.rapid7.com 25 * 26 * Addition of single DES encryption with no salt by Deepika Dutta Mishra 27 * <dipikadutta at gmail.com> in 2013, no rights reserved. 28 * 29 * ...with changes in the jumbo patch, by Alain Espinosa (starting with a 30 * comment further down this file). 31 * 32 * Various tweaks & fixes and support for Win64 and Linux-X32 ABIs as well 33 * as CPU detection additions by magnum 2010-2015. 34 */ 35 36#include "arch.h" 37 38#if defined (_WIN64) || defined (__CYGWIN64__) 39/* 40 * MS use a different x64 calling convention than everyone else: 41 * Arguments: RCX, RDX, R8, R9 then stack right-to-left. 42 * Volatile: RAX, RCX, RDX, R8, R9, R10, R11, XMM0:XMM5 43 * Non-volatile: RBX, RBP, RSI, RDI, R12:R15, XMM6:XMM15 44 * Return: RAX. 45 */ 46#define ARG1 %rdi 47#define PROLOGUE \ 48 subq $(8+10*16), %rsp; \ 49 movapd %xmm6, 0*16(%rsp); \ 50 movapd %xmm7, 1*16(%rsp); \ 51 movapd %xmm8, 2*16(%rsp); \ 52 movapd %xmm9, 3*16(%rsp); \ 53 movapd %xmm10, 4*16(%rsp); \ 54 movapd %xmm11, 5*16(%rsp); \ 55 movapd %xmm12, 6*16(%rsp); \ 56 movapd %xmm13, 7*16(%rsp); \ 57 movapd %xmm14, 8*16(%rsp); \ 58 movapd %xmm15, 9*16(%rsp); \ 59 push %rdi; \ 60 push %rsi; \ 61 movq %rcx, %rdi; \ 62 movq %rdx, %rsi 63 64#define EPILOGUE \ 65 pop %rsi; \ 66 pop %rdi; \ 67 movapd 0*16(%rsp), %xmm6; \ 68 movapd 1*16(%rsp), %xmm7; \ 69 movapd 2*16(%rsp), %xmm8; \ 70 movapd 3*16(%rsp), %xmm9; \ 71 movapd 4*16(%rsp), %xmm10; \ 72 movapd 5*16(%rsp), %xmm11; \ 73 movapd 6*16(%rsp), %xmm12; \ 74 movapd 7*16(%rsp), %xmm13; \ 75 movapd 8*16(%rsp), %xmm14; \ 76 movapd 9*16(%rsp), %xmm15; \ 77 addq $(8+10*16), %rsp 78#else 79/* 80 * System V AMD64 ABI (followed by everybody else including linux-X32): 81 * Arguments: RDI, RSI, RDX, RCX, R8, R9 then stack right-to-left. 82 * Volatile: RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, XMM0:XMM15 83 * Non-volatile: RBX, RBP, R12:R15 84 * Return: RAX. 85 */ 86#define ARG1 %rdi 87#define PROLOGUE 88#define EPILOGUE 89#endif 90 91/* 92 * Throughout this file ARG1 is *pcount so it's 32-bit for X32, although 93 * using (%rdi) works fine too without any warnings. 94 */ 95#ifdef __ILP32__ 96#undef ARG1 97#define ARG1 %edi 98#endif 99 100#ifdef ALIGN_LOG 101#define DO_ALIGN(log) .align log 102#else 103#define DO_ALIGN(log) .align 1 << log 104#endif 105 106#if DES_BS_ASM 107 108#ifdef UNDERSCORES 109#define DES_bs_all _DES_bs_all 110#define DES_bs_init_asm _DES_bs_init_asm 111#define DES_bs_crypt _DES_bs_crypt 112#define DES_bs_crypt_25 _DES_bs_crypt_25 113#define DES_bs_crypt_LM _DES_bs_crypt_LM 114#define DES_bs_crypt_plain _DES_bs_crypt_plain 115#define DES_bs_P _DES_bs_P 116#endif 117 118#ifdef __sun 119/* Sun's assembler doesn't recognize .space */ 120#define DO_SPACE(size) .zero size 121#else 122/* Mac OS X assembler doesn't recognize .zero */ 123#define DO_SPACE(size) .space size 124#endif 125 126/* Sun's assembler can't multiply, but at least it can add... */ 127#ifdef __ILP32__ 128#define nptr(n) n+n+n+n 129#else 130#define nptr(n) n+n+n+n+n+n+n+n 131#endif 132#define nvec(n) n+n+n+n+n+n+n+n+n+n+n+n+n+n+n+n 133 134#ifdef BSD 135.data 136#else 137.bss 138#endif 139 140.globl DES_bs_all 141DO_ALIGN(6) 142DES_bs_all: 143DES_bs_all_KSp: 144DO_SPACE(nptr(0x300)) 145DES_bs_all_KS_p: 146DES_bs_all_KS_v: 147DO_SPACE(nvec(0x300)) 148DES_bs_all_E: 149DO_SPACE(nptr(96)) 150DES_bs_all_K: 151DO_SPACE(nvec(56)) 152DES_bs_all_B: 153DO_SPACE(nvec(64)) 154DES_bs_all_tmp: 155DO_SPACE(nvec(16)) 156DES_bs_all_xkeys: 157DO_SPACE(nvec(64)) 158DES_bs_all_pxkeys: 159DO_SPACE(nptr(128)) 160DES_bs_all_keys_changed: 161DO_SPACE(4) 162DES_bs_all_salt: 163DO_SPACE(4) 164DES_bs_all_Ens: 165DO_SPACE(nptr(48)) 166 167.globl DES_bs_P 168DO_ALIGN(6) 169DES_bs_P: 170DO_SPACE(nvec(64)) 171 172#define E(i) DES_bs_all_E+nptr(i)(%rip) 173#define B(i) DES_bs_all_B+nvec(i)(%rip) 174#define tmp_at(i) DES_bs_all_tmp+nvec(i)(%rip) 175#define P(i) DES_bs_P+nvec(i)(%rip) 176 177#define pnot tmp_at(0) 178 179#define a1 %xmm0 180#define a2 %xmm1 181#define a3 %xmm2 182#define a4 %xmm3 183#define a5 %xmm4 184#define a6 %xmm5 185 186#define S1(out1, out2, out3, out4) \ 187 movdqa %xmm4,%xmm7; \ 188 movdqa %xmm5,%xmm10; \ 189 pandn %xmm0,%xmm4; \ 190 movdqa %xmm2,%xmm13; \ 191 movdqa %xmm4,%xmm14; \ 192 por %xmm2,%xmm10; \ 193 movdqa %xmm5,%xmm11; \ 194 pxor %xmm0,%xmm13; \ 195 pxor %xmm7,%xmm11; \ 196 pxor %xmm3,%xmm14; \ 197 movdqa %xmm13,%xmm12; \ 198 movdqa %xmm11,%xmm15; \ 199 pand %xmm10,%xmm13; \ 200 movdqa %xmm14,%xmm9; \ 201 movdqa %xmm13,%xmm8; \ 202 pxor %xmm2,%xmm15; \ 203 pxor %xmm3,%xmm8; \ 204 pandn %xmm11,%xmm12; \ 205 pandn %xmm8,%xmm9; \ 206 por %xmm5,%xmm13; \ 207 por %xmm0,%xmm5; \ 208 pandn %xmm7,%xmm8; \ 209 pandn %xmm14,%xmm15; \ 210 movdqa %xmm5,%xmm6; \ 211 pxor %xmm15,%xmm13; \ 212 movdqa %xmm9,%xmm15; \ 213 por %xmm13,%xmm6; \ 214 pandn %xmm3,%xmm5; \ 215 movdqa %xmm8,%xmm3; \ 216 pandn %xmm13,%xmm15; \ 217 pxor %xmm6,%xmm8; \ 218 pxor %xmm3,%xmm5; \ 219 pand %xmm10,%xmm13; \ 220 pandn %xmm2,%xmm4; \ 221 movdqa %xmm6,%xmm2; \ 222 pxor %xmm10,%xmm6; \ 223 pxor %xmm14,%xmm2; \ 224 pandn %xmm2,%xmm4; \ 225 movdqa %xmm4,%xmm2; \ 226 pxor pnot,%xmm2; \ 227 pxor %xmm11,%xmm4; \ 228 pxor %xmm2,%xmm13; \ 229 movdqa %xmm1,%xmm2; \ 230 por %xmm3,%xmm4; \ 231 pandn %xmm8,%xmm2; \ 232 por %xmm7,%xmm14; \ 233 pxor %xmm10,%xmm4; \ 234 por %xmm1,%xmm9; \ 235 pxor %xmm13,%xmm2; \ 236 pxor %xmm0,%xmm4; \ 237 movdqa %xmm1,%xmm0; \ 238 pxor %xmm4,%xmm13; \ 239 pxor %xmm13,%xmm9; \ 240 por %xmm12,%xmm5; \ 241 pxor out1,%xmm9; \ 242 por %xmm5,%xmm6; \ 243 por %xmm11,%xmm13; \ 244 pxor %xmm4,%xmm6; \ 245 movdqa %xmm9,out1; \ 246 por %xmm15,%xmm0; \ 247 pxor %xmm6,%xmm13; \ 248 pxor out3,%xmm2; \ 249 pxor out2,%xmm13; \ 250 pand %xmm15,%xmm4; \ 251 pandn %xmm14,%xmm6; \ 252 pxor %xmm0,%xmm13; \ 253 pxor %xmm6,%xmm4; \ 254 movdqa %xmm2,out3; \ 255 por %xmm1,%xmm4; \ 256 pxor %xmm5,%xmm4; \ 257 movdqa %xmm13,out2; \ 258 pxor out4,%xmm4; \ 259 movdqa %xmm4,out4 260 261#define S2(out1, out2, out3, out4) \ 262 movdqa %xmm4,%xmm13; \ 263 movdqa %xmm5,%xmm6; \ 264 pxor %xmm1,%xmm13; \ 265 movdqa %xmm5,%xmm8; \ 266 pandn %xmm0,%xmm6; \ 267 movdqa %xmm13,%xmm7; \ 268 pandn %xmm4,%xmm6; \ 269 movdqa %xmm5,%xmm9; \ 270 movdqa %xmm6,%xmm14; \ 271 pandn %xmm13,%xmm8; \ 272 pand %xmm0,%xmm7; \ 273 pxor pnot,%xmm0; \ 274 por %xmm1,%xmm14; \ 275 movdqa %xmm8,%xmm12; \ 276 pxor %xmm4,%xmm7; \ 277 pand %xmm2,%xmm9; \ 278 pxor %xmm5,%xmm13; \ 279 pxor %xmm8,%xmm6; \ 280 movdqa %xmm9,%xmm10; \ 281 pand %xmm14,%xmm6; \ 282 pandn %xmm6,%xmm10; \ 283 pand %xmm2,%xmm6; \ 284 pandn %xmm7,%xmm12; \ 285 pxor %xmm6,%xmm0; \ 286 movdqa %xmm9,%xmm5; \ 287 pandn %xmm3,%xmm10; \ 288 pandn %xmm13,%xmm5; \ 289 movdqa %xmm5,%xmm11; \ 290 pandn %xmm1,%xmm5; \ 291 pxor %xmm0,%xmm11; \ 292 pxor %xmm13,%xmm2; \ 293 por %xmm3,%xmm12; \ 294 pxor %xmm5,%xmm7; \ 295 movdqa %xmm7,%xmm1; \ 296 pxor out2,%xmm10; \ 297 pandn %xmm0,%xmm1; \ 298 movdqa %xmm3,%xmm0; \ 299 pxor %xmm2,%xmm1; \ 300 pandn %xmm14,%xmm0; \ 301 pxor %xmm11,%xmm14; \ 302 pxor %xmm5,%xmm6; \ 303 pxor %xmm1,%xmm0; \ 304 por %xmm6,%xmm2; \ 305 por %xmm14,%xmm9; \ 306 pxor %xmm1,%xmm6; \ 307 pxor %xmm11,%xmm10; \ 308 pand %xmm9,%xmm6; \ 309 pxor out3,%xmm2; \ 310 pxor %xmm4,%xmm6; \ 311 pandn %xmm6,%xmm8; \ 312 pxor %xmm9,%xmm2; \ 313 pxor %xmm11,%xmm8; \ 314 por %xmm8,%xmm3; \ 315 por %xmm13,%xmm14; \ 316 pxor %xmm3,%xmm2; \ 317 pandn %xmm8,%xmm7; \ 318 movdqa %xmm2,out3; \ 319 pxor out4,%xmm7; \ 320 movdqa %xmm10,out2; \ 321 pxor %xmm14,%xmm7; \ 322 pxor out1,%xmm0; \ 323 pxor %xmm12,%xmm7; \ 324 movdqa %xmm0,out1; \ 325 movdqa %xmm7,out4 326 327#define S3(out1, out2, out3, out4) \ 328 movdqa %xmm1,%xmm6; \ 329 movdqa %xmm5,%xmm13; \ 330 pandn %xmm0,%xmm6; \ 331 movdqa %xmm5,%xmm8; \ 332 pxor %xmm2,%xmm13; \ 333 movdqa %xmm0,%xmm11; \ 334 por %xmm13,%xmm6; \ 335 movdqa %xmm13,%xmm9; \ 336 pxor %xmm3,%xmm8; \ 337 movdqa %xmm3,%xmm15; \ 338 pandn %xmm8,%xmm11; \ 339 pxor %xmm1,%xmm9; \ 340 movdqa %xmm11,%xmm10; \ 341 movdqa %xmm4,%xmm12; \ 342 movdqa %xmm5,%xmm14; \ 343 pxor %xmm6,%xmm10; \ 344 pandn %xmm9,%xmm14; \ 345 movdqa %xmm10,%xmm7; \ 346 pxor %xmm14,%xmm6; \ 347 movdqa %xmm6,%xmm14; \ 348 pand %xmm5,%xmm7; \ 349 pand %xmm3,%xmm5; \ 350 pandn %xmm10,%xmm14; \ 351 por %xmm3,%xmm7; \ 352 pandn %xmm10,%xmm12; \ 353 pand %xmm0,%xmm7; \ 354 pxor out4,%xmm12; \ 355 pxor %xmm9,%xmm7; \ 356 pand %xmm13,%xmm8; \ 357 pxor %xmm0,%xmm15; \ 358 pxor %xmm7,%xmm12; \ 359 pxor %xmm15,%xmm6; \ 360 pand %xmm3,%xmm13; \ 361 por %xmm2,%xmm6; \ 362 por %xmm11,%xmm15; \ 363 pandn %xmm6,%xmm8; \ 364 movdqa %xmm15,%xmm6; \ 365 pand %xmm4,%xmm8; \ 366 pandn %xmm7,%xmm6; \ 367 movdqa %xmm1,%xmm7; \ 368 pandn %xmm10,%xmm1; \ 369 pandn %xmm5,%xmm7; \ 370 por %xmm9,%xmm5; \ 371 pxor %xmm7,%xmm6; \ 372 movdqa %xmm2,%xmm7; \ 373 pxor %xmm9,%xmm15; \ 374 pandn %xmm6,%xmm7; \ 375 pandn %xmm1,%xmm2; \ 376 pandn %xmm5,%xmm7; \ 377 pxor pnot,%xmm15; \ 378 pxor out2,%xmm7; \ 379 pxor %xmm2,%xmm15; \ 380 pandn %xmm4,%xmm14; \ 381 movdqa %xmm12,out4; \ 382 por %xmm15,%xmm9; \ 383 pxor %xmm0,%xmm7; \ 384 pandn %xmm9,%xmm13; \ 385 por %xmm11,%xmm1; \ 386 pxor %xmm8,%xmm7; \ 387 pxor %xmm1,%xmm13; \ 388 por %xmm4,%xmm6; \ 389 pxor out1,%xmm14; \ 390 pxor %xmm13,%xmm6; \ 391 pxor %xmm15,%xmm14; \ 392 pxor out3,%xmm6; \ 393 movdqa %xmm7,out2; \ 394 movdqa %xmm14,out1; \ 395 movdqa %xmm6,out3 396 397#define S4(out1, out2, out3, out4) \ 398 movdqa %xmm3,%xmm7; \ 399 movdqa %xmm1,%xmm8; \ 400 pxor %xmm2,%xmm0; \ 401 pxor %xmm4,%xmm2; \ 402 por %xmm1,%xmm3; \ 403 pandn %xmm2,%xmm1; \ 404 pxor %xmm4,%xmm3; \ 405 movdqa %xmm1,%xmm10; \ 406 pxor %xmm7,%xmm1; \ 407 pandn %xmm2,%xmm3; \ 408 movdqa %xmm1,%xmm11; \ 409 movdqa %xmm3,%xmm6; \ 410 pxor %xmm8,%xmm7; \ 411 por %xmm0,%xmm1; \ 412 pandn %xmm1,%xmm3; \ 413 movdqa %xmm3,%xmm1; \ 414 movdqa %xmm5,%xmm12; \ 415 pxor %xmm8,%xmm3; \ 416 pand %xmm3,%xmm11; \ 417 movdqa %xmm11,%xmm9; \ 418 por %xmm4,%xmm10; \ 419 pxor %xmm3,%xmm0; \ 420 pandn %xmm2,%xmm11; \ 421 pandn %xmm0,%xmm11; \ 422 pxor %xmm0,%xmm10; \ 423 movdqa %xmm7,%xmm0; \ 424 pxor %xmm11,%xmm6; \ 425 movdqa %xmm6,%xmm4; \ 426 pandn %xmm5,%xmm6; \ 427 pandn %xmm10,%xmm7; \ 428 pxor out1,%xmm6; \ 429 pandn %xmm4,%xmm5; \ 430 pxor %xmm1,%xmm7; \ 431 pxor %xmm7,%xmm6; \ 432 pxor pnot,%xmm7; \ 433 pxor %xmm7,%xmm5; \ 434 pxor %xmm4,%xmm7; \ 435 pxor out2,%xmm5; \ 436 movdqa %xmm5,out2; \ 437 pandn %xmm7,%xmm0; \ 438 movdqa %xmm12,%xmm7; \ 439 por %xmm9,%xmm0; \ 440 movdqa %xmm6,out1; \ 441 pxor %xmm10,%xmm0; \ 442 por %xmm3,%xmm12; \ 443 pxor %xmm0,%xmm12; \ 444 pxor out4,%xmm0; \ 445 pand %xmm7,%xmm3; \ 446 pxor out3,%xmm12; \ 447 movdqa %xmm12,out3; \ 448 pxor %xmm3,%xmm0; \ 449 movdqa %xmm0,out4 450 451#define S5(out1, out2, out3, out4) \ 452 movdqa %xmm2,%xmm6; \ 453 por %xmm0,%xmm2; \ 454 movdqa %xmm5,%xmm7; \ 455 pandn %xmm2,%xmm5; \ 456 movdqa %xmm3,%xmm14; \ 457 pandn %xmm5,%xmm3; \ 458 pxor %xmm0,%xmm5; \ 459 pxor %xmm6,%xmm3; \ 460 movdqa %xmm5,%xmm15; \ 461 pxor %xmm6,%xmm5; \ 462 movdqa %xmm3,%xmm10; \ 463 pand %xmm4,%xmm3; \ 464 movdqa %xmm5,%xmm8; \ 465 por %xmm0,%xmm5; \ 466 pxor %xmm14,%xmm3; \ 467 pxor %xmm5,%xmm3; \ 468 movdqa %xmm5,%xmm12; \ 469 por %xmm14,%xmm8; \ 470 pxor %xmm0,%xmm2; \ 471 pxor %xmm3,%xmm7; \ 472 pand %xmm14,%xmm12; \ 473 movdqa %xmm7,%xmm9; \ 474 por %xmm15,%xmm7; \ 475 pxor %xmm15,%xmm12; \ 476 pandn %xmm7,%xmm0; \ 477 pand %xmm4,%xmm7; \ 478 pxor %xmm8,%xmm4; \ 479 pxor %xmm7,%xmm12; \ 480 movdqa %xmm0,%xmm6; \ 481 pxor %xmm4,%xmm0; \ 482 pxor %xmm10,%xmm6; \ 483 movdqa %xmm1,%xmm13; \ 484 pandn %xmm4,%xmm6; \ 485 pand %xmm10,%xmm5; \ 486 pxor pnot,%xmm6; \ 487 por %xmm12,%xmm0; \ 488 pandn %xmm6,%xmm13; \ 489 movdqa %xmm7,%xmm6; \ 490 pandn %xmm10,%xmm7; \ 491 pxor %xmm8,%xmm10; \ 492 pandn %xmm0,%xmm7; \ 493 pxor %xmm13,%xmm3; \ 494 pand %xmm7,%xmm9; \ 495 movdqa %xmm7,%xmm0; \ 496 pxor %xmm4,%xmm9; \ 497 pandn %xmm8,%xmm0; \ 498 pand %xmm1,%xmm8; \ 499 por %xmm1,%xmm0; \ 500 pand %xmm9,%xmm14; \ 501 pxor %xmm2,%xmm7; \ 502 por %xmm9,%xmm5; \ 503 pxor %xmm14,%xmm7; \ 504 pxor %xmm6,%xmm5; \ 505 pxor %xmm7,%xmm0; \ 506 pxor %xmm15,%xmm9; \ 507 pandn %xmm10,%xmm7; \ 508 pand %xmm1,%xmm5; \ 509 pxor %xmm9,%xmm7; \ 510 pxor %xmm12,%xmm5; \ 511 pxor %xmm8,%xmm7; \ 512 pxor out3,%xmm3; \ 513 pxor out4,%xmm5; \ 514 pxor out1,%xmm0; \ 515 pxor out2,%xmm7; \ 516 movdqa %xmm3,out3; \ 517 movdqa %xmm5,out4; \ 518 movdqa %xmm0,out1; \ 519 movdqa %xmm7,out2 520 521#define S6(out1, out2, out3, out4) \ 522 movdqa %xmm5,%xmm8; \ 523 por %xmm1,%xmm5; \ 524 movdqa %xmm4,%xmm7; \ 525 movdqa %xmm4,tmp_at(2); \ 526 movdqa %xmm2,%xmm11; \ 527 pxor %xmm1,%xmm4; \ 528 pand %xmm0,%xmm5; \ 529 movdqa %xmm3,%xmm15; \ 530 pxor %xmm5,%xmm4; \ 531 movdqa %xmm4,%xmm9; \ 532 pxor %xmm0,%xmm11; \ 533 pxor %xmm8,%xmm4; \ 534 movdqa %xmm0,tmp_at(1); \ 535 movdqa %xmm4,%xmm12; \ 536 pand %xmm0,%xmm4; \ 537 movdqa %xmm11,%xmm0; \ 538 pandn %xmm7,%xmm12; \ 539 movdqa %xmm4,%xmm10; \ 540 pxor %xmm1,%xmm4; \ 541 por %xmm1,%xmm11; \ 542 por %xmm4,%xmm0; \ 543 movdqa %xmm0,%xmm6; \ 544 por %xmm12,%xmm4; \ 545 pxor %xmm9,%xmm0; \ 546 pxor %xmm1,%xmm6; \ 547 movdqa %xmm4,%xmm14; \ 548 movdqa %xmm6,%xmm7; \ 549 pandn %xmm8,%xmm6; \ 550 pxor %xmm8,%xmm10; \ 551 pxor %xmm2,%xmm6; \ 552 pand %xmm0,%xmm2; \ 553 movdqa %xmm3,%xmm1; \ 554 pandn %xmm2,%xmm8; \ 555 movdqa %xmm2,%xmm13; \ 556 pxor %xmm8,%xmm14; \ 557 pxor %xmm11,%xmm7; \ 558 pand %xmm14,%xmm15; \ 559 pandn tmp_at(2),%xmm2; \ 560 pxor %xmm0,%xmm15; \ 561 por tmp_at(1),%xmm0; \ 562 pxor out4,%xmm15; \ 563 pand %xmm4,%xmm0; \ 564 por %xmm6,%xmm2; \ 565 pxor %xmm6,%xmm0; \ 566 pxor pnot,%xmm7; \ 567 pandn %xmm0,%xmm8; \ 568 pxor %xmm9,%xmm0; \ 569 por %xmm3,%xmm12; \ 570 pandn tmp_at(2),%xmm0; \ 571 por %xmm2,%xmm5; \ 572 pxor %xmm7,%xmm0; \ 573 pxor tmp_at(1),%xmm6; \ 574 pandn %xmm0,%xmm1; \ 575 pandn %xmm2,%xmm3; \ 576 pand %xmm10,%xmm6; \ 577 pxor %xmm3,%xmm7; \ 578 pxor out3,%xmm8; \ 579 pxor %xmm6,%xmm7; \ 580 pxor %xmm12,%xmm8; \ 581 pxor %xmm1,%xmm5; \ 582 pxor %xmm13,%xmm7; \ 583 pxor %xmm11,%xmm14; \ 584 pxor out2,%xmm5; \ 585 pxor %xmm14,%xmm5; \ 586 movdqa %xmm15,out4; \ 587 pxor out1,%xmm7; \ 588 movdqa %xmm8,out3; \ 589 movdqa %xmm5,out2; \ 590 movdqa %xmm7,out1 591 592#define S7(out1, out2, out3, out4) \ 593 movdqa %xmm4,%xmm14; \ 594 pxor %xmm3,%xmm4; \ 595 movdqa %xmm3,%xmm11; \ 596 movdqa %xmm4,%xmm12; \ 597 pand %xmm4,%xmm11; \ 598 pxor %xmm2,%xmm4; \ 599 movdqa %xmm11,%xmm6; \ 600 movdqa %xmm4,%xmm7; \ 601 movdqa %xmm11,%xmm15; \ 602 pand %xmm5,%xmm6; \ 603 pxor %xmm1,%xmm11; \ 604 movdqa %xmm7,%xmm13; \ 605 pand %xmm5,%xmm4; \ 606 movdqa %xmm11,%xmm10; \ 607 pxor %xmm5,%xmm12; \ 608 pxor %xmm2,%xmm6; \ 609 movdqa %xmm6,%xmm8; \ 610 por %xmm10,%xmm6; \ 611 pand %xmm4,%xmm11; \ 612 pandn %xmm0,%xmm11; \ 613 pxor %xmm12,%xmm6; \ 614 pxor %xmm4,%xmm8; \ 615 pandn %xmm14,%xmm7; \ 616 movdqa %xmm7,%xmm9; \ 617 pxor %xmm6,%xmm11; \ 618 pxor %xmm12,%xmm4; \ 619 por %xmm10,%xmm7; \ 620 pxor %xmm8,%xmm7; \ 621 pandn %xmm3,%xmm4; \ 622 pxor %xmm14,%xmm8; \ 623 pandn %xmm10,%xmm4; \ 624 pxor %xmm4,%xmm8; \ 625 pandn %xmm13,%xmm12; \ 626 pand %xmm8,%xmm2; \ 627 por %xmm15,%xmm6; \ 628 por %xmm2,%xmm6; \ 629 pxor %xmm12,%xmm6; \ 630 movdqa %xmm0,%xmm3; \ 631 pandn %xmm6,%xmm0; \ 632 movdqa %xmm6,%xmm4; \ 633 por %xmm8,%xmm6; \ 634 pand %xmm5,%xmm6; \ 635 pxor %xmm7,%xmm0; \ 636 por %xmm14,%xmm2; \ 637 pand %xmm6,%xmm1; \ 638 pxor %xmm4,%xmm7; \ 639 pxor %xmm6,%xmm2; \ 640 pxor %xmm7,%xmm1; \ 641 pxor %xmm14,%xmm7; \ 642 movdqa %xmm3,%xmm5; \ 643 por %xmm2,%xmm7; \ 644 pxor out1,%xmm0; \ 645 pand %xmm7,%xmm3; \ 646 pxor pnot,%xmm4; \ 647 pxor %xmm6,%xmm7; \ 648 por %xmm9,%xmm7; \ 649 pxor out4,%xmm11; \ 650 pxor %xmm3,%xmm8; \ 651 pxor %xmm4,%xmm7; \ 652 pandn %xmm7,%xmm5; \ 653 movdqa %xmm11,out4; \ 654 pxor out2,%xmm1; \ 655 movdqa %xmm0,out1; \ 656 pxor %xmm5,%xmm1; \ 657 pxor out3,%xmm8; \ 658 movdqa %xmm8,out3; \ 659 movdqa %xmm1,out2 660 661#define S8(out1, out2, out3, out4) \ 662 movdqa %xmm1,%xmm13; \ 663 pandn %xmm2,%xmm1; \ 664 movdqa %xmm2,%xmm11; \ 665 movdqa %xmm2,%xmm8; \ 666 pandn %xmm4,%xmm2; \ 667 movdqa %xmm1,%xmm6; \ 668 pxor %xmm3,%xmm2; \ 669 pandn %xmm13,%xmm11; \ 670 movdqa %xmm2,%xmm9; \ 671 pand %xmm0,%xmm2; \ 672 movdqa %xmm9,%xmm7; \ 673 pandn %xmm2,%xmm1; \ 674 pandn %xmm13,%xmm9; \ 675 pxor %xmm4,%xmm11; \ 676 movdqa %xmm9,%xmm12; \ 677 por %xmm0,%xmm9; \ 678 movdqa %xmm11,%xmm10; \ 679 pand %xmm9,%xmm11; \ 680 pxor pnot,%xmm7; \ 681 por %xmm11,%xmm2; \ 682 pxor %xmm11,%xmm7; \ 683 pandn %xmm8,%xmm9; \ 684 movdqa %xmm5,%xmm15; \ 685 pxor %xmm9,%xmm7; \ 686 por %xmm1,%xmm15; \ 687 pxor %xmm7,%xmm6; \ 688 pxor %xmm6,%xmm15; \ 689 pxor %xmm0,%xmm6; \ 690 movdqa %xmm6,%xmm14; \ 691 pxor %xmm13,%xmm7; \ 692 pand %xmm4,%xmm6; \ 693 pxor out2,%xmm15; \ 694 pxor %xmm7,%xmm6; \ 695 pxor %xmm6,%xmm12; \ 696 movdqa %xmm15,out2; \ 697 pxor %xmm2,%xmm6; \ 698 pxor %xmm4,%xmm14; \ 699 por %xmm13,%xmm6; \ 700 pand %xmm5,%xmm2; \ 701 por %xmm3,%xmm7; \ 702 pxor %xmm12,%xmm10; \ 703 pxor %xmm10,%xmm7; \ 704 pxor %xmm14,%xmm6; \ 705 pxor %xmm6,%xmm2; \ 706 pxor %xmm7,%xmm0; \ 707 pandn %xmm10,%xmm3; \ 708 pand %xmm5,%xmm0; \ 709 pand %xmm3,%xmm6; \ 710 pxor out3,%xmm2; \ 711 pxor %xmm6,%xmm7; \ 712 pxor %xmm1,%xmm7; \ 713 pxor out4,%xmm0; \ 714 movdqa %xmm2,out3; \ 715 por %xmm7,%xmm5; \ 716 pxor out1,%xmm5; \ 717 pxor %xmm12,%xmm0; \ 718 pxor %xmm12,%xmm5; \ 719 movdqa %xmm0,out4; \ 720 movdqa %xmm5,out1 721 722#define zero %xmm5 723 724#define DES_bs_clear_block_8(i) \ 725 movdqa zero,B(i); \ 726 movdqa zero,B(i + 1); \ 727 movdqa zero,B(i + 2); \ 728 movdqa zero,B(i + 3); \ 729 movdqa zero,B(i + 4); \ 730 movdqa zero,B(i + 5); \ 731 movdqa zero,B(i + 6); \ 732 movdqa zero,B(i + 7) 733 734#define DES_bs_clear_block \ 735 DES_bs_clear_block_8(0); \ 736 DES_bs_clear_block_8(8); \ 737 DES_bs_clear_block_8(16); \ 738 DES_bs_clear_block_8(24); \ 739 DES_bs_clear_block_8(32); \ 740 DES_bs_clear_block_8(40); \ 741 DES_bs_clear_block_8(48); \ 742 DES_bs_clear_block_8(56) 743 744#define k_ptr %rdx 745#define K(i) nvec(i)(k_ptr) 746#define k(i) nptr(i)(k_ptr) 747 748#define tmp1 %rcx 749#define tmp2 %rsi 750#ifdef __ILP32__ 751#define tmp1p %ecx 752#define tmp2p %esi 753#else 754#define tmp1p tmp1 755#define tmp2p tmp2 756#endif 757 758#define xor_E(i) \ 759 mov E(i),tmp1p; \ 760 movdqa K(i),a1; \ 761 mov E(i + 1),tmp2p; \ 762 movdqa K(i + 1),a2; \ 763 pxor (tmp1),a1; \ 764 pxor (tmp2),a2; \ 765 mov E(i + 2),tmp1p; \ 766 movdqa K(i + 2),a3; \ 767 mov E(i + 3),tmp2p; \ 768 movdqa K(i + 3),a4; \ 769 pxor (tmp1),a3; \ 770 pxor (tmp2),a4; \ 771 mov E(i + 4),tmp1p; \ 772 movdqa K(i + 4),a5; \ 773 mov E(i + 5),tmp2p; \ 774 movdqa K(i + 5),a6; \ 775 pxor (tmp1),a5; \ 776 pxor (tmp2),a6 777 778#define xor_B(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \ 779 movdqa B(b1),a1; \ 780 movdqa B(b2),a2; \ 781 pxor K(k1),a1; \ 782 movdqa B(b3),a3; \ 783 pxor K(k2),a2; \ 784 movdqa B(b4),a4; \ 785 pxor K(k3),a3; \ 786 movdqa B(b5),a5; \ 787 pxor K(k4),a4; \ 788 movdqa B(b6),a6; \ 789 pxor K(k5),a5; \ 790 pxor K(k6),a6 791 792#define xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6) \ 793 mov k(k1),tmp1; \ 794 mov k(k2),tmp2; \ 795 movdqa B(b1),a1; \ 796 movdqa B(b2),a2; \ 797 pxor (tmp1),a1; \ 798 mov k(k3),tmp1p; \ 799 pxor (tmp2),a2; \ 800 mov k(k4),tmp2p; \ 801 movdqa B(b3),a3; \ 802 movdqa B(b4),a4; \ 803 pxor (tmp1),a3; \ 804 mov k(k6),tmp1; \ 805 pxor (tmp2),a4 806 807#define xor_B_KS_p_suffix(b5, k5) \ 808 mov k(k5),tmp2; \ 809 movdqa B(b5),a5; \ 810 pxor (tmp1),a6; \ 811 pxor (tmp2),a5 812 813#define xor_B_KS_p(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \ 814 xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \ 815 movdqa B(b6),a6; \ 816 xor_B_KS_p_suffix(b5, k5) 817 818#define xor_B_KS_p_special(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, k6) \ 819 xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \ 820 xor_B_KS_p_suffix(b5, k5) 821 822#define mask01 tmp_at(8) 823#define mask02 tmp_at(9) 824#define mask04 tmp_at(10) 825#define mask08 tmp_at(11) 826#define mask10 tmp_at(12) 827#define mask20 tmp_at(13) 828#define mask40 tmp_at(14) 829#define mask80 tmp_at(15) 830 831#define v_ptr %rax 832#define V(i) nvec(i)(v_ptr) 833 834#if 1 835#define SHLB1(reg) paddb reg,reg 836#else 837#define SHLB1(reg) psllq $1,reg 838#endif 839 840#define FINALIZE_NEXT_KEY_BITS_0_6 \ 841 movdqa V(0),%xmm0; \ 842 movdqa V(1),%xmm1; \ 843 movdqa V(2),%xmm2; \ 844 movdqa V(3),%xmm3; \ 845 pand %xmm7,%xmm0; \ 846 pand %xmm7,%xmm1; \ 847 pand %xmm7,%xmm2; \ 848 pand %xmm7,%xmm3; \ 849 SHLB1(%xmm1); \ 850 psllq $2,%xmm2; \ 851 psllq $3,%xmm3; \ 852 por %xmm0,%xmm1; \ 853 por %xmm2,%xmm3; \ 854 movdqa V(4),%xmm4; \ 855 movdqa V(5),%xmm5; \ 856 por %xmm1,%xmm3; \ 857 pand %xmm7,%xmm4; \ 858 pand %xmm7,%xmm5; \ 859 movdqa V(6),%xmm6; \ 860 movdqa V(7),%xmm0; \ 861 psllq $4,%xmm4; \ 862 pand %xmm7,%xmm6; \ 863 pand %xmm7,%xmm0; \ 864 psllq $5,%xmm5; \ 865 psllq $6,%xmm6; \ 866 psllq $7,%xmm0; \ 867 por %xmm4,%xmm5; \ 868 por %xmm6,%xmm3; \ 869 por %xmm5,%xmm0; \ 870 movdqa V(1),%xmm1; \ 871 por %xmm3,%xmm0; \ 872 movdqa V(2),%xmm2; \ 873 movdqa %xmm0,K(0); \ 874\ 875 movdqa V(0),%xmm0; \ 876 movdqa V(3),%xmm3; \ 877 pand %xmm8,%xmm1; \ 878 pand %xmm8,%xmm2; \ 879 pand %xmm8,%xmm0; \ 880 pand %xmm8,%xmm3; \ 881 psrlq $1,%xmm0; \ 882 SHLB1(%xmm2); \ 883 psllq $2,%xmm3; \ 884 por %xmm0,%xmm1; \ 885 por %xmm2,%xmm3; \ 886 movdqa V(4),%xmm4; \ 887 movdqa V(5),%xmm5; \ 888 por %xmm1,%xmm3; \ 889 pand %xmm8,%xmm4; \ 890 pand %xmm8,%xmm5; \ 891 movdqa V(6),%xmm6; \ 892 movdqa V(7),%xmm0; \ 893 psllq $3,%xmm4; \ 894 pand %xmm8,%xmm6; \ 895 pand %xmm8,%xmm0; \ 896 psllq $4,%xmm5; \ 897 psllq $5,%xmm6; \ 898 psllq $6,%xmm0; \ 899 por %xmm4,%xmm5; \ 900 por %xmm6,%xmm3; \ 901 por %xmm5,%xmm0; \ 902 movdqa V(1),%xmm1; \ 903 por %xmm3,%xmm0; \ 904 movdqa V(2),%xmm2; \ 905 movdqa %xmm0,K(1); \ 906\ 907 movdqa V(0),%xmm0; \ 908 movdqa V(3),%xmm3; \ 909 pand %xmm9,%xmm1; \ 910 pand %xmm9,%xmm2; \ 911 pand %xmm9,%xmm0; \ 912 pand %xmm9,%xmm3; \ 913 psrlq $1,%xmm1; \ 914 psrlq $2,%xmm0; \ 915 SHLB1(%xmm3); \ 916 por %xmm0,%xmm1; \ 917 por %xmm2,%xmm3; \ 918 movdqa V(4),%xmm4; \ 919 movdqa V(5),%xmm5; \ 920 por %xmm1,%xmm3; \ 921 pand %xmm9,%xmm4; \ 922 pand %xmm9,%xmm5; \ 923 movdqa V(6),%xmm6; \ 924 movdqa V(7),%xmm0; \ 925 psllq $2,%xmm4; \ 926 pand %xmm9,%xmm6; \ 927 pand %xmm9,%xmm0; \ 928 psllq $3,%xmm5; \ 929 psllq $4,%xmm6; \ 930 psllq $5,%xmm0; \ 931 por %xmm4,%xmm5; \ 932 por %xmm6,%xmm3; \ 933 por %xmm5,%xmm0; \ 934 movdqa V(1),%xmm1; \ 935 por %xmm3,%xmm0; \ 936 movdqa V(2),%xmm2; \ 937 movdqa %xmm0,K(2); \ 938\ 939 movdqa V(0),%xmm0; \ 940 movdqa V(3),%xmm3; \ 941 pand %xmm10,%xmm1; \ 942 pand %xmm10,%xmm2; \ 943 pand %xmm10,%xmm0; \ 944 pand %xmm10,%xmm3; \ 945 psrlq $2,%xmm1; \ 946 psrlq $3,%xmm0; \ 947 psrlq $1,%xmm2; \ 948 por %xmm0,%xmm1; \ 949 por %xmm2,%xmm3; \ 950 movdqa V(4),%xmm4; \ 951 movdqa V(5),%xmm5; \ 952 por %xmm1,%xmm3; \ 953 pand %xmm10,%xmm4; \ 954 pand %xmm10,%xmm5; \ 955 movdqa V(6),%xmm6; \ 956 movdqa V(7),%xmm0; \ 957 SHLB1(%xmm4); \ 958 pand %xmm10,%xmm6; \ 959 pand %xmm10,%xmm0; \ 960 psllq $2,%xmm5; \ 961 psllq $3,%xmm6; \ 962 psllq $4,%xmm0; \ 963 por %xmm4,%xmm5; \ 964 por %xmm6,%xmm3; \ 965 por %xmm5,%xmm0; \ 966 movdqa V(1),%xmm1; \ 967 por %xmm3,%xmm0; \ 968 movdqa V(2),%xmm2; \ 969 movdqa %xmm0,K(3); \ 970\ 971 movdqa V(0),%xmm0; \ 972 movdqa V(3),%xmm3; \ 973 pand %xmm11,%xmm1; \ 974 pand %xmm11,%xmm2; \ 975 pand %xmm11,%xmm0; \ 976 pand %xmm11,%xmm3; \ 977 psrlq $3,%xmm1; \ 978 psrlq $4,%xmm0; \ 979 psrlq $2,%xmm2; \ 980 psrlq $1,%xmm3; \ 981 por %xmm0,%xmm1; \ 982 por %xmm2,%xmm3; \ 983 movdqa V(4),%xmm4; \ 984 movdqa V(5),%xmm5; \ 985 por %xmm1,%xmm3; \ 986 pand %xmm11,%xmm4; \ 987 pand %xmm11,%xmm5; \ 988 movdqa V(6),%xmm6; \ 989 movdqa V(7),%xmm0; \ 990 pand %xmm11,%xmm6; \ 991 pand %xmm11,%xmm0; \ 992 SHLB1(%xmm5); \ 993 psllq $2,%xmm6; \ 994 psllq $3,%xmm0; \ 995 por %xmm4,%xmm5; \ 996 por %xmm6,%xmm3; \ 997 por %xmm5,%xmm0; \ 998 movdqa V(1),%xmm1; \ 999 por %xmm3,%xmm0; \ 1000 movdqa V(2),%xmm2; \ 1001 movdqa %xmm0,K(4); \ 1002\ 1003 movdqa V(0),%xmm0; \ 1004 movdqa V(3),%xmm3; \ 1005 pand %xmm12,%xmm1; \ 1006 pand %xmm12,%xmm2; \ 1007 pand %xmm12,%xmm0; \ 1008 pand %xmm12,%xmm3; \ 1009 psrlq $4,%xmm1; \ 1010 psrlq $5,%xmm0; \ 1011 psrlq $3,%xmm2; \ 1012 psrlq $2,%xmm3; \ 1013 por %xmm0,%xmm1; \ 1014 por %xmm2,%xmm3; \ 1015 movdqa V(4),%xmm4; \ 1016 movdqa V(5),%xmm5; \ 1017 por %xmm1,%xmm3; \ 1018 pand %xmm12,%xmm4; \ 1019 pand %xmm12,%xmm5; \ 1020 movdqa V(6),%xmm6; \ 1021 movdqa V(7),%xmm0; \ 1022 psrlq $1,%xmm4; \ 1023 pand %xmm12,%xmm6; \ 1024 pand %xmm12,%xmm0; \ 1025 SHLB1(%xmm6); \ 1026 psllq $2,%xmm0; \ 1027 por %xmm4,%xmm5; \ 1028 por %xmm6,%xmm3; \ 1029 por %xmm5,%xmm0; \ 1030 movdqa V(1),%xmm1; \ 1031 por %xmm3,%xmm0; \ 1032 movdqa V(2),%xmm2; \ 1033 movdqa %xmm0,K(5); \ 1034\ 1035 movdqa V(0),%xmm0; \ 1036 movdqa V(3),%xmm3; \ 1037 pand %xmm13,%xmm1; \ 1038 pand %xmm13,%xmm2; \ 1039 pand %xmm13,%xmm0; \ 1040 pand %xmm13,%xmm3; \ 1041 psrlq $5,%xmm1; \ 1042 psrlq $6,%xmm0; \ 1043 psrlq $4,%xmm2; \ 1044 psrlq $3,%xmm3; \ 1045 por %xmm0,%xmm1; \ 1046 por %xmm2,%xmm3; \ 1047 movdqa V(4),%xmm4; \ 1048 movdqa V(5),%xmm5; \ 1049 por %xmm1,%xmm3; \ 1050 pand %xmm13,%xmm4; \ 1051 pand %xmm13,%xmm5; \ 1052 movdqa V(6),%xmm6; \ 1053 movdqa V(7),%xmm0; \ 1054 psrlq $2,%xmm4; \ 1055 pand %xmm13,%xmm6; \ 1056 pand %xmm13,%xmm0; \ 1057 psrlq $1,%xmm5; \ 1058 SHLB1(%xmm0); \ 1059 por %xmm4,%xmm5; \ 1060 por %xmm6,%xmm3; \ 1061 por %xmm5,%xmm0; \ 1062 por %xmm3,%xmm0; \ 1063 movdqa %xmm0,K(6) 1064 1065.text 1066 1067DO_ALIGN(6) 1068.globl DES_bs_init_asm 1069DES_bs_init_asm: 1070 pcmpeqd %xmm0,%xmm0 1071 movdqa %xmm0,pnot 1072 paddb %xmm0,%xmm0 1073 pxor pnot,%xmm0 1074 movdqa %xmm0,mask01 1075 SHLB1(%xmm0) 1076 movdqa %xmm0,mask02 1077 SHLB1(%xmm0) 1078 movdqa %xmm0,mask04 1079 SHLB1(%xmm0) 1080 movdqa %xmm0,mask08 1081 SHLB1(%xmm0) 1082 movdqa %xmm0,mask10 1083 SHLB1(%xmm0) 1084 movdqa %xmm0,mask20 1085 SHLB1(%xmm0) 1086 movdqa %xmm0,mask40 1087 SHLB1(%xmm0) 1088 movdqa %xmm0,mask80 1089 ret 1090 1091#define iterations %edi 1092#define rounds_and_swapped %eax 1093 1094DO_ALIGN(6) 1095.globl DES_bs_crypt 1096DES_bs_crypt: 1097 PROLOGUE 1098 cmpl $0,DES_bs_all_keys_changed(%rip) 1099 jz DES_bs_crypt_body 1100 pushq %rdi 1101 call DES_bs_finalize_keys 1102 popq %rdi 1103DES_bs_crypt_body: 1104 pxor zero,zero 1105 leaq DES_bs_all_KS_v(%rip),k_ptr 1106 DES_bs_clear_block 1107 movl $8,rounds_and_swapped 1108DES_bs_crypt_start: 1109 xor_E(0) 1110 S1(B(40), B(48), B(54), B(62)) 1111 xor_E(6) 1112 S2(B(44), B(59), B(33), B(49)) 1113 xor_E(12) 1114 S3(B(55), B(47), B(61), B(37)) 1115 xor_E(18) 1116 S4(B(57), B(51), B(41), B(32)) 1117 xor_E(24) 1118 S5(B(39), B(45), B(56), B(34)) 1119 xor_E(30) 1120 S6(B(35), B(60), B(42), B(50)) 1121 xor_E(36) 1122 S7(B(63), B(43), B(53), B(38)) 1123 xor_E(42) 1124 S8(B(36), B(58), B(46), B(52)) 1125 cmpl $0x100,rounds_and_swapped 1126 je DES_bs_crypt_next 1127DES_bs_crypt_swap: 1128 xor_E(48) 1129 S1(B(8), B(16), B(22), B(30)) 1130 xor_E(54) 1131 S2(B(12), B(27), B(1), B(17)) 1132 xor_E(60) 1133 S3(B(23), B(15), B(29), B(5)) 1134 xor_E(66) 1135 S4(B(25), B(19), B(9), B(0)) 1136 xor_E(72) 1137 S5(B(7), B(13), B(24), B(2)) 1138 xor_E(78) 1139 S6(B(3), B(28), B(10), B(18)) 1140 xor_E(84) 1141 S7(B(31), B(11), B(21), B(6)) 1142 xor_E(90) 1143 addq $nvec(96),k_ptr 1144 S8(B(4), B(26), B(14), B(20)) 1145 subl $1,rounds_and_swapped 1146 jnz DES_bs_crypt_start 1147 subq $nvec(0x300+48),k_ptr 1148 movl $0x108,rounds_and_swapped 1149 subl $1,iterations 1150 jnz DES_bs_crypt_swap 1151 EPILOGUE 1152 ret 1153DES_bs_crypt_next: 1154 subq $nvec(0x300-48),k_ptr 1155 movl $8,rounds_and_swapped 1156 subl $1,iterations 1157 jnz DES_bs_crypt_start 1158 EPILOGUE 1159 ret 1160 1161DO_ALIGN(6) 1162.globl DES_bs_crypt_25 1163DES_bs_crypt_25: 1164 PROLOGUE 1165 cmpl $0,DES_bs_all_keys_changed(%rip) 1166 jnz DES_bs_finalize_keys_25 1167DES_bs_crypt_25_body: 1168 pxor zero,zero 1169 leaq DES_bs_all_KS_v(%rip),k_ptr 1170 DES_bs_clear_block 1171 movl $8,rounds_and_swapped 1172 movl $25,iterations 1173DES_bs_crypt_25_start: 1174 xor_E(0) 1175 S1(B(40), B(48), B(54), B(62)) 1176 xor_E(6) 1177 S2(B(44), B(59), B(33), B(49)) 1178 xor_B(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17) 1179 S3(B(55), B(47), B(61), B(37)) 1180 xor_B(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23) 1181 S4(B(57), B(51), B(41), B(32)) 1182 xor_E(24) 1183 S5(B(39), B(45), B(56), B(34)) 1184 xor_E(30) 1185 S6(B(35), B(60), B(42), B(50)) 1186 xor_B(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41) 1187 S7(B(63), B(43), B(53), B(38)) 1188 xor_B(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47) 1189 S8(B(36), B(58), B(46), B(52)) 1190 cmpl $0x100,rounds_and_swapped 1191 je DES_bs_crypt_25_next 1192DES_bs_crypt_25_swap: 1193 xor_E(48) 1194 S1(B(8), B(16), B(22), B(30)) 1195 xor_E(54) 1196 S2(B(12), B(27), B(1), B(17)) 1197 xor_B(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65) 1198 S3(B(23), B(15), B(29), B(5)) 1199 xor_B(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71) 1200 S4(B(25), B(19), B(9), B(0)) 1201 xor_E(72) 1202 S5(B(7), B(13), B(24), B(2)) 1203 xor_E(78) 1204 S6(B(3), B(28), B(10), B(18)) 1205 xor_B(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89) 1206 S7(B(31), B(11), B(21), B(6)) 1207 xor_B(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95) 1208 S8(B(4), B(26), B(14), B(20)) 1209 addq $nvec(96),k_ptr 1210 subl $1,rounds_and_swapped 1211 jnz DES_bs_crypt_25_start 1212 subq $nvec(0x300+48),k_ptr 1213 movl $0x108,rounds_and_swapped 1214 subl $1,iterations 1215 jnz DES_bs_crypt_25_swap 1216 EPILOGUE 1217 ret 1218DES_bs_crypt_25_next: 1219 subq $nvec(0x300-48),k_ptr 1220 movl $8,rounds_and_swapped 1221 subl $1,iterations 1222 jmp DES_bs_crypt_25_start 1223 1224DES_bs_finalize_keys_25: 1225 leaq DES_bs_crypt_25_body(%rip),tmp1 1226 pushq tmp1 1227DES_bs_finalize_keys: 1228 movdqa mask01,%xmm7 1229 movdqa mask02,%xmm8 1230 leaq DES_bs_all_xkeys(%rip),v_ptr 1231 movdqa mask04,%xmm9 1232 movdqa mask08,%xmm10 1233 leaq DES_bs_all_K(%rip),k_ptr 1234 movl $8,iterations 1235 movdqa mask10,%xmm11 1236 movdqa mask20,%xmm12 1237 movl $0,DES_bs_all_keys_changed(%rip) 1238 movdqa mask40,%xmm13 1239DES_bs_finalize_keys_main_loop: 1240 FINALIZE_NEXT_KEY_BITS_0_6 1241 addq $nvec(7),k_ptr 1242 addq $nvec(8),v_ptr 1243 subl $1,iterations 1244 jnz DES_bs_finalize_keys_main_loop 1245 leaq DES_bs_all_KSp(%rip),k_ptr 1246 leaq DES_bs_all_KS_v(%rip),v_ptr 1247 movl $0x60,iterations 1248DES_bs_finalize_keys_expand_loop: 1249 mov k(0),tmp1p 1250 mov k(1),tmp2p 1251 movdqa (tmp1),%xmm0 1252 movdqa (tmp2),%xmm1 1253 mov k(2),tmp1p 1254 mov k(3),tmp2p 1255 movdqa %xmm0,V(0) 1256 movdqa %xmm1,V(1) 1257 movdqa (tmp1),%xmm0 1258 movdqa (tmp2),%xmm1 1259 mov k(4),tmp1p 1260 mov k(5),tmp2p 1261 movdqa %xmm0,V(2) 1262 movdqa %xmm1,V(3) 1263 movdqa (tmp1),%xmm0 1264 movdqa (tmp2),%xmm1 1265 mov k(6),tmp1p 1266 mov k(7),tmp2p 1267 movdqa %xmm0,V(4) 1268 movdqa %xmm1,V(5) 1269 movdqa (tmp1),%xmm0 1270 movdqa (tmp2),%xmm1 1271 addq $nptr(8),k_ptr 1272 movdqa %xmm0,V(6) 1273 movdqa %xmm1,V(7) 1274 addq $nvec(8),v_ptr 1275 subl $1,iterations 1276 jnz DES_bs_finalize_keys_expand_loop 1277 ret 1278 1279#define ones %xmm1 1280 1281#define rounds %eax 1282 1283DO_ALIGN(6) 1284.globl DES_bs_crypt_LM 1285DES_bs_crypt_LM: 1286 PROLOGUE 1287 movl (ARG1),%r8d 1288 movdqa mask01,%xmm7 1289 movdqa mask02,%xmm8 1290 leaq DES_bs_all_xkeys(%rip),v_ptr 1291 movdqa mask04,%xmm9 1292 movdqa mask08,%xmm10 1293 leaq DES_bs_all_K(%rip),k_ptr 1294 movdqa mask10,%xmm11 1295 movdqa mask20,%xmm12 1296 movl $7,iterations 1297 movdqa mask40,%xmm13 1298 movdqa mask80,%xmm14 1299DES_bs_finalize_keys_LM_loop: 1300 FINALIZE_NEXT_KEY_BITS_0_6 1301# bit 7 1302 movdqa V(0),%xmm0 1303 movdqa V(1),%xmm1 1304 movdqa V(2),%xmm2 1305 movdqa V(3),%xmm3 1306 pand %xmm14,%xmm0 1307 pand %xmm14,%xmm1 1308 pand %xmm14,%xmm2 1309 pand %xmm14,%xmm3 1310 psrlq $7,%xmm0 1311 psrlq $6,%xmm1 1312 psrlq $5,%xmm2 1313 psrlq $4,%xmm3 1314 por %xmm0,%xmm1 1315 por %xmm2,%xmm3 1316 movdqa V(4),%xmm4 1317 movdqa V(5),%xmm5 1318 por %xmm1,%xmm3 1319 pand %xmm14,%xmm4 1320 pand %xmm14,%xmm5 1321 movdqa V(6),%xmm6 1322 movdqa V(7),%xmm0 1323 psrlq $3,%xmm4 1324 pand %xmm14,%xmm6 1325 pand %xmm14,%xmm0 1326 psrlq $2,%xmm5 1327 psrlq $1,%xmm6 1328 por %xmm4,%xmm5 1329 por %xmm6,%xmm3 1330 por %xmm5,%xmm0 1331 addq $nvec(8),v_ptr 1332 por %xmm3,%xmm0 1333 movdqa %xmm0,K(7) 1334 addq $nvec(8),k_ptr 1335 subl $1,iterations 1336 jnz DES_bs_finalize_keys_LM_loop 1337 1338 pxor zero,zero 1339 pcmpeqd ones,ones 1340 leaq DES_bs_all_KS_p(%rip),k_ptr 1341 movdqa zero,B(0) 1342 movdqa zero,B(1) 1343 movdqa zero,B(2) 1344 movdqa zero,B(3) 1345 movdqa zero,B(4) 1346 movdqa zero,B(5) 1347 movdqa zero,B(6) 1348 movdqa zero,B(7) 1349 movdqa ones,B(8) 1350 movdqa ones,B(9) 1351 movdqa ones,B(10) 1352 movdqa zero,B(11) 1353 movdqa ones,B(12) 1354 movdqa zero,B(13) 1355 movdqa zero,B(14) 1356 movdqa zero,B(15) 1357 movdqa zero,B(16) 1358 movdqa zero,B(17) 1359 movdqa zero,B(18) 1360 movdqa zero,B(19) 1361 movdqa zero,B(20) 1362 movdqa zero,B(21) 1363 movdqa zero,B(22) 1364 movdqa ones,B(23) 1365 movdqa zero,B(24) 1366 movdqa zero,B(25) 1367 movdqa ones,B(26) 1368 movdqa zero,B(27) 1369 movdqa zero,B(28) 1370 movdqa ones,B(29) 1371 movdqa ones,B(30) 1372 movdqa ones,B(31) 1373 movdqa zero,B(32) 1374 movdqa zero,B(33) 1375 movdqa zero,B(34) 1376 movdqa ones,B(35) 1377 movdqa zero,B(36) 1378 movdqa ones,B(37) 1379 movdqa ones,B(38) 1380 movdqa ones,B(39) 1381 movdqa zero,B(40) 1382 movdqa zero,B(41) 1383 movdqa zero,B(42) 1384 movdqa zero,B(43) 1385 movdqa zero,B(44) 1386 movdqa ones,B(45) 1387 movdqa zero,B(46) 1388 movdqa zero,B(47) 1389 movdqa ones,B(48) 1390 movdqa ones,B(49) 1391 movdqa zero,B(50) 1392 movdqa zero,B(51) 1393 movdqa zero,B(52) 1394 movdqa zero,B(53) 1395 movdqa ones,B(54) 1396 movdqa zero,B(55) 1397 movdqa ones,B(56) 1398 movdqa zero,B(57) 1399 movdqa ones,B(58) 1400 movdqa zero,B(59) 1401 movdqa ones,B(60) 1402 movdqa ones,B(61) 1403 movdqa ones,B(62) 1404 movdqa ones,B(63) 1405 movl $8,rounds 1406DES_bs_crypt_LM_loop: 1407 xor_B_KS_p_special(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5) 1408 S1(B(40), B(48), B(54), B(62)) 1409 xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11) 1410 S2(B(44), B(59), B(33), B(49)) 1411 xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17) 1412 S3(B(55), B(47), B(61), B(37)) 1413 xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23) 1414 S4(B(57), B(51), B(41), B(32)) 1415 xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29) 1416 S5(B(39), B(45), B(56), B(34)) 1417 xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35) 1418 S6(B(35), B(60), B(42), B(50)) 1419 xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41) 1420 S7(B(63), B(43), B(53), B(38)) 1421 xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47) 1422 S8(B(36), B(58), B(46), B(52)) 1423 xor_B_KS_p_special(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 53) 1424 S1(B(8), B(16), B(22), B(30)) 1425 xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59) 1426 S2(B(12), B(27), B(1), B(17)) 1427 xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65) 1428 S3(B(23), B(15), B(29), B(5)) 1429 xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71) 1430 S4(B(25), B(19), B(9), B(0)) 1431 xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77) 1432 S5(B(7), B(13), B(24), B(2)) 1433 xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83) 1434 S6(B(3), B(28), B(10), B(18)) 1435 xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89) 1436 S7(B(31), B(11), B(21), B(6)) 1437 xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95) 1438 addq $nptr(96),k_ptr 1439 S8(B(4), B(26), B(14), B(20)) 1440 subl $1,rounds 1441 jnz DES_bs_crypt_LM_loop 1442 xchgq %r8,%rax 1443 EPILOGUE 1444 ret 1445 1446#define rounds %eax 1447 1448DO_ALIGN(6) 1449.globl DES_bs_crypt_plain 1450DES_bs_crypt_plain: 1451 PROLOGUE 1452 movdqa mask01,%xmm7 1453 movdqa mask02,%xmm8 1454 leaq DES_bs_all_xkeys(%rip),v_ptr 1455 movdqa mask04,%xmm9 1456 movdqa mask08,%xmm10 1457 leaq DES_bs_all_K(%rip),k_ptr 1458 movdqa mask10,%xmm11 1459 movdqa mask20,%xmm12 1460 movl $8,iterations 1461 movdqa mask40,%xmm13 1462DES_bs_finalize_keys_plain_loop: 1463 FINALIZE_NEXT_KEY_BITS_0_6 1464 addq $nvec(7),k_ptr 1465 addq $nvec(8),v_ptr 1466 subl $1,iterations 1467 jnz DES_bs_finalize_keys_plain_loop 1468 leaq DES_bs_all_KS_p(%rip),k_ptr 1469 leaq DES_bs_all_KS_v(%rip),v_ptr 1470 1471 movdqa P(0),%xmm4 1472 movdqa %xmm4,B(0) 1473 movdqa P(1),%xmm4 1474 movdqa %xmm4,B(1) 1475 movdqa P(2),%xmm4 1476 movdqa %xmm4,B(2) 1477 movdqa P(3),%xmm4 1478 movdqa %xmm4,B(3) 1479 movdqa P(4),%xmm4 1480 movdqa %xmm4,B(4) 1481 movdqa P(5),%xmm4 1482 movdqa %xmm4,B(5) 1483 movdqa P(6),%xmm4 1484 movdqa %xmm4,B(6) 1485 movdqa P(7),%xmm4 1486 movdqa %xmm4,B(7) 1487 movdqa P(8),%xmm4 1488 movdqa %xmm4,B(8) 1489 movdqa P(9),%xmm4 1490 movdqa %xmm4,B(9) 1491 movdqa P(10),%xmm4 1492 movdqa %xmm4,B(10) 1493 movdqa P(11),%xmm4 1494 movdqa %xmm4,B(11) 1495 movdqa P(12),%xmm4 1496 movdqa %xmm4,B(12) 1497 movdqa P(13),%xmm4 1498 movdqa %xmm4,B(13) 1499 movdqa P(14),%xmm4 1500 movdqa %xmm4,B(14) 1501 movdqa P(15),%xmm4 1502 movdqa %xmm4,B(15) 1503 movdqa P(16),%xmm4 1504 movdqa %xmm4,B(16) 1505 movdqa P(17),%xmm4 1506 movdqa %xmm4,B(17) 1507 movdqa P(18),%xmm4 1508 movdqa %xmm4,B(18) 1509 movdqa P(19),%xmm4 1510 movdqa %xmm4,B(19) 1511 movdqa P(20),%xmm4 1512 movdqa %xmm4,B(20) 1513 movdqa P(21),%xmm4 1514 movdqa %xmm4,B(21) 1515 movdqa P(22),%xmm4 1516 movdqa %xmm4,B(22) 1517 movdqa P(23),%xmm4 1518 movdqa %xmm4,B(23) 1519 movdqa P(24),%xmm4 1520 movdqa %xmm4,B(24) 1521 movdqa P(25),%xmm4 1522 movdqa %xmm4,B(25) 1523 movdqa P(26),%xmm4 1524 movdqa %xmm4,B(26) 1525 movdqa P(27),%xmm4 1526 movdqa %xmm4,B(27) 1527 movdqa P(28),%xmm4 1528 movdqa %xmm4,B(28) 1529 movdqa P(29),%xmm4 1530 movdqa %xmm4,B(29) 1531 movdqa P(30),%xmm4 1532 movdqa %xmm4,B(30) 1533 movdqa P(31),%xmm4 1534 movdqa %xmm4,B(31) 1535 movdqa P(32),%xmm4 1536 movdqa %xmm4,B(32) 1537 movdqa P(33),%xmm4 1538 movdqa %xmm4,B(33) 1539 movdqa P(34),%xmm4 1540 movdqa %xmm4,B(34) 1541 movdqa P(35),%xmm4 1542 movdqa %xmm4,B(35) 1543 movdqa P(36),%xmm4 1544 movdqa %xmm4,B(36) 1545 movdqa P(37),%xmm4 1546 movdqa %xmm4,B(37) 1547 movdqa P(38),%xmm4 1548 movdqa %xmm4,B(38) 1549 movdqa P(39),%xmm4 1550 movdqa %xmm4,B(39) 1551 movdqa P(40),%xmm4 1552 movdqa %xmm4,B(40) 1553 movdqa P(41),%xmm4 1554 movdqa %xmm4,B(41) 1555 movdqa P(42),%xmm4 1556 movdqa %xmm4,B(42) 1557 movdqa P(43),%xmm4 1558 movdqa %xmm4,B(43) 1559 movdqa P(44),%xmm4 1560 movdqa %xmm4,B(44) 1561 movdqa P(45),%xmm4 1562 movdqa %xmm4,B(45) 1563 movdqa P(46),%xmm4 1564 movdqa %xmm4,B(46) 1565 movdqa P(47),%xmm4 1566 movdqa %xmm4,B(47) 1567 movdqa P(48),%xmm4 1568 movdqa %xmm4,B(48) 1569 movdqa P(49),%xmm4 1570 movdqa %xmm4,B(49) 1571 movdqa P(50),%xmm4 1572 movdqa %xmm4,B(50) 1573 movdqa P(51),%xmm4 1574 movdqa %xmm4,B(51) 1575 movdqa P(52),%xmm4 1576 movdqa %xmm4,B(52) 1577 movdqa P(53),%xmm4 1578 movdqa %xmm4,B(53) 1579 movdqa P(54),%xmm4 1580 movdqa %xmm4,B(54) 1581 movdqa P(55),%xmm4 1582 movdqa %xmm4,B(55) 1583 movdqa P(56),%xmm4 1584 movdqa %xmm4,B(56) 1585 movdqa P(57),%xmm4 1586 movdqa %xmm4,B(57) 1587 movdqa P(58),%xmm4 1588 movdqa %xmm4,B(58) 1589 movdqa P(59),%xmm4 1590 movdqa %xmm4,B(59) 1591 movdqa P(60),%xmm4 1592 movdqa %xmm4,B(60) 1593 movdqa P(61),%xmm4 1594 movdqa %xmm4,B(61) 1595 movdqa P(62),%xmm4 1596 movdqa %xmm4,B(62) 1597 movdqa P(63),%xmm4 1598 movdqa %xmm4,B(63) 1599 movl $8,rounds 1600DES_bs_crypt_plain_loop: 1601 xor_B_KS_p(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5) 1602 S1(B(40), B(48), B(54), B(62)) 1603 xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11) 1604 S2(B(44), B(59), B(33), B(49)) 1605 xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17) 1606 S3(B(55), B(47), B(61), B(37)) 1607 xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23) 1608 S4(B(57), B(51), B(41), B(32)) 1609 xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29) 1610 S5(B(39), B(45), B(56), B(34)) 1611 xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35) 1612 S6(B(35), B(60), B(42), B(50)) 1613 xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41) 1614 S7(B(63), B(43), B(53), B(38)) 1615 xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47) 1616 S8(B(36), B(58), B(46), B(52)) 1617 xor_B_KS_p(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 36, 53) 1618 S1(B(8), B(16), B(22), B(30)) 1619 xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59) 1620 S2(B(12), B(27), B(1), B(17)) 1621 xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65) 1622 S3(B(23), B(15), B(29), B(5)) 1623 xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71) 1624 S4(B(25), B(19), B(9), B(0)) 1625 xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77) 1626 S5(B(7), B(13), B(24), B(2)) 1627 xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83) 1628 S6(B(3), B(28), B(10), B(18)) 1629 xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89) 1630 S7(B(31), B(11), B(21), B(6)) 1631 xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95) 1632 addq $nptr(96),k_ptr 1633 S8(B(4), B(26), B(14), B(20)) 1634 subl $1,rounds 1635 jnz DES_bs_crypt_plain_loop 1636 EPILOGUE 1637 ret 1638#endif 1639 1640#if CPU_REQ 1641/* 1642 * CPU detection. 1643 */ 1644 1645/* Leaf 1 */ 1646#define CF_SSSE3 $0x00000200 /* SSSE3 */ 1647#define CF_SSE4_1 $0x00080200 /* SSE4.1 + SSSE3 */ 1648#define CF_SSE4_2 $0x00180200 /* SSE4.2 + SSE4.1 + SSSE3 */ 1649#define CF_AVX $0x1C000000 /* AVX + XSAVE + OSXSAVE */ 1650 1651/* Extended features */ 1652#define CX_XOP $0x00000800 1653 1654/* Leaf 7 */ 1655#define C7_AVX2 $0x00000020 /* AVX2 */ 1656#define C7_AVX512F $0x00010000 1657#define C7_AVX512BW $0x40010000 /* AVX512BW + AVX512F */ 1658 1659.text 1660 1661#ifdef UNDERSCORES 1662#define CPU_req_name _CPU_req_name 1663#define CPU_detect _CPU_detect 1664#endif 1665.globl CPU_req_name 1666CPU_req_name: 1667 .asciz CPU_NAME 1668 1669.globl CPU_detect 1670CPU_detect: 1671 pushq %rbx 1672 1673/* First, leaf 1 checks */ 1674 movl $1,%eax 1675 cpuid 1676#if CPU_REQ_AVX2 || CPU_REQ_AVX || CPU_REQ_XOP 1677 andl CF_AVX,%ecx 1678 cmpl CF_AVX,%ecx 1679 jne CPU_detect_fail 1680#elif CPU_REQ_SSE4_2 1681 andl CF_SSE4_2,%ecx 1682 cmpl CF_SSE4_2,%ecx 1683 jne CPU_detect_fail 1684#elif CPU_REQ_SSE4_1 1685 andl CF_SSE4_1,%ecx 1686 cmpl CF_SSE4_1,%ecx 1687 jne CPU_detect_fail 1688#elif CPU_REQ_SSSE3 1689 andl CF_SSSE3,%ecx 1690 cmpl CF_SSSE3,%ecx 1691 jne CPU_detect_fail 1692#endif 1693 1694#if CPU_REQ_AVX2 || CPU_REQ_AVX || CPU_REQ_XOP 1695/* Check that xmm and ymm state is preserved on a context switch */ 1696 xorl %ecx,%ecx 1697 xgetbv 1698 andb $0x6,%al 1699 cmpb $0x6,%al 1700 jne CPU_detect_fail 1701#endif 1702 1703/* Extended feature tests (if required) */ 1704#if CPU_REQ_XOP 1705 movl $0x80000000,%eax 1706 cpuid 1707 movl $0x80000001,%edx 1708 cmpl %edx,%eax 1709 jl CPU_detect_fail 1710 xchgl %edx,%eax 1711 cpuid 1712 testl CX_XOP,%ecx 1713 jz CPU_detect_fail 1714#endif 1715 1716/* Finally, leaf 7 tests (if required) */ 1717#if CPU_REQ_AVX2 || CPU_REQ_AVX512F || CPU_REQ_AVX512BW 1718 xorl %eax,%eax 1719 cpuid 1720 movl $7,%edx 1721 cmpl %edx,%eax 1722 jl CPU_detect_fail 1723 xchgl %edx,%eax 1724 xorl %ecx,%ecx 1725 cpuid 1726#if CPU_REQ_AVX512BW 1727 andl C7_AVX512BW,%ebx 1728 cmpl C7_AVX512BW,%ebx 1729 jne CPU_detect_fail 1730#elif CPU_REQ_AVX512F 1731 andl C7_AVX512F,%ebx 1732 cmpl C7_AVX512F,%ebx 1733 jne CPU_detect_fail 1734#elif CPU_REQ_AVX2 1735 andl C7_AVX2,%ebx 1736 cmpl C7_AVX2,%ebx 1737 jne CPU_detect_fail 1738#endif 1739#endif 1740 1741/* If we reached here all is fine and we return 1 */ 1742 movl $1,%eax 1743 popq %rbx 1744 ret 1745 1746/* Return 0 */ 1747CPU_detect_fail: 1748 xorl %eax,%eax 1749 popq %rbx 1750 ret 1751#endif 1752 1753#if defined(__ELF__) && defined(__linux__) 1754.section .note.GNU-stack,"",@progbits 1755#endif 1756