1/*
2 * This file contains the core of a bitslice DES implementation for x86-64/SSE2.
3 * It is part of John the Ripper password cracker,
4 * Copyright (c) 2000-2001,2005,2006,2008,2011,2012,2015,2019 by Solar Designer
5 * Copyright (c) 2015,2017 by magnum
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
8 *
9 * Gate counts per S-box: 49 44 46 33 48 46 46 41
10 * Average: 44.125
11 *
12 * The Boolean expressions corresponding to DES S-boxes have been generated
13 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
14 * John the Ripper password cracker: http://www.openwall.com/john/
15 * Being mathematical formulas, they are not copyrighted and are free for reuse
16 * by anyone.
17 *
18 * The x86-64/SSE2 code for the S-boxes was generated by Solar Designer using a
19 * Perl script.  The script performed various optimizations, including the
20 * x86-64 specific optimization of preferring registers 0-7 over 8-15 to reduce
21 * the number of instruction prefixes (and thus code size).  The instruction
22 * scheduling has been tuned for Core 2.
23 *
24 * The effort has been sponsored by Rapid7: http://www.rapid7.com
25 *
26 * Addition of single DES encryption with no salt by Deepika Dutta Mishra
27 * <dipikadutta at gmail.com> in 2013, no rights reserved.
28 *
29 * ...with changes in the jumbo patch, by Alain Espinosa (starting with a
30 * comment further down this file).
31 *
32 * Various tweaks & fixes and support for Win64 and Linux-X32 ABIs as well
33 * as CPU detection additions by magnum 2010-2015.
34 */
35
36#include "arch.h"
37
38#if defined (_WIN64) || defined (__CYGWIN64__)
39/*
40 * MS use a different x64 calling convention than everyone else:
41 * Arguments: RCX, RDX, R8, R9 then stack right-to-left.
42 * Volatile: RAX, RCX, RDX, R8, R9, R10, R11, XMM0:XMM5
43 * Non-volatile: RBX, RBP, RSI, RDI, R12:R15, XMM6:XMM15
44 * Return: RAX.
45 */
46#define ARG1				%rdi
47#define PROLOGUE \
48	subq $(8+10*16), %rsp; \
49	movapd %xmm6, 0*16(%rsp); \
50	movapd %xmm7, 1*16(%rsp); \
51	movapd %xmm8, 2*16(%rsp); \
52	movapd %xmm9, 3*16(%rsp); \
53	movapd %xmm10, 4*16(%rsp); \
54	movapd %xmm11, 5*16(%rsp); \
55	movapd %xmm12, 6*16(%rsp); \
56	movapd %xmm13, 7*16(%rsp); \
57	movapd %xmm14, 8*16(%rsp); \
58	movapd %xmm15, 9*16(%rsp); \
59	push %rdi; \
60	push %rsi; \
61	movq %rcx, %rdi; \
62	movq %rdx, %rsi
63
64#define EPILOGUE \
65	pop %rsi; \
66	pop %rdi; \
67	movapd 0*16(%rsp), %xmm6; \
68	movapd 1*16(%rsp), %xmm7; \
69	movapd 2*16(%rsp), %xmm8; \
70	movapd 3*16(%rsp), %xmm9; \
71	movapd 4*16(%rsp), %xmm10; \
72	movapd 5*16(%rsp), %xmm11; \
73	movapd 6*16(%rsp), %xmm12; \
74	movapd 7*16(%rsp), %xmm13; \
75	movapd 8*16(%rsp), %xmm14; \
76	movapd 9*16(%rsp), %xmm15; \
77	addq $(8+10*16), %rsp
78#else
79/*
80 * System V AMD64 ABI (followed by everybody else including linux-X32):
81 * Arguments: RDI, RSI, RDX, RCX, R8, R9 then stack right-to-left.
82 * Volatile: RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, XMM0:XMM15
83 * Non-volatile: RBX, RBP, R12:R15
84 * Return: RAX.
85 */
86#define ARG1				%rdi
87#define PROLOGUE
88#define EPILOGUE
89#endif
90
91/*
92 * Throughout this file ARG1 is *pcount so it's 32-bit for X32, although
93 * using (%rdi) works fine too without any warnings.
94 */
95#ifdef __ILP32__
96#undef ARG1
97#define ARG1				%edi
98#endif
99
100#ifdef ALIGN_LOG
101#define DO_ALIGN(log)			.align log
102#else
103#define DO_ALIGN(log)			.align 1 << log
104#endif
105
106#if DES_BS_ASM
107
108#ifdef UNDERSCORES
109#define DES_bs_all			_DES_bs_all
110#define DES_bs_init_asm			_DES_bs_init_asm
111#define DES_bs_crypt			_DES_bs_crypt
112#define DES_bs_crypt_25			_DES_bs_crypt_25
113#define DES_bs_crypt_LM			_DES_bs_crypt_LM
114#define DES_bs_crypt_plain		_DES_bs_crypt_plain
115#define DES_bs_P			_DES_bs_P
116#endif
117
118#ifdef __sun
119/* Sun's assembler doesn't recognize .space */
120#define DO_SPACE(size)			.zero size
121#else
122/* Mac OS X assembler doesn't recognize .zero */
123#define DO_SPACE(size)			.space size
124#endif
125
126/* Sun's assembler can't multiply, but at least it can add... */
127#ifdef __ILP32__
128#define nptr(n)				n+n+n+n
129#else
130#define nptr(n)				n+n+n+n+n+n+n+n
131#endif
132#define nvec(n)				n+n+n+n+n+n+n+n+n+n+n+n+n+n+n+n
133
134#ifdef BSD
135.data
136#else
137.bss
138#endif
139
140.globl DES_bs_all
141DO_ALIGN(6)
142DES_bs_all:
143DES_bs_all_KSp:
144DO_SPACE(nptr(0x300))
145DES_bs_all_KS_p:
146DES_bs_all_KS_v:
147DO_SPACE(nvec(0x300))
148DES_bs_all_E:
149DO_SPACE(nptr(96))
150DES_bs_all_K:
151DO_SPACE(nvec(56))
152DES_bs_all_B:
153DO_SPACE(nvec(64))
154DES_bs_all_tmp:
155DO_SPACE(nvec(16))
156DES_bs_all_xkeys:
157DO_SPACE(nvec(64))
158DES_bs_all_pxkeys:
159DO_SPACE(nptr(128))
160DES_bs_all_keys_changed:
161DO_SPACE(4)
162DES_bs_all_salt:
163DO_SPACE(4)
164DES_bs_all_Ens:
165DO_SPACE(nptr(48))
166
167.globl DES_bs_P
168DO_ALIGN(6)
169DES_bs_P:
170DO_SPACE(nvec(64))
171
172#define E(i)				DES_bs_all_E+nptr(i)(%rip)
173#define B(i)				DES_bs_all_B+nvec(i)(%rip)
174#define tmp_at(i)			DES_bs_all_tmp+nvec(i)(%rip)
175#define P(i)				DES_bs_P+nvec(i)(%rip)
176
177#define pnot				tmp_at(0)
178
179#define a1				%xmm0
180#define a2				%xmm1
181#define a3				%xmm2
182#define a4				%xmm3
183#define a5				%xmm4
184#define a6				%xmm5
185
186#define S1(out1, out2, out3, out4) \
187	movdqa %xmm4,%xmm7; \
188	movdqa %xmm5,%xmm10; \
189	pandn %xmm0,%xmm4; \
190	movdqa %xmm2,%xmm13; \
191	movdqa %xmm4,%xmm14; \
192	por %xmm2,%xmm10; \
193	movdqa %xmm5,%xmm11; \
194	pxor %xmm0,%xmm13; \
195	pxor %xmm7,%xmm11; \
196	pxor %xmm3,%xmm14; \
197	movdqa %xmm13,%xmm12; \
198	movdqa %xmm11,%xmm15; \
199	pand %xmm10,%xmm13; \
200	movdqa %xmm14,%xmm9; \
201	movdqa %xmm13,%xmm8; \
202	pxor %xmm2,%xmm15; \
203	pxor %xmm3,%xmm8; \
204	pandn %xmm11,%xmm12; \
205	pandn %xmm8,%xmm9; \
206	por %xmm5,%xmm13; \
207	por %xmm0,%xmm5; \
208	pandn %xmm7,%xmm8; \
209	pandn %xmm14,%xmm15; \
210	movdqa %xmm5,%xmm6; \
211	pxor %xmm15,%xmm13; \
212	movdqa %xmm9,%xmm15; \
213	por %xmm13,%xmm6; \
214	pandn %xmm3,%xmm5; \
215	movdqa %xmm8,%xmm3; \
216	pandn %xmm13,%xmm15; \
217	pxor %xmm6,%xmm8; \
218	pxor %xmm3,%xmm5; \
219	pand %xmm10,%xmm13; \
220	pandn %xmm2,%xmm4; \
221	movdqa %xmm6,%xmm2; \
222	pxor %xmm10,%xmm6; \
223	pxor %xmm14,%xmm2; \
224	pandn %xmm2,%xmm4; \
225	movdqa %xmm4,%xmm2; \
226	pxor pnot,%xmm2; \
227	pxor %xmm11,%xmm4; \
228	pxor %xmm2,%xmm13; \
229	movdqa %xmm1,%xmm2; \
230	por %xmm3,%xmm4; \
231	pandn %xmm8,%xmm2; \
232	por %xmm7,%xmm14; \
233	pxor %xmm10,%xmm4; \
234	por %xmm1,%xmm9; \
235	pxor %xmm13,%xmm2; \
236	pxor %xmm0,%xmm4; \
237	movdqa %xmm1,%xmm0; \
238	pxor %xmm4,%xmm13; \
239	pxor %xmm13,%xmm9; \
240	por %xmm12,%xmm5; \
241	pxor out1,%xmm9; \
242	por %xmm5,%xmm6; \
243	por %xmm11,%xmm13; \
244	pxor %xmm4,%xmm6; \
245	movdqa %xmm9,out1; \
246	por %xmm15,%xmm0; \
247	pxor %xmm6,%xmm13; \
248	pxor out3,%xmm2; \
249	pxor out2,%xmm13; \
250	pand %xmm15,%xmm4; \
251	pandn %xmm14,%xmm6; \
252	pxor %xmm0,%xmm13; \
253	pxor %xmm6,%xmm4; \
254	movdqa %xmm2,out3; \
255	por %xmm1,%xmm4; \
256	pxor %xmm5,%xmm4; \
257	movdqa %xmm13,out2; \
258	pxor out4,%xmm4; \
259	movdqa %xmm4,out4
260
261#define S2(out1, out2, out3, out4) \
262	movdqa %xmm4,%xmm13; \
263	movdqa %xmm5,%xmm6; \
264	pxor %xmm1,%xmm13; \
265	movdqa %xmm5,%xmm8; \
266	pandn %xmm0,%xmm6; \
267	movdqa %xmm13,%xmm7; \
268	pandn %xmm4,%xmm6; \
269	movdqa %xmm5,%xmm9; \
270	movdqa %xmm6,%xmm14; \
271	pandn %xmm13,%xmm8; \
272	pand %xmm0,%xmm7; \
273	pxor pnot,%xmm0; \
274	por %xmm1,%xmm14; \
275	movdqa %xmm8,%xmm12; \
276	pxor %xmm4,%xmm7; \
277	pand %xmm2,%xmm9; \
278	pxor %xmm5,%xmm13; \
279	pxor %xmm8,%xmm6; \
280	movdqa %xmm9,%xmm10; \
281	pand %xmm14,%xmm6; \
282	pandn %xmm6,%xmm10; \
283	pand %xmm2,%xmm6; \
284	pandn %xmm7,%xmm12; \
285	pxor %xmm6,%xmm0; \
286	movdqa %xmm9,%xmm5; \
287	pandn %xmm3,%xmm10; \
288	pandn %xmm13,%xmm5; \
289	movdqa %xmm5,%xmm11; \
290	pandn %xmm1,%xmm5; \
291	pxor %xmm0,%xmm11; \
292	pxor %xmm13,%xmm2; \
293	por %xmm3,%xmm12; \
294	pxor %xmm5,%xmm7; \
295	movdqa %xmm7,%xmm1; \
296	pxor out2,%xmm10; \
297	pandn %xmm0,%xmm1; \
298	movdqa %xmm3,%xmm0; \
299	pxor %xmm2,%xmm1; \
300	pandn %xmm14,%xmm0; \
301	pxor %xmm11,%xmm14; \
302	pxor %xmm5,%xmm6; \
303	pxor %xmm1,%xmm0; \
304	por %xmm6,%xmm2; \
305	por %xmm14,%xmm9; \
306	pxor %xmm1,%xmm6; \
307	pxor %xmm11,%xmm10; \
308	pand %xmm9,%xmm6; \
309	pxor out3,%xmm2; \
310	pxor %xmm4,%xmm6; \
311	pandn %xmm6,%xmm8; \
312	pxor %xmm9,%xmm2; \
313	pxor %xmm11,%xmm8; \
314	por %xmm8,%xmm3; \
315	por %xmm13,%xmm14; \
316	pxor %xmm3,%xmm2; \
317	pandn %xmm8,%xmm7; \
318	movdqa %xmm2,out3; \
319	pxor out4,%xmm7; \
320	movdqa %xmm10,out2; \
321	pxor %xmm14,%xmm7; \
322	pxor out1,%xmm0; \
323	pxor %xmm12,%xmm7; \
324	movdqa %xmm0,out1; \
325	movdqa %xmm7,out4
326
327#define S3(out1, out2, out3, out4) \
328	movdqa %xmm1,%xmm6; \
329	movdqa %xmm5,%xmm13; \
330	pandn %xmm0,%xmm6; \
331	movdqa %xmm5,%xmm8; \
332	pxor %xmm2,%xmm13; \
333	movdqa %xmm0,%xmm11; \
334	por %xmm13,%xmm6; \
335	movdqa %xmm13,%xmm9; \
336	pxor %xmm3,%xmm8; \
337	movdqa %xmm3,%xmm15; \
338	pandn %xmm8,%xmm11; \
339	pxor %xmm1,%xmm9; \
340	movdqa %xmm11,%xmm10; \
341	movdqa %xmm4,%xmm12; \
342	movdqa %xmm5,%xmm14; \
343	pxor %xmm6,%xmm10; \
344	pandn %xmm9,%xmm14; \
345	movdqa %xmm10,%xmm7; \
346	pxor %xmm14,%xmm6; \
347	movdqa %xmm6,%xmm14; \
348	pand %xmm5,%xmm7; \
349	pand %xmm3,%xmm5; \
350	pandn %xmm10,%xmm14; \
351	por %xmm3,%xmm7; \
352	pandn %xmm10,%xmm12; \
353	pand %xmm0,%xmm7; \
354	pxor out4,%xmm12; \
355	pxor %xmm9,%xmm7; \
356	pand %xmm13,%xmm8; \
357	pxor %xmm0,%xmm15; \
358	pxor %xmm7,%xmm12; \
359	pxor %xmm15,%xmm6; \
360	pand %xmm3,%xmm13; \
361	por %xmm2,%xmm6; \
362	por %xmm11,%xmm15; \
363	pandn %xmm6,%xmm8; \
364	movdqa %xmm15,%xmm6; \
365	pand %xmm4,%xmm8; \
366	pandn %xmm7,%xmm6; \
367	movdqa %xmm1,%xmm7; \
368	pandn %xmm10,%xmm1; \
369	pandn %xmm5,%xmm7; \
370	por %xmm9,%xmm5; \
371	pxor %xmm7,%xmm6; \
372	movdqa %xmm2,%xmm7; \
373	pxor %xmm9,%xmm15; \
374	pandn %xmm6,%xmm7; \
375	pandn %xmm1,%xmm2; \
376	pandn %xmm5,%xmm7; \
377	pxor pnot,%xmm15; \
378	pxor out2,%xmm7; \
379	pxor %xmm2,%xmm15; \
380	pandn %xmm4,%xmm14; \
381	movdqa %xmm12,out4; \
382	por %xmm15,%xmm9; \
383	pxor %xmm0,%xmm7; \
384	pandn %xmm9,%xmm13; \
385	por %xmm11,%xmm1; \
386	pxor %xmm8,%xmm7; \
387	pxor %xmm1,%xmm13; \
388	por %xmm4,%xmm6; \
389	pxor out1,%xmm14; \
390	pxor %xmm13,%xmm6; \
391	pxor %xmm15,%xmm14; \
392	pxor out3,%xmm6; \
393	movdqa %xmm7,out2; \
394	movdqa %xmm14,out1; \
395	movdqa %xmm6,out3
396
397#define S4(out1, out2, out3, out4) \
398	movdqa %xmm3,%xmm7; \
399	movdqa %xmm1,%xmm8; \
400	pxor %xmm2,%xmm0; \
401	pxor %xmm4,%xmm2; \
402	por %xmm1,%xmm3; \
403	pandn %xmm2,%xmm1; \
404	pxor %xmm4,%xmm3; \
405	movdqa %xmm1,%xmm10; \
406	pxor %xmm7,%xmm1; \
407	pandn %xmm2,%xmm3; \
408	movdqa %xmm1,%xmm11; \
409	movdqa %xmm3,%xmm6; \
410	pxor %xmm8,%xmm7; \
411	por %xmm0,%xmm1; \
412	pandn %xmm1,%xmm3; \
413	movdqa %xmm3,%xmm1; \
414	movdqa %xmm5,%xmm12; \
415	pxor %xmm8,%xmm3; \
416	pand %xmm3,%xmm11; \
417	movdqa %xmm11,%xmm9; \
418	por %xmm4,%xmm10; \
419	pxor %xmm3,%xmm0; \
420	pandn %xmm2,%xmm11; \
421	pandn %xmm0,%xmm11; \
422	pxor %xmm0,%xmm10; \
423	movdqa %xmm7,%xmm0; \
424	pxor %xmm11,%xmm6; \
425	movdqa %xmm6,%xmm4; \
426	pandn %xmm5,%xmm6; \
427	pandn %xmm10,%xmm7; \
428	pxor out1,%xmm6; \
429	pandn %xmm4,%xmm5; \
430	pxor %xmm1,%xmm7; \
431	pxor %xmm7,%xmm6; \
432	pxor pnot,%xmm7; \
433	pxor %xmm7,%xmm5; \
434	pxor %xmm4,%xmm7; \
435	pxor out2,%xmm5; \
436	movdqa %xmm5,out2; \
437	pandn %xmm7,%xmm0; \
438	movdqa %xmm12,%xmm7; \
439	por %xmm9,%xmm0; \
440	movdqa %xmm6,out1; \
441	pxor %xmm10,%xmm0; \
442	por %xmm3,%xmm12; \
443	pxor %xmm0,%xmm12; \
444	pxor out4,%xmm0; \
445	pand %xmm7,%xmm3; \
446	pxor out3,%xmm12; \
447	movdqa %xmm12,out3; \
448	pxor %xmm3,%xmm0; \
449	movdqa %xmm0,out4
450
451#define S5(out1, out2, out3, out4) \
452	movdqa %xmm2,%xmm6; \
453	por %xmm0,%xmm2; \
454	movdqa %xmm5,%xmm7; \
455	pandn %xmm2,%xmm5; \
456	movdqa %xmm3,%xmm14; \
457	pandn %xmm5,%xmm3; \
458	pxor %xmm0,%xmm5; \
459	pxor %xmm6,%xmm3; \
460	movdqa %xmm5,%xmm15; \
461	pxor %xmm6,%xmm5; \
462	movdqa %xmm3,%xmm10; \
463	pand %xmm4,%xmm3; \
464	movdqa %xmm5,%xmm8; \
465	por %xmm0,%xmm5; \
466	pxor %xmm14,%xmm3; \
467	pxor %xmm5,%xmm3; \
468	movdqa %xmm5,%xmm12; \
469	por %xmm14,%xmm8; \
470	pxor %xmm0,%xmm2; \
471	pxor %xmm3,%xmm7; \
472	pand %xmm14,%xmm12; \
473	movdqa %xmm7,%xmm9; \
474	por %xmm15,%xmm7; \
475	pxor %xmm15,%xmm12; \
476	pandn %xmm7,%xmm0; \
477	pand %xmm4,%xmm7; \
478	pxor %xmm8,%xmm4; \
479	pxor %xmm7,%xmm12; \
480	movdqa %xmm0,%xmm6; \
481	pxor %xmm4,%xmm0; \
482	pxor %xmm10,%xmm6; \
483	movdqa %xmm1,%xmm13; \
484	pandn %xmm4,%xmm6; \
485	pand %xmm10,%xmm5; \
486	pxor pnot,%xmm6; \
487	por %xmm12,%xmm0; \
488	pandn %xmm6,%xmm13; \
489	movdqa %xmm7,%xmm6; \
490	pandn %xmm10,%xmm7; \
491	pxor %xmm8,%xmm10; \
492	pandn %xmm0,%xmm7; \
493	pxor %xmm13,%xmm3; \
494	pand %xmm7,%xmm9; \
495	movdqa %xmm7,%xmm0; \
496	pxor %xmm4,%xmm9; \
497	pandn %xmm8,%xmm0; \
498	pand %xmm1,%xmm8; \
499	por %xmm1,%xmm0; \
500	pand %xmm9,%xmm14; \
501	pxor %xmm2,%xmm7; \
502	por %xmm9,%xmm5; \
503	pxor %xmm14,%xmm7; \
504	pxor %xmm6,%xmm5; \
505	pxor %xmm7,%xmm0; \
506	pxor %xmm15,%xmm9; \
507	pandn %xmm10,%xmm7; \
508	pand %xmm1,%xmm5; \
509	pxor %xmm9,%xmm7; \
510	pxor %xmm12,%xmm5; \
511	pxor %xmm8,%xmm7; \
512	pxor out3,%xmm3; \
513	pxor out4,%xmm5; \
514	pxor out1,%xmm0; \
515	pxor out2,%xmm7; \
516	movdqa %xmm3,out3; \
517	movdqa %xmm5,out4; \
518	movdqa %xmm0,out1; \
519	movdqa %xmm7,out2
520
521#define S6(out1, out2, out3, out4) \
522	movdqa %xmm5,%xmm8; \
523	por %xmm1,%xmm5; \
524	movdqa %xmm4,%xmm7; \
525	movdqa %xmm4,tmp_at(2); \
526	movdqa %xmm2,%xmm11; \
527	pxor %xmm1,%xmm4; \
528	pand %xmm0,%xmm5; \
529	movdqa %xmm3,%xmm15; \
530	pxor %xmm5,%xmm4; \
531	movdqa %xmm4,%xmm9; \
532	pxor %xmm0,%xmm11; \
533	pxor %xmm8,%xmm4; \
534	movdqa %xmm0,tmp_at(1); \
535	movdqa %xmm4,%xmm12; \
536	pand %xmm0,%xmm4; \
537	movdqa %xmm11,%xmm0; \
538	pandn %xmm7,%xmm12; \
539	movdqa %xmm4,%xmm10; \
540	pxor %xmm1,%xmm4; \
541	por %xmm1,%xmm11; \
542	por %xmm4,%xmm0; \
543	movdqa %xmm0,%xmm6; \
544	por %xmm12,%xmm4; \
545	pxor %xmm9,%xmm0; \
546	pxor %xmm1,%xmm6; \
547	movdqa %xmm4,%xmm14; \
548	movdqa %xmm6,%xmm7; \
549	pandn %xmm8,%xmm6; \
550	pxor %xmm8,%xmm10; \
551	pxor %xmm2,%xmm6; \
552	pand %xmm0,%xmm2; \
553	movdqa %xmm3,%xmm1; \
554	pandn %xmm2,%xmm8; \
555	movdqa %xmm2,%xmm13; \
556	pxor %xmm8,%xmm14; \
557	pxor %xmm11,%xmm7; \
558	pand %xmm14,%xmm15; \
559	pandn tmp_at(2),%xmm2; \
560	pxor %xmm0,%xmm15; \
561	por tmp_at(1),%xmm0; \
562	pxor out4,%xmm15; \
563	pand %xmm4,%xmm0; \
564	por %xmm6,%xmm2; \
565	pxor %xmm6,%xmm0; \
566	pxor pnot,%xmm7; \
567	pandn %xmm0,%xmm8; \
568	pxor %xmm9,%xmm0; \
569	por %xmm3,%xmm12; \
570	pandn tmp_at(2),%xmm0; \
571	por %xmm2,%xmm5; \
572	pxor %xmm7,%xmm0; \
573	pxor tmp_at(1),%xmm6; \
574	pandn %xmm0,%xmm1; \
575	pandn %xmm2,%xmm3; \
576	pand %xmm10,%xmm6; \
577	pxor %xmm3,%xmm7; \
578	pxor out3,%xmm8; \
579	pxor %xmm6,%xmm7; \
580	pxor %xmm12,%xmm8; \
581	pxor %xmm1,%xmm5; \
582	pxor %xmm13,%xmm7; \
583	pxor %xmm11,%xmm14; \
584	pxor out2,%xmm5; \
585	pxor %xmm14,%xmm5; \
586	movdqa %xmm15,out4; \
587	pxor out1,%xmm7; \
588	movdqa %xmm8,out3; \
589	movdqa %xmm5,out2; \
590	movdqa %xmm7,out1
591
592#define S7(out1, out2, out3, out4) \
593	movdqa %xmm4,%xmm14; \
594	pxor %xmm3,%xmm4; \
595	movdqa %xmm3,%xmm11; \
596	movdqa %xmm4,%xmm12; \
597	pand %xmm4,%xmm11; \
598	pxor %xmm2,%xmm4; \
599	movdqa %xmm11,%xmm6; \
600	movdqa %xmm4,%xmm7; \
601	movdqa %xmm11,%xmm15; \
602	pand %xmm5,%xmm6; \
603	pxor %xmm1,%xmm11; \
604	movdqa %xmm7,%xmm13; \
605	pand %xmm5,%xmm4; \
606	movdqa %xmm11,%xmm10; \
607	pxor %xmm5,%xmm12; \
608	pxor %xmm2,%xmm6; \
609	movdqa %xmm6,%xmm8; \
610	por %xmm10,%xmm6; \
611	pand %xmm4,%xmm11; \
612	pandn %xmm0,%xmm11; \
613	pxor %xmm12,%xmm6; \
614	pxor %xmm4,%xmm8; \
615	pandn %xmm14,%xmm7; \
616	movdqa %xmm7,%xmm9; \
617	pxor %xmm6,%xmm11; \
618	pxor %xmm12,%xmm4; \
619	por %xmm10,%xmm7; \
620	pxor %xmm8,%xmm7; \
621	pandn %xmm3,%xmm4; \
622	pxor %xmm14,%xmm8; \
623	pandn %xmm10,%xmm4; \
624	pxor %xmm4,%xmm8; \
625	pandn %xmm13,%xmm12; \
626	pand %xmm8,%xmm2; \
627	por %xmm15,%xmm6; \
628	por %xmm2,%xmm6; \
629	pxor %xmm12,%xmm6; \
630	movdqa %xmm0,%xmm3; \
631	pandn %xmm6,%xmm0; \
632	movdqa %xmm6,%xmm4; \
633	por %xmm8,%xmm6; \
634	pand %xmm5,%xmm6; \
635	pxor %xmm7,%xmm0; \
636	por %xmm14,%xmm2; \
637	pand %xmm6,%xmm1; \
638	pxor %xmm4,%xmm7; \
639	pxor %xmm6,%xmm2; \
640	pxor %xmm7,%xmm1; \
641	pxor %xmm14,%xmm7; \
642	movdqa %xmm3,%xmm5; \
643	por %xmm2,%xmm7; \
644	pxor out1,%xmm0; \
645	pand %xmm7,%xmm3; \
646	pxor pnot,%xmm4; \
647	pxor %xmm6,%xmm7; \
648	por %xmm9,%xmm7; \
649	pxor out4,%xmm11; \
650	pxor %xmm3,%xmm8; \
651	pxor %xmm4,%xmm7; \
652	pandn %xmm7,%xmm5; \
653	movdqa %xmm11,out4; \
654	pxor out2,%xmm1; \
655	movdqa %xmm0,out1; \
656	pxor %xmm5,%xmm1; \
657	pxor out3,%xmm8; \
658	movdqa %xmm8,out3; \
659	movdqa %xmm1,out2
660
661#define S8(out1, out2, out3, out4) \
662	movdqa %xmm1,%xmm13; \
663	pandn %xmm2,%xmm1; \
664	movdqa %xmm2,%xmm11; \
665	movdqa %xmm2,%xmm8; \
666	pandn %xmm4,%xmm2; \
667	movdqa %xmm1,%xmm6; \
668	pxor %xmm3,%xmm2; \
669	pandn %xmm13,%xmm11; \
670	movdqa %xmm2,%xmm9; \
671	pand %xmm0,%xmm2; \
672	movdqa %xmm9,%xmm7; \
673	pandn %xmm2,%xmm1; \
674	pandn %xmm13,%xmm9; \
675	pxor %xmm4,%xmm11; \
676	movdqa %xmm9,%xmm12; \
677	por %xmm0,%xmm9; \
678	movdqa %xmm11,%xmm10; \
679	pand %xmm9,%xmm11; \
680	pxor pnot,%xmm7; \
681	por %xmm11,%xmm2; \
682	pxor %xmm11,%xmm7; \
683	pandn %xmm8,%xmm9; \
684	movdqa %xmm5,%xmm15; \
685	pxor %xmm9,%xmm7; \
686	por %xmm1,%xmm15; \
687	pxor %xmm7,%xmm6; \
688	pxor %xmm6,%xmm15; \
689	pxor %xmm0,%xmm6; \
690	movdqa %xmm6,%xmm14; \
691	pxor %xmm13,%xmm7; \
692	pand %xmm4,%xmm6; \
693	pxor out2,%xmm15; \
694	pxor %xmm7,%xmm6; \
695	pxor %xmm6,%xmm12; \
696	movdqa %xmm15,out2; \
697	pxor %xmm2,%xmm6; \
698	pxor %xmm4,%xmm14; \
699	por %xmm13,%xmm6; \
700	pand %xmm5,%xmm2; \
701	por %xmm3,%xmm7; \
702	pxor %xmm12,%xmm10; \
703	pxor %xmm10,%xmm7; \
704	pxor %xmm14,%xmm6; \
705	pxor %xmm6,%xmm2; \
706	pxor %xmm7,%xmm0; \
707	pandn %xmm10,%xmm3; \
708	pand %xmm5,%xmm0; \
709	pand %xmm3,%xmm6; \
710	pxor out3,%xmm2; \
711	pxor %xmm6,%xmm7; \
712	pxor %xmm1,%xmm7; \
713	pxor out4,%xmm0; \
714	movdqa %xmm2,out3; \
715	por %xmm7,%xmm5; \
716	pxor out1,%xmm5; \
717	pxor %xmm12,%xmm0; \
718	pxor %xmm12,%xmm5; \
719	movdqa %xmm0,out4; \
720	movdqa %xmm5,out1
721
722#define zero				%xmm5
723
724#define DES_bs_clear_block_8(i) \
725	movdqa zero,B(i); \
726	movdqa zero,B(i + 1); \
727	movdqa zero,B(i + 2); \
728	movdqa zero,B(i + 3); \
729	movdqa zero,B(i + 4); \
730	movdqa zero,B(i + 5); \
731	movdqa zero,B(i + 6); \
732	movdqa zero,B(i + 7)
733
734#define DES_bs_clear_block \
735	DES_bs_clear_block_8(0); \
736	DES_bs_clear_block_8(8); \
737	DES_bs_clear_block_8(16); \
738	DES_bs_clear_block_8(24); \
739	DES_bs_clear_block_8(32); \
740	DES_bs_clear_block_8(40); \
741	DES_bs_clear_block_8(48); \
742	DES_bs_clear_block_8(56)
743
744#define k_ptr				%rdx
745#define K(i)				nvec(i)(k_ptr)
746#define k(i)				nptr(i)(k_ptr)
747
748#define tmp1				%rcx
749#define tmp2				%rsi
750#ifdef __ILP32__
751#define tmp1p				%ecx
752#define tmp2p				%esi
753#else
754#define tmp1p				tmp1
755#define tmp2p				tmp2
756#endif
757
758#define xor_E(i) \
759	mov E(i),tmp1p; \
760	movdqa K(i),a1; \
761	mov E(i + 1),tmp2p; \
762	movdqa K(i + 1),a2; \
763	pxor (tmp1),a1; \
764	pxor (tmp2),a2; \
765	mov E(i + 2),tmp1p; \
766	movdqa K(i + 2),a3; \
767	mov E(i + 3),tmp2p; \
768	movdqa K(i + 3),a4; \
769	pxor (tmp1),a3; \
770	pxor (tmp2),a4; \
771	mov E(i + 4),tmp1p; \
772	movdqa K(i + 4),a5; \
773	mov E(i + 5),tmp2p; \
774	movdqa K(i + 5),a6; \
775	pxor (tmp1),a5; \
776	pxor (tmp2),a6
777
778#define xor_B(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
779	movdqa B(b1),a1; \
780	movdqa B(b2),a2; \
781	pxor K(k1),a1; \
782	movdqa B(b3),a3; \
783	pxor K(k2),a2; \
784	movdqa B(b4),a4; \
785	pxor K(k3),a3; \
786	movdqa B(b5),a5; \
787	pxor K(k4),a4; \
788	movdqa B(b6),a6; \
789	pxor K(k5),a5; \
790	pxor K(k6),a6
791
792#define xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6) \
793	mov k(k1),tmp1; \
794	mov k(k2),tmp2; \
795	movdqa B(b1),a1; \
796	movdqa B(b2),a2; \
797	pxor (tmp1),a1; \
798	mov k(k3),tmp1p; \
799	pxor (tmp2),a2; \
800	mov k(k4),tmp2p; \
801	movdqa B(b3),a3; \
802	movdqa B(b4),a4; \
803	pxor (tmp1),a3; \
804	mov k(k6),tmp1; \
805	pxor (tmp2),a4
806
807#define xor_B_KS_p_suffix(b5, k5) \
808	mov k(k5),tmp2; \
809	movdqa B(b5),a5; \
810	pxor (tmp1),a6; \
811	pxor (tmp2),a5
812
813#define xor_B_KS_p(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
814	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
815	movdqa B(b6),a6; \
816	xor_B_KS_p_suffix(b5, k5)
817
818#define xor_B_KS_p_special(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, k6) \
819	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
820	xor_B_KS_p_suffix(b5, k5)
821
822#define mask01				tmp_at(8)
823#define mask02				tmp_at(9)
824#define mask04				tmp_at(10)
825#define mask08				tmp_at(11)
826#define mask10				tmp_at(12)
827#define mask20				tmp_at(13)
828#define mask40				tmp_at(14)
829#define mask80				tmp_at(15)
830
831#define v_ptr				%rax
832#define V(i)				nvec(i)(v_ptr)
833
834#if 1
835#define SHLB1(reg)			paddb reg,reg
836#else
837#define SHLB1(reg)			psllq $1,reg
838#endif
839
840#define FINALIZE_NEXT_KEY_BITS_0_6 \
841	movdqa V(0),%xmm0; \
842	movdqa V(1),%xmm1; \
843	movdqa V(2),%xmm2; \
844	movdqa V(3),%xmm3; \
845	pand %xmm7,%xmm0; \
846	pand %xmm7,%xmm1; \
847	pand %xmm7,%xmm2; \
848	pand %xmm7,%xmm3; \
849	SHLB1(%xmm1); \
850	psllq $2,%xmm2; \
851	psllq $3,%xmm3; \
852	por %xmm0,%xmm1; \
853	por %xmm2,%xmm3; \
854	movdqa V(4),%xmm4; \
855	movdqa V(5),%xmm5; \
856	por %xmm1,%xmm3; \
857	pand %xmm7,%xmm4; \
858	pand %xmm7,%xmm5; \
859	movdqa V(6),%xmm6; \
860	movdqa V(7),%xmm0; \
861	psllq $4,%xmm4; \
862	pand %xmm7,%xmm6; \
863	pand %xmm7,%xmm0; \
864	psllq $5,%xmm5; \
865	psllq $6,%xmm6; \
866	psllq $7,%xmm0; \
867	por %xmm4,%xmm5; \
868	por %xmm6,%xmm3; \
869	por %xmm5,%xmm0; \
870	movdqa V(1),%xmm1; \
871	por %xmm3,%xmm0; \
872	movdqa V(2),%xmm2; \
873	movdqa %xmm0,K(0); \
874\
875	movdqa V(0),%xmm0; \
876	movdqa V(3),%xmm3; \
877	pand %xmm8,%xmm1; \
878	pand %xmm8,%xmm2; \
879	pand %xmm8,%xmm0; \
880	pand %xmm8,%xmm3; \
881	psrlq $1,%xmm0; \
882	SHLB1(%xmm2); \
883	psllq $2,%xmm3; \
884	por %xmm0,%xmm1; \
885	por %xmm2,%xmm3; \
886	movdqa V(4),%xmm4; \
887	movdqa V(5),%xmm5; \
888	por %xmm1,%xmm3; \
889	pand %xmm8,%xmm4; \
890	pand %xmm8,%xmm5; \
891	movdqa V(6),%xmm6; \
892	movdqa V(7),%xmm0; \
893	psllq $3,%xmm4; \
894	pand %xmm8,%xmm6; \
895	pand %xmm8,%xmm0; \
896	psllq $4,%xmm5; \
897	psllq $5,%xmm6; \
898	psllq $6,%xmm0; \
899	por %xmm4,%xmm5; \
900	por %xmm6,%xmm3; \
901	por %xmm5,%xmm0; \
902	movdqa V(1),%xmm1; \
903	por %xmm3,%xmm0; \
904	movdqa V(2),%xmm2; \
905	movdqa %xmm0,K(1); \
906\
907	movdqa V(0),%xmm0; \
908	movdqa V(3),%xmm3; \
909	pand %xmm9,%xmm1; \
910	pand %xmm9,%xmm2; \
911	pand %xmm9,%xmm0; \
912	pand %xmm9,%xmm3; \
913	psrlq $1,%xmm1; \
914	psrlq $2,%xmm0; \
915	SHLB1(%xmm3); \
916	por %xmm0,%xmm1; \
917	por %xmm2,%xmm3; \
918	movdqa V(4),%xmm4; \
919	movdqa V(5),%xmm5; \
920	por %xmm1,%xmm3; \
921	pand %xmm9,%xmm4; \
922	pand %xmm9,%xmm5; \
923	movdqa V(6),%xmm6; \
924	movdqa V(7),%xmm0; \
925	psllq $2,%xmm4; \
926	pand %xmm9,%xmm6; \
927	pand %xmm9,%xmm0; \
928	psllq $3,%xmm5; \
929	psllq $4,%xmm6; \
930	psllq $5,%xmm0; \
931	por %xmm4,%xmm5; \
932	por %xmm6,%xmm3; \
933	por %xmm5,%xmm0; \
934	movdqa V(1),%xmm1; \
935	por %xmm3,%xmm0; \
936	movdqa V(2),%xmm2; \
937	movdqa %xmm0,K(2); \
938\
939	movdqa V(0),%xmm0; \
940	movdqa V(3),%xmm3; \
941	pand %xmm10,%xmm1; \
942	pand %xmm10,%xmm2; \
943	pand %xmm10,%xmm0; \
944	pand %xmm10,%xmm3; \
945	psrlq $2,%xmm1; \
946	psrlq $3,%xmm0; \
947	psrlq $1,%xmm2; \
948	por %xmm0,%xmm1; \
949	por %xmm2,%xmm3; \
950	movdqa V(4),%xmm4; \
951	movdqa V(5),%xmm5; \
952	por %xmm1,%xmm3; \
953	pand %xmm10,%xmm4; \
954	pand %xmm10,%xmm5; \
955	movdqa V(6),%xmm6; \
956	movdqa V(7),%xmm0; \
957	SHLB1(%xmm4); \
958	pand %xmm10,%xmm6; \
959	pand %xmm10,%xmm0; \
960	psllq $2,%xmm5; \
961	psllq $3,%xmm6; \
962	psllq $4,%xmm0; \
963	por %xmm4,%xmm5; \
964	por %xmm6,%xmm3; \
965	por %xmm5,%xmm0; \
966	movdqa V(1),%xmm1; \
967	por %xmm3,%xmm0; \
968	movdqa V(2),%xmm2; \
969	movdqa %xmm0,K(3); \
970\
971	movdqa V(0),%xmm0; \
972	movdqa V(3),%xmm3; \
973	pand %xmm11,%xmm1; \
974	pand %xmm11,%xmm2; \
975	pand %xmm11,%xmm0; \
976	pand %xmm11,%xmm3; \
977	psrlq $3,%xmm1; \
978	psrlq $4,%xmm0; \
979	psrlq $2,%xmm2; \
980	psrlq $1,%xmm3; \
981	por %xmm0,%xmm1; \
982	por %xmm2,%xmm3; \
983	movdqa V(4),%xmm4; \
984	movdqa V(5),%xmm5; \
985	por %xmm1,%xmm3; \
986	pand %xmm11,%xmm4; \
987	pand %xmm11,%xmm5; \
988	movdqa V(6),%xmm6; \
989	movdqa V(7),%xmm0; \
990	pand %xmm11,%xmm6; \
991	pand %xmm11,%xmm0; \
992	SHLB1(%xmm5); \
993	psllq $2,%xmm6; \
994	psllq $3,%xmm0; \
995	por %xmm4,%xmm5; \
996	por %xmm6,%xmm3; \
997	por %xmm5,%xmm0; \
998	movdqa V(1),%xmm1; \
999	por %xmm3,%xmm0; \
1000	movdqa V(2),%xmm2; \
1001	movdqa %xmm0,K(4); \
1002\
1003	movdqa V(0),%xmm0; \
1004	movdqa V(3),%xmm3; \
1005	pand %xmm12,%xmm1; \
1006	pand %xmm12,%xmm2; \
1007	pand %xmm12,%xmm0; \
1008	pand %xmm12,%xmm3; \
1009	psrlq $4,%xmm1; \
1010	psrlq $5,%xmm0; \
1011	psrlq $3,%xmm2; \
1012	psrlq $2,%xmm3; \
1013	por %xmm0,%xmm1; \
1014	por %xmm2,%xmm3; \
1015	movdqa V(4),%xmm4; \
1016	movdqa V(5),%xmm5; \
1017	por %xmm1,%xmm3; \
1018	pand %xmm12,%xmm4; \
1019	pand %xmm12,%xmm5; \
1020	movdqa V(6),%xmm6; \
1021	movdqa V(7),%xmm0; \
1022	psrlq $1,%xmm4; \
1023	pand %xmm12,%xmm6; \
1024	pand %xmm12,%xmm0; \
1025	SHLB1(%xmm6); \
1026	psllq $2,%xmm0; \
1027	por %xmm4,%xmm5; \
1028	por %xmm6,%xmm3; \
1029	por %xmm5,%xmm0; \
1030	movdqa V(1),%xmm1; \
1031	por %xmm3,%xmm0; \
1032	movdqa V(2),%xmm2; \
1033	movdqa %xmm0,K(5); \
1034\
1035	movdqa V(0),%xmm0; \
1036	movdqa V(3),%xmm3; \
1037	pand %xmm13,%xmm1; \
1038	pand %xmm13,%xmm2; \
1039	pand %xmm13,%xmm0; \
1040	pand %xmm13,%xmm3; \
1041	psrlq $5,%xmm1; \
1042	psrlq $6,%xmm0; \
1043	psrlq $4,%xmm2; \
1044	psrlq $3,%xmm3; \
1045	por %xmm0,%xmm1; \
1046	por %xmm2,%xmm3; \
1047	movdqa V(4),%xmm4; \
1048	movdqa V(5),%xmm5; \
1049	por %xmm1,%xmm3; \
1050	pand %xmm13,%xmm4; \
1051	pand %xmm13,%xmm5; \
1052	movdqa V(6),%xmm6; \
1053	movdqa V(7),%xmm0; \
1054	psrlq $2,%xmm4; \
1055	pand %xmm13,%xmm6; \
1056	pand %xmm13,%xmm0; \
1057	psrlq $1,%xmm5; \
1058	SHLB1(%xmm0); \
1059	por %xmm4,%xmm5; \
1060	por %xmm6,%xmm3; \
1061	por %xmm5,%xmm0; \
1062	por %xmm3,%xmm0; \
1063	movdqa %xmm0,K(6)
1064
1065.text
1066
1067DO_ALIGN(6)
1068.globl DES_bs_init_asm
1069DES_bs_init_asm:
1070	pcmpeqd %xmm0,%xmm0
1071	movdqa %xmm0,pnot
1072	paddb %xmm0,%xmm0
1073	pxor pnot,%xmm0
1074	movdqa %xmm0,mask01
1075	SHLB1(%xmm0)
1076	movdqa %xmm0,mask02
1077	SHLB1(%xmm0)
1078	movdqa %xmm0,mask04
1079	SHLB1(%xmm0)
1080	movdqa %xmm0,mask08
1081	SHLB1(%xmm0)
1082	movdqa %xmm0,mask10
1083	SHLB1(%xmm0)
1084	movdqa %xmm0,mask20
1085	SHLB1(%xmm0)
1086	movdqa %xmm0,mask40
1087	SHLB1(%xmm0)
1088	movdqa %xmm0,mask80
1089	ret
1090
1091#define iterations			%edi
1092#define rounds_and_swapped		%eax
1093
1094DO_ALIGN(6)
1095.globl DES_bs_crypt
1096DES_bs_crypt:
1097	PROLOGUE
1098	cmpl $0,DES_bs_all_keys_changed(%rip)
1099	jz DES_bs_crypt_body
1100	pushq %rdi
1101	call DES_bs_finalize_keys
1102	popq %rdi
1103DES_bs_crypt_body:
1104	pxor zero,zero
1105	leaq DES_bs_all_KS_v(%rip),k_ptr
1106	DES_bs_clear_block
1107	movl $8,rounds_and_swapped
1108DES_bs_crypt_start:
1109	xor_E(0)
1110	S1(B(40), B(48), B(54), B(62))
1111	xor_E(6)
1112	S2(B(44), B(59), B(33), B(49))
1113	xor_E(12)
1114	S3(B(55), B(47), B(61), B(37))
1115	xor_E(18)
1116	S4(B(57), B(51), B(41), B(32))
1117	xor_E(24)
1118	S5(B(39), B(45), B(56), B(34))
1119	xor_E(30)
1120	S6(B(35), B(60), B(42), B(50))
1121	xor_E(36)
1122	S7(B(63), B(43), B(53), B(38))
1123	xor_E(42)
1124	S8(B(36), B(58), B(46), B(52))
1125	cmpl $0x100,rounds_and_swapped
1126	je DES_bs_crypt_next
1127DES_bs_crypt_swap:
1128	xor_E(48)
1129	S1(B(8), B(16), B(22), B(30))
1130	xor_E(54)
1131	S2(B(12), B(27), B(1), B(17))
1132	xor_E(60)
1133	S3(B(23), B(15), B(29), B(5))
1134	xor_E(66)
1135	S4(B(25), B(19), B(9), B(0))
1136	xor_E(72)
1137	S5(B(7), B(13), B(24), B(2))
1138	xor_E(78)
1139	S6(B(3), B(28), B(10), B(18))
1140	xor_E(84)
1141	S7(B(31), B(11), B(21), B(6))
1142	xor_E(90)
1143	addq $nvec(96),k_ptr
1144	S8(B(4), B(26), B(14), B(20))
1145	subl $1,rounds_and_swapped
1146	jnz DES_bs_crypt_start
1147	subq $nvec(0x300+48),k_ptr
1148	movl $0x108,rounds_and_swapped
1149	subl $1,iterations
1150	jnz DES_bs_crypt_swap
1151	EPILOGUE
1152	ret
1153DES_bs_crypt_next:
1154	subq $nvec(0x300-48),k_ptr
1155	movl $8,rounds_and_swapped
1156	subl $1,iterations
1157	jnz DES_bs_crypt_start
1158	EPILOGUE
1159	ret
1160
1161DO_ALIGN(6)
1162.globl DES_bs_crypt_25
1163DES_bs_crypt_25:
1164	PROLOGUE
1165	cmpl $0,DES_bs_all_keys_changed(%rip)
1166	jnz DES_bs_finalize_keys_25
1167DES_bs_crypt_25_body:
1168	pxor zero,zero
1169	leaq DES_bs_all_KS_v(%rip),k_ptr
1170	DES_bs_clear_block
1171	movl $8,rounds_and_swapped
1172	movl $25,iterations
1173DES_bs_crypt_25_start:
1174	xor_E(0)
1175	S1(B(40), B(48), B(54), B(62))
1176	xor_E(6)
1177	S2(B(44), B(59), B(33), B(49))
1178	xor_B(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1179	S3(B(55), B(47), B(61), B(37))
1180	xor_B(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1181	S4(B(57), B(51), B(41), B(32))
1182	xor_E(24)
1183	S5(B(39), B(45), B(56), B(34))
1184	xor_E(30)
1185	S6(B(35), B(60), B(42), B(50))
1186	xor_B(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1187	S7(B(63), B(43), B(53), B(38))
1188	xor_B(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1189	S8(B(36), B(58), B(46), B(52))
1190	cmpl $0x100,rounds_and_swapped
1191	je DES_bs_crypt_25_next
1192DES_bs_crypt_25_swap:
1193	xor_E(48)
1194	S1(B(8), B(16), B(22), B(30))
1195	xor_E(54)
1196	S2(B(12), B(27), B(1), B(17))
1197	xor_B(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1198	S3(B(23), B(15), B(29), B(5))
1199	xor_B(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1200	S4(B(25), B(19), B(9), B(0))
1201	xor_E(72)
1202	S5(B(7), B(13), B(24), B(2))
1203	xor_E(78)
1204	S6(B(3), B(28), B(10), B(18))
1205	xor_B(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1206	S7(B(31), B(11), B(21), B(6))
1207	xor_B(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1208	S8(B(4), B(26), B(14), B(20))
1209	addq $nvec(96),k_ptr
1210	subl $1,rounds_and_swapped
1211	jnz DES_bs_crypt_25_start
1212	subq $nvec(0x300+48),k_ptr
1213	movl $0x108,rounds_and_swapped
1214	subl $1,iterations
1215	jnz DES_bs_crypt_25_swap
1216	EPILOGUE
1217	ret
1218DES_bs_crypt_25_next:
1219	subq $nvec(0x300-48),k_ptr
1220	movl $8,rounds_and_swapped
1221	subl $1,iterations
1222	jmp DES_bs_crypt_25_start
1223
1224DES_bs_finalize_keys_25:
1225	leaq DES_bs_crypt_25_body(%rip),tmp1
1226	pushq tmp1
1227DES_bs_finalize_keys:
1228	movdqa mask01,%xmm7
1229	movdqa mask02,%xmm8
1230	leaq DES_bs_all_xkeys(%rip),v_ptr
1231	movdqa mask04,%xmm9
1232	movdqa mask08,%xmm10
1233	leaq DES_bs_all_K(%rip),k_ptr
1234	movl $8,iterations
1235	movdqa mask10,%xmm11
1236	movdqa mask20,%xmm12
1237	movl $0,DES_bs_all_keys_changed(%rip)
1238	movdqa mask40,%xmm13
1239DES_bs_finalize_keys_main_loop:
1240	FINALIZE_NEXT_KEY_BITS_0_6
1241	addq $nvec(7),k_ptr
1242	addq $nvec(8),v_ptr
1243	subl $1,iterations
1244	jnz DES_bs_finalize_keys_main_loop
1245	leaq DES_bs_all_KSp(%rip),k_ptr
1246	leaq DES_bs_all_KS_v(%rip),v_ptr
1247	movl $0x60,iterations
1248DES_bs_finalize_keys_expand_loop:
1249	mov k(0),tmp1p
1250	mov k(1),tmp2p
1251	movdqa (tmp1),%xmm0
1252	movdqa (tmp2),%xmm1
1253	mov k(2),tmp1p
1254	mov k(3),tmp2p
1255	movdqa %xmm0,V(0)
1256	movdqa %xmm1,V(1)
1257	movdqa (tmp1),%xmm0
1258	movdqa (tmp2),%xmm1
1259	mov k(4),tmp1p
1260	mov k(5),tmp2p
1261	movdqa %xmm0,V(2)
1262	movdqa %xmm1,V(3)
1263	movdqa (tmp1),%xmm0
1264	movdqa (tmp2),%xmm1
1265	mov k(6),tmp1p
1266	mov k(7),tmp2p
1267	movdqa %xmm0,V(4)
1268	movdqa %xmm1,V(5)
1269	movdqa (tmp1),%xmm0
1270	movdqa (tmp2),%xmm1
1271	addq $nptr(8),k_ptr
1272	movdqa %xmm0,V(6)
1273	movdqa %xmm1,V(7)
1274	addq $nvec(8),v_ptr
1275	subl $1,iterations
1276	jnz DES_bs_finalize_keys_expand_loop
1277	ret
1278
1279#define ones				%xmm1
1280
1281#define rounds				%eax
1282
1283DO_ALIGN(6)
1284.globl DES_bs_crypt_LM
1285DES_bs_crypt_LM:
1286	PROLOGUE
1287	movl (ARG1),%r8d
1288	movdqa mask01,%xmm7
1289	movdqa mask02,%xmm8
1290	leaq DES_bs_all_xkeys(%rip),v_ptr
1291	movdqa mask04,%xmm9
1292	movdqa mask08,%xmm10
1293	leaq DES_bs_all_K(%rip),k_ptr
1294	movdqa mask10,%xmm11
1295	movdqa mask20,%xmm12
1296	movl $7,iterations
1297	movdqa mask40,%xmm13
1298	movdqa mask80,%xmm14
1299DES_bs_finalize_keys_LM_loop:
1300	FINALIZE_NEXT_KEY_BITS_0_6
1301# bit 7
1302	movdqa V(0),%xmm0
1303	movdqa V(1),%xmm1
1304	movdqa V(2),%xmm2
1305	movdqa V(3),%xmm3
1306	pand %xmm14,%xmm0
1307	pand %xmm14,%xmm1
1308	pand %xmm14,%xmm2
1309	pand %xmm14,%xmm3
1310	psrlq $7,%xmm0
1311	psrlq $6,%xmm1
1312	psrlq $5,%xmm2
1313	psrlq $4,%xmm3
1314	por %xmm0,%xmm1
1315	por %xmm2,%xmm3
1316	movdqa V(4),%xmm4
1317	movdqa V(5),%xmm5
1318	por %xmm1,%xmm3
1319	pand %xmm14,%xmm4
1320	pand %xmm14,%xmm5
1321	movdqa V(6),%xmm6
1322	movdqa V(7),%xmm0
1323	psrlq $3,%xmm4
1324	pand %xmm14,%xmm6
1325	pand %xmm14,%xmm0
1326	psrlq $2,%xmm5
1327	psrlq $1,%xmm6
1328	por %xmm4,%xmm5
1329	por %xmm6,%xmm3
1330	por %xmm5,%xmm0
1331	addq $nvec(8),v_ptr
1332	por %xmm3,%xmm0
1333	movdqa %xmm0,K(7)
1334	addq $nvec(8),k_ptr
1335	subl $1,iterations
1336	jnz DES_bs_finalize_keys_LM_loop
1337
1338	pxor zero,zero
1339	pcmpeqd ones,ones
1340	leaq DES_bs_all_KS_p(%rip),k_ptr
1341	movdqa zero,B(0)
1342	movdqa zero,B(1)
1343	movdqa zero,B(2)
1344	movdqa zero,B(3)
1345	movdqa zero,B(4)
1346	movdqa zero,B(5)
1347	movdqa zero,B(6)
1348	movdqa zero,B(7)
1349	movdqa ones,B(8)
1350	movdqa ones,B(9)
1351	movdqa ones,B(10)
1352	movdqa zero,B(11)
1353	movdqa ones,B(12)
1354	movdqa zero,B(13)
1355	movdqa zero,B(14)
1356	movdqa zero,B(15)
1357	movdqa zero,B(16)
1358	movdqa zero,B(17)
1359	movdqa zero,B(18)
1360	movdqa zero,B(19)
1361	movdqa zero,B(20)
1362	movdqa zero,B(21)
1363	movdqa zero,B(22)
1364	movdqa ones,B(23)
1365	movdqa zero,B(24)
1366	movdqa zero,B(25)
1367	movdqa ones,B(26)
1368	movdqa zero,B(27)
1369	movdqa zero,B(28)
1370	movdqa ones,B(29)
1371	movdqa ones,B(30)
1372	movdqa ones,B(31)
1373	movdqa zero,B(32)
1374	movdqa zero,B(33)
1375	movdqa zero,B(34)
1376	movdqa ones,B(35)
1377	movdqa zero,B(36)
1378	movdqa ones,B(37)
1379	movdqa ones,B(38)
1380	movdqa ones,B(39)
1381	movdqa zero,B(40)
1382	movdqa zero,B(41)
1383	movdqa zero,B(42)
1384	movdqa zero,B(43)
1385	movdqa zero,B(44)
1386	movdqa ones,B(45)
1387	movdqa zero,B(46)
1388	movdqa zero,B(47)
1389	movdqa ones,B(48)
1390	movdqa ones,B(49)
1391	movdqa zero,B(50)
1392	movdqa zero,B(51)
1393	movdqa zero,B(52)
1394	movdqa zero,B(53)
1395	movdqa ones,B(54)
1396	movdqa zero,B(55)
1397	movdqa ones,B(56)
1398	movdqa zero,B(57)
1399	movdqa ones,B(58)
1400	movdqa zero,B(59)
1401	movdqa ones,B(60)
1402	movdqa ones,B(61)
1403	movdqa ones,B(62)
1404	movdqa ones,B(63)
1405	movl $8,rounds
1406DES_bs_crypt_LM_loop:
1407	xor_B_KS_p_special(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5)
1408	S1(B(40), B(48), B(54), B(62))
1409	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
1410	S2(B(44), B(59), B(33), B(49))
1411	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1412	S3(B(55), B(47), B(61), B(37))
1413	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1414	S4(B(57), B(51), B(41), B(32))
1415	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
1416	S5(B(39), B(45), B(56), B(34))
1417	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
1418	S6(B(35), B(60), B(42), B(50))
1419	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1420	S7(B(63), B(43), B(53), B(38))
1421	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1422	S8(B(36), B(58), B(46), B(52))
1423	xor_B_KS_p_special(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 53)
1424	S1(B(8), B(16), B(22), B(30))
1425	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
1426	S2(B(12), B(27), B(1), B(17))
1427	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1428	S3(B(23), B(15), B(29), B(5))
1429	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1430	S4(B(25), B(19), B(9), B(0))
1431	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
1432	S5(B(7), B(13), B(24), B(2))
1433	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
1434	S6(B(3), B(28), B(10), B(18))
1435	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1436	S7(B(31), B(11), B(21), B(6))
1437	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1438	addq $nptr(96),k_ptr
1439	S8(B(4), B(26), B(14), B(20))
1440	subl $1,rounds
1441	jnz DES_bs_crypt_LM_loop
1442	xchgq %r8,%rax
1443	EPILOGUE
1444	ret
1445
1446#define rounds				%eax
1447
1448DO_ALIGN(6)
1449.globl DES_bs_crypt_plain
1450DES_bs_crypt_plain:
1451	PROLOGUE
1452	movdqa mask01,%xmm7
1453	movdqa mask02,%xmm8
1454	leaq DES_bs_all_xkeys(%rip),v_ptr
1455	movdqa mask04,%xmm9
1456	movdqa mask08,%xmm10
1457	leaq DES_bs_all_K(%rip),k_ptr
1458	movdqa mask10,%xmm11
1459	movdqa mask20,%xmm12
1460	movl $8,iterations
1461	movdqa mask40,%xmm13
1462DES_bs_finalize_keys_plain_loop:
1463	FINALIZE_NEXT_KEY_BITS_0_6
1464	addq $nvec(7),k_ptr
1465	addq $nvec(8),v_ptr
1466	subl $1,iterations
1467	jnz DES_bs_finalize_keys_plain_loop
1468	leaq DES_bs_all_KS_p(%rip),k_ptr
1469	leaq DES_bs_all_KS_v(%rip),v_ptr
1470
1471	movdqa P(0),%xmm4
1472	movdqa %xmm4,B(0)
1473	movdqa P(1),%xmm4
1474	movdqa %xmm4,B(1)
1475	movdqa P(2),%xmm4
1476	movdqa %xmm4,B(2)
1477	movdqa P(3),%xmm4
1478	movdqa %xmm4,B(3)
1479	movdqa P(4),%xmm4
1480	movdqa %xmm4,B(4)
1481	movdqa P(5),%xmm4
1482	movdqa %xmm4,B(5)
1483	movdqa P(6),%xmm4
1484	movdqa %xmm4,B(6)
1485	movdqa P(7),%xmm4
1486	movdqa %xmm4,B(7)
1487	movdqa P(8),%xmm4
1488	movdqa %xmm4,B(8)
1489	movdqa P(9),%xmm4
1490	movdqa %xmm4,B(9)
1491	movdqa P(10),%xmm4
1492	movdqa %xmm4,B(10)
1493	movdqa P(11),%xmm4
1494	movdqa %xmm4,B(11)
1495	movdqa P(12),%xmm4
1496	movdqa %xmm4,B(12)
1497	movdqa P(13),%xmm4
1498	movdqa %xmm4,B(13)
1499	movdqa P(14),%xmm4
1500	movdqa %xmm4,B(14)
1501	movdqa P(15),%xmm4
1502	movdqa %xmm4,B(15)
1503	movdqa P(16),%xmm4
1504	movdqa %xmm4,B(16)
1505	movdqa P(17),%xmm4
1506	movdqa %xmm4,B(17)
1507	movdqa P(18),%xmm4
1508	movdqa %xmm4,B(18)
1509	movdqa P(19),%xmm4
1510	movdqa %xmm4,B(19)
1511	movdqa P(20),%xmm4
1512	movdqa %xmm4,B(20)
1513	movdqa P(21),%xmm4
1514	movdqa %xmm4,B(21)
1515	movdqa P(22),%xmm4
1516	movdqa %xmm4,B(22)
1517	movdqa P(23),%xmm4
1518	movdqa %xmm4,B(23)
1519	movdqa P(24),%xmm4
1520	movdqa %xmm4,B(24)
1521	movdqa P(25),%xmm4
1522	movdqa %xmm4,B(25)
1523	movdqa P(26),%xmm4
1524	movdqa %xmm4,B(26)
1525	movdqa P(27),%xmm4
1526	movdqa %xmm4,B(27)
1527	movdqa P(28),%xmm4
1528	movdqa %xmm4,B(28)
1529	movdqa P(29),%xmm4
1530	movdqa %xmm4,B(29)
1531	movdqa P(30),%xmm4
1532	movdqa %xmm4,B(30)
1533	movdqa P(31),%xmm4
1534	movdqa %xmm4,B(31)
1535	movdqa P(32),%xmm4
1536	movdqa %xmm4,B(32)
1537	movdqa P(33),%xmm4
1538	movdqa %xmm4,B(33)
1539	movdqa P(34),%xmm4
1540	movdqa %xmm4,B(34)
1541	movdqa P(35),%xmm4
1542	movdqa %xmm4,B(35)
1543	movdqa P(36),%xmm4
1544	movdqa %xmm4,B(36)
1545	movdqa P(37),%xmm4
1546	movdqa %xmm4,B(37)
1547	movdqa P(38),%xmm4
1548	movdqa %xmm4,B(38)
1549	movdqa P(39),%xmm4
1550	movdqa %xmm4,B(39)
1551	movdqa P(40),%xmm4
1552	movdqa %xmm4,B(40)
1553	movdqa P(41),%xmm4
1554	movdqa %xmm4,B(41)
1555	movdqa P(42),%xmm4
1556	movdqa %xmm4,B(42)
1557	movdqa P(43),%xmm4
1558	movdqa %xmm4,B(43)
1559	movdqa P(44),%xmm4
1560	movdqa %xmm4,B(44)
1561	movdqa P(45),%xmm4
1562	movdqa %xmm4,B(45)
1563	movdqa P(46),%xmm4
1564	movdqa %xmm4,B(46)
1565	movdqa P(47),%xmm4
1566	movdqa %xmm4,B(47)
1567	movdqa P(48),%xmm4
1568	movdqa %xmm4,B(48)
1569	movdqa P(49),%xmm4
1570	movdqa %xmm4,B(49)
1571	movdqa P(50),%xmm4
1572	movdqa %xmm4,B(50)
1573	movdqa P(51),%xmm4
1574	movdqa %xmm4,B(51)
1575	movdqa P(52),%xmm4
1576	movdqa %xmm4,B(52)
1577	movdqa P(53),%xmm4
1578	movdqa %xmm4,B(53)
1579	movdqa P(54),%xmm4
1580	movdqa %xmm4,B(54)
1581	movdqa P(55),%xmm4
1582	movdqa %xmm4,B(55)
1583	movdqa P(56),%xmm4
1584	movdqa %xmm4,B(56)
1585	movdqa P(57),%xmm4
1586	movdqa %xmm4,B(57)
1587	movdqa P(58),%xmm4
1588	movdqa %xmm4,B(58)
1589	movdqa P(59),%xmm4
1590	movdqa %xmm4,B(59)
1591	movdqa P(60),%xmm4
1592	movdqa %xmm4,B(60)
1593	movdqa P(61),%xmm4
1594	movdqa %xmm4,B(61)
1595	movdqa P(62),%xmm4
1596	movdqa %xmm4,B(62)
1597	movdqa P(63),%xmm4
1598	movdqa %xmm4,B(63)
1599	movl $8,rounds
1600DES_bs_crypt_plain_loop:
1601	xor_B_KS_p(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5)
1602	S1(B(40), B(48), B(54), B(62))
1603	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
1604	S2(B(44), B(59), B(33), B(49))
1605	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1606	S3(B(55), B(47), B(61), B(37))
1607	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1608	S4(B(57), B(51), B(41), B(32))
1609	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
1610	S5(B(39), B(45), B(56), B(34))
1611	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
1612	S6(B(35), B(60), B(42), B(50))
1613	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1614	S7(B(63), B(43), B(53), B(38))
1615	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1616	S8(B(36), B(58), B(46), B(52))
1617	xor_B_KS_p(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 36, 53)
1618	S1(B(8), B(16), B(22), B(30))
1619	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
1620	S2(B(12), B(27), B(1), B(17))
1621	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1622	S3(B(23), B(15), B(29), B(5))
1623	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1624	S4(B(25), B(19), B(9), B(0))
1625	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
1626	S5(B(7), B(13), B(24), B(2))
1627	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
1628	S6(B(3), B(28), B(10), B(18))
1629	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1630	S7(B(31), B(11), B(21), B(6))
1631	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1632	addq $nptr(96),k_ptr
1633	S8(B(4), B(26), B(14), B(20))
1634	subl $1,rounds
1635	jnz DES_bs_crypt_plain_loop
1636	EPILOGUE
1637	ret
1638#endif
1639
1640#if CPU_REQ
1641/*
1642 * CPU detection.
1643 */
1644
1645/* Leaf 1 */
1646#define CF_SSSE3                $0x00000200 /* SSSE3 */
1647#define CF_SSE4_1               $0x00080200 /* SSE4.1 + SSSE3 */
1648#define CF_SSE4_2               $0x00180200 /* SSE4.2 + SSE4.1 + SSSE3 */
1649#define CF_AVX                  $0x1C000000 /* AVX + XSAVE + OSXSAVE */
1650
1651/* Extended features */
1652#define CX_XOP                  $0x00000800
1653
1654/* Leaf 7 */
1655#define C7_AVX2                 $0x00000020 /* AVX2 */
1656#define C7_AVX512F              $0x00010000
1657#define C7_AVX512BW             $0x40010000 /* AVX512BW + AVX512F */
1658
1659.text
1660
1661#ifdef UNDERSCORES
1662#define CPU_req_name _CPU_req_name
1663#define CPU_detect _CPU_detect
1664#endif
1665.globl CPU_req_name
1666CPU_req_name:
1667	.asciz CPU_NAME
1668
1669.globl CPU_detect
1670CPU_detect:
1671	pushq %rbx
1672
1673/* First, leaf 1 checks */
1674	movl $1,%eax
1675	cpuid
1676#if CPU_REQ_AVX2 || CPU_REQ_AVX || CPU_REQ_XOP
1677	andl CF_AVX,%ecx
1678	cmpl CF_AVX,%ecx
1679	jne CPU_detect_fail
1680#elif CPU_REQ_SSE4_2
1681	andl CF_SSE4_2,%ecx
1682	cmpl CF_SSE4_2,%ecx
1683	jne CPU_detect_fail
1684#elif CPU_REQ_SSE4_1
1685	andl CF_SSE4_1,%ecx
1686	cmpl CF_SSE4_1,%ecx
1687	jne CPU_detect_fail
1688#elif CPU_REQ_SSSE3
1689	andl CF_SSSE3,%ecx
1690	cmpl CF_SSSE3,%ecx
1691	jne CPU_detect_fail
1692#endif
1693
1694#if CPU_REQ_AVX2 || CPU_REQ_AVX || CPU_REQ_XOP
1695/* Check that xmm and ymm state is preserved on a context switch */
1696	xorl %ecx,%ecx
1697	xgetbv
1698	andb $0x6,%al
1699	cmpb $0x6,%al
1700	jne CPU_detect_fail
1701#endif
1702
1703/* Extended feature tests (if required) */
1704#if CPU_REQ_XOP
1705	movl $0x80000000,%eax
1706	cpuid
1707	movl $0x80000001,%edx
1708	cmpl %edx,%eax
1709	jl CPU_detect_fail
1710	xchgl %edx,%eax
1711	cpuid
1712	testl CX_XOP,%ecx
1713	jz CPU_detect_fail
1714#endif
1715
1716/* Finally, leaf 7 tests (if required) */
1717#if CPU_REQ_AVX2 || CPU_REQ_AVX512F || CPU_REQ_AVX512BW
1718	xorl %eax,%eax
1719	cpuid
1720	movl $7,%edx
1721	cmpl %edx,%eax
1722	jl CPU_detect_fail
1723	xchgl %edx,%eax
1724	xorl %ecx,%ecx
1725	cpuid
1726#if CPU_REQ_AVX512BW
1727	andl C7_AVX512BW,%ebx
1728	cmpl C7_AVX512BW,%ebx
1729	jne CPU_detect_fail
1730#elif CPU_REQ_AVX512F
1731	andl C7_AVX512F,%ebx
1732	cmpl C7_AVX512F,%ebx
1733	jne CPU_detect_fail
1734#elif CPU_REQ_AVX2
1735	andl C7_AVX2,%ebx
1736	cmpl C7_AVX2,%ebx
1737	jne CPU_detect_fail
1738#endif
1739#endif
1740
1741/* If we reached here all is fine and we return 1 */
1742	movl $1,%eax
1743	popq %rbx
1744	ret
1745
1746/* Return 0 */
1747CPU_detect_fail:
1748	xorl %eax,%eax
1749	popq %rbx
1750	ret
1751#endif
1752
1753#if defined(__ELF__) && defined(__linux__)
1754.section .note.GNU-stack,"",@progbits
1755#endif
1756