1/*
2 * This file contains the core of a bitslice DES implementation for x86/SSE2.
3 * It is part of John the Ripper password cracker,
4 * Copyright (c) 2000-2001,2005,2006,2008,2011,2012,2015 by Solar Designer
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
7 *
8 * Gate counts per S-box: 49 44 46 33 48 46 46 41
9 * Average: 44.125
10 *
11 * The Boolean expressions corresponding to DES S-boxes have been generated
12 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
13 * John the Ripper password cracker: http://www.openwall.com/john/
14 * Being mathematical formulas, they are not copyrighted and are free for reuse
15 * by anyone.
16 *
17 * The x86/SSE2 code for the S-boxes was generated by Solar Designer using a
18 * Perl script and then hand-optimized - originally for MMX, then converted to
19 * SSE2.  Instruction scheduling was not re-done for SSE2-capable CPUs yet;
20 * doing so may provide further speedup.
21 *
22 * The effort has been sponsored by Rapid7: http://www.rapid7.com
23 *
24 * ...with changes in the jumbo patch, by Alain Espinosa (starting with a
25 * comment further down this file) and magnum.
26 *
27 * Addition of single DES encryption with no salt by Deepika Dutta Mishra
28 * <dipikadutta at gmail.com> in 2012, no rights reserved.
29 */
30
31#include "arch.h"
32
33/*
34 * Some broken systems don't offer section alignments larger than 4 bytes,
35 * while for the SSE code we need at least a 16 byte alignment.  ALIGN_FIX
36 * is here to work around this issue when we happen to get bad addresses.
37 */
38#ifndef ALIGN_FIX
39#ifdef ALIGN_LOG
40#define DO_ALIGN(log)			.align log
41#else
42#define DO_ALIGN(log)			.align 1 << log
43#endif
44#else
45#ifdef ALIGN_LOG
46#define DO_ALIGN(log)			.align log; .space ALIGN_FIX
47#else
48#define DO_ALIGN(log)			.align 1 << log; .space ALIGN_FIX
49#endif
50#endif
51
52#if DES_BS_ASM
53
54#ifdef UNDERSCORES
55#define DES_bs_all			_DES_bs_all
56#define DES_bs_init_asm			_DES_bs_init_asm
57#define DES_bs_crypt			_DES_bs_crypt
58#define DES_bs_crypt_25			_DES_bs_crypt_25
59#define DES_bs_crypt_LM			_DES_bs_crypt_LM
60#define DES_bs_crypt_plain		_DES_bs_crypt_plain
61#define DES_bs_P			_DES_bs_P
62#endif
63
64#ifdef __sun
65/* Sun's assembler doesn't recognize .space */
66#define DO_SPACE(size)			.zero size
67#else
68/* Mac OS X assembler doesn't recognize .zero */
69#define DO_SPACE(size)			.space size
70#endif
71
72/* Sun's assembler can't multiply, but at least it can add... */
73#define nptr(n)				n+n+n+n
74#define nvec(n)				n+n+n+n+n+n+n+n+n+n+n+n+n+n+n+n
75
76#ifdef BSD
77.data
78#else
79.bss
80#endif
81
82.globl DES_bs_P
83DO_ALIGN(6)
84DES_bs_P:
85DO_SPACE(nvec(64))
86
87.globl DES_bs_all
88DO_ALIGN(6)
89DES_bs_all:
90DES_bs_all_KSp:
91DO_SPACE(nptr(0x300))
92DES_bs_all_KS_p:
93DES_bs_all_KS_v:
94DO_SPACE(nvec(0x300))
95DES_bs_all_E:
96DO_SPACE(nptr(96))
97DES_bs_all_K:
98DO_SPACE(nvec(56))
99DES_bs_all_B:
100DO_SPACE(nvec(64))
101DES_bs_all_tmp:
102DO_SPACE(nvec(16))
103DES_bs_all_xkeys:
104DO_SPACE(nvec(64))
105DES_bs_all_pxkeys:
106DO_SPACE(nptr(128))
107DES_bs_all_keys_changed:
108DO_SPACE(4)
109DES_bs_all_salt:
110DO_SPACE(4)
111DES_bs_all_Ens:
112DO_SPACE(nptr(48))
113
114#define E(i)				DES_bs_all_E+nptr(i)
115#define B(i)				DES_bs_all_B+nvec(i)
116#define tmp_at(i)			DES_bs_all_tmp+nvec(i)
117#define P(i)				DES_bs_P+nvec(i)
118#define pnot				tmp_at(0)
119
120#define S1(out1, out2, out3, out4) \
121	movdqa %xmm0,tmp_at(1); \
122	movdqa %xmm5,%xmm7; \
123	movdqa %xmm4,tmp_at(4); \
124	movdqa %xmm2,%xmm6; \
125	movdqa %xmm1,tmp_at(2); \
126	por %xmm2,%xmm7; \
127	movdqa %xmm3,tmp_at(3); \
128	pxor %xmm0,%xmm6; \
129	movdqa %xmm7,tmp_at(5); \
130	movdqa %xmm6,%xmm1; \
131	pandn %xmm0,%xmm4; \
132	pand %xmm7,%xmm1; \
133	movdqa %xmm1,%xmm7; \
134	por %xmm5,%xmm7; \
135	pxor %xmm3,%xmm1; \
136	pxor %xmm4,%xmm3; \
137	movdqa %xmm1,tmp_at(6); \
138	movdqa %xmm3,%xmm1; \
139	pandn tmp_at(6),%xmm3; \
140	movdqa %xmm3,tmp_at(7); \
141	movdqa %xmm5,%xmm3; \
142	por %xmm0,%xmm5; \
143	pxor tmp_at(4),%xmm3; \
144	movdqa %xmm3,tmp_at(8); \
145	movdqa %xmm5,%xmm0; \
146	pandn %xmm3,%xmm6; \
147	pxor %xmm2,%xmm3; \
148	pandn %xmm2,%xmm4; \
149	pandn %xmm1,%xmm3; \
150	pxor %xmm3,%xmm7; \
151	movdqa tmp_at(7),%xmm3; \
152	pandn tmp_at(3),%xmm5; \
153	por %xmm7,%xmm0; \
154	pandn %xmm7,%xmm3; \
155	movdqa %xmm3,tmp_at(9); \
156	pand tmp_at(5),%xmm7; \
157	movdqa tmp_at(6),%xmm3; \
158	movdqa %xmm0,%xmm2; \
159	pxor %xmm1,%xmm2; \
160	pandn tmp_at(4),%xmm3; \
161	pandn %xmm2,%xmm4; \
162	movdqa tmp_at(2),%xmm2; \
163	pxor %xmm4,%xmm7; \
164	pxor tmp_at(8),%xmm4; \
165	pxor %xmm3,%xmm5; \
166	por %xmm3,%xmm4; \
167	pxor tmp_at(1),%xmm4; \
168	pxor %xmm0,%xmm3; \
169	pandn %xmm3,%xmm2; \
170	pxor tmp_at(5),%xmm0; \
171	movdqa tmp_at(7),%xmm3; \
172	por tmp_at(2),%xmm3; \
173	pxor pnot,%xmm7; \
174	pxor out1,%xmm3; \
175	pxor %xmm7,%xmm2; \
176	pxor tmp_at(5),%xmm4; \
177	pxor out3,%xmm2; \
178	pxor %xmm4,%xmm7; \
179	pxor %xmm7,%xmm3; \
180	movdqa %xmm3,out1; \
181	por %xmm6,%xmm5; \
182	por tmp_at(8),%xmm7; \
183	por %xmm5,%xmm0; \
184	pxor out2,%xmm7; \
185	pxor %xmm4,%xmm0; \
186	pxor %xmm0,%xmm7; \
187	por tmp_at(4),%xmm1; \
188	movdqa tmp_at(2),%xmm3; \
189	pand tmp_at(9),%xmm4; \
190	pandn %xmm1,%xmm0; \
191	pxor %xmm0,%xmm4; \
192	por tmp_at(9),%xmm3; \
193	por tmp_at(2),%xmm4; \
194	movdqa %xmm2,out3; \
195	pxor %xmm3,%xmm7; \
196	pxor %xmm5,%xmm4; \
197	pxor out4,%xmm4; \
198	movdqa %xmm7,out2; \
199	movdqa %xmm4,out4
200
201#define S2(out1, out2, out3, out4) \
202	movdqa %xmm2,tmp_at(2); \
203	movdqa %xmm1,tmp_at(1); \
204	movdqa %xmm5,%xmm2; \
205	movdqa %xmm4,tmp_at(4); \
206	pandn %xmm0,%xmm2; \
207	movdqa %xmm3,tmp_at(3); \
208	pandn %xmm4,%xmm2; \
209	movdqa %xmm0,%xmm6; \
210	movdqa %xmm2,%xmm7; \
211	pxor pnot,%xmm0; \
212	por %xmm1,%xmm7; \
213	pxor %xmm4,%xmm1; \
214	movdqa %xmm7,tmp_at(5); \
215	pand %xmm1,%xmm6; \
216	movdqa %xmm5,%xmm7; \
217	pxor %xmm4,%xmm6; \
218	pandn %xmm1,%xmm7; \
219	movdqa %xmm3,%xmm4; \
220	pxor %xmm7,%xmm2; \
221	pandn %xmm6,%xmm7; \
222	pxor %xmm5,%xmm1; \
223	movdqa %xmm7,tmp_at(7); \
224	movdqa %xmm5,%xmm7; \
225	pand tmp_at(2),%xmm5; \
226	pand tmp_at(5),%xmm2; \
227	movdqa %xmm5,tmp_at(8); \
228	pandn %xmm2,%xmm5; \
229	pand tmp_at(2),%xmm2; \
230	movdqa tmp_at(8),%xmm7; \
231	pandn tmp_at(3),%xmm5; \
232	pandn %xmm1,%xmm7; \
233	pxor %xmm2,%xmm0; \
234	movdqa %xmm7,%xmm3; \
235	pxor %xmm0,%xmm3; \
236	pxor out2,%xmm5; \
237	pandn tmp_at(1),%xmm7; \
238	pxor %xmm6,%xmm7; \
239	pxor %xmm3,%xmm5; \
240	movdqa %xmm7,%xmm6; \
241	movdqa %xmm5,out2; \
242	movdqa tmp_at(7),%xmm5; \
243	pandn tmp_at(5),%xmm4; \
244	pandn %xmm0,%xmm6; \
245	pxor tmp_at(5),%xmm3; \
246	movdqa %xmm1,%xmm0; \
247	pxor %xmm4,%xmm6; \
248	pxor tmp_at(2),%xmm0; \
249	pxor %xmm0,%xmm6; \
250	movdqa %xmm0,%xmm4; \
251	pxor out1,%xmm6; \
252	pandn tmp_at(1),%xmm0; \
253	pxor tmp_at(4),%xmm2; \
254	pxor %xmm3,%xmm0; \
255	movdqa %xmm6,out1; \
256	por %xmm1,%xmm3; \
257	por tmp_at(8),%xmm0; \
258	pxor %xmm4,%xmm0; \
259	movdqa %xmm0,%xmm4; \
260	pandn tmp_at(2),%xmm0; \
261	movdqa tmp_at(3),%xmm6; \
262	pxor tmp_at(7),%xmm0; \
263	por %xmm7,%xmm0; \
264	por %xmm6,%xmm5; \
265	pxor %xmm0,%xmm2; \
266	pandn %xmm2,%xmm7; \
267	por %xmm2,%xmm6; \
268	pxor out4,%xmm7; \
269	pxor %xmm4,%xmm6; \
270	pxor out3,%xmm6; \
271	pxor %xmm5,%xmm7; \
272	pxor %xmm3,%xmm7; \
273	movdqa %xmm6,out3; \
274	movdqa %xmm7,out4
275
276#define S3(out1, out2, out3, out4) \
277	movdqa %xmm0,tmp_at(1); \
278	movdqa %xmm1,tmp_at(2); \
279	movdqa %xmm0,%xmm7; \
280	pandn %xmm0,%xmm1; \
281	movdqa %xmm2,tmp_at(3); \
282	movdqa %xmm5,%xmm0; \
283	pxor %xmm2,%xmm0; \
284	movdqa %xmm4,tmp_at(4); \
285	movdqa %xmm5,%xmm2; \
286	por %xmm0,%xmm1; \
287	pxor %xmm3,%xmm2; \
288	movdqa %xmm0,%xmm4; \
289	movdqa %xmm5,%xmm6; \
290	pandn %xmm2,%xmm7; \
291	pxor tmp_at(2),%xmm4; \
292	movdqa %xmm7,tmp_at(5); \
293	pxor %xmm1,%xmm7; \
294	pandn %xmm4,%xmm6; \
295	movdqa %xmm7,tmp_at(6); \
296	pxor %xmm6,%xmm1; \
297	pand %xmm0,%xmm2; \
298	movdqa %xmm1,%xmm6; \
299	movdqa %xmm3,%xmm0; \
300	pandn %xmm7,%xmm6; \
301	pand %xmm5,%xmm7; \
302	pand %xmm3,%xmm5; \
303	por %xmm3,%xmm7; \
304	pand tmp_at(1),%xmm7; \
305	movdqa tmp_at(4),%xmm3; \
306	pandn tmp_at(6),%xmm3; \
307	pxor %xmm4,%xmm7; \
308	pxor tmp_at(1),%xmm0; \
309	movdqa %xmm7,tmp_at(7); \
310	pxor %xmm3,%xmm7; \
311	movdqa tmp_at(2),%xmm3; \
312	pxor out4,%xmm7; \
313	pxor %xmm0,%xmm1; \
314	movdqa %xmm7,out4; \
315	movdqa tmp_at(3),%xmm7; \
316	por tmp_at(3),%xmm1; \
317	pandn %xmm1,%xmm2; \
318	por tmp_at(5),%xmm0; \
319	movdqa %xmm0,%xmm1; \
320	pandn %xmm5,%xmm3; \
321	pandn tmp_at(7),%xmm1; \
322	por %xmm4,%xmm5; \
323	pxor %xmm3,%xmm1; \
324	por tmp_at(2),%xmm7; \
325	movdqa tmp_at(3),%xmm3; \
326	pandn %xmm1,%xmm3; \
327	pxor %xmm4,%xmm0; \
328	pandn %xmm5,%xmm3; \
329	movdqa tmp_at(4),%xmm5; \
330	pxor tmp_at(1),%xmm3; \
331	pand %xmm2,%xmm5; \
332	pxor pnot,%xmm0; \
333	pxor %xmm5,%xmm3; \
334	movdqa %xmm7,%xmm5; \
335	pxor out2,%xmm3; \
336	pandn tmp_at(4),%xmm6; \
337	pandn tmp_at(6),%xmm7; \
338	pxor %xmm0,%xmm6; \
339	movdqa %xmm3,out2; \
340	pxor tmp_at(1),%xmm2; \
341	por tmp_at(4),%xmm1; \
342	por %xmm2,%xmm0; \
343	pxor tmp_at(6),%xmm5; \
344	pxor %xmm1,%xmm0; \
345	pxor out1,%xmm6; \
346	pxor out3,%xmm5; \
347	pxor tmp_at(7),%xmm0; \
348	pxor %xmm7,%xmm6; \
349	pxor %xmm5,%xmm0; \
350	movdqa %xmm6,out1; \
351	movdqa %xmm0,out3
352
353#define S4(out1, out2, out3, out4) \
354	movdqa %xmm1,%xmm7; \
355	pxor %xmm2,%xmm0; \
356	por %xmm3,%xmm1; \
357	pxor %xmm4,%xmm2; \
358	movdqa %xmm5,tmp_at(2); \
359	pxor %xmm4,%xmm1; \
360	movdqa %xmm7,%xmm6; \
361	movdqa %xmm7,%xmm5; \
362	pandn %xmm2,%xmm7; \
363	pandn %xmm2,%xmm1; \
364	por %xmm7,%xmm4; \
365	pxor %xmm3,%xmm7; \
366	movdqa %xmm7,%xmm6; \
367	por %xmm0,%xmm7; \
368	pxor %xmm5,%xmm3; \
369	movdqa %xmm1,tmp_at(3); \
370	pandn %xmm7,%xmm1; \
371	movdqa %xmm1,%xmm7; \
372	pxor %xmm5,%xmm1; \
373	pand %xmm1,%xmm6; \
374	movdqa %xmm6,%xmm5; \
375	pxor %xmm1,%xmm0; \
376	pandn %xmm2,%xmm6; \
377	pandn %xmm0,%xmm6; \
378	pxor %xmm0,%xmm4; \
379	movdqa %xmm3,%xmm0; \
380	pandn %xmm4,%xmm3; \
381	movdqa tmp_at(2),%xmm2; \
382	pxor %xmm7,%xmm3; \
383	pxor tmp_at(3),%xmm6; \
384	movdqa %xmm6,%xmm7; \
385	pandn %xmm2,%xmm6; \
386	pxor out1,%xmm6; \
387	pandn %xmm7,%xmm2; \
388	pxor out2,%xmm2; \
389	pxor %xmm3,%xmm6; \
390	pxor pnot,%xmm3; \
391	pxor %xmm3,%xmm2; \
392	pxor %xmm7,%xmm3; \
393	movdqa %xmm6,out1; \
394	pandn %xmm3,%xmm0; \
395	por %xmm5,%xmm0; \
396	movdqa %xmm2,out2; \
397	movdqa tmp_at(2),%xmm3; \
398	por %xmm1,%xmm3; \
399	pand tmp_at(2),%xmm1; \
400	pxor %xmm4,%xmm0; \
401	pxor %xmm0,%xmm3; \
402	pxor out3,%xmm3; \
403	pxor %xmm1,%xmm0; \
404	movdqa %xmm3,out3; \
405	pxor out4,%xmm0; \
406	movdqa %xmm0,out4
407
408#define S5(out1, out2, out3, out4) \
409	movdqa %xmm2,tmp_at(3); \
410	movdqa %xmm0,tmp_at(1); \
411	por %xmm0,%xmm2; \
412	movdqa %xmm5,%xmm6; \
413	movdqa %xmm2,tmp_at(4); \
414	pandn %xmm2,%xmm5; \
415	movdqa %xmm2,%xmm7; \
416	movdqa %xmm5,%xmm2; \
417	pxor %xmm0,%xmm5; \
418	movdqa %xmm3,%xmm7; \
419	movdqa %xmm5,tmp_at(5); \
420	pxor tmp_at(3),%xmm5; \
421	movdqa %xmm1,tmp_at(2); \
422	por %xmm5,%xmm0; \
423	por %xmm3,%xmm5; \
424	pandn %xmm2,%xmm3; \
425	pxor tmp_at(3),%xmm3; \
426	movdqa %xmm3,tmp_at(6); \
427	movdqa %xmm0,%xmm1; \
428	pand %xmm4,%xmm3; \
429	pxor %xmm0,%xmm3; \
430	pand %xmm7,%xmm0; \
431	pxor %xmm7,%xmm3; \
432	movdqa %xmm3,tmp_at(3); \
433	pxor %xmm3,%xmm6; \
434	movdqa %xmm6,%xmm2; \
435	por tmp_at(5),%xmm6; \
436	movdqa %xmm6,%xmm3; \
437	pand %xmm4,%xmm6; \
438	movdqa %xmm6,tmp_at(7); \
439	pxor tmp_at(5),%xmm6; \
440	pxor %xmm6,%xmm0; \
441	movdqa tmp_at(1),%xmm6; \
442	movdqa %xmm0,tmp_at(8); \
443	pandn %xmm3,%xmm6; \
444	movdqa tmp_at(2),%xmm0; \
445	movdqa %xmm6,%xmm3; \
446	pxor tmp_at(6),%xmm6; \
447	pxor %xmm5,%xmm4; \
448	pandn %xmm4,%xmm6; \
449	pxor pnot,%xmm6; \
450	pandn %xmm6,%xmm0; \
451	pxor tmp_at(3),%xmm0; \
452	movdqa tmp_at(7),%xmm6; \
453	pandn tmp_at(6),%xmm6; \
454	pxor out3,%xmm0; \
455	pxor %xmm4,%xmm3; \
456	movdqa %xmm0,out3; \
457	por tmp_at(8),%xmm3; \
458	movdqa tmp_at(6),%xmm0; \
459	pandn %xmm3,%xmm6; \
460	pand tmp_at(6),%xmm1; \
461	pand %xmm6,%xmm2; \
462	movdqa %xmm6,%xmm3; \
463	pandn %xmm5,%xmm6; \
464	pxor %xmm4,%xmm2; \
465	por %xmm2,%xmm1; \
466	pxor tmp_at(4),%xmm3; \
467	pxor tmp_at(7),%xmm1; \
468	pand %xmm2,%xmm7; \
469	pand tmp_at(2),%xmm1; \
470	pxor tmp_at(1),%xmm7; \
471	pxor tmp_at(8),%xmm1; \
472	pxor %xmm7,%xmm3; \
473	por tmp_at(2),%xmm6; \
474	pxor out4,%xmm1; \
475	movdqa %xmm1,out4; \
476	pxor %xmm5,%xmm0; \
477	pxor tmp_at(5),%xmm2; \
478	pxor %xmm3,%xmm6; \
479	pandn %xmm0,%xmm3; \
480	pand tmp_at(2),%xmm5; \
481	pxor %xmm2,%xmm3; \
482	pxor out2,%xmm5; \
483	pxor %xmm5,%xmm3; \
484	pxor out1,%xmm6; \
485	movdqa %xmm3,out2; \
486	movdqa %xmm6,out1
487
488#define S6(out1, out2, out3, out4) \
489	movdqa %xmm4,tmp_at(2); \
490	pxor %xmm1,%xmm4; \
491	movdqa %xmm5,tmp_at(3); \
492	por %xmm1,%xmm5; \
493	movdqa %xmm2,%xmm7; \
494	pand %xmm0,%xmm5; \
495	pxor %xmm0,%xmm2; \
496	movdqa %xmm0,tmp_at(1); \
497	pxor %xmm5,%xmm4; \
498	movdqa %xmm4,tmp_at(4); \
499	pxor tmp_at(3),%xmm4; \
500	movdqa %xmm4,%xmm6; \
501	pandn tmp_at(2),%xmm4; \
502	pand %xmm0,%xmm6; \
503	movdqa %xmm6,tmp_at(5); \
504	pxor %xmm1,%xmm6; \
505	movdqa %xmm6,tmp_at(6); \
506	por %xmm2,%xmm6; \
507	movdqa %xmm6,tmp_at(7); \
508	pxor tmp_at(4),%xmm6; \
509	movdqa %xmm6,%xmm0; \
510	pand %xmm7,%xmm6; \
511	movdqa %xmm6,tmp_at(8); \
512	movdqa tmp_at(3),%xmm6; \
513	por %xmm1,%xmm2; \
514	pandn tmp_at(8),%xmm6; \
515	movdqa %xmm6,tmp_at(9); \
516	movdqa tmp_at(6),%xmm6; \
517	por %xmm4,%xmm6; \
518	movdqa %xmm6,tmp_at(6); \
519	pxor tmp_at(9),%xmm6; \
520	movdqa %xmm6,tmp_at(10); \
521	pand %xmm3,%xmm6; \
522	pxor out4,%xmm6; \
523	pxor %xmm0,%xmm6; \
524	por tmp_at(1),%xmm0; \
525	movdqa %xmm6,out4; \
526	movdqa tmp_at(7),%xmm6; \
527	pxor %xmm1,%xmm6; \
528	movdqa %xmm3,%xmm1; \
529	movdqa %xmm6,tmp_at(7); \
530	pandn tmp_at(3),%xmm6; \
531	pxor %xmm7,%xmm6; \
532	movdqa tmp_at(8),%xmm7; \
533	movdqa %xmm6,tmp_at(12); \
534	pandn tmp_at(2),%xmm7; \
535	pand tmp_at(6),%xmm0; \
536	por %xmm6,%xmm7; \
537	pxor %xmm6,%xmm0; \
538	movdqa tmp_at(9),%xmm6; \
539	por %xmm3,%xmm4; \
540	pandn %xmm0,%xmm6; \
541	por %xmm7,%xmm5; \
542	pxor %xmm4,%xmm6; \
543	pxor tmp_at(4),%xmm0; \
544	pxor out3,%xmm6; \
545	pxor %xmm2,%xmm5; \
546	movdqa %xmm6,out3; \
547	movdqa tmp_at(5),%xmm6; \
548	pandn tmp_at(2),%xmm0; \
549	pxor pnot,%xmm2; \
550	pxor tmp_at(7),%xmm2; \
551	pxor tmp_at(3),%xmm6; \
552	pxor out2,%xmm5; \
553	movdqa tmp_at(12),%xmm4; \
554	pxor %xmm2,%xmm0; \
555	pxor tmp_at(1),%xmm4; \
556	pxor tmp_at(10),%xmm5; \
557	pand %xmm6,%xmm4; \
558	pandn %xmm0,%xmm3; \
559	pxor out1,%xmm4; \
560	pandn %xmm7,%xmm1; \
561	pxor tmp_at(8),%xmm4; \
562	pxor %xmm2,%xmm1; \
563	pxor %xmm3,%xmm5; \
564	movdqa %xmm5,out2; \
565	pxor %xmm1,%xmm4; \
566	movdqa %xmm4,out1
567
568#define S7(out1, out2, out3, out4) \
569	movdqa %xmm0,tmp_at(1); \
570	movdqa %xmm4,tmp_at(3); \
571	movdqa %xmm4,%xmm0; \
572	pxor %xmm3,%xmm4; \
573	movdqa %xmm5,tmp_at(4); \
574	movdqa %xmm4,%xmm7; \
575	movdqa %xmm3,tmp_at(2); \
576	pxor %xmm2,%xmm4; \
577	movdqa %xmm4,tmp_at(5); \
578	pand %xmm5,%xmm4; \
579	movdqa %xmm7,%xmm5; \
580	pxor tmp_at(4),%xmm5; \
581	pand %xmm3,%xmm7; \
582	movdqa %xmm7,tmp_at(6); \
583	movdqa %xmm7,%xmm6; \
584	pxor %xmm1,%xmm7; \
585	pand tmp_at(4),%xmm6; \
586	pxor %xmm2,%xmm6; \
587	movdqa %xmm7,tmp_at(7); \
588	movdqa tmp_at(1),%xmm3; \
589	movdqa %xmm6,%xmm0; \
590	por %xmm7,%xmm6; \
591	pand %xmm4,%xmm7; \
592	pxor %xmm5,%xmm6; \
593	pandn %xmm3,%xmm7; \
594	pxor %xmm4,%xmm0; \
595	pxor out4,%xmm7; \
596	pxor %xmm5,%xmm4; \
597	pxor %xmm6,%xmm7; \
598	movdqa %xmm7,out4; \
599	pandn tmp_at(2),%xmm4; \
600	por tmp_at(6),%xmm6; \
601	movdqa tmp_at(5),%xmm7; \
602	pandn tmp_at(3),%xmm7; \
603	pandn tmp_at(7),%xmm4; \
604	movdqa %xmm7,tmp_at(9); \
605	por tmp_at(7),%xmm7; \
606	pandn tmp_at(5),%xmm5; \
607	pxor %xmm0,%xmm7; \
608	pxor tmp_at(3),%xmm0; \
609	pxor %xmm4,%xmm0; \
610	movdqa tmp_at(1),%xmm4; \
611	pand %xmm0,%xmm2; \
612	por %xmm2,%xmm6; \
613	pxor %xmm5,%xmm6; \
614	pandn %xmm6,%xmm3; \
615	movdqa %xmm6,%xmm5; \
616	pxor %xmm7,%xmm3; \
617	pxor %xmm6,%xmm7; \
618	por %xmm0,%xmm6; \
619	pxor out1,%xmm3; \
620	pand tmp_at(4),%xmm6; \
621	pxor pnot,%xmm5; \
622	pand %xmm6,%xmm1; \
623	pxor out3,%xmm0; \
624	pxor %xmm7,%xmm1; \
625	movdqa %xmm3,out1; \
626	movdqa %xmm4,%xmm3; \
627	pxor tmp_at(3),%xmm7; \
628	por %xmm1,%xmm2; \
629	pxor %xmm6,%xmm2; \
630	por %xmm2,%xmm7; \
631	pand %xmm7,%xmm4; \
632	pxor %xmm6,%xmm7; \
633	por tmp_at(9),%xmm7; \
634	pxor %xmm5,%xmm7; \
635	pxor out2,%xmm1; \
636	pandn %xmm7,%xmm3; \
637	pxor %xmm4,%xmm0; \
638	movdqa %xmm0,out3; \
639	pxor %xmm3,%xmm1; \
640	movdqa %xmm1,out2
641
642#define S8(out1, out2, out3, out4) \
643	movdqa %xmm2,%xmm7; \
644	movdqa %xmm1,tmp_at(1); \
645	pandn %xmm2,%xmm1; \
646	movdqa %xmm2,tmp_at(2); \
647	pandn %xmm4,%xmm2; \
648	movdqa %xmm3,tmp_at(3); \
649	pxor %xmm3,%xmm2; \
650	movdqa %xmm4,tmp_at(4); \
651	movdqa %xmm1,%xmm3; \
652	movdqa %xmm5,tmp_at(5); \
653	movdqa %xmm2,%xmm4; \
654	movdqa %xmm2,%xmm5; \
655	pandn tmp_at(1),%xmm4; \
656	pand %xmm0,%xmm2; \
657	pandn tmp_at(1),%xmm7; \
658	pandn %xmm2,%xmm1; \
659	pxor tmp_at(4),%xmm7; \
660	movdqa %xmm4,%xmm6; \
661	por %xmm0,%xmm4; \
662	movdqa %xmm7,tmp_at(6); \
663	pand %xmm4,%xmm7; \
664	pxor pnot,%xmm5; \
665	por %xmm7,%xmm2; \
666	pxor %xmm7,%xmm5; \
667	pandn tmp_at(2),%xmm4; \
668	movdqa tmp_at(5),%xmm7; \
669	pxor %xmm4,%xmm5; \
670	por %xmm1,%xmm7; \
671	pxor %xmm5,%xmm3; \
672	pxor %xmm3,%xmm7; \
673	pxor %xmm0,%xmm3; \
674	pxor out2,%xmm7; \
675	movdqa %xmm7,out2; \
676	pxor tmp_at(1),%xmm5; \
677	movdqa %xmm3,%xmm4; \
678	pand tmp_at(4),%xmm3; \
679	pxor %xmm5,%xmm3; \
680	por tmp_at(3),%xmm5; \
681	pxor %xmm3,%xmm6; \
682	pxor tmp_at(6),%xmm5; \
683	pxor %xmm2,%xmm3; \
684	pxor %xmm6,%xmm5; \
685	por tmp_at(1),%xmm3; \
686	pxor %xmm5,%xmm0; \
687	pxor %xmm4,%xmm3; \
688	por tmp_at(3),%xmm4; \
689	pxor tmp_at(4),%xmm3; \
690	pand tmp_at(5),%xmm2; \
691	pandn %xmm3,%xmm4; \
692	pand tmp_at(5),%xmm0; \
693	pxor %xmm6,%xmm0; \
694	por %xmm1,%xmm4; \
695	pxor out4,%xmm0; \
696	pxor %xmm4,%xmm5; \
697	pxor out3,%xmm2; \
698	por tmp_at(5),%xmm5; \
699	pxor out1,%xmm5; \
700	pxor %xmm3,%xmm2; \
701	pxor %xmm6,%xmm5; \
702	movdqa %xmm0,out4; \
703	movdqa %xmm2,out3; \
704	movdqa %xmm5,out1
705
706#define a1				%xmm0
707#define a2				%xmm1
708#define a3				%xmm2
709#define a4				%xmm3
710#define a5				%xmm4
711#define a6				%xmm5
712
713#define zero				%xmm5
714
715#define DES_bs_clear_block_8(i) \
716	movdqa zero,B(i); \
717	movdqa zero,B(i + 1); \
718	movdqa zero,B(i + 2); \
719	movdqa zero,B(i + 3); \
720	movdqa zero,B(i + 4); \
721	movdqa zero,B(i + 5); \
722	movdqa zero,B(i + 6); \
723	movdqa zero,B(i + 7)
724
725#define DES_bs_clear_block \
726	DES_bs_clear_block_8(0); \
727	DES_bs_clear_block_8(8); \
728	DES_bs_clear_block_8(16); \
729	DES_bs_clear_block_8(24); \
730	DES_bs_clear_block_8(32); \
731	DES_bs_clear_block_8(40); \
732	DES_bs_clear_block_8(48); \
733	DES_bs_clear_block_8(56)
734
735#define k_ptr				%edx
736#define K(i)				nvec(i)(k_ptr)
737#define k(i)				nptr(i)(k_ptr)
738
739#define tmp1				%ecx
740#define tmp2				%esi
741
742#define xor_E(i) \
743	movl E(i),tmp1; \
744	movdqa K(i),a1; \
745	movl E(i + 1),tmp2; \
746	movdqa K(i + 1),a2; \
747	pxor (tmp1),a1; \
748	pxor (tmp2),a2; \
749	movl E(i + 2),tmp1; \
750	movdqa K(i + 2),a3; \
751	movl E(i + 3),tmp2; \
752	movdqa K(i + 3),a4; \
753	pxor (tmp1),a3; \
754	pxor (tmp2),a4; \
755	movl E(i + 4),tmp1; \
756	movdqa K(i + 4),a5; \
757	movl E(i + 5),tmp2; \
758	movdqa K(i + 5),a6; \
759	pxor (tmp1),a5; \
760	pxor (tmp2),a6
761
762#define xor_B(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
763	movdqa B(b1),a1; \
764	movdqa B(b2),a2; \
765	pxor K(k1),a1; \
766	movdqa B(b3),a3; \
767	pxor K(k2),a2; \
768	movdqa B(b4),a4; \
769	pxor K(k3),a3; \
770	movdqa B(b5),a5; \
771	pxor K(k4),a4; \
772	movdqa B(b6),a6; \
773	pxor K(k5),a5; \
774	pxor K(k6),a6
775
776#define xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6) \
777	movl k(k1),tmp1; \
778	movl k(k2),tmp2; \
779	movdqa B(b1),a1; \
780	movdqa B(b2),a2; \
781	pxor (tmp1),a1; \
782	movl k(k3),tmp1; \
783	pxor (tmp2),a2; \
784	movl k(k4),tmp2; \
785	movdqa B(b3),a3; \
786	movdqa B(b4),a4; \
787	pxor (tmp1),a3; \
788	movl k(k6),tmp1; \
789	pxor (tmp2),a4
790
791#define xor_B_KS_p_suffix(b5, k5) \
792	movl k(k5),tmp2; \
793	movdqa B(b5),a5; \
794	pxor (tmp1),a6; \
795	pxor (tmp2),a5
796
797#define xor_B_KS_p(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
798	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
799	movdqa B(b6),a6; \
800	xor_B_KS_p_suffix(b5, k5)
801
802#define xor_B_KS_p_special(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, k6) \
803	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
804	xor_B_KS_p_suffix(b5, k5)
805
806#define mask01				tmp_at(15)
807
808#define v_ptr				%eax
809#define V(i)				nvec(i)(v_ptr)
810
811#if 1
812#define SHLB1(reg)			paddb reg,reg
813#else
814#define SHLB1(reg)			psllq $1,reg
815#endif
816
817#define FINALIZE_NEXT_KEY_BITS_0_6 \
818	movdqa mask01,%xmm7; \
819\
820	movdqa V(0),%xmm0; \
821	movdqa V(1),%xmm1; \
822	movdqa V(2),%xmm2; \
823	movdqa V(3),%xmm3; \
824	pand %xmm7,%xmm0; \
825	pand %xmm7,%xmm1; \
826	pand %xmm7,%xmm2; \
827	pand %xmm7,%xmm3; \
828	SHLB1(%xmm1); \
829	psllq $2,%xmm2; \
830	psllq $3,%xmm3; \
831	por %xmm0,%xmm1; \
832	por %xmm2,%xmm3; \
833	movdqa V(4),%xmm4; \
834	movdqa V(5),%xmm5; \
835	por %xmm1,%xmm3; \
836	pand %xmm7,%xmm4; \
837	pand %xmm7,%xmm5; \
838	movdqa V(6),%xmm6; \
839	movdqa V(7),%xmm0; \
840	psllq $4,%xmm4; \
841	pand %xmm7,%xmm6; \
842	pand %xmm7,%xmm0; \
843	psllq $5,%xmm5; \
844	psllq $6,%xmm6; \
845	psllq $7,%xmm0; \
846	por %xmm4,%xmm5; \
847	por %xmm6,%xmm3; \
848	por %xmm5,%xmm0; \
849	movdqa V(1),%xmm1; \
850	por %xmm3,%xmm0; \
851	SHLB1(%xmm7); \
852	movdqa %xmm0,K(0); \
853\
854	movdqa V(0),%xmm0; \
855	movdqa V(2),%xmm2; \
856	movdqa V(3),%xmm3; \
857	pand %xmm7,%xmm0; \
858	pand %xmm7,%xmm1; \
859	pand %xmm7,%xmm2; \
860	pand %xmm7,%xmm3; \
861	psrlq $1,%xmm0; \
862	SHLB1(%xmm2); \
863	psllq $2,%xmm3; \
864	por %xmm0,%xmm1; \
865	por %xmm2,%xmm3; \
866	movdqa V(4),%xmm4; \
867	movdqa V(5),%xmm5; \
868	por %xmm1,%xmm3; \
869	pand %xmm7,%xmm4; \
870	pand %xmm7,%xmm5; \
871	movdqa V(6),%xmm6; \
872	movdqa V(7),%xmm0; \
873	psllq $3,%xmm4; \
874	pand %xmm7,%xmm6; \
875	pand %xmm7,%xmm0; \
876	psllq $4,%xmm5; \
877	psllq $5,%xmm6; \
878	psllq $6,%xmm0; \
879	por %xmm4,%xmm5; \
880	por %xmm6,%xmm3; \
881	por %xmm5,%xmm0; \
882	movdqa V(1),%xmm1; \
883	por %xmm3,%xmm0; \
884	SHLB1(%xmm7); \
885	movdqa %xmm0,K(1); \
886\
887	movdqa V(0),%xmm0; \
888	movdqa V(2),%xmm2; \
889	movdqa V(3),%xmm3; \
890	pand %xmm7,%xmm0; \
891	pand %xmm7,%xmm1; \
892	pand %xmm7,%xmm2; \
893	pand %xmm7,%xmm3; \
894	psrlq $2,%xmm0; \
895	psrlq $1,%xmm1; \
896	SHLB1(%xmm3); \
897	por %xmm0,%xmm1; \
898	por %xmm2,%xmm3; \
899	movdqa V(4),%xmm4; \
900	movdqa V(5),%xmm5; \
901	por %xmm1,%xmm3; \
902	pand %xmm7,%xmm4; \
903	pand %xmm7,%xmm5; \
904	movdqa V(6),%xmm6; \
905	movdqa V(7),%xmm0; \
906	psllq $2,%xmm4; \
907	pand %xmm7,%xmm6; \
908	pand %xmm7,%xmm0; \
909	psllq $3,%xmm5; \
910	psllq $4,%xmm6; \
911	psllq $5,%xmm0; \
912	por %xmm4,%xmm5; \
913	por %xmm6,%xmm3; \
914	por %xmm5,%xmm0; \
915	movdqa V(1),%xmm1; \
916	por %xmm3,%xmm0; \
917	SHLB1(%xmm7); \
918	movdqa %xmm0,K(2); \
919\
920	movdqa V(0),%xmm0; \
921	movdqa V(2),%xmm2; \
922	movdqa V(3),%xmm3; \
923	pand %xmm7,%xmm0; \
924	pand %xmm7,%xmm1; \
925	pand %xmm7,%xmm2; \
926	pand %xmm7,%xmm3; \
927	psrlq $3,%xmm0; \
928	psrlq $2,%xmm1; \
929	psrlq $1,%xmm2; \
930	por %xmm0,%xmm1; \
931	por %xmm2,%xmm3; \
932	movdqa V(4),%xmm4; \
933	movdqa V(5),%xmm5; \
934	por %xmm1,%xmm3; \
935	pand %xmm7,%xmm4; \
936	pand %xmm7,%xmm5; \
937	movdqa V(6),%xmm6; \
938	movdqa V(7),%xmm0; \
939	SHLB1(%xmm4); \
940	pand %xmm7,%xmm6; \
941	pand %xmm7,%xmm0; \
942	psllq $2,%xmm5; \
943	psllq $3,%xmm6; \
944	psllq $4,%xmm0; \
945	por %xmm4,%xmm5; \
946	por %xmm6,%xmm3; \
947	por %xmm5,%xmm0; \
948	movdqa V(1),%xmm1; \
949	por %xmm3,%xmm0; \
950	SHLB1(%xmm7); \
951	movdqa %xmm0,K(3); \
952\
953	movdqa V(0),%xmm0; \
954	movdqa V(2),%xmm2; \
955	movdqa V(3),%xmm3; \
956	pand %xmm7,%xmm0; \
957	pand %xmm7,%xmm1; \
958	pand %xmm7,%xmm2; \
959	pand %xmm7,%xmm3; \
960	psrlq $4,%xmm0; \
961	psrlq $3,%xmm1; \
962	psrlq $2,%xmm2; \
963	psrlq $1,%xmm3; \
964	por %xmm0,%xmm1; \
965	por %xmm2,%xmm3; \
966	movdqa V(4),%xmm4; \
967	movdqa V(5),%xmm5; \
968	por %xmm1,%xmm3; \
969	pand %xmm7,%xmm4; \
970	pand %xmm7,%xmm5; \
971	movdqa V(6),%xmm6; \
972	movdqa V(7),%xmm0; \
973	pand %xmm7,%xmm6; \
974	pand %xmm7,%xmm0; \
975	SHLB1(%xmm5); \
976	psllq $2,%xmm6; \
977	psllq $3,%xmm0; \
978	por %xmm4,%xmm5; \
979	por %xmm6,%xmm3; \
980	por %xmm5,%xmm0; \
981	movdqa V(1),%xmm1; \
982	por %xmm3,%xmm0; \
983	SHLB1(%xmm7); \
984	movdqa %xmm0,K(4); \
985\
986	movdqa V(0),%xmm0; \
987	movdqa V(2),%xmm2; \
988	movdqa V(3),%xmm3; \
989	pand %xmm7,%xmm0; \
990	pand %xmm7,%xmm1; \
991	pand %xmm7,%xmm2; \
992	pand %xmm7,%xmm3; \
993	psrlq $5,%xmm0; \
994	psrlq $4,%xmm1; \
995	psrlq $3,%xmm2; \
996	psrlq $2,%xmm3; \
997	por %xmm0,%xmm1; \
998	por %xmm2,%xmm3; \
999	movdqa V(4),%xmm4; \
1000	movdqa V(5),%xmm5; \
1001	por %xmm1,%xmm3; \
1002	pand %xmm7,%xmm4; \
1003	pand %xmm7,%xmm5; \
1004	movdqa V(6),%xmm6; \
1005	movdqa V(7),%xmm0; \
1006	psrlq $1,%xmm4; \
1007	pand %xmm7,%xmm6; \
1008	pand %xmm7,%xmm0; \
1009	SHLB1(%xmm6); \
1010	psllq $2,%xmm0; \
1011	por %xmm4,%xmm5; \
1012	por %xmm6,%xmm3; \
1013	por %xmm5,%xmm0; \
1014	movdqa V(1),%xmm1; \
1015	por %xmm3,%xmm0; \
1016	SHLB1(%xmm7); \
1017	movdqa %xmm0,K(5); \
1018\
1019	movdqa V(0),%xmm0; \
1020	movdqa V(2),%xmm2; \
1021	movdqa V(3),%xmm3; \
1022	pand %xmm7,%xmm0; \
1023	pand %xmm7,%xmm1; \
1024	pand %xmm7,%xmm2; \
1025	pand %xmm7,%xmm3; \
1026	psrlq $6,%xmm0; \
1027	psrlq $5,%xmm1; \
1028	psrlq $4,%xmm2; \
1029	psrlq $3,%xmm3; \
1030	por %xmm0,%xmm1; \
1031	por %xmm2,%xmm3; \
1032	movdqa V(4),%xmm4; \
1033	movdqa V(5),%xmm5; \
1034	por %xmm1,%xmm3; \
1035	pand %xmm7,%xmm4; \
1036	pand %xmm7,%xmm5; \
1037	movdqa V(6),%xmm6; \
1038	movdqa V(7),%xmm0; \
1039	psrlq $2,%xmm4; \
1040	pand %xmm7,%xmm6; \
1041	pand %xmm7,%xmm0; \
1042	psrlq $1,%xmm5; \
1043	SHLB1(%xmm0); \
1044	por %xmm4,%xmm5; \
1045	por %xmm6,%xmm3; \
1046	por %xmm5,%xmm0; \
1047	por %xmm3,%xmm0; \
1048	movdqa %xmm0,K(6)
1049
1050.text
1051
1052DO_ALIGN(6)
1053.globl DES_bs_init_asm
1054DES_bs_init_asm:
1055	pcmpeqd %xmm0,%xmm0
1056	movdqa %xmm0,pnot
1057	paddb %xmm0,%xmm0
1058	pxor pnot,%xmm0
1059	movdqa %xmm0,mask01
1060	ret
1061
1062#define rounds_and_swapped		%ebp
1063#define iterations			%eax
1064
1065DO_ALIGN(6)
1066.globl DES_bs_crypt
1067DES_bs_crypt:
1068	cmpl $0,DES_bs_all_keys_changed
1069	jz DES_bs_crypt_body
1070	call DES_bs_finalize_keys
1071DES_bs_crypt_body:
1072	movl 4(%esp),iterations
1073	pxor zero,zero
1074	pushl %ebp
1075	pushl %esi
1076	movl $DES_bs_all_KS_v,k_ptr
1077	DES_bs_clear_block
1078	movl $8,rounds_and_swapped
1079DES_bs_crypt_start:
1080	xor_E(0)
1081	S1(B(40), B(48), B(54), B(62))
1082	xor_E(6)
1083	S2(B(44), B(59), B(33), B(49))
1084	xor_E(12)
1085	S3(B(55), B(47), B(61), B(37))
1086	xor_E(18)
1087	S4(B(57), B(51), B(41), B(32))
1088	xor_E(24)
1089	S5(B(39), B(45), B(56), B(34))
1090	xor_E(30)
1091	S6(B(35), B(60), B(42), B(50))
1092	xor_E(36)
1093	S7(B(63), B(43), B(53), B(38))
1094	xor_E(42)
1095	S8(B(36), B(58), B(46), B(52))
1096	cmpl $0x100,rounds_and_swapped
1097	je DES_bs_crypt_next
1098DES_bs_crypt_swap:
1099	xor_E(48)
1100	S1(B(8), B(16), B(22), B(30))
1101	xor_E(54)
1102	S2(B(12), B(27), B(1), B(17))
1103	xor_E(60)
1104	S3(B(23), B(15), B(29), B(5))
1105	xor_E(66)
1106	S4(B(25), B(19), B(9), B(0))
1107	xor_E(72)
1108	S5(B(7), B(13), B(24), B(2))
1109	xor_E(78)
1110	S6(B(3), B(28), B(10), B(18))
1111	xor_E(84)
1112	S7(B(31), B(11), B(21), B(6))
1113	xor_E(90)
1114	addl $nvec(96),k_ptr
1115	S8(B(4), B(26), B(14), B(20))
1116	decl rounds_and_swapped
1117	jnz DES_bs_crypt_start
1118	subl $nvec(0x300+48),k_ptr
1119	movl $0x108,rounds_and_swapped
1120	decl iterations
1121	jnz DES_bs_crypt_swap
1122	popl %esi
1123	popl %ebp
1124	ret
1125DES_bs_crypt_next:
1126	subl $nvec(0x300-48),k_ptr
1127	movl $8,rounds_and_swapped
1128	decl iterations
1129	jnz DES_bs_crypt_start
1130	popl %esi
1131	popl %ebp
1132	ret
1133
1134DO_ALIGN(6)
1135.globl DES_bs_crypt_25
1136DES_bs_crypt_25:
1137	cmpl $0,DES_bs_all_keys_changed
1138	jnz DES_bs_finalize_keys_25
1139DES_bs_crypt_25_body:
1140	pxor zero,zero
1141	pushl %ebp
1142	pushl %esi
1143	movl $DES_bs_all_KS_v,k_ptr
1144	DES_bs_clear_block
1145	movl $8,rounds_and_swapped
1146	movl $25,iterations
1147DES_bs_crypt_25_start:
1148	xor_E(0)
1149	S1(B(40), B(48), B(54), B(62))
1150	xor_E(6)
1151	S2(B(44), B(59), B(33), B(49))
1152	xor_B(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1153	S3(B(55), B(47), B(61), B(37))
1154	xor_B(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1155	S4(B(57), B(51), B(41), B(32))
1156	xor_E(24)
1157	S5(B(39), B(45), B(56), B(34))
1158	xor_E(30)
1159	S6(B(35), B(60), B(42), B(50))
1160	xor_B(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1161	S7(B(63), B(43), B(53), B(38))
1162	xor_B(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1163	S8(B(36), B(58), B(46), B(52))
1164	cmpl $0x100,rounds_and_swapped
1165	je DES_bs_crypt_25_next
1166DES_bs_crypt_25_swap:
1167	xor_E(48)
1168	S1(B(8), B(16), B(22), B(30))
1169	xor_E(54)
1170	S2(B(12), B(27), B(1), B(17))
1171	xor_B(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1172	S3(B(23), B(15), B(29), B(5))
1173	xor_B(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1174	S4(B(25), B(19), B(9), B(0))
1175	xor_E(72)
1176	S5(B(7), B(13), B(24), B(2))
1177	xor_E(78)
1178	S6(B(3), B(28), B(10), B(18))
1179	xor_B(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1180	S7(B(31), B(11), B(21), B(6))
1181	xor_B(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1182	S8(B(4), B(26), B(14), B(20))
1183	addl $nvec(96),k_ptr
1184	decl rounds_and_swapped
1185	jnz DES_bs_crypt_25_start
1186	subl $nvec(0x300+48),k_ptr
1187	movl $0x108,rounds_and_swapped
1188	decl iterations
1189	jnz DES_bs_crypt_25_swap
1190	popl %esi
1191	popl %ebp
1192	ret
1193DES_bs_crypt_25_next:
1194	subl $nvec(0x300-48),k_ptr
1195	movl $8,rounds_and_swapped
1196	decl iterations
1197	jmp DES_bs_crypt_25_start
1198
1199DES_bs_finalize_keys_25:
1200	pushl $DES_bs_crypt_25_body
1201DES_bs_finalize_keys:
1202	movl $DES_bs_all_xkeys,v_ptr
1203	movl $DES_bs_all_K,k_ptr
1204	movl $0,DES_bs_all_keys_changed
1205DES_bs_finalize_keys_main_loop:
1206	FINALIZE_NEXT_KEY_BITS_0_6
1207	addl $nvec(7),k_ptr
1208	addl $nvec(8),v_ptr
1209	cmpl $DES_bs_all_K+nvec(56),k_ptr
1210	jb DES_bs_finalize_keys_main_loop
1211	pushl %esi
1212	movl $DES_bs_all_KSp,k_ptr
1213	movl $DES_bs_all_KS_v,v_ptr
1214DES_bs_finalize_keys_expand_loop:
1215	movl k(0),tmp1
1216	movl k(1),tmp2
1217	movdqa (tmp1),%xmm0
1218	movdqa (tmp2),%xmm1
1219	movl k(2),tmp1
1220	movl k(3),tmp2
1221	movdqa %xmm0,V(0)
1222	movdqa %xmm1,V(1)
1223	movdqa (tmp1),%xmm0
1224	movdqa (tmp2),%xmm1
1225	movl k(4),tmp1
1226	movl k(5),tmp2
1227	movdqa %xmm0,V(2)
1228	movdqa %xmm1,V(3)
1229	movdqa (tmp1),%xmm0
1230	movdqa (tmp2),%xmm1
1231	movl k(6),tmp1
1232	movl k(7),tmp2
1233	movdqa %xmm0,V(4)
1234	movdqa %xmm1,V(5)
1235	movdqa (tmp1),%xmm0
1236	movdqa (tmp2),%xmm1
1237	addl $nptr(8),k_ptr
1238	movdqa %xmm0,V(6)
1239	movdqa %xmm1,V(7)
1240	addl $nvec(8),v_ptr
1241	cmpl $DES_bs_all_KSp+nptr(0x300),k_ptr
1242	jb DES_bs_finalize_keys_expand_loop
1243	popl %esi
1244	ret
1245
1246#define ones				%xmm1
1247
1248#define rounds				%eax
1249
1250DO_ALIGN(6)
1251.globl DES_bs_crypt_LM
1252DES_bs_crypt_LM:
1253	movl $DES_bs_all_xkeys,v_ptr
1254	movl $DES_bs_all_K,k_ptr
1255DES_bs_finalize_keys_LM_loop:
1256	FINALIZE_NEXT_KEY_BITS_0_6
1257# bit 7
1258	SHLB1(%xmm7)
1259	movdqa V(0),%xmm0
1260	movdqa V(1),%xmm1
1261	movdqa V(2),%xmm2
1262	movdqa V(3),%xmm3
1263	pand %xmm7,%xmm0
1264	pand %xmm7,%xmm1
1265	pand %xmm7,%xmm2
1266	pand %xmm7,%xmm3
1267	psrlq $7,%xmm0
1268	psrlq $6,%xmm1
1269	psrlq $5,%xmm2
1270	psrlq $4,%xmm3
1271	por %xmm0,%xmm1
1272	por %xmm2,%xmm3
1273	movdqa V(4),%xmm4
1274	movdqa V(5),%xmm5
1275	por %xmm1,%xmm3
1276	pand %xmm7,%xmm4
1277	pand %xmm7,%xmm5
1278	movdqa V(6),%xmm6
1279	movdqa V(7),%xmm0
1280	psrlq $3,%xmm4
1281	pand %xmm7,%xmm6
1282	pand %xmm7,%xmm0
1283	psrlq $2,%xmm5
1284	psrlq $1,%xmm6
1285	por %xmm4,%xmm5
1286	por %xmm6,%xmm3
1287	por %xmm5,%xmm0
1288	addl $nvec(8),v_ptr
1289	por %xmm3,%xmm0
1290	movdqa %xmm0,K(7)
1291	addl $nvec(8),k_ptr
1292	cmpl $DES_bs_all_K+nvec(56),k_ptr
1293	jb DES_bs_finalize_keys_LM_loop
1294
1295	pxor zero,zero
1296	pushl %esi
1297	pcmpeqd ones,ones
1298	movl $DES_bs_all_KS_p,k_ptr
1299	movdqa zero,B(0)
1300	movdqa zero,B(1)
1301	movdqa zero,B(2)
1302	movdqa zero,B(3)
1303	movdqa zero,B(4)
1304	movdqa zero,B(5)
1305	movdqa zero,B(6)
1306	movdqa zero,B(7)
1307	movdqa ones,B(8)
1308	movdqa ones,B(9)
1309	movdqa ones,B(10)
1310	movdqa zero,B(11)
1311	movdqa ones,B(12)
1312	movdqa zero,B(13)
1313	movdqa zero,B(14)
1314	movdqa zero,B(15)
1315	movdqa zero,B(16)
1316	movdqa zero,B(17)
1317	movdqa zero,B(18)
1318	movdqa zero,B(19)
1319	movdqa zero,B(20)
1320	movdqa zero,B(21)
1321	movdqa zero,B(22)
1322	movdqa ones,B(23)
1323	movdqa zero,B(24)
1324	movdqa zero,B(25)
1325	movdqa ones,B(26)
1326	movdqa zero,B(27)
1327	movdqa zero,B(28)
1328	movdqa ones,B(29)
1329	movdqa ones,B(30)
1330	movdqa ones,B(31)
1331	movdqa zero,B(32)
1332	movdqa zero,B(33)
1333	movdqa zero,B(34)
1334	movdqa ones,B(35)
1335	movdqa zero,B(36)
1336	movdqa ones,B(37)
1337	movdqa ones,B(38)
1338	movdqa ones,B(39)
1339	movdqa zero,B(40)
1340	movdqa zero,B(41)
1341	movdqa zero,B(42)
1342	movdqa zero,B(43)
1343	movdqa zero,B(44)
1344	movdqa ones,B(45)
1345	movdqa zero,B(46)
1346	movdqa zero,B(47)
1347	movdqa ones,B(48)
1348	movdqa ones,B(49)
1349	movdqa zero,B(50)
1350	movdqa zero,B(51)
1351	movdqa zero,B(52)
1352	movdqa zero,B(53)
1353	movdqa ones,B(54)
1354	movdqa zero,B(55)
1355	movdqa ones,B(56)
1356	movdqa zero,B(57)
1357	movdqa ones,B(58)
1358	movdqa zero,B(59)
1359	movdqa ones,B(60)
1360	movdqa ones,B(61)
1361	movdqa ones,B(62)
1362	movdqa ones,B(63)
1363	movl $8,rounds
1364DES_bs_crypt_LM_loop:
1365	xor_B_KS_p_special(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5)
1366	S1(B(40), B(48), B(54), B(62))
1367	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
1368	S2(B(44), B(59), B(33), B(49))
1369	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1370	S3(B(55), B(47), B(61), B(37))
1371	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1372	S4(B(57), B(51), B(41), B(32))
1373	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
1374	S5(B(39), B(45), B(56), B(34))
1375	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
1376	S6(B(35), B(60), B(42), B(50))
1377	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1378	S7(B(63), B(43), B(53), B(38))
1379	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1380	S8(B(36), B(58), B(46), B(52))
1381	xor_B_KS_p_special(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 53)
1382	S1(B(8), B(16), B(22), B(30))
1383	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
1384	S2(B(12), B(27), B(1), B(17))
1385	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1386	S3(B(23), B(15), B(29), B(5))
1387	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1388	S4(B(25), B(19), B(9), B(0))
1389	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
1390	S5(B(7), B(13), B(24), B(2))
1391	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
1392	S6(B(3), B(28), B(10), B(18))
1393	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1394	S7(B(31), B(11), B(21), B(6))
1395	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1396	addl $nptr(96),k_ptr
1397	S8(B(4), B(26), B(14), B(20))
1398	decl rounds
1399	jnz DES_bs_crypt_LM_loop
1400	movl 8(%esp),%eax
1401	popl %esi
1402	movl (%eax),%eax
1403	ret
1404
1405#define rounds				%eax
1406
1407DO_ALIGN(6)
1408.globl DES_bs_crypt_plain
1409DES_bs_crypt_plain:
1410	movl $DES_bs_all_xkeys,v_ptr
1411	movl $DES_bs_all_K,k_ptr
1412	movdqa P(0),%xmm4
1413	movdqa %xmm4,B(0)
1414	movdqa P(1),%xmm4
1415	movdqa %xmm4,B(1)
1416	movdqa P(2),%xmm4
1417	movdqa %xmm4,B(2)
1418	movdqa P(3),%xmm4
1419	movdqa %xmm4,B(3)
1420	movdqa P(4),%xmm4
1421	movdqa %xmm4,B(4)
1422	movdqa P(5),%xmm4
1423	movdqa %xmm4,B(5)
1424	movdqa P(6),%xmm4
1425	movdqa %xmm4,B(6)
1426	movdqa P(7),%xmm4
1427	movdqa %xmm4,B(7)
1428	movdqa P(8),%xmm4
1429	movdqa %xmm4,B(8)
1430	movdqa P(9),%xmm4
1431	movdqa %xmm4,B(9)
1432	movdqa P(10),%xmm4
1433	movdqa %xmm4,B(10)
1434	movdqa P(11),%xmm4
1435	movdqa %xmm4,B(11)
1436	movdqa P(12),%xmm4
1437	movdqa %xmm4,B(12)
1438	movdqa P(13),%xmm4
1439	movdqa %xmm4,B(13)
1440	movdqa P(14),%xmm4
1441	movdqa %xmm4,B(14)
1442	movdqa P(15),%xmm4
1443	movdqa %xmm4,B(15)
1444	movdqa P(16),%xmm4
1445	movdqa %xmm4,B(16)
1446	movdqa P(17),%xmm4
1447	movdqa %xmm4,B(17)
1448	movdqa P(18),%xmm4
1449	movdqa %xmm4,B(18)
1450	movdqa P(19),%xmm4
1451	movdqa %xmm4,B(19)
1452	movdqa P(20),%xmm4
1453	movdqa %xmm4,B(20)
1454	movdqa P(21),%xmm4
1455	movdqa %xmm4,B(21)
1456	movdqa P(22),%xmm4
1457	movdqa %xmm4,B(22)
1458	movdqa P(23),%xmm4
1459	movdqa %xmm4,B(23)
1460	movdqa P(24),%xmm4
1461	movdqa %xmm4,B(24)
1462	movdqa P(25),%xmm4
1463	movdqa %xmm4,B(25)
1464	movdqa P(26),%xmm4
1465	movdqa %xmm4,B(26)
1466	movdqa P(27),%xmm4
1467	movdqa %xmm4,B(27)
1468	movdqa P(28),%xmm4
1469	movdqa %xmm4,B(28)
1470	movdqa P(29),%xmm4
1471	movdqa %xmm4,B(29)
1472	movdqa P(30),%xmm4
1473	movdqa %xmm4,B(30)
1474	movdqa P(31),%xmm4
1475	movdqa %xmm4,B(31)
1476	movdqa P(32),%xmm4
1477	movdqa %xmm4,B(32)
1478	movdqa P(33),%xmm4
1479	movdqa %xmm4,B(33)
1480	movdqa P(34),%xmm4
1481	movdqa %xmm4,B(34)
1482	movdqa P(35),%xmm4
1483	movdqa %xmm4,B(35)
1484	movdqa P(36),%xmm4
1485	movdqa %xmm4,B(36)
1486	movdqa P(37),%xmm4
1487	movdqa %xmm4,B(37)
1488	movdqa P(38),%xmm4
1489	movdqa %xmm4,B(38)
1490	movdqa P(39),%xmm4
1491	movdqa %xmm4,B(39)
1492	movdqa P(40),%xmm4
1493	movdqa %xmm4,B(40)
1494	movdqa P(41),%xmm4
1495	movdqa %xmm4,B(41)
1496	movdqa P(42),%xmm4
1497	movdqa %xmm4,B(42)
1498	movdqa P(43),%xmm4
1499	movdqa %xmm4,B(43)
1500	movdqa P(44),%xmm4
1501	movdqa %xmm4,B(44)
1502	movdqa P(45),%xmm4
1503	movdqa %xmm4,B(45)
1504	movdqa P(46),%xmm4
1505	movdqa %xmm4,B(46)
1506	movdqa P(47),%xmm4
1507	movdqa %xmm4,B(47)
1508	movdqa P(48),%xmm4
1509	movdqa %xmm4,B(48)
1510	movdqa P(49),%xmm4
1511	movdqa %xmm4,B(49)
1512	movdqa P(50),%xmm4
1513	movdqa %xmm4,B(50)
1514	movdqa P(51),%xmm4
1515	movdqa %xmm4,B(51)
1516	movdqa P(52),%xmm4
1517	movdqa %xmm4,B(52)
1518	movdqa P(53),%xmm4
1519	movdqa %xmm4,B(53)
1520	movdqa P(54),%xmm4
1521	movdqa %xmm4,B(54)
1522	movdqa P(55),%xmm4
1523	movdqa %xmm4,B(55)
1524	movdqa P(56),%xmm4
1525	movdqa %xmm4,B(56)
1526	movdqa P(57),%xmm4
1527	movdqa %xmm4,B(57)
1528	movdqa P(58),%xmm4
1529	movdqa %xmm4,B(58)
1530	movdqa P(59),%xmm4
1531	movdqa %xmm4,B(59)
1532	movdqa P(60),%xmm4
1533	movdqa %xmm4,B(60)
1534	movdqa P(61),%xmm4
1535	movdqa %xmm4,B(61)
1536	movdqa P(62),%xmm4
1537	movdqa %xmm4,B(62)
1538	movdqa P(63),%xmm4
1539	movdqa %xmm4,B(63)
1540
1541DES_bs_finalize_keys_plain_loop:
1542	FINALIZE_NEXT_KEY_BITS_0_6
1543	addl $nvec(7),k_ptr
1544	addl $nvec(8),v_ptr
1545	cmpl $DES_bs_all_K+nvec(56),k_ptr
1546	jb DES_bs_finalize_keys_plain_loop
1547	pushl %esi
1548	movl $DES_bs_all_KS_p,k_ptr
1549	movl $DES_bs_all_KS_v,v_ptr
1550	movl $8,rounds
1551DES_bs_crypt_plain_loop:
1552	xor_B_KS_p(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5)
1553	S1(B(40), B(48), B(54), B(62))
1554	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
1555	S2(B(44), B(59), B(33), B(49))
1556	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
1557	S3(B(55), B(47), B(61), B(37))
1558	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
1559	S4(B(57), B(51), B(41), B(32))
1560	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
1561	S5(B(39), B(45), B(56), B(34))
1562	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
1563	S6(B(35), B(60), B(42), B(50))
1564	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
1565	S7(B(63), B(43), B(53), B(38))
1566	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
1567	S8(B(36), B(58), B(46), B(52))
1568	xor_B_KS_p(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 36, 53)
1569	S1(B(8), B(16), B(22), B(30))
1570	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
1571	S2(B(12), B(27), B(1), B(17))
1572	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
1573	S3(B(23), B(15), B(29), B(5))
1574	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
1575	S4(B(25), B(19), B(9), B(0))
1576	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
1577	S5(B(7), B(13), B(24), B(2))
1578	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
1579	S6(B(3), B(28), B(10), B(18))
1580	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
1581	S7(B(31), B(11), B(21), B(6))
1582	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
1583	addl $nptr(96),k_ptr
1584	S8(B(4), B(26), B(14), B(20))
1585	decl rounds
1586	jnz DES_bs_crypt_plain_loop
1587	popl %esi
1588	ret
1589
1590
1591#endif
1592
1593#if defined(__ELF__) && defined(__linux__)
1594.section .note.GNU-stack,"",@progbits
1595#endif
1596