1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifdef BORINGSSL_PREFIX
5%include "boringssl_prefix_symbols_nasm.inc"
6%endif
7%ifidn __OUTPUT_FORMAT__,obj
8section	code	use32 class=code align=64
9%elifidn __OUTPUT_FORMAT__,win32
10%ifdef __YASM_VERSION_ID__
11%if __YASM_VERSION_ID__ < 01010000h
12%error yasm version 1.1.0 or later needed.
13%endif
14; Yasm automatically includes .00 and complains about redefining it.
15; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
16%else
17$@feat.00 equ 1
18%endif
19section	.text	code align=64
20%else
21section	.text	code
22%endif
23global	_GFp_ChaCha20_ctr32
24align	16
25_GFp_ChaCha20_ctr32:
26L$_GFp_ChaCha20_ctr32_begin:
27	push	ebp
28	push	ebx
29	push	esi
30	push	edi
31	xor	eax,eax
32	cmp	eax,DWORD [28+esp]
33	je	NEAR L$000no_data
34	call	L$pic_point
35L$pic_point:
36	pop	eax
37	lea	ebp,[_GFp_ia32cap_P]
38	test	DWORD [ebp],16777216
39	jz	NEAR L$001x86
40	test	DWORD [4+ebp],512
41	jz	NEAR L$001x86
42	jmp	NEAR L$ssse3_shortcut
43L$001x86:
44	mov	esi,DWORD [32+esp]
45	mov	edi,DWORD [36+esp]
46	sub	esp,132
47	mov	eax,DWORD [esi]
48	mov	ebx,DWORD [4+esi]
49	mov	ecx,DWORD [8+esi]
50	mov	edx,DWORD [12+esi]
51	mov	DWORD [80+esp],eax
52	mov	DWORD [84+esp],ebx
53	mov	DWORD [88+esp],ecx
54	mov	DWORD [92+esp],edx
55	mov	eax,DWORD [16+esi]
56	mov	ebx,DWORD [20+esi]
57	mov	ecx,DWORD [24+esi]
58	mov	edx,DWORD [28+esi]
59	mov	DWORD [96+esp],eax
60	mov	DWORD [100+esp],ebx
61	mov	DWORD [104+esp],ecx
62	mov	DWORD [108+esp],edx
63	mov	eax,DWORD [edi]
64	mov	ebx,DWORD [4+edi]
65	mov	ecx,DWORD [8+edi]
66	mov	edx,DWORD [12+edi]
67	sub	eax,1
68	mov	DWORD [112+esp],eax
69	mov	DWORD [116+esp],ebx
70	mov	DWORD [120+esp],ecx
71	mov	DWORD [124+esp],edx
72	jmp	NEAR L$002entry
73align	16
74L$003outer_loop:
75	mov	DWORD [156+esp],ebx
76	mov	DWORD [152+esp],eax
77	mov	DWORD [160+esp],ecx
78L$002entry:
79	mov	eax,1634760805
80	mov	DWORD [4+esp],857760878
81	mov	DWORD [8+esp],2036477234
82	mov	DWORD [12+esp],1797285236
83	mov	ebx,DWORD [84+esp]
84	mov	ebp,DWORD [88+esp]
85	mov	ecx,DWORD [104+esp]
86	mov	esi,DWORD [108+esp]
87	mov	edx,DWORD [116+esp]
88	mov	edi,DWORD [120+esp]
89	mov	DWORD [20+esp],ebx
90	mov	DWORD [24+esp],ebp
91	mov	DWORD [40+esp],ecx
92	mov	DWORD [44+esp],esi
93	mov	DWORD [52+esp],edx
94	mov	DWORD [56+esp],edi
95	mov	ebx,DWORD [92+esp]
96	mov	edi,DWORD [124+esp]
97	mov	edx,DWORD [112+esp]
98	mov	ebp,DWORD [80+esp]
99	mov	ecx,DWORD [96+esp]
100	mov	esi,DWORD [100+esp]
101	add	edx,1
102	mov	DWORD [28+esp],ebx
103	mov	DWORD [60+esp],edi
104	mov	DWORD [112+esp],edx
105	mov	ebx,10
106	jmp	NEAR L$004loop
107align	16
108L$004loop:
109	add	eax,ebp
110	mov	DWORD [128+esp],ebx
111	mov	ebx,ebp
112	xor	edx,eax
113	rol	edx,16
114	add	ecx,edx
115	xor	ebx,ecx
116	mov	edi,DWORD [52+esp]
117	rol	ebx,12
118	mov	ebp,DWORD [20+esp]
119	add	eax,ebx
120	xor	edx,eax
121	mov	DWORD [esp],eax
122	rol	edx,8
123	mov	eax,DWORD [4+esp]
124	add	ecx,edx
125	mov	DWORD [48+esp],edx
126	xor	ebx,ecx
127	add	eax,ebp
128	rol	ebx,7
129	xor	edi,eax
130	mov	DWORD [32+esp],ecx
131	rol	edi,16
132	mov	DWORD [16+esp],ebx
133	add	esi,edi
134	mov	ecx,DWORD [40+esp]
135	xor	ebp,esi
136	mov	edx,DWORD [56+esp]
137	rol	ebp,12
138	mov	ebx,DWORD [24+esp]
139	add	eax,ebp
140	xor	edi,eax
141	mov	DWORD [4+esp],eax
142	rol	edi,8
143	mov	eax,DWORD [8+esp]
144	add	esi,edi
145	mov	DWORD [52+esp],edi
146	xor	ebp,esi
147	add	eax,ebx
148	rol	ebp,7
149	xor	edx,eax
150	mov	DWORD [36+esp],esi
151	rol	edx,16
152	mov	DWORD [20+esp],ebp
153	add	ecx,edx
154	mov	esi,DWORD [44+esp]
155	xor	ebx,ecx
156	mov	edi,DWORD [60+esp]
157	rol	ebx,12
158	mov	ebp,DWORD [28+esp]
159	add	eax,ebx
160	xor	edx,eax
161	mov	DWORD [8+esp],eax
162	rol	edx,8
163	mov	eax,DWORD [12+esp]
164	add	ecx,edx
165	mov	DWORD [56+esp],edx
166	xor	ebx,ecx
167	add	eax,ebp
168	rol	ebx,7
169	xor	edi,eax
170	rol	edi,16
171	mov	DWORD [24+esp],ebx
172	add	esi,edi
173	xor	ebp,esi
174	rol	ebp,12
175	mov	ebx,DWORD [20+esp]
176	add	eax,ebp
177	xor	edi,eax
178	mov	DWORD [12+esp],eax
179	rol	edi,8
180	mov	eax,DWORD [esp]
181	add	esi,edi
182	mov	edx,edi
183	xor	ebp,esi
184	add	eax,ebx
185	rol	ebp,7
186	xor	edx,eax
187	rol	edx,16
188	mov	DWORD [28+esp],ebp
189	add	ecx,edx
190	xor	ebx,ecx
191	mov	edi,DWORD [48+esp]
192	rol	ebx,12
193	mov	ebp,DWORD [24+esp]
194	add	eax,ebx
195	xor	edx,eax
196	mov	DWORD [esp],eax
197	rol	edx,8
198	mov	eax,DWORD [4+esp]
199	add	ecx,edx
200	mov	DWORD [60+esp],edx
201	xor	ebx,ecx
202	add	eax,ebp
203	rol	ebx,7
204	xor	edi,eax
205	mov	DWORD [40+esp],ecx
206	rol	edi,16
207	mov	DWORD [20+esp],ebx
208	add	esi,edi
209	mov	ecx,DWORD [32+esp]
210	xor	ebp,esi
211	mov	edx,DWORD [52+esp]
212	rol	ebp,12
213	mov	ebx,DWORD [28+esp]
214	add	eax,ebp
215	xor	edi,eax
216	mov	DWORD [4+esp],eax
217	rol	edi,8
218	mov	eax,DWORD [8+esp]
219	add	esi,edi
220	mov	DWORD [48+esp],edi
221	xor	ebp,esi
222	add	eax,ebx
223	rol	ebp,7
224	xor	edx,eax
225	mov	DWORD [44+esp],esi
226	rol	edx,16
227	mov	DWORD [24+esp],ebp
228	add	ecx,edx
229	mov	esi,DWORD [36+esp]
230	xor	ebx,ecx
231	mov	edi,DWORD [56+esp]
232	rol	ebx,12
233	mov	ebp,DWORD [16+esp]
234	add	eax,ebx
235	xor	edx,eax
236	mov	DWORD [8+esp],eax
237	rol	edx,8
238	mov	eax,DWORD [12+esp]
239	add	ecx,edx
240	mov	DWORD [52+esp],edx
241	xor	ebx,ecx
242	add	eax,ebp
243	rol	ebx,7
244	xor	edi,eax
245	rol	edi,16
246	mov	DWORD [28+esp],ebx
247	add	esi,edi
248	xor	ebp,esi
249	mov	edx,DWORD [48+esp]
250	rol	ebp,12
251	mov	ebx,DWORD [128+esp]
252	add	eax,ebp
253	xor	edi,eax
254	mov	DWORD [12+esp],eax
255	rol	edi,8
256	mov	eax,DWORD [esp]
257	add	esi,edi
258	mov	DWORD [56+esp],edi
259	xor	ebp,esi
260	rol	ebp,7
261	dec	ebx
262	jnz	NEAR L$004loop
263	mov	ebx,DWORD [160+esp]
264	add	eax,1634760805
265	add	ebp,DWORD [80+esp]
266	add	ecx,DWORD [96+esp]
267	add	esi,DWORD [100+esp]
268	cmp	ebx,64
269	jb	NEAR L$005tail
270	mov	ebx,DWORD [156+esp]
271	add	edx,DWORD [112+esp]
272	add	edi,DWORD [120+esp]
273	xor	eax,DWORD [ebx]
274	xor	ebp,DWORD [16+ebx]
275	mov	DWORD [esp],eax
276	mov	eax,DWORD [152+esp]
277	xor	ecx,DWORD [32+ebx]
278	xor	esi,DWORD [36+ebx]
279	xor	edx,DWORD [48+ebx]
280	xor	edi,DWORD [56+ebx]
281	mov	DWORD [16+eax],ebp
282	mov	DWORD [32+eax],ecx
283	mov	DWORD [36+eax],esi
284	mov	DWORD [48+eax],edx
285	mov	DWORD [56+eax],edi
286	mov	ebp,DWORD [4+esp]
287	mov	ecx,DWORD [8+esp]
288	mov	esi,DWORD [12+esp]
289	mov	edx,DWORD [20+esp]
290	mov	edi,DWORD [24+esp]
291	add	ebp,857760878
292	add	ecx,2036477234
293	add	esi,1797285236
294	add	edx,DWORD [84+esp]
295	add	edi,DWORD [88+esp]
296	xor	ebp,DWORD [4+ebx]
297	xor	ecx,DWORD [8+ebx]
298	xor	esi,DWORD [12+ebx]
299	xor	edx,DWORD [20+ebx]
300	xor	edi,DWORD [24+ebx]
301	mov	DWORD [4+eax],ebp
302	mov	DWORD [8+eax],ecx
303	mov	DWORD [12+eax],esi
304	mov	DWORD [20+eax],edx
305	mov	DWORD [24+eax],edi
306	mov	ebp,DWORD [28+esp]
307	mov	ecx,DWORD [40+esp]
308	mov	esi,DWORD [44+esp]
309	mov	edx,DWORD [52+esp]
310	mov	edi,DWORD [60+esp]
311	add	ebp,DWORD [92+esp]
312	add	ecx,DWORD [104+esp]
313	add	esi,DWORD [108+esp]
314	add	edx,DWORD [116+esp]
315	add	edi,DWORD [124+esp]
316	xor	ebp,DWORD [28+ebx]
317	xor	ecx,DWORD [40+ebx]
318	xor	esi,DWORD [44+ebx]
319	xor	edx,DWORD [52+ebx]
320	xor	edi,DWORD [60+ebx]
321	lea	ebx,[64+ebx]
322	mov	DWORD [28+eax],ebp
323	mov	ebp,DWORD [esp]
324	mov	DWORD [40+eax],ecx
325	mov	ecx,DWORD [160+esp]
326	mov	DWORD [44+eax],esi
327	mov	DWORD [52+eax],edx
328	mov	DWORD [60+eax],edi
329	mov	DWORD [eax],ebp
330	lea	eax,[64+eax]
331	sub	ecx,64
332	jnz	NEAR L$003outer_loop
333	jmp	NEAR L$006done
334L$005tail:
335	add	edx,DWORD [112+esp]
336	add	edi,DWORD [120+esp]
337	mov	DWORD [esp],eax
338	mov	DWORD [16+esp],ebp
339	mov	DWORD [32+esp],ecx
340	mov	DWORD [36+esp],esi
341	mov	DWORD [48+esp],edx
342	mov	DWORD [56+esp],edi
343	mov	ebp,DWORD [4+esp]
344	mov	ecx,DWORD [8+esp]
345	mov	esi,DWORD [12+esp]
346	mov	edx,DWORD [20+esp]
347	mov	edi,DWORD [24+esp]
348	add	ebp,857760878
349	add	ecx,2036477234
350	add	esi,1797285236
351	add	edx,DWORD [84+esp]
352	add	edi,DWORD [88+esp]
353	mov	DWORD [4+esp],ebp
354	mov	DWORD [8+esp],ecx
355	mov	DWORD [12+esp],esi
356	mov	DWORD [20+esp],edx
357	mov	DWORD [24+esp],edi
358	mov	ebp,DWORD [28+esp]
359	mov	ecx,DWORD [40+esp]
360	mov	esi,DWORD [44+esp]
361	mov	edx,DWORD [52+esp]
362	mov	edi,DWORD [60+esp]
363	add	ebp,DWORD [92+esp]
364	add	ecx,DWORD [104+esp]
365	add	esi,DWORD [108+esp]
366	add	edx,DWORD [116+esp]
367	add	edi,DWORD [124+esp]
368	mov	DWORD [28+esp],ebp
369	mov	ebp,DWORD [156+esp]
370	mov	DWORD [40+esp],ecx
371	mov	ecx,DWORD [152+esp]
372	mov	DWORD [44+esp],esi
373	xor	esi,esi
374	mov	DWORD [52+esp],edx
375	mov	DWORD [60+esp],edi
376	xor	eax,eax
377	xor	edx,edx
378L$007tail_loop:
379	mov	al,BYTE [ebp*1+esi]
380	mov	dl,BYTE [esi*1+esp]
381	lea	esi,[1+esi]
382	xor	al,dl
383	mov	BYTE [esi*1+ecx-1],al
384	dec	ebx
385	jnz	NEAR L$007tail_loop
386L$006done:
387	add	esp,132
388L$000no_data:
389	pop	edi
390	pop	esi
391	pop	ebx
392	pop	ebp
393	ret
394align	16
395__ChaCha20_ssse3:
396	push	ebp
397	push	ebx
398	push	esi
399	push	edi
400L$ssse3_shortcut:
401	mov	edi,DWORD [20+esp]
402	mov	esi,DWORD [24+esp]
403	mov	ecx,DWORD [28+esp]
404	mov	edx,DWORD [32+esp]
405	mov	ebx,DWORD [36+esp]
406	mov	ebp,esp
407	sub	esp,524
408	and	esp,-64
409	mov	DWORD [512+esp],ebp
410	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
411	movdqu	xmm3,[ebx]
412	cmp	ecx,256
413	jb	NEAR L$0081x
414	mov	DWORD [516+esp],edx
415	mov	DWORD [520+esp],ebx
416	sub	ecx,256
417	lea	ebp,[384+esp]
418	movdqu	xmm7,[edx]
419	pshufd	xmm0,xmm3,0
420	pshufd	xmm1,xmm3,85
421	pshufd	xmm2,xmm3,170
422	pshufd	xmm3,xmm3,255
423	paddd	xmm0,[48+eax]
424	pshufd	xmm4,xmm7,0
425	pshufd	xmm5,xmm7,85
426	psubd	xmm0,[64+eax]
427	pshufd	xmm6,xmm7,170
428	pshufd	xmm7,xmm7,255
429	movdqa	[64+ebp],xmm0
430	movdqa	[80+ebp],xmm1
431	movdqa	[96+ebp],xmm2
432	movdqa	[112+ebp],xmm3
433	movdqu	xmm3,[16+edx]
434	movdqa	[ebp-64],xmm4
435	movdqa	[ebp-48],xmm5
436	movdqa	[ebp-32],xmm6
437	movdqa	[ebp-16],xmm7
438	movdqa	xmm7,[32+eax]
439	lea	ebx,[128+esp]
440	pshufd	xmm0,xmm3,0
441	pshufd	xmm1,xmm3,85
442	pshufd	xmm2,xmm3,170
443	pshufd	xmm3,xmm3,255
444	pshufd	xmm4,xmm7,0
445	pshufd	xmm5,xmm7,85
446	pshufd	xmm6,xmm7,170
447	pshufd	xmm7,xmm7,255
448	movdqa	[ebp],xmm0
449	movdqa	[16+ebp],xmm1
450	movdqa	[32+ebp],xmm2
451	movdqa	[48+ebp],xmm3
452	movdqa	[ebp-128],xmm4
453	movdqa	[ebp-112],xmm5
454	movdqa	[ebp-96],xmm6
455	movdqa	[ebp-80],xmm7
456	lea	esi,[128+esi]
457	lea	edi,[128+edi]
458	jmp	NEAR L$009outer_loop
459align	16
460L$009outer_loop:
461	movdqa	xmm1,[ebp-112]
462	movdqa	xmm2,[ebp-96]
463	movdqa	xmm3,[ebp-80]
464	movdqa	xmm5,[ebp-48]
465	movdqa	xmm6,[ebp-32]
466	movdqa	xmm7,[ebp-16]
467	movdqa	[ebx-112],xmm1
468	movdqa	[ebx-96],xmm2
469	movdqa	[ebx-80],xmm3
470	movdqa	[ebx-48],xmm5
471	movdqa	[ebx-32],xmm6
472	movdqa	[ebx-16],xmm7
473	movdqa	xmm2,[32+ebp]
474	movdqa	xmm3,[48+ebp]
475	movdqa	xmm4,[64+ebp]
476	movdqa	xmm5,[80+ebp]
477	movdqa	xmm6,[96+ebp]
478	movdqa	xmm7,[112+ebp]
479	paddd	xmm4,[64+eax]
480	movdqa	[32+ebx],xmm2
481	movdqa	[48+ebx],xmm3
482	movdqa	[64+ebx],xmm4
483	movdqa	[80+ebx],xmm5
484	movdqa	[96+ebx],xmm6
485	movdqa	[112+ebx],xmm7
486	movdqa	[64+ebp],xmm4
487	movdqa	xmm0,[ebp-128]
488	movdqa	xmm6,xmm4
489	movdqa	xmm3,[ebp-64]
490	movdqa	xmm4,[ebp]
491	movdqa	xmm5,[16+ebp]
492	mov	edx,10
493	nop
494align	16
495L$010loop:
496	paddd	xmm0,xmm3
497	movdqa	xmm2,xmm3
498	pxor	xmm6,xmm0
499	pshufb	xmm6,[eax]
500	paddd	xmm4,xmm6
501	pxor	xmm2,xmm4
502	movdqa	xmm3,[ebx-48]
503	movdqa	xmm1,xmm2
504	pslld	xmm2,12
505	psrld	xmm1,20
506	por	xmm2,xmm1
507	movdqa	xmm1,[ebx-112]
508	paddd	xmm0,xmm2
509	movdqa	xmm7,[80+ebx]
510	pxor	xmm6,xmm0
511	movdqa	[ebx-128],xmm0
512	pshufb	xmm6,[16+eax]
513	paddd	xmm4,xmm6
514	movdqa	[64+ebx],xmm6
515	pxor	xmm2,xmm4
516	paddd	xmm1,xmm3
517	movdqa	xmm0,xmm2
518	pslld	xmm2,7
519	psrld	xmm0,25
520	pxor	xmm7,xmm1
521	por	xmm2,xmm0
522	movdqa	[ebx],xmm4
523	pshufb	xmm7,[eax]
524	movdqa	[ebx-64],xmm2
525	paddd	xmm5,xmm7
526	movdqa	xmm4,[32+ebx]
527	pxor	xmm3,xmm5
528	movdqa	xmm2,[ebx-32]
529	movdqa	xmm0,xmm3
530	pslld	xmm3,12
531	psrld	xmm0,20
532	por	xmm3,xmm0
533	movdqa	xmm0,[ebx-96]
534	paddd	xmm1,xmm3
535	movdqa	xmm6,[96+ebx]
536	pxor	xmm7,xmm1
537	movdqa	[ebx-112],xmm1
538	pshufb	xmm7,[16+eax]
539	paddd	xmm5,xmm7
540	movdqa	[80+ebx],xmm7
541	pxor	xmm3,xmm5
542	paddd	xmm0,xmm2
543	movdqa	xmm1,xmm3
544	pslld	xmm3,7
545	psrld	xmm1,25
546	pxor	xmm6,xmm0
547	por	xmm3,xmm1
548	movdqa	[16+ebx],xmm5
549	pshufb	xmm6,[eax]
550	movdqa	[ebx-48],xmm3
551	paddd	xmm4,xmm6
552	movdqa	xmm5,[48+ebx]
553	pxor	xmm2,xmm4
554	movdqa	xmm3,[ebx-16]
555	movdqa	xmm1,xmm2
556	pslld	xmm2,12
557	psrld	xmm1,20
558	por	xmm2,xmm1
559	movdqa	xmm1,[ebx-80]
560	paddd	xmm0,xmm2
561	movdqa	xmm7,[112+ebx]
562	pxor	xmm6,xmm0
563	movdqa	[ebx-96],xmm0
564	pshufb	xmm6,[16+eax]
565	paddd	xmm4,xmm6
566	movdqa	[96+ebx],xmm6
567	pxor	xmm2,xmm4
568	paddd	xmm1,xmm3
569	movdqa	xmm0,xmm2
570	pslld	xmm2,7
571	psrld	xmm0,25
572	pxor	xmm7,xmm1
573	por	xmm2,xmm0
574	pshufb	xmm7,[eax]
575	movdqa	[ebx-32],xmm2
576	paddd	xmm5,xmm7
577	pxor	xmm3,xmm5
578	movdqa	xmm2,[ebx-48]
579	movdqa	xmm0,xmm3
580	pslld	xmm3,12
581	psrld	xmm0,20
582	por	xmm3,xmm0
583	movdqa	xmm0,[ebx-128]
584	paddd	xmm1,xmm3
585	pxor	xmm7,xmm1
586	movdqa	[ebx-80],xmm1
587	pshufb	xmm7,[16+eax]
588	paddd	xmm5,xmm7
589	movdqa	xmm6,xmm7
590	pxor	xmm3,xmm5
591	paddd	xmm0,xmm2
592	movdqa	xmm1,xmm3
593	pslld	xmm3,7
594	psrld	xmm1,25
595	pxor	xmm6,xmm0
596	por	xmm3,xmm1
597	pshufb	xmm6,[eax]
598	movdqa	[ebx-16],xmm3
599	paddd	xmm4,xmm6
600	pxor	xmm2,xmm4
601	movdqa	xmm3,[ebx-32]
602	movdqa	xmm1,xmm2
603	pslld	xmm2,12
604	psrld	xmm1,20
605	por	xmm2,xmm1
606	movdqa	xmm1,[ebx-112]
607	paddd	xmm0,xmm2
608	movdqa	xmm7,[64+ebx]
609	pxor	xmm6,xmm0
610	movdqa	[ebx-128],xmm0
611	pshufb	xmm6,[16+eax]
612	paddd	xmm4,xmm6
613	movdqa	[112+ebx],xmm6
614	pxor	xmm2,xmm4
615	paddd	xmm1,xmm3
616	movdqa	xmm0,xmm2
617	pslld	xmm2,7
618	psrld	xmm0,25
619	pxor	xmm7,xmm1
620	por	xmm2,xmm0
621	movdqa	[32+ebx],xmm4
622	pshufb	xmm7,[eax]
623	movdqa	[ebx-48],xmm2
624	paddd	xmm5,xmm7
625	movdqa	xmm4,[ebx]
626	pxor	xmm3,xmm5
627	movdqa	xmm2,[ebx-16]
628	movdqa	xmm0,xmm3
629	pslld	xmm3,12
630	psrld	xmm0,20
631	por	xmm3,xmm0
632	movdqa	xmm0,[ebx-96]
633	paddd	xmm1,xmm3
634	movdqa	xmm6,[80+ebx]
635	pxor	xmm7,xmm1
636	movdqa	[ebx-112],xmm1
637	pshufb	xmm7,[16+eax]
638	paddd	xmm5,xmm7
639	movdqa	[64+ebx],xmm7
640	pxor	xmm3,xmm5
641	paddd	xmm0,xmm2
642	movdqa	xmm1,xmm3
643	pslld	xmm3,7
644	psrld	xmm1,25
645	pxor	xmm6,xmm0
646	por	xmm3,xmm1
647	movdqa	[48+ebx],xmm5
648	pshufb	xmm6,[eax]
649	movdqa	[ebx-32],xmm3
650	paddd	xmm4,xmm6
651	movdqa	xmm5,[16+ebx]
652	pxor	xmm2,xmm4
653	movdqa	xmm3,[ebx-64]
654	movdqa	xmm1,xmm2
655	pslld	xmm2,12
656	psrld	xmm1,20
657	por	xmm2,xmm1
658	movdqa	xmm1,[ebx-80]
659	paddd	xmm0,xmm2
660	movdqa	xmm7,[96+ebx]
661	pxor	xmm6,xmm0
662	movdqa	[ebx-96],xmm0
663	pshufb	xmm6,[16+eax]
664	paddd	xmm4,xmm6
665	movdqa	[80+ebx],xmm6
666	pxor	xmm2,xmm4
667	paddd	xmm1,xmm3
668	movdqa	xmm0,xmm2
669	pslld	xmm2,7
670	psrld	xmm0,25
671	pxor	xmm7,xmm1
672	por	xmm2,xmm0
673	pshufb	xmm7,[eax]
674	movdqa	[ebx-16],xmm2
675	paddd	xmm5,xmm7
676	pxor	xmm3,xmm5
677	movdqa	xmm0,xmm3
678	pslld	xmm3,12
679	psrld	xmm0,20
680	por	xmm3,xmm0
681	movdqa	xmm0,[ebx-128]
682	paddd	xmm1,xmm3
683	movdqa	xmm6,[64+ebx]
684	pxor	xmm7,xmm1
685	movdqa	[ebx-80],xmm1
686	pshufb	xmm7,[16+eax]
687	paddd	xmm5,xmm7
688	movdqa	[96+ebx],xmm7
689	pxor	xmm3,xmm5
690	movdqa	xmm1,xmm3
691	pslld	xmm3,7
692	psrld	xmm1,25
693	por	xmm3,xmm1
694	dec	edx
695	jnz	NEAR L$010loop
696	movdqa	[ebx-64],xmm3
697	movdqa	[ebx],xmm4
698	movdqa	[16+ebx],xmm5
699	movdqa	[64+ebx],xmm6
700	movdqa	[96+ebx],xmm7
701	movdqa	xmm1,[ebx-112]
702	movdqa	xmm2,[ebx-96]
703	movdqa	xmm3,[ebx-80]
704	paddd	xmm0,[ebp-128]
705	paddd	xmm1,[ebp-112]
706	paddd	xmm2,[ebp-96]
707	paddd	xmm3,[ebp-80]
708	movdqa	xmm6,xmm0
709	punpckldq	xmm0,xmm1
710	movdqa	xmm7,xmm2
711	punpckldq	xmm2,xmm3
712	punpckhdq	xmm6,xmm1
713	punpckhdq	xmm7,xmm3
714	movdqa	xmm1,xmm0
715	punpcklqdq	xmm0,xmm2
716	movdqa	xmm3,xmm6
717	punpcklqdq	xmm6,xmm7
718	punpckhqdq	xmm1,xmm2
719	punpckhqdq	xmm3,xmm7
720	movdqu	xmm4,[esi-128]
721	movdqu	xmm5,[esi-64]
722	movdqu	xmm2,[esi]
723	movdqu	xmm7,[64+esi]
724	lea	esi,[16+esi]
725	pxor	xmm4,xmm0
726	movdqa	xmm0,[ebx-64]
727	pxor	xmm5,xmm1
728	movdqa	xmm1,[ebx-48]
729	pxor	xmm6,xmm2
730	movdqa	xmm2,[ebx-32]
731	pxor	xmm7,xmm3
732	movdqa	xmm3,[ebx-16]
733	movdqu	[edi-128],xmm4
734	movdqu	[edi-64],xmm5
735	movdqu	[edi],xmm6
736	movdqu	[64+edi],xmm7
737	lea	edi,[16+edi]
738	paddd	xmm0,[ebp-64]
739	paddd	xmm1,[ebp-48]
740	paddd	xmm2,[ebp-32]
741	paddd	xmm3,[ebp-16]
742	movdqa	xmm6,xmm0
743	punpckldq	xmm0,xmm1
744	movdqa	xmm7,xmm2
745	punpckldq	xmm2,xmm3
746	punpckhdq	xmm6,xmm1
747	punpckhdq	xmm7,xmm3
748	movdqa	xmm1,xmm0
749	punpcklqdq	xmm0,xmm2
750	movdqa	xmm3,xmm6
751	punpcklqdq	xmm6,xmm7
752	punpckhqdq	xmm1,xmm2
753	punpckhqdq	xmm3,xmm7
754	movdqu	xmm4,[esi-128]
755	movdqu	xmm5,[esi-64]
756	movdqu	xmm2,[esi]
757	movdqu	xmm7,[64+esi]
758	lea	esi,[16+esi]
759	pxor	xmm4,xmm0
760	movdqa	xmm0,[ebx]
761	pxor	xmm5,xmm1
762	movdqa	xmm1,[16+ebx]
763	pxor	xmm6,xmm2
764	movdqa	xmm2,[32+ebx]
765	pxor	xmm7,xmm3
766	movdqa	xmm3,[48+ebx]
767	movdqu	[edi-128],xmm4
768	movdqu	[edi-64],xmm5
769	movdqu	[edi],xmm6
770	movdqu	[64+edi],xmm7
771	lea	edi,[16+edi]
772	paddd	xmm0,[ebp]
773	paddd	xmm1,[16+ebp]
774	paddd	xmm2,[32+ebp]
775	paddd	xmm3,[48+ebp]
776	movdqa	xmm6,xmm0
777	punpckldq	xmm0,xmm1
778	movdqa	xmm7,xmm2
779	punpckldq	xmm2,xmm3
780	punpckhdq	xmm6,xmm1
781	punpckhdq	xmm7,xmm3
782	movdqa	xmm1,xmm0
783	punpcklqdq	xmm0,xmm2
784	movdqa	xmm3,xmm6
785	punpcklqdq	xmm6,xmm7
786	punpckhqdq	xmm1,xmm2
787	punpckhqdq	xmm3,xmm7
788	movdqu	xmm4,[esi-128]
789	movdqu	xmm5,[esi-64]
790	movdqu	xmm2,[esi]
791	movdqu	xmm7,[64+esi]
792	lea	esi,[16+esi]
793	pxor	xmm4,xmm0
794	movdqa	xmm0,[64+ebx]
795	pxor	xmm5,xmm1
796	movdqa	xmm1,[80+ebx]
797	pxor	xmm6,xmm2
798	movdqa	xmm2,[96+ebx]
799	pxor	xmm7,xmm3
800	movdqa	xmm3,[112+ebx]
801	movdqu	[edi-128],xmm4
802	movdqu	[edi-64],xmm5
803	movdqu	[edi],xmm6
804	movdqu	[64+edi],xmm7
805	lea	edi,[16+edi]
806	paddd	xmm0,[64+ebp]
807	paddd	xmm1,[80+ebp]
808	paddd	xmm2,[96+ebp]
809	paddd	xmm3,[112+ebp]
810	movdqa	xmm6,xmm0
811	punpckldq	xmm0,xmm1
812	movdqa	xmm7,xmm2
813	punpckldq	xmm2,xmm3
814	punpckhdq	xmm6,xmm1
815	punpckhdq	xmm7,xmm3
816	movdqa	xmm1,xmm0
817	punpcklqdq	xmm0,xmm2
818	movdqa	xmm3,xmm6
819	punpcklqdq	xmm6,xmm7
820	punpckhqdq	xmm1,xmm2
821	punpckhqdq	xmm3,xmm7
822	movdqu	xmm4,[esi-128]
823	movdqu	xmm5,[esi-64]
824	movdqu	xmm2,[esi]
825	movdqu	xmm7,[64+esi]
826	lea	esi,[208+esi]
827	pxor	xmm4,xmm0
828	pxor	xmm5,xmm1
829	pxor	xmm6,xmm2
830	pxor	xmm7,xmm3
831	movdqu	[edi-128],xmm4
832	movdqu	[edi-64],xmm5
833	movdqu	[edi],xmm6
834	movdqu	[64+edi],xmm7
835	lea	edi,[208+edi]
836	sub	ecx,256
837	jnc	NEAR L$009outer_loop
838	add	ecx,256
839	jz	NEAR L$011done
840	mov	ebx,DWORD [520+esp]
841	lea	esi,[esi-128]
842	mov	edx,DWORD [516+esp]
843	lea	edi,[edi-128]
844	movd	xmm2,DWORD [64+ebp]
845	movdqu	xmm3,[ebx]
846	paddd	xmm2,[96+eax]
847	pand	xmm3,[112+eax]
848	por	xmm3,xmm2
849L$0081x:
850	movdqa	xmm0,[32+eax]
851	movdqu	xmm1,[edx]
852	movdqu	xmm2,[16+edx]
853	movdqa	xmm6,[eax]
854	movdqa	xmm7,[16+eax]
855	mov	DWORD [48+esp],ebp
856	movdqa	[esp],xmm0
857	movdqa	[16+esp],xmm1
858	movdqa	[32+esp],xmm2
859	movdqa	[48+esp],xmm3
860	mov	edx,10
861	jmp	NEAR L$012loop1x
862align	16
863L$013outer1x:
864	movdqa	xmm3,[80+eax]
865	movdqa	xmm0,[esp]
866	movdqa	xmm1,[16+esp]
867	movdqa	xmm2,[32+esp]
868	paddd	xmm3,[48+esp]
869	mov	edx,10
870	movdqa	[48+esp],xmm3
871	jmp	NEAR L$012loop1x
872align	16
873L$012loop1x:
874	paddd	xmm0,xmm1
875	pxor	xmm3,xmm0
876db	102,15,56,0,222
877	paddd	xmm2,xmm3
878	pxor	xmm1,xmm2
879	movdqa	xmm4,xmm1
880	psrld	xmm1,20
881	pslld	xmm4,12
882	por	xmm1,xmm4
883	paddd	xmm0,xmm1
884	pxor	xmm3,xmm0
885db	102,15,56,0,223
886	paddd	xmm2,xmm3
887	pxor	xmm1,xmm2
888	movdqa	xmm4,xmm1
889	psrld	xmm1,25
890	pslld	xmm4,7
891	por	xmm1,xmm4
892	pshufd	xmm2,xmm2,78
893	pshufd	xmm1,xmm1,57
894	pshufd	xmm3,xmm3,147
895	nop
896	paddd	xmm0,xmm1
897	pxor	xmm3,xmm0
898db	102,15,56,0,222
899	paddd	xmm2,xmm3
900	pxor	xmm1,xmm2
901	movdqa	xmm4,xmm1
902	psrld	xmm1,20
903	pslld	xmm4,12
904	por	xmm1,xmm4
905	paddd	xmm0,xmm1
906	pxor	xmm3,xmm0
907db	102,15,56,0,223
908	paddd	xmm2,xmm3
909	pxor	xmm1,xmm2
910	movdqa	xmm4,xmm1
911	psrld	xmm1,25
912	pslld	xmm4,7
913	por	xmm1,xmm4
914	pshufd	xmm2,xmm2,78
915	pshufd	xmm1,xmm1,147
916	pshufd	xmm3,xmm3,57
917	dec	edx
918	jnz	NEAR L$012loop1x
919	paddd	xmm0,[esp]
920	paddd	xmm1,[16+esp]
921	paddd	xmm2,[32+esp]
922	paddd	xmm3,[48+esp]
923	cmp	ecx,64
924	jb	NEAR L$014tail
925	movdqu	xmm4,[esi]
926	movdqu	xmm5,[16+esi]
927	pxor	xmm0,xmm4
928	movdqu	xmm4,[32+esi]
929	pxor	xmm1,xmm5
930	movdqu	xmm5,[48+esi]
931	pxor	xmm2,xmm4
932	pxor	xmm3,xmm5
933	lea	esi,[64+esi]
934	movdqu	[edi],xmm0
935	movdqu	[16+edi],xmm1
936	movdqu	[32+edi],xmm2
937	movdqu	[48+edi],xmm3
938	lea	edi,[64+edi]
939	sub	ecx,64
940	jnz	NEAR L$013outer1x
941	jmp	NEAR L$011done
942L$014tail:
943	movdqa	[esp],xmm0
944	movdqa	[16+esp],xmm1
945	movdqa	[32+esp],xmm2
946	movdqa	[48+esp],xmm3
947	xor	eax,eax
948	xor	edx,edx
949	xor	ebp,ebp
950L$015tail_loop:
951	mov	al,BYTE [ebp*1+esp]
952	mov	dl,BYTE [ebp*1+esi]
953	lea	ebp,[1+ebp]
954	xor	al,dl
955	mov	BYTE [ebp*1+edi-1],al
956	dec	ecx
957	jnz	NEAR L$015tail_loop
958L$011done:
959	mov	esp,DWORD [512+esp]
960	pop	edi
961	pop	esi
962	pop	ebx
963	pop	ebp
964	ret
965align	64
966L$ssse3_data:
967db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
968db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
969dd	1634760805,857760878,2036477234,1797285236
970dd	0,1,2,3
971dd	4,4,4,4
972dd	1,0,0,0
973dd	4,0,0,0
974dd	0,-1,-1,-1
975align	64
976db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
977db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
978db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
979db	114,103,62,0
980segment	.bss
981common	_GFp_ia32cap_P 16
982