1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifdef BORINGSSL_PREFIX
5%include "boringssl_prefix_symbols_nasm.inc"
6%endif
7%ifidn __OUTPUT_FORMAT__,obj
8section	code	use32 class=code align=64
9%elifidn __OUTPUT_FORMAT__,win32
10$@feat.00 equ 1
11section	.text	code align=64
12%else
13section	.text	code
14%endif
15global	_GFp_ChaCha20_ctr32
16align	16
17_GFp_ChaCha20_ctr32:
18L$_GFp_ChaCha20_ctr32_begin:
19	push	ebp
20	push	ebx
21	push	esi
22	push	edi
23	xor	eax,eax
24	cmp	eax,DWORD [28+esp]
25	je	NEAR L$000no_data
26	call	L$pic_point
27L$pic_point:
28	pop	eax
29	lea	ebp,[_GFp_ia32cap_P]
30	test	DWORD [ebp],16777216
31	jz	NEAR L$001x86
32	test	DWORD [4+ebp],512
33	jz	NEAR L$001x86
34	jmp	NEAR L$ssse3_shortcut
35L$001x86:
36	mov	esi,DWORD [32+esp]
37	mov	edi,DWORD [36+esp]
38	sub	esp,132
39	mov	eax,DWORD [esi]
40	mov	ebx,DWORD [4+esi]
41	mov	ecx,DWORD [8+esi]
42	mov	edx,DWORD [12+esi]
43	mov	DWORD [80+esp],eax
44	mov	DWORD [84+esp],ebx
45	mov	DWORD [88+esp],ecx
46	mov	DWORD [92+esp],edx
47	mov	eax,DWORD [16+esi]
48	mov	ebx,DWORD [20+esi]
49	mov	ecx,DWORD [24+esi]
50	mov	edx,DWORD [28+esi]
51	mov	DWORD [96+esp],eax
52	mov	DWORD [100+esp],ebx
53	mov	DWORD [104+esp],ecx
54	mov	DWORD [108+esp],edx
55	mov	eax,DWORD [edi]
56	mov	ebx,DWORD [4+edi]
57	mov	ecx,DWORD [8+edi]
58	mov	edx,DWORD [12+edi]
59	sub	eax,1
60	mov	DWORD [112+esp],eax
61	mov	DWORD [116+esp],ebx
62	mov	DWORD [120+esp],ecx
63	mov	DWORD [124+esp],edx
64	jmp	NEAR L$002entry
65align	16
66L$003outer_loop:
67	mov	DWORD [156+esp],ebx
68	mov	DWORD [152+esp],eax
69	mov	DWORD [160+esp],ecx
70L$002entry:
71	mov	eax,1634760805
72	mov	DWORD [4+esp],857760878
73	mov	DWORD [8+esp],2036477234
74	mov	DWORD [12+esp],1797285236
75	mov	ebx,DWORD [84+esp]
76	mov	ebp,DWORD [88+esp]
77	mov	ecx,DWORD [104+esp]
78	mov	esi,DWORD [108+esp]
79	mov	edx,DWORD [116+esp]
80	mov	edi,DWORD [120+esp]
81	mov	DWORD [20+esp],ebx
82	mov	DWORD [24+esp],ebp
83	mov	DWORD [40+esp],ecx
84	mov	DWORD [44+esp],esi
85	mov	DWORD [52+esp],edx
86	mov	DWORD [56+esp],edi
87	mov	ebx,DWORD [92+esp]
88	mov	edi,DWORD [124+esp]
89	mov	edx,DWORD [112+esp]
90	mov	ebp,DWORD [80+esp]
91	mov	ecx,DWORD [96+esp]
92	mov	esi,DWORD [100+esp]
93	add	edx,1
94	mov	DWORD [28+esp],ebx
95	mov	DWORD [60+esp],edi
96	mov	DWORD [112+esp],edx
97	mov	ebx,10
98	jmp	NEAR L$004loop
99align	16
100L$004loop:
101	add	eax,ebp
102	mov	DWORD [128+esp],ebx
103	mov	ebx,ebp
104	xor	edx,eax
105	rol	edx,16
106	add	ecx,edx
107	xor	ebx,ecx
108	mov	edi,DWORD [52+esp]
109	rol	ebx,12
110	mov	ebp,DWORD [20+esp]
111	add	eax,ebx
112	xor	edx,eax
113	mov	DWORD [esp],eax
114	rol	edx,8
115	mov	eax,DWORD [4+esp]
116	add	ecx,edx
117	mov	DWORD [48+esp],edx
118	xor	ebx,ecx
119	add	eax,ebp
120	rol	ebx,7
121	xor	edi,eax
122	mov	DWORD [32+esp],ecx
123	rol	edi,16
124	mov	DWORD [16+esp],ebx
125	add	esi,edi
126	mov	ecx,DWORD [40+esp]
127	xor	ebp,esi
128	mov	edx,DWORD [56+esp]
129	rol	ebp,12
130	mov	ebx,DWORD [24+esp]
131	add	eax,ebp
132	xor	edi,eax
133	mov	DWORD [4+esp],eax
134	rol	edi,8
135	mov	eax,DWORD [8+esp]
136	add	esi,edi
137	mov	DWORD [52+esp],edi
138	xor	ebp,esi
139	add	eax,ebx
140	rol	ebp,7
141	xor	edx,eax
142	mov	DWORD [36+esp],esi
143	rol	edx,16
144	mov	DWORD [20+esp],ebp
145	add	ecx,edx
146	mov	esi,DWORD [44+esp]
147	xor	ebx,ecx
148	mov	edi,DWORD [60+esp]
149	rol	ebx,12
150	mov	ebp,DWORD [28+esp]
151	add	eax,ebx
152	xor	edx,eax
153	mov	DWORD [8+esp],eax
154	rol	edx,8
155	mov	eax,DWORD [12+esp]
156	add	ecx,edx
157	mov	DWORD [56+esp],edx
158	xor	ebx,ecx
159	add	eax,ebp
160	rol	ebx,7
161	xor	edi,eax
162	rol	edi,16
163	mov	DWORD [24+esp],ebx
164	add	esi,edi
165	xor	ebp,esi
166	rol	ebp,12
167	mov	ebx,DWORD [20+esp]
168	add	eax,ebp
169	xor	edi,eax
170	mov	DWORD [12+esp],eax
171	rol	edi,8
172	mov	eax,DWORD [esp]
173	add	esi,edi
174	mov	edx,edi
175	xor	ebp,esi
176	add	eax,ebx
177	rol	ebp,7
178	xor	edx,eax
179	rol	edx,16
180	mov	DWORD [28+esp],ebp
181	add	ecx,edx
182	xor	ebx,ecx
183	mov	edi,DWORD [48+esp]
184	rol	ebx,12
185	mov	ebp,DWORD [24+esp]
186	add	eax,ebx
187	xor	edx,eax
188	mov	DWORD [esp],eax
189	rol	edx,8
190	mov	eax,DWORD [4+esp]
191	add	ecx,edx
192	mov	DWORD [60+esp],edx
193	xor	ebx,ecx
194	add	eax,ebp
195	rol	ebx,7
196	xor	edi,eax
197	mov	DWORD [40+esp],ecx
198	rol	edi,16
199	mov	DWORD [20+esp],ebx
200	add	esi,edi
201	mov	ecx,DWORD [32+esp]
202	xor	ebp,esi
203	mov	edx,DWORD [52+esp]
204	rol	ebp,12
205	mov	ebx,DWORD [28+esp]
206	add	eax,ebp
207	xor	edi,eax
208	mov	DWORD [4+esp],eax
209	rol	edi,8
210	mov	eax,DWORD [8+esp]
211	add	esi,edi
212	mov	DWORD [48+esp],edi
213	xor	ebp,esi
214	add	eax,ebx
215	rol	ebp,7
216	xor	edx,eax
217	mov	DWORD [44+esp],esi
218	rol	edx,16
219	mov	DWORD [24+esp],ebp
220	add	ecx,edx
221	mov	esi,DWORD [36+esp]
222	xor	ebx,ecx
223	mov	edi,DWORD [56+esp]
224	rol	ebx,12
225	mov	ebp,DWORD [16+esp]
226	add	eax,ebx
227	xor	edx,eax
228	mov	DWORD [8+esp],eax
229	rol	edx,8
230	mov	eax,DWORD [12+esp]
231	add	ecx,edx
232	mov	DWORD [52+esp],edx
233	xor	ebx,ecx
234	add	eax,ebp
235	rol	ebx,7
236	xor	edi,eax
237	rol	edi,16
238	mov	DWORD [28+esp],ebx
239	add	esi,edi
240	xor	ebp,esi
241	mov	edx,DWORD [48+esp]
242	rol	ebp,12
243	mov	ebx,DWORD [128+esp]
244	add	eax,ebp
245	xor	edi,eax
246	mov	DWORD [12+esp],eax
247	rol	edi,8
248	mov	eax,DWORD [esp]
249	add	esi,edi
250	mov	DWORD [56+esp],edi
251	xor	ebp,esi
252	rol	ebp,7
253	dec	ebx
254	jnz	NEAR L$004loop
255	mov	ebx,DWORD [160+esp]
256	add	eax,1634760805
257	add	ebp,DWORD [80+esp]
258	add	ecx,DWORD [96+esp]
259	add	esi,DWORD [100+esp]
260	cmp	ebx,64
261	jb	NEAR L$005tail
262	mov	ebx,DWORD [156+esp]
263	add	edx,DWORD [112+esp]
264	add	edi,DWORD [120+esp]
265	xor	eax,DWORD [ebx]
266	xor	ebp,DWORD [16+ebx]
267	mov	DWORD [esp],eax
268	mov	eax,DWORD [152+esp]
269	xor	ecx,DWORD [32+ebx]
270	xor	esi,DWORD [36+ebx]
271	xor	edx,DWORD [48+ebx]
272	xor	edi,DWORD [56+ebx]
273	mov	DWORD [16+eax],ebp
274	mov	DWORD [32+eax],ecx
275	mov	DWORD [36+eax],esi
276	mov	DWORD [48+eax],edx
277	mov	DWORD [56+eax],edi
278	mov	ebp,DWORD [4+esp]
279	mov	ecx,DWORD [8+esp]
280	mov	esi,DWORD [12+esp]
281	mov	edx,DWORD [20+esp]
282	mov	edi,DWORD [24+esp]
283	add	ebp,857760878
284	add	ecx,2036477234
285	add	esi,1797285236
286	add	edx,DWORD [84+esp]
287	add	edi,DWORD [88+esp]
288	xor	ebp,DWORD [4+ebx]
289	xor	ecx,DWORD [8+ebx]
290	xor	esi,DWORD [12+ebx]
291	xor	edx,DWORD [20+ebx]
292	xor	edi,DWORD [24+ebx]
293	mov	DWORD [4+eax],ebp
294	mov	DWORD [8+eax],ecx
295	mov	DWORD [12+eax],esi
296	mov	DWORD [20+eax],edx
297	mov	DWORD [24+eax],edi
298	mov	ebp,DWORD [28+esp]
299	mov	ecx,DWORD [40+esp]
300	mov	esi,DWORD [44+esp]
301	mov	edx,DWORD [52+esp]
302	mov	edi,DWORD [60+esp]
303	add	ebp,DWORD [92+esp]
304	add	ecx,DWORD [104+esp]
305	add	esi,DWORD [108+esp]
306	add	edx,DWORD [116+esp]
307	add	edi,DWORD [124+esp]
308	xor	ebp,DWORD [28+ebx]
309	xor	ecx,DWORD [40+ebx]
310	xor	esi,DWORD [44+ebx]
311	xor	edx,DWORD [52+ebx]
312	xor	edi,DWORD [60+ebx]
313	lea	ebx,[64+ebx]
314	mov	DWORD [28+eax],ebp
315	mov	ebp,DWORD [esp]
316	mov	DWORD [40+eax],ecx
317	mov	ecx,DWORD [160+esp]
318	mov	DWORD [44+eax],esi
319	mov	DWORD [52+eax],edx
320	mov	DWORD [60+eax],edi
321	mov	DWORD [eax],ebp
322	lea	eax,[64+eax]
323	sub	ecx,64
324	jnz	NEAR L$003outer_loop
325	jmp	NEAR L$006done
326L$005tail:
327	add	edx,DWORD [112+esp]
328	add	edi,DWORD [120+esp]
329	mov	DWORD [esp],eax
330	mov	DWORD [16+esp],ebp
331	mov	DWORD [32+esp],ecx
332	mov	DWORD [36+esp],esi
333	mov	DWORD [48+esp],edx
334	mov	DWORD [56+esp],edi
335	mov	ebp,DWORD [4+esp]
336	mov	ecx,DWORD [8+esp]
337	mov	esi,DWORD [12+esp]
338	mov	edx,DWORD [20+esp]
339	mov	edi,DWORD [24+esp]
340	add	ebp,857760878
341	add	ecx,2036477234
342	add	esi,1797285236
343	add	edx,DWORD [84+esp]
344	add	edi,DWORD [88+esp]
345	mov	DWORD [4+esp],ebp
346	mov	DWORD [8+esp],ecx
347	mov	DWORD [12+esp],esi
348	mov	DWORD [20+esp],edx
349	mov	DWORD [24+esp],edi
350	mov	ebp,DWORD [28+esp]
351	mov	ecx,DWORD [40+esp]
352	mov	esi,DWORD [44+esp]
353	mov	edx,DWORD [52+esp]
354	mov	edi,DWORD [60+esp]
355	add	ebp,DWORD [92+esp]
356	add	ecx,DWORD [104+esp]
357	add	esi,DWORD [108+esp]
358	add	edx,DWORD [116+esp]
359	add	edi,DWORD [124+esp]
360	mov	DWORD [28+esp],ebp
361	mov	ebp,DWORD [156+esp]
362	mov	DWORD [40+esp],ecx
363	mov	ecx,DWORD [152+esp]
364	mov	DWORD [44+esp],esi
365	xor	esi,esi
366	mov	DWORD [52+esp],edx
367	mov	DWORD [60+esp],edi
368	xor	eax,eax
369	xor	edx,edx
370L$007tail_loop:
371	mov	al,BYTE [ebp*1+esi]
372	mov	dl,BYTE [esi*1+esp]
373	lea	esi,[1+esi]
374	xor	al,dl
375	mov	BYTE [esi*1+ecx-1],al
376	dec	ebx
377	jnz	NEAR L$007tail_loop
378L$006done:
379	add	esp,132
380L$000no_data:
381	pop	edi
382	pop	esi
383	pop	ebx
384	pop	ebp
385	ret
386align	16
387__ChaCha20_ssse3:
388	push	ebp
389	push	ebx
390	push	esi
391	push	edi
392L$ssse3_shortcut:
393	mov	edi,DWORD [20+esp]
394	mov	esi,DWORD [24+esp]
395	mov	ecx,DWORD [28+esp]
396	mov	edx,DWORD [32+esp]
397	mov	ebx,DWORD [36+esp]
398	mov	ebp,esp
399	sub	esp,524
400	and	esp,-64
401	mov	DWORD [512+esp],ebp
402	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
403	movdqu	xmm3,[ebx]
404	cmp	ecx,256
405	jb	NEAR L$0081x
406	mov	DWORD [516+esp],edx
407	mov	DWORD [520+esp],ebx
408	sub	ecx,256
409	lea	ebp,[384+esp]
410	movdqu	xmm7,[edx]
411	pshufd	xmm0,xmm3,0
412	pshufd	xmm1,xmm3,85
413	pshufd	xmm2,xmm3,170
414	pshufd	xmm3,xmm3,255
415	paddd	xmm0,[48+eax]
416	pshufd	xmm4,xmm7,0
417	pshufd	xmm5,xmm7,85
418	psubd	xmm0,[64+eax]
419	pshufd	xmm6,xmm7,170
420	pshufd	xmm7,xmm7,255
421	movdqa	[64+ebp],xmm0
422	movdqa	[80+ebp],xmm1
423	movdqa	[96+ebp],xmm2
424	movdqa	[112+ebp],xmm3
425	movdqu	xmm3,[16+edx]
426	movdqa	[ebp-64],xmm4
427	movdqa	[ebp-48],xmm5
428	movdqa	[ebp-32],xmm6
429	movdqa	[ebp-16],xmm7
430	movdqa	xmm7,[32+eax]
431	lea	ebx,[128+esp]
432	pshufd	xmm0,xmm3,0
433	pshufd	xmm1,xmm3,85
434	pshufd	xmm2,xmm3,170
435	pshufd	xmm3,xmm3,255
436	pshufd	xmm4,xmm7,0
437	pshufd	xmm5,xmm7,85
438	pshufd	xmm6,xmm7,170
439	pshufd	xmm7,xmm7,255
440	movdqa	[ebp],xmm0
441	movdqa	[16+ebp],xmm1
442	movdqa	[32+ebp],xmm2
443	movdqa	[48+ebp],xmm3
444	movdqa	[ebp-128],xmm4
445	movdqa	[ebp-112],xmm5
446	movdqa	[ebp-96],xmm6
447	movdqa	[ebp-80],xmm7
448	lea	esi,[128+esi]
449	lea	edi,[128+edi]
450	jmp	NEAR L$009outer_loop
451align	16
452L$009outer_loop:
453	movdqa	xmm1,[ebp-112]
454	movdqa	xmm2,[ebp-96]
455	movdqa	xmm3,[ebp-80]
456	movdqa	xmm5,[ebp-48]
457	movdqa	xmm6,[ebp-32]
458	movdqa	xmm7,[ebp-16]
459	movdqa	[ebx-112],xmm1
460	movdqa	[ebx-96],xmm2
461	movdqa	[ebx-80],xmm3
462	movdqa	[ebx-48],xmm5
463	movdqa	[ebx-32],xmm6
464	movdqa	[ebx-16],xmm7
465	movdqa	xmm2,[32+ebp]
466	movdqa	xmm3,[48+ebp]
467	movdqa	xmm4,[64+ebp]
468	movdqa	xmm5,[80+ebp]
469	movdqa	xmm6,[96+ebp]
470	movdqa	xmm7,[112+ebp]
471	paddd	xmm4,[64+eax]
472	movdqa	[32+ebx],xmm2
473	movdqa	[48+ebx],xmm3
474	movdqa	[64+ebx],xmm4
475	movdqa	[80+ebx],xmm5
476	movdqa	[96+ebx],xmm6
477	movdqa	[112+ebx],xmm7
478	movdqa	[64+ebp],xmm4
479	movdqa	xmm0,[ebp-128]
480	movdqa	xmm6,xmm4
481	movdqa	xmm3,[ebp-64]
482	movdqa	xmm4,[ebp]
483	movdqa	xmm5,[16+ebp]
484	mov	edx,10
485	nop
486align	16
487L$010loop:
488	paddd	xmm0,xmm3
489	movdqa	xmm2,xmm3
490	pxor	xmm6,xmm0
491	pshufb	xmm6,[eax]
492	paddd	xmm4,xmm6
493	pxor	xmm2,xmm4
494	movdqa	xmm3,[ebx-48]
495	movdqa	xmm1,xmm2
496	pslld	xmm2,12
497	psrld	xmm1,20
498	por	xmm2,xmm1
499	movdqa	xmm1,[ebx-112]
500	paddd	xmm0,xmm2
501	movdqa	xmm7,[80+ebx]
502	pxor	xmm6,xmm0
503	movdqa	[ebx-128],xmm0
504	pshufb	xmm6,[16+eax]
505	paddd	xmm4,xmm6
506	movdqa	[64+ebx],xmm6
507	pxor	xmm2,xmm4
508	paddd	xmm1,xmm3
509	movdqa	xmm0,xmm2
510	pslld	xmm2,7
511	psrld	xmm0,25
512	pxor	xmm7,xmm1
513	por	xmm2,xmm0
514	movdqa	[ebx],xmm4
515	pshufb	xmm7,[eax]
516	movdqa	[ebx-64],xmm2
517	paddd	xmm5,xmm7
518	movdqa	xmm4,[32+ebx]
519	pxor	xmm3,xmm5
520	movdqa	xmm2,[ebx-32]
521	movdqa	xmm0,xmm3
522	pslld	xmm3,12
523	psrld	xmm0,20
524	por	xmm3,xmm0
525	movdqa	xmm0,[ebx-96]
526	paddd	xmm1,xmm3
527	movdqa	xmm6,[96+ebx]
528	pxor	xmm7,xmm1
529	movdqa	[ebx-112],xmm1
530	pshufb	xmm7,[16+eax]
531	paddd	xmm5,xmm7
532	movdqa	[80+ebx],xmm7
533	pxor	xmm3,xmm5
534	paddd	xmm0,xmm2
535	movdqa	xmm1,xmm3
536	pslld	xmm3,7
537	psrld	xmm1,25
538	pxor	xmm6,xmm0
539	por	xmm3,xmm1
540	movdqa	[16+ebx],xmm5
541	pshufb	xmm6,[eax]
542	movdqa	[ebx-48],xmm3
543	paddd	xmm4,xmm6
544	movdqa	xmm5,[48+ebx]
545	pxor	xmm2,xmm4
546	movdqa	xmm3,[ebx-16]
547	movdqa	xmm1,xmm2
548	pslld	xmm2,12
549	psrld	xmm1,20
550	por	xmm2,xmm1
551	movdqa	xmm1,[ebx-80]
552	paddd	xmm0,xmm2
553	movdqa	xmm7,[112+ebx]
554	pxor	xmm6,xmm0
555	movdqa	[ebx-96],xmm0
556	pshufb	xmm6,[16+eax]
557	paddd	xmm4,xmm6
558	movdqa	[96+ebx],xmm6
559	pxor	xmm2,xmm4
560	paddd	xmm1,xmm3
561	movdqa	xmm0,xmm2
562	pslld	xmm2,7
563	psrld	xmm0,25
564	pxor	xmm7,xmm1
565	por	xmm2,xmm0
566	pshufb	xmm7,[eax]
567	movdqa	[ebx-32],xmm2
568	paddd	xmm5,xmm7
569	pxor	xmm3,xmm5
570	movdqa	xmm2,[ebx-48]
571	movdqa	xmm0,xmm3
572	pslld	xmm3,12
573	psrld	xmm0,20
574	por	xmm3,xmm0
575	movdqa	xmm0,[ebx-128]
576	paddd	xmm1,xmm3
577	pxor	xmm7,xmm1
578	movdqa	[ebx-80],xmm1
579	pshufb	xmm7,[16+eax]
580	paddd	xmm5,xmm7
581	movdqa	xmm6,xmm7
582	pxor	xmm3,xmm5
583	paddd	xmm0,xmm2
584	movdqa	xmm1,xmm3
585	pslld	xmm3,7
586	psrld	xmm1,25
587	pxor	xmm6,xmm0
588	por	xmm3,xmm1
589	pshufb	xmm6,[eax]
590	movdqa	[ebx-16],xmm3
591	paddd	xmm4,xmm6
592	pxor	xmm2,xmm4
593	movdqa	xmm3,[ebx-32]
594	movdqa	xmm1,xmm2
595	pslld	xmm2,12
596	psrld	xmm1,20
597	por	xmm2,xmm1
598	movdqa	xmm1,[ebx-112]
599	paddd	xmm0,xmm2
600	movdqa	xmm7,[64+ebx]
601	pxor	xmm6,xmm0
602	movdqa	[ebx-128],xmm0
603	pshufb	xmm6,[16+eax]
604	paddd	xmm4,xmm6
605	movdqa	[112+ebx],xmm6
606	pxor	xmm2,xmm4
607	paddd	xmm1,xmm3
608	movdqa	xmm0,xmm2
609	pslld	xmm2,7
610	psrld	xmm0,25
611	pxor	xmm7,xmm1
612	por	xmm2,xmm0
613	movdqa	[32+ebx],xmm4
614	pshufb	xmm7,[eax]
615	movdqa	[ebx-48],xmm2
616	paddd	xmm5,xmm7
617	movdqa	xmm4,[ebx]
618	pxor	xmm3,xmm5
619	movdqa	xmm2,[ebx-16]
620	movdqa	xmm0,xmm3
621	pslld	xmm3,12
622	psrld	xmm0,20
623	por	xmm3,xmm0
624	movdqa	xmm0,[ebx-96]
625	paddd	xmm1,xmm3
626	movdqa	xmm6,[80+ebx]
627	pxor	xmm7,xmm1
628	movdqa	[ebx-112],xmm1
629	pshufb	xmm7,[16+eax]
630	paddd	xmm5,xmm7
631	movdqa	[64+ebx],xmm7
632	pxor	xmm3,xmm5
633	paddd	xmm0,xmm2
634	movdqa	xmm1,xmm3
635	pslld	xmm3,7
636	psrld	xmm1,25
637	pxor	xmm6,xmm0
638	por	xmm3,xmm1
639	movdqa	[48+ebx],xmm5
640	pshufb	xmm6,[eax]
641	movdqa	[ebx-32],xmm3
642	paddd	xmm4,xmm6
643	movdqa	xmm5,[16+ebx]
644	pxor	xmm2,xmm4
645	movdqa	xmm3,[ebx-64]
646	movdqa	xmm1,xmm2
647	pslld	xmm2,12
648	psrld	xmm1,20
649	por	xmm2,xmm1
650	movdqa	xmm1,[ebx-80]
651	paddd	xmm0,xmm2
652	movdqa	xmm7,[96+ebx]
653	pxor	xmm6,xmm0
654	movdqa	[ebx-96],xmm0
655	pshufb	xmm6,[16+eax]
656	paddd	xmm4,xmm6
657	movdqa	[80+ebx],xmm6
658	pxor	xmm2,xmm4
659	paddd	xmm1,xmm3
660	movdqa	xmm0,xmm2
661	pslld	xmm2,7
662	psrld	xmm0,25
663	pxor	xmm7,xmm1
664	por	xmm2,xmm0
665	pshufb	xmm7,[eax]
666	movdqa	[ebx-16],xmm2
667	paddd	xmm5,xmm7
668	pxor	xmm3,xmm5
669	movdqa	xmm0,xmm3
670	pslld	xmm3,12
671	psrld	xmm0,20
672	por	xmm3,xmm0
673	movdqa	xmm0,[ebx-128]
674	paddd	xmm1,xmm3
675	movdqa	xmm6,[64+ebx]
676	pxor	xmm7,xmm1
677	movdqa	[ebx-80],xmm1
678	pshufb	xmm7,[16+eax]
679	paddd	xmm5,xmm7
680	movdqa	[96+ebx],xmm7
681	pxor	xmm3,xmm5
682	movdqa	xmm1,xmm3
683	pslld	xmm3,7
684	psrld	xmm1,25
685	por	xmm3,xmm1
686	dec	edx
687	jnz	NEAR L$010loop
688	movdqa	[ebx-64],xmm3
689	movdqa	[ebx],xmm4
690	movdqa	[16+ebx],xmm5
691	movdqa	[64+ebx],xmm6
692	movdqa	[96+ebx],xmm7
693	movdqa	xmm1,[ebx-112]
694	movdqa	xmm2,[ebx-96]
695	movdqa	xmm3,[ebx-80]
696	paddd	xmm0,[ebp-128]
697	paddd	xmm1,[ebp-112]
698	paddd	xmm2,[ebp-96]
699	paddd	xmm3,[ebp-80]
700	movdqa	xmm6,xmm0
701	punpckldq	xmm0,xmm1
702	movdqa	xmm7,xmm2
703	punpckldq	xmm2,xmm3
704	punpckhdq	xmm6,xmm1
705	punpckhdq	xmm7,xmm3
706	movdqa	xmm1,xmm0
707	punpcklqdq	xmm0,xmm2
708	movdqa	xmm3,xmm6
709	punpcklqdq	xmm6,xmm7
710	punpckhqdq	xmm1,xmm2
711	punpckhqdq	xmm3,xmm7
712	movdqu	xmm4,[esi-128]
713	movdqu	xmm5,[esi-64]
714	movdqu	xmm2,[esi]
715	movdqu	xmm7,[64+esi]
716	lea	esi,[16+esi]
717	pxor	xmm4,xmm0
718	movdqa	xmm0,[ebx-64]
719	pxor	xmm5,xmm1
720	movdqa	xmm1,[ebx-48]
721	pxor	xmm6,xmm2
722	movdqa	xmm2,[ebx-32]
723	pxor	xmm7,xmm3
724	movdqa	xmm3,[ebx-16]
725	movdqu	[edi-128],xmm4
726	movdqu	[edi-64],xmm5
727	movdqu	[edi],xmm6
728	movdqu	[64+edi],xmm7
729	lea	edi,[16+edi]
730	paddd	xmm0,[ebp-64]
731	paddd	xmm1,[ebp-48]
732	paddd	xmm2,[ebp-32]
733	paddd	xmm3,[ebp-16]
734	movdqa	xmm6,xmm0
735	punpckldq	xmm0,xmm1
736	movdqa	xmm7,xmm2
737	punpckldq	xmm2,xmm3
738	punpckhdq	xmm6,xmm1
739	punpckhdq	xmm7,xmm3
740	movdqa	xmm1,xmm0
741	punpcklqdq	xmm0,xmm2
742	movdqa	xmm3,xmm6
743	punpcklqdq	xmm6,xmm7
744	punpckhqdq	xmm1,xmm2
745	punpckhqdq	xmm3,xmm7
746	movdqu	xmm4,[esi-128]
747	movdqu	xmm5,[esi-64]
748	movdqu	xmm2,[esi]
749	movdqu	xmm7,[64+esi]
750	lea	esi,[16+esi]
751	pxor	xmm4,xmm0
752	movdqa	xmm0,[ebx]
753	pxor	xmm5,xmm1
754	movdqa	xmm1,[16+ebx]
755	pxor	xmm6,xmm2
756	movdqa	xmm2,[32+ebx]
757	pxor	xmm7,xmm3
758	movdqa	xmm3,[48+ebx]
759	movdqu	[edi-128],xmm4
760	movdqu	[edi-64],xmm5
761	movdqu	[edi],xmm6
762	movdqu	[64+edi],xmm7
763	lea	edi,[16+edi]
764	paddd	xmm0,[ebp]
765	paddd	xmm1,[16+ebp]
766	paddd	xmm2,[32+ebp]
767	paddd	xmm3,[48+ebp]
768	movdqa	xmm6,xmm0
769	punpckldq	xmm0,xmm1
770	movdqa	xmm7,xmm2
771	punpckldq	xmm2,xmm3
772	punpckhdq	xmm6,xmm1
773	punpckhdq	xmm7,xmm3
774	movdqa	xmm1,xmm0
775	punpcklqdq	xmm0,xmm2
776	movdqa	xmm3,xmm6
777	punpcklqdq	xmm6,xmm7
778	punpckhqdq	xmm1,xmm2
779	punpckhqdq	xmm3,xmm7
780	movdqu	xmm4,[esi-128]
781	movdqu	xmm5,[esi-64]
782	movdqu	xmm2,[esi]
783	movdqu	xmm7,[64+esi]
784	lea	esi,[16+esi]
785	pxor	xmm4,xmm0
786	movdqa	xmm0,[64+ebx]
787	pxor	xmm5,xmm1
788	movdqa	xmm1,[80+ebx]
789	pxor	xmm6,xmm2
790	movdqa	xmm2,[96+ebx]
791	pxor	xmm7,xmm3
792	movdqa	xmm3,[112+ebx]
793	movdqu	[edi-128],xmm4
794	movdqu	[edi-64],xmm5
795	movdqu	[edi],xmm6
796	movdqu	[64+edi],xmm7
797	lea	edi,[16+edi]
798	paddd	xmm0,[64+ebp]
799	paddd	xmm1,[80+ebp]
800	paddd	xmm2,[96+ebp]
801	paddd	xmm3,[112+ebp]
802	movdqa	xmm6,xmm0
803	punpckldq	xmm0,xmm1
804	movdqa	xmm7,xmm2
805	punpckldq	xmm2,xmm3
806	punpckhdq	xmm6,xmm1
807	punpckhdq	xmm7,xmm3
808	movdqa	xmm1,xmm0
809	punpcklqdq	xmm0,xmm2
810	movdqa	xmm3,xmm6
811	punpcklqdq	xmm6,xmm7
812	punpckhqdq	xmm1,xmm2
813	punpckhqdq	xmm3,xmm7
814	movdqu	xmm4,[esi-128]
815	movdqu	xmm5,[esi-64]
816	movdqu	xmm2,[esi]
817	movdqu	xmm7,[64+esi]
818	lea	esi,[208+esi]
819	pxor	xmm4,xmm0
820	pxor	xmm5,xmm1
821	pxor	xmm6,xmm2
822	pxor	xmm7,xmm3
823	movdqu	[edi-128],xmm4
824	movdqu	[edi-64],xmm5
825	movdqu	[edi],xmm6
826	movdqu	[64+edi],xmm7
827	lea	edi,[208+edi]
828	sub	ecx,256
829	jnc	NEAR L$009outer_loop
830	add	ecx,256
831	jz	NEAR L$011done
832	mov	ebx,DWORD [520+esp]
833	lea	esi,[esi-128]
834	mov	edx,DWORD [516+esp]
835	lea	edi,[edi-128]
836	movd	xmm2,DWORD [64+ebp]
837	movdqu	xmm3,[ebx]
838	paddd	xmm2,[96+eax]
839	pand	xmm3,[112+eax]
840	por	xmm3,xmm2
841L$0081x:
842	movdqa	xmm0,[32+eax]
843	movdqu	xmm1,[edx]
844	movdqu	xmm2,[16+edx]
845	movdqa	xmm6,[eax]
846	movdqa	xmm7,[16+eax]
847	mov	DWORD [48+esp],ebp
848	movdqa	[esp],xmm0
849	movdqa	[16+esp],xmm1
850	movdqa	[32+esp],xmm2
851	movdqa	[48+esp],xmm3
852	mov	edx,10
853	jmp	NEAR L$012loop1x
854align	16
855L$013outer1x:
856	movdqa	xmm3,[80+eax]
857	movdqa	xmm0,[esp]
858	movdqa	xmm1,[16+esp]
859	movdqa	xmm2,[32+esp]
860	paddd	xmm3,[48+esp]
861	mov	edx,10
862	movdqa	[48+esp],xmm3
863	jmp	NEAR L$012loop1x
864align	16
865L$012loop1x:
866	paddd	xmm0,xmm1
867	pxor	xmm3,xmm0
868db	102,15,56,0,222
869	paddd	xmm2,xmm3
870	pxor	xmm1,xmm2
871	movdqa	xmm4,xmm1
872	psrld	xmm1,20
873	pslld	xmm4,12
874	por	xmm1,xmm4
875	paddd	xmm0,xmm1
876	pxor	xmm3,xmm0
877db	102,15,56,0,223
878	paddd	xmm2,xmm3
879	pxor	xmm1,xmm2
880	movdqa	xmm4,xmm1
881	psrld	xmm1,25
882	pslld	xmm4,7
883	por	xmm1,xmm4
884	pshufd	xmm2,xmm2,78
885	pshufd	xmm1,xmm1,57
886	pshufd	xmm3,xmm3,147
887	nop
888	paddd	xmm0,xmm1
889	pxor	xmm3,xmm0
890db	102,15,56,0,222
891	paddd	xmm2,xmm3
892	pxor	xmm1,xmm2
893	movdqa	xmm4,xmm1
894	psrld	xmm1,20
895	pslld	xmm4,12
896	por	xmm1,xmm4
897	paddd	xmm0,xmm1
898	pxor	xmm3,xmm0
899db	102,15,56,0,223
900	paddd	xmm2,xmm3
901	pxor	xmm1,xmm2
902	movdqa	xmm4,xmm1
903	psrld	xmm1,25
904	pslld	xmm4,7
905	por	xmm1,xmm4
906	pshufd	xmm2,xmm2,78
907	pshufd	xmm1,xmm1,147
908	pshufd	xmm3,xmm3,57
909	dec	edx
910	jnz	NEAR L$012loop1x
911	paddd	xmm0,[esp]
912	paddd	xmm1,[16+esp]
913	paddd	xmm2,[32+esp]
914	paddd	xmm3,[48+esp]
915	cmp	ecx,64
916	jb	NEAR L$014tail
917	movdqu	xmm4,[esi]
918	movdqu	xmm5,[16+esi]
919	pxor	xmm0,xmm4
920	movdqu	xmm4,[32+esi]
921	pxor	xmm1,xmm5
922	movdqu	xmm5,[48+esi]
923	pxor	xmm2,xmm4
924	pxor	xmm3,xmm5
925	lea	esi,[64+esi]
926	movdqu	[edi],xmm0
927	movdqu	[16+edi],xmm1
928	movdqu	[32+edi],xmm2
929	movdqu	[48+edi],xmm3
930	lea	edi,[64+edi]
931	sub	ecx,64
932	jnz	NEAR L$013outer1x
933	jmp	NEAR L$011done
934L$014tail:
935	movdqa	[esp],xmm0
936	movdqa	[16+esp],xmm1
937	movdqa	[32+esp],xmm2
938	movdqa	[48+esp],xmm3
939	xor	eax,eax
940	xor	edx,edx
941	xor	ebp,ebp
942L$015tail_loop:
943	mov	al,BYTE [ebp*1+esp]
944	mov	dl,BYTE [ebp*1+esi]
945	lea	ebp,[1+ebp]
946	xor	al,dl
947	mov	BYTE [ebp*1+edi-1],al
948	dec	ecx
949	jnz	NEAR L$015tail_loop
950L$011done:
951	mov	esp,DWORD [512+esp]
952	pop	edi
953	pop	esi
954	pop	ebx
955	pop	ebp
956	ret
957align	64
958L$ssse3_data:
959db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
960db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
961dd	1634760805,857760878,2036477234,1797285236
962dd	0,1,2,3
963dd	4,4,4,4
964dd	1,0,0,0
965dd	4,0,0,0
966dd	0,-1,-1,-1
967align	64
968db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
969db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
970db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
971db	114,103,62,0
972segment	.bss
973common	_GFp_ia32cap_P 16
974