1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifdef BORINGSSL_PREFIX
5%include "boringssl_prefix_symbols_nasm.inc"
6%endif
7%ifidn __OUTPUT_FORMAT__,obj
8section	code	use32 class=code align=64
9%elifidn __OUTPUT_FORMAT__,win32
10%ifdef __YASM_VERSION_ID__
11%if __YASM_VERSION_ID__ < 01010000h
12%error yasm version 1.1.0 or later needed.
13%endif
14; Yasm automatically includes .00 and complains about redefining it.
15; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
16%else
17$@feat.00 equ 1
18%endif
19section	.text	code align=64
20%else
21section	.text	code
22%endif
23;extern	_GFp_ia32cap_P
24align	64
25global	_GFp_poly1305_init_asm
26align	16
27_GFp_poly1305_init_asm:
28L$_GFp_poly1305_init_asm_begin:
29	push	ebp
30	push	ebx
31	push	esi
32	push	edi
33	mov	edi,DWORD [20+esp]
34	mov	esi,DWORD [24+esp]
35	mov	ebp,DWORD [28+esp]
36	xor	eax,eax
37	mov	DWORD [edi],eax
38	mov	DWORD [4+edi],eax
39	mov	DWORD [8+edi],eax
40	mov	DWORD [12+edi],eax
41	mov	DWORD [16+edi],eax
42	mov	DWORD [20+edi],eax
43	cmp	esi,0
44	je	NEAR L$000nokey
45	call	L$001pic_point
46L$001pic_point:
47	pop	ebx
48	lea	eax,[(_GFp_poly1305_blocks-L$001pic_point)+ebx]
49	lea	edx,[(_GFp_poly1305_emit-L$001pic_point)+ebx]
50	lea	edi,[_GFp_ia32cap_P]
51	mov	ecx,DWORD [edi]
52	and	ecx,83886080
53	cmp	ecx,83886080
54	lea	eax,[(__poly1305_blocks_sse2-L$001pic_point)+ebx]
55	lea	edx,[(__poly1305_emit_sse2-L$001pic_point)+ebx]
56	mov	edi,DWORD [20+esp]
57	mov	DWORD [ebp],eax
58	mov	DWORD [4+ebp],edx
59	mov	eax,DWORD [esi]
60	mov	ebx,DWORD [4+esi]
61	mov	ecx,DWORD [8+esi]
62	mov	edx,DWORD [12+esi]
63	and	eax,268435455
64	and	ebx,268435452
65	and	ecx,268435452
66	and	edx,268435452
67	mov	DWORD [24+edi],eax
68	mov	DWORD [28+edi],ebx
69	mov	DWORD [32+edi],ecx
70	mov	DWORD [36+edi],edx
71	mov	eax,1
72L$000nokey:
73	pop	edi
74	pop	esi
75	pop	ebx
76	pop	ebp
77	ret
78global	_GFp_poly1305_blocks
79align	16
80_GFp_poly1305_blocks:
81L$_GFp_poly1305_blocks_begin:
82	push	ebp
83	push	ebx
84	push	esi
85	push	edi
86	mov	edi,DWORD [20+esp]
87	mov	esi,DWORD [24+esp]
88	mov	ecx,DWORD [28+esp]
89L$enter_blocks:
90	and	ecx,-15
91	jz	NEAR L$002nodata
92	sub	esp,64
93	mov	eax,DWORD [24+edi]
94	mov	ebx,DWORD [28+edi]
95	lea	ebp,[ecx*1+esi]
96	mov	ecx,DWORD [32+edi]
97	mov	edx,DWORD [36+edi]
98	mov	DWORD [92+esp],ebp
99	mov	ebp,esi
100	mov	DWORD [36+esp],eax
101	mov	eax,ebx
102	shr	eax,2
103	mov	DWORD [40+esp],ebx
104	add	eax,ebx
105	mov	ebx,ecx
106	shr	ebx,2
107	mov	DWORD [44+esp],ecx
108	add	ebx,ecx
109	mov	ecx,edx
110	shr	ecx,2
111	mov	DWORD [48+esp],edx
112	add	ecx,edx
113	mov	DWORD [52+esp],eax
114	mov	DWORD [56+esp],ebx
115	mov	DWORD [60+esp],ecx
116	mov	eax,DWORD [edi]
117	mov	ebx,DWORD [4+edi]
118	mov	ecx,DWORD [8+edi]
119	mov	esi,DWORD [12+edi]
120	mov	edi,DWORD [16+edi]
121	jmp	NEAR L$003loop
122align	32
123L$003loop:
124	add	eax,DWORD [ebp]
125	adc	ebx,DWORD [4+ebp]
126	adc	ecx,DWORD [8+ebp]
127	adc	esi,DWORD [12+ebp]
128	lea	ebp,[16+ebp]
129	adc	edi,DWORD [96+esp]
130	mov	DWORD [esp],eax
131	mov	DWORD [12+esp],esi
132	mul	DWORD [36+esp]
133	mov	DWORD [16+esp],edi
134	mov	edi,eax
135	mov	eax,ebx
136	mov	esi,edx
137	mul	DWORD [60+esp]
138	add	edi,eax
139	mov	eax,ecx
140	adc	esi,edx
141	mul	DWORD [56+esp]
142	add	edi,eax
143	mov	eax,DWORD [12+esp]
144	adc	esi,edx
145	mul	DWORD [52+esp]
146	add	edi,eax
147	mov	eax,DWORD [esp]
148	adc	esi,edx
149	mul	DWORD [40+esp]
150	mov	DWORD [20+esp],edi
151	xor	edi,edi
152	add	esi,eax
153	mov	eax,ebx
154	adc	edi,edx
155	mul	DWORD [36+esp]
156	add	esi,eax
157	mov	eax,ecx
158	adc	edi,edx
159	mul	DWORD [60+esp]
160	add	esi,eax
161	mov	eax,DWORD [12+esp]
162	adc	edi,edx
163	mul	DWORD [56+esp]
164	add	esi,eax
165	mov	eax,DWORD [16+esp]
166	adc	edi,edx
167	imul	eax,DWORD [52+esp]
168	add	esi,eax
169	mov	eax,DWORD [esp]
170	adc	edi,0
171	mul	DWORD [44+esp]
172	mov	DWORD [24+esp],esi
173	xor	esi,esi
174	add	edi,eax
175	mov	eax,ebx
176	adc	esi,edx
177	mul	DWORD [40+esp]
178	add	edi,eax
179	mov	eax,ecx
180	adc	esi,edx
181	mul	DWORD [36+esp]
182	add	edi,eax
183	mov	eax,DWORD [12+esp]
184	adc	esi,edx
185	mul	DWORD [60+esp]
186	add	edi,eax
187	mov	eax,DWORD [16+esp]
188	adc	esi,edx
189	imul	eax,DWORD [56+esp]
190	add	edi,eax
191	mov	eax,DWORD [esp]
192	adc	esi,0
193	mul	DWORD [48+esp]
194	mov	DWORD [28+esp],edi
195	xor	edi,edi
196	add	esi,eax
197	mov	eax,ebx
198	adc	edi,edx
199	mul	DWORD [44+esp]
200	add	esi,eax
201	mov	eax,ecx
202	adc	edi,edx
203	mul	DWORD [40+esp]
204	add	esi,eax
205	mov	eax,DWORD [12+esp]
206	adc	edi,edx
207	mul	DWORD [36+esp]
208	add	esi,eax
209	mov	ecx,DWORD [16+esp]
210	adc	edi,edx
211	mov	edx,ecx
212	imul	ecx,DWORD [60+esp]
213	add	esi,ecx
214	mov	eax,DWORD [20+esp]
215	adc	edi,0
216	imul	edx,DWORD [36+esp]
217	add	edx,edi
218	mov	ebx,DWORD [24+esp]
219	mov	ecx,DWORD [28+esp]
220	mov	edi,edx
221	shr	edx,2
222	and	edi,3
223	lea	edx,[edx*4+edx]
224	add	eax,edx
225	adc	ebx,0
226	adc	ecx,0
227	adc	esi,0
228	adc	edi,0
229	cmp	ebp,DWORD [92+esp]
230	jne	NEAR L$003loop
231	mov	edx,DWORD [84+esp]
232	add	esp,64
233	mov	DWORD [edx],eax
234	mov	DWORD [4+edx],ebx
235	mov	DWORD [8+edx],ecx
236	mov	DWORD [12+edx],esi
237	mov	DWORD [16+edx],edi
238L$002nodata:
239	pop	edi
240	pop	esi
241	pop	ebx
242	pop	ebp
243	ret
244global	_GFp_poly1305_emit
245align	16
246_GFp_poly1305_emit:
247L$_GFp_poly1305_emit_begin:
248	push	ebp
249	push	ebx
250	push	esi
251	push	edi
252	mov	ebp,DWORD [20+esp]
253L$enter_emit:
254	mov	edi,DWORD [24+esp]
255	mov	eax,DWORD [ebp]
256	mov	ebx,DWORD [4+ebp]
257	mov	ecx,DWORD [8+ebp]
258	mov	edx,DWORD [12+ebp]
259	mov	esi,DWORD [16+ebp]
260	add	eax,5
261	adc	ebx,0
262	adc	ecx,0
263	adc	edx,0
264	adc	esi,0
265	shr	esi,2
266	neg	esi
267	and	eax,esi
268	and	ebx,esi
269	and	ecx,esi
270	and	edx,esi
271	mov	DWORD [edi],eax
272	mov	DWORD [4+edi],ebx
273	mov	DWORD [8+edi],ecx
274	mov	DWORD [12+edi],edx
275	not	esi
276	mov	eax,DWORD [ebp]
277	mov	ebx,DWORD [4+ebp]
278	mov	ecx,DWORD [8+ebp]
279	mov	edx,DWORD [12+ebp]
280	mov	ebp,DWORD [28+esp]
281	and	eax,esi
282	and	ebx,esi
283	and	ecx,esi
284	and	edx,esi
285	or	eax,DWORD [edi]
286	or	ebx,DWORD [4+edi]
287	or	ecx,DWORD [8+edi]
288	or	edx,DWORD [12+edi]
289	add	eax,DWORD [ebp]
290	adc	ebx,DWORD [4+ebp]
291	adc	ecx,DWORD [8+ebp]
292	adc	edx,DWORD [12+ebp]
293	mov	DWORD [edi],eax
294	mov	DWORD [4+edi],ebx
295	mov	DWORD [8+edi],ecx
296	mov	DWORD [12+edi],edx
297	pop	edi
298	pop	esi
299	pop	ebx
300	pop	ebp
301	ret
302align	32
303align	16
304__poly1305_init_sse2:
305	movdqu	xmm4,[24+edi]
306	lea	edi,[48+edi]
307	mov	ebp,esp
308	sub	esp,224
309	and	esp,-16
310	movq	xmm7,[64+ebx]
311	movdqa	xmm0,xmm4
312	movdqa	xmm1,xmm4
313	movdqa	xmm2,xmm4
314	pand	xmm0,xmm7
315	psrlq	xmm1,26
316	psrldq	xmm2,6
317	pand	xmm1,xmm7
318	movdqa	xmm3,xmm2
319	psrlq	xmm2,4
320	psrlq	xmm3,30
321	pand	xmm2,xmm7
322	pand	xmm3,xmm7
323	psrldq	xmm4,13
324	lea	edx,[144+esp]
325	mov	ecx,2
326L$004square:
327	movdqa	[esp],xmm0
328	movdqa	[16+esp],xmm1
329	movdqa	[32+esp],xmm2
330	movdqa	[48+esp],xmm3
331	movdqa	[64+esp],xmm4
332	movdqa	xmm6,xmm1
333	movdqa	xmm5,xmm2
334	pslld	xmm6,2
335	pslld	xmm5,2
336	paddd	xmm6,xmm1
337	paddd	xmm5,xmm2
338	movdqa	[80+esp],xmm6
339	movdqa	[96+esp],xmm5
340	movdqa	xmm6,xmm3
341	movdqa	xmm5,xmm4
342	pslld	xmm6,2
343	pslld	xmm5,2
344	paddd	xmm6,xmm3
345	paddd	xmm5,xmm4
346	movdqa	[112+esp],xmm6
347	movdqa	[128+esp],xmm5
348	pshufd	xmm6,xmm0,68
349	movdqa	xmm5,xmm1
350	pshufd	xmm1,xmm1,68
351	pshufd	xmm2,xmm2,68
352	pshufd	xmm3,xmm3,68
353	pshufd	xmm4,xmm4,68
354	movdqa	[edx],xmm6
355	movdqa	[16+edx],xmm1
356	movdqa	[32+edx],xmm2
357	movdqa	[48+edx],xmm3
358	movdqa	[64+edx],xmm4
359	pmuludq	xmm4,xmm0
360	pmuludq	xmm3,xmm0
361	pmuludq	xmm2,xmm0
362	pmuludq	xmm1,xmm0
363	pmuludq	xmm0,xmm6
364	movdqa	xmm6,xmm5
365	pmuludq	xmm5,[48+edx]
366	movdqa	xmm7,xmm6
367	pmuludq	xmm6,[32+edx]
368	paddq	xmm4,xmm5
369	movdqa	xmm5,xmm7
370	pmuludq	xmm7,[16+edx]
371	paddq	xmm3,xmm6
372	movdqa	xmm6,[80+esp]
373	pmuludq	xmm5,[edx]
374	paddq	xmm2,xmm7
375	pmuludq	xmm6,[64+edx]
376	movdqa	xmm7,[32+esp]
377	paddq	xmm1,xmm5
378	movdqa	xmm5,xmm7
379	pmuludq	xmm7,[32+edx]
380	paddq	xmm0,xmm6
381	movdqa	xmm6,xmm5
382	pmuludq	xmm5,[16+edx]
383	paddq	xmm4,xmm7
384	movdqa	xmm7,[96+esp]
385	pmuludq	xmm6,[edx]
386	paddq	xmm3,xmm5
387	movdqa	xmm5,xmm7
388	pmuludq	xmm7,[64+edx]
389	paddq	xmm2,xmm6
390	pmuludq	xmm5,[48+edx]
391	movdqa	xmm6,[48+esp]
392	paddq	xmm1,xmm7
393	movdqa	xmm7,xmm6
394	pmuludq	xmm6,[16+edx]
395	paddq	xmm0,xmm5
396	movdqa	xmm5,[112+esp]
397	pmuludq	xmm7,[edx]
398	paddq	xmm4,xmm6
399	movdqa	xmm6,xmm5
400	pmuludq	xmm5,[64+edx]
401	paddq	xmm3,xmm7
402	movdqa	xmm7,xmm6
403	pmuludq	xmm6,[48+edx]
404	paddq	xmm2,xmm5
405	pmuludq	xmm7,[32+edx]
406	movdqa	xmm5,[64+esp]
407	paddq	xmm1,xmm6
408	movdqa	xmm6,[128+esp]
409	pmuludq	xmm5,[edx]
410	paddq	xmm0,xmm7
411	movdqa	xmm7,xmm6
412	pmuludq	xmm6,[64+edx]
413	paddq	xmm4,xmm5
414	movdqa	xmm5,xmm7
415	pmuludq	xmm7,[16+edx]
416	paddq	xmm3,xmm6
417	movdqa	xmm6,xmm5
418	pmuludq	xmm5,[32+edx]
419	paddq	xmm0,xmm7
420	pmuludq	xmm6,[48+edx]
421	movdqa	xmm7,[64+ebx]
422	paddq	xmm1,xmm5
423	paddq	xmm2,xmm6
424	movdqa	xmm5,xmm3
425	pand	xmm3,xmm7
426	psrlq	xmm5,26
427	paddq	xmm5,xmm4
428	movdqa	xmm6,xmm0
429	pand	xmm0,xmm7
430	psrlq	xmm6,26
431	movdqa	xmm4,xmm5
432	paddq	xmm6,xmm1
433	psrlq	xmm5,26
434	pand	xmm4,xmm7
435	movdqa	xmm1,xmm6
436	psrlq	xmm6,26
437	paddd	xmm0,xmm5
438	psllq	xmm5,2
439	paddq	xmm6,xmm2
440	paddq	xmm5,xmm0
441	pand	xmm1,xmm7
442	movdqa	xmm2,xmm6
443	psrlq	xmm6,26
444	pand	xmm2,xmm7
445	paddd	xmm6,xmm3
446	movdqa	xmm0,xmm5
447	psrlq	xmm5,26
448	movdqa	xmm3,xmm6
449	psrlq	xmm6,26
450	pand	xmm0,xmm7
451	paddd	xmm1,xmm5
452	pand	xmm3,xmm7
453	paddd	xmm4,xmm6
454	dec	ecx
455	jz	NEAR L$005square_break
456	punpcklqdq	xmm0,[esp]
457	punpcklqdq	xmm1,[16+esp]
458	punpcklqdq	xmm2,[32+esp]
459	punpcklqdq	xmm3,[48+esp]
460	punpcklqdq	xmm4,[64+esp]
461	jmp	NEAR L$004square
462L$005square_break:
463	psllq	xmm0,32
464	psllq	xmm1,32
465	psllq	xmm2,32
466	psllq	xmm3,32
467	psllq	xmm4,32
468	por	xmm0,[esp]
469	por	xmm1,[16+esp]
470	por	xmm2,[32+esp]
471	por	xmm3,[48+esp]
472	por	xmm4,[64+esp]
473	pshufd	xmm0,xmm0,141
474	pshufd	xmm1,xmm1,141
475	pshufd	xmm2,xmm2,141
476	pshufd	xmm3,xmm3,141
477	pshufd	xmm4,xmm4,141
478	movdqu	[edi],xmm0
479	movdqu	[16+edi],xmm1
480	movdqu	[32+edi],xmm2
481	movdqu	[48+edi],xmm3
482	movdqu	[64+edi],xmm4
483	movdqa	xmm6,xmm1
484	movdqa	xmm5,xmm2
485	pslld	xmm6,2
486	pslld	xmm5,2
487	paddd	xmm6,xmm1
488	paddd	xmm5,xmm2
489	movdqu	[80+edi],xmm6
490	movdqu	[96+edi],xmm5
491	movdqa	xmm6,xmm3
492	movdqa	xmm5,xmm4
493	pslld	xmm6,2
494	pslld	xmm5,2
495	paddd	xmm6,xmm3
496	paddd	xmm5,xmm4
497	movdqu	[112+edi],xmm6
498	movdqu	[128+edi],xmm5
499	mov	esp,ebp
500	lea	edi,[edi-48]
501	ret
502align	32
503align	16
504__poly1305_blocks_sse2:
505	push	ebp
506	push	ebx
507	push	esi
508	push	edi
509	mov	edi,DWORD [20+esp]
510	mov	esi,DWORD [24+esp]
511	mov	ecx,DWORD [28+esp]
512	mov	eax,DWORD [20+edi]
513	and	ecx,-16
514	jz	NEAR L$006nodata
515	cmp	ecx,64
516	jae	NEAR L$007enter_sse2
517	test	eax,eax
518	jz	NEAR L$enter_blocks
519align	16
520L$007enter_sse2:
521	call	L$008pic_point
522L$008pic_point:
523	pop	ebx
524	lea	ebx,[(L$const_sse2-L$008pic_point)+ebx]
525	test	eax,eax
526	jnz	NEAR L$009base2_26
527	call	__poly1305_init_sse2
528	mov	eax,DWORD [edi]
529	mov	ecx,DWORD [3+edi]
530	mov	edx,DWORD [6+edi]
531	mov	esi,DWORD [9+edi]
532	mov	ebp,DWORD [13+edi]
533	mov	DWORD [20+edi],1
534	shr	ecx,2
535	and	eax,67108863
536	shr	edx,4
537	and	ecx,67108863
538	shr	esi,6
539	and	edx,67108863
540	movd	xmm0,eax
541	movd	xmm1,ecx
542	movd	xmm2,edx
543	movd	xmm3,esi
544	movd	xmm4,ebp
545	mov	esi,DWORD [24+esp]
546	mov	ecx,DWORD [28+esp]
547	jmp	NEAR L$010base2_32
548align	16
549L$009base2_26:
550	movd	xmm0,DWORD [edi]
551	movd	xmm1,DWORD [4+edi]
552	movd	xmm2,DWORD [8+edi]
553	movd	xmm3,DWORD [12+edi]
554	movd	xmm4,DWORD [16+edi]
555	movdqa	xmm7,[64+ebx]
556L$010base2_32:
557	mov	eax,DWORD [32+esp]
558	mov	ebp,esp
559	sub	esp,528
560	and	esp,-16
561	lea	edi,[48+edi]
562	shl	eax,24
563	test	ecx,31
564	jz	NEAR L$011even
565	movdqu	xmm6,[esi]
566	lea	esi,[16+esi]
567	movdqa	xmm5,xmm6
568	pand	xmm6,xmm7
569	paddd	xmm0,xmm6
570	movdqa	xmm6,xmm5
571	psrlq	xmm5,26
572	psrldq	xmm6,6
573	pand	xmm5,xmm7
574	paddd	xmm1,xmm5
575	movdqa	xmm5,xmm6
576	psrlq	xmm6,4
577	pand	xmm6,xmm7
578	paddd	xmm2,xmm6
579	movdqa	xmm6,xmm5
580	psrlq	xmm5,30
581	pand	xmm5,xmm7
582	psrldq	xmm6,7
583	paddd	xmm3,xmm5
584	movd	xmm5,eax
585	paddd	xmm4,xmm6
586	movd	xmm6,DWORD [12+edi]
587	paddd	xmm4,xmm5
588	movdqa	[esp],xmm0
589	movdqa	[16+esp],xmm1
590	movdqa	[32+esp],xmm2
591	movdqa	[48+esp],xmm3
592	movdqa	[64+esp],xmm4
593	pmuludq	xmm0,xmm6
594	pmuludq	xmm1,xmm6
595	pmuludq	xmm2,xmm6
596	movd	xmm5,DWORD [28+edi]
597	pmuludq	xmm3,xmm6
598	pmuludq	xmm4,xmm6
599	movdqa	xmm6,xmm5
600	pmuludq	xmm5,[48+esp]
601	movdqa	xmm7,xmm6
602	pmuludq	xmm6,[32+esp]
603	paddq	xmm4,xmm5
604	movdqa	xmm5,xmm7
605	pmuludq	xmm7,[16+esp]
606	paddq	xmm3,xmm6
607	movd	xmm6,DWORD [92+edi]
608	pmuludq	xmm5,[esp]
609	paddq	xmm2,xmm7
610	pmuludq	xmm6,[64+esp]
611	movd	xmm7,DWORD [44+edi]
612	paddq	xmm1,xmm5
613	movdqa	xmm5,xmm7
614	pmuludq	xmm7,[32+esp]
615	paddq	xmm0,xmm6
616	movdqa	xmm6,xmm5
617	pmuludq	xmm5,[16+esp]
618	paddq	xmm4,xmm7
619	movd	xmm7,DWORD [108+edi]
620	pmuludq	xmm6,[esp]
621	paddq	xmm3,xmm5
622	movdqa	xmm5,xmm7
623	pmuludq	xmm7,[64+esp]
624	paddq	xmm2,xmm6
625	pmuludq	xmm5,[48+esp]
626	movd	xmm6,DWORD [60+edi]
627	paddq	xmm1,xmm7
628	movdqa	xmm7,xmm6
629	pmuludq	xmm6,[16+esp]
630	paddq	xmm0,xmm5
631	movd	xmm5,DWORD [124+edi]
632	pmuludq	xmm7,[esp]
633	paddq	xmm4,xmm6
634	movdqa	xmm6,xmm5
635	pmuludq	xmm5,[64+esp]
636	paddq	xmm3,xmm7
637	movdqa	xmm7,xmm6
638	pmuludq	xmm6,[48+esp]
639	paddq	xmm2,xmm5
640	pmuludq	xmm7,[32+esp]
641	movd	xmm5,DWORD [76+edi]
642	paddq	xmm1,xmm6
643	movd	xmm6,DWORD [140+edi]
644	pmuludq	xmm5,[esp]
645	paddq	xmm0,xmm7
646	movdqa	xmm7,xmm6
647	pmuludq	xmm6,[64+esp]
648	paddq	xmm4,xmm5
649	movdqa	xmm5,xmm7
650	pmuludq	xmm7,[16+esp]
651	paddq	xmm3,xmm6
652	movdqa	xmm6,xmm5
653	pmuludq	xmm5,[32+esp]
654	paddq	xmm0,xmm7
655	pmuludq	xmm6,[48+esp]
656	movdqa	xmm7,[64+ebx]
657	paddq	xmm1,xmm5
658	paddq	xmm2,xmm6
659	movdqa	xmm5,xmm3
660	pand	xmm3,xmm7
661	psrlq	xmm5,26
662	paddq	xmm5,xmm4
663	movdqa	xmm6,xmm0
664	pand	xmm0,xmm7
665	psrlq	xmm6,26
666	movdqa	xmm4,xmm5
667	paddq	xmm6,xmm1
668	psrlq	xmm5,26
669	pand	xmm4,xmm7
670	movdqa	xmm1,xmm6
671	psrlq	xmm6,26
672	paddd	xmm0,xmm5
673	psllq	xmm5,2
674	paddq	xmm6,xmm2
675	paddq	xmm5,xmm0
676	pand	xmm1,xmm7
677	movdqa	xmm2,xmm6
678	psrlq	xmm6,26
679	pand	xmm2,xmm7
680	paddd	xmm6,xmm3
681	movdqa	xmm0,xmm5
682	psrlq	xmm5,26
683	movdqa	xmm3,xmm6
684	psrlq	xmm6,26
685	pand	xmm0,xmm7
686	paddd	xmm1,xmm5
687	pand	xmm3,xmm7
688	paddd	xmm4,xmm6
689	sub	ecx,16
690	jz	NEAR L$012done
691L$011even:
692	lea	edx,[384+esp]
693	lea	eax,[esi-32]
694	sub	ecx,64
695	movdqu	xmm5,[edi]
696	pshufd	xmm6,xmm5,68
697	cmovb	esi,eax
698	pshufd	xmm5,xmm5,238
699	movdqa	[edx],xmm6
700	lea	eax,[160+esp]
701	movdqu	xmm6,[16+edi]
702	movdqa	[edx-144],xmm5
703	pshufd	xmm5,xmm6,68
704	pshufd	xmm6,xmm6,238
705	movdqa	[16+edx],xmm5
706	movdqu	xmm5,[32+edi]
707	movdqa	[edx-128],xmm6
708	pshufd	xmm6,xmm5,68
709	pshufd	xmm5,xmm5,238
710	movdqa	[32+edx],xmm6
711	movdqu	xmm6,[48+edi]
712	movdqa	[edx-112],xmm5
713	pshufd	xmm5,xmm6,68
714	pshufd	xmm6,xmm6,238
715	movdqa	[48+edx],xmm5
716	movdqu	xmm5,[64+edi]
717	movdqa	[edx-96],xmm6
718	pshufd	xmm6,xmm5,68
719	pshufd	xmm5,xmm5,238
720	movdqa	[64+edx],xmm6
721	movdqu	xmm6,[80+edi]
722	movdqa	[edx-80],xmm5
723	pshufd	xmm5,xmm6,68
724	pshufd	xmm6,xmm6,238
725	movdqa	[80+edx],xmm5
726	movdqu	xmm5,[96+edi]
727	movdqa	[edx-64],xmm6
728	pshufd	xmm6,xmm5,68
729	pshufd	xmm5,xmm5,238
730	movdqa	[96+edx],xmm6
731	movdqu	xmm6,[112+edi]
732	movdqa	[edx-48],xmm5
733	pshufd	xmm5,xmm6,68
734	pshufd	xmm6,xmm6,238
735	movdqa	[112+edx],xmm5
736	movdqu	xmm5,[128+edi]
737	movdqa	[edx-32],xmm6
738	pshufd	xmm6,xmm5,68
739	pshufd	xmm5,xmm5,238
740	movdqa	[128+edx],xmm6
741	movdqa	[edx-16],xmm5
742	movdqu	xmm5,[32+esi]
743	movdqu	xmm6,[48+esi]
744	lea	esi,[32+esi]
745	movdqa	[112+esp],xmm2
746	movdqa	[128+esp],xmm3
747	movdqa	[144+esp],xmm4
748	movdqa	xmm2,xmm5
749	movdqa	xmm3,xmm6
750	psrldq	xmm2,6
751	psrldq	xmm3,6
752	movdqa	xmm4,xmm5
753	punpcklqdq	xmm2,xmm3
754	punpckhqdq	xmm4,xmm6
755	punpcklqdq	xmm5,xmm6
756	movdqa	xmm3,xmm2
757	psrlq	xmm2,4
758	psrlq	xmm3,30
759	movdqa	xmm6,xmm5
760	psrlq	xmm4,40
761	psrlq	xmm6,26
762	pand	xmm5,xmm7
763	pand	xmm6,xmm7
764	pand	xmm2,xmm7
765	pand	xmm3,xmm7
766	por	xmm4,[ebx]
767	movdqa	[80+esp],xmm0
768	movdqa	[96+esp],xmm1
769	jbe	NEAR L$013skip_loop
770	jmp	NEAR L$014loop
771align	32
772L$014loop:
773	movdqa	xmm7,[edx-144]
774	movdqa	[16+eax],xmm6
775	movdqa	[32+eax],xmm2
776	movdqa	[48+eax],xmm3
777	movdqa	[64+eax],xmm4
778	movdqa	xmm1,xmm5
779	pmuludq	xmm5,xmm7
780	movdqa	xmm0,xmm6
781	pmuludq	xmm6,xmm7
782	pmuludq	xmm2,xmm7
783	pmuludq	xmm3,xmm7
784	pmuludq	xmm4,xmm7
785	pmuludq	xmm0,[edx-16]
786	movdqa	xmm7,xmm1
787	pmuludq	xmm1,[edx-128]
788	paddq	xmm0,xmm5
789	movdqa	xmm5,xmm7
790	pmuludq	xmm7,[edx-112]
791	paddq	xmm1,xmm6
792	movdqa	xmm6,xmm5
793	pmuludq	xmm5,[edx-96]
794	paddq	xmm2,xmm7
795	movdqa	xmm7,[16+eax]
796	pmuludq	xmm6,[edx-80]
797	paddq	xmm3,xmm5
798	movdqa	xmm5,xmm7
799	pmuludq	xmm7,[edx-128]
800	paddq	xmm4,xmm6
801	movdqa	xmm6,xmm5
802	pmuludq	xmm5,[edx-112]
803	paddq	xmm2,xmm7
804	movdqa	xmm7,[32+eax]
805	pmuludq	xmm6,[edx-96]
806	paddq	xmm3,xmm5
807	movdqa	xmm5,xmm7
808	pmuludq	xmm7,[edx-32]
809	paddq	xmm4,xmm6
810	movdqa	xmm6,xmm5
811	pmuludq	xmm5,[edx-16]
812	paddq	xmm0,xmm7
813	movdqa	xmm7,xmm6
814	pmuludq	xmm6,[edx-128]
815	paddq	xmm1,xmm5
816	movdqa	xmm5,[48+eax]
817	pmuludq	xmm7,[edx-112]
818	paddq	xmm3,xmm6
819	movdqa	xmm6,xmm5
820	pmuludq	xmm5,[edx-48]
821	paddq	xmm4,xmm7
822	movdqa	xmm7,xmm6
823	pmuludq	xmm6,[edx-32]
824	paddq	xmm0,xmm5
825	movdqa	xmm5,xmm7
826	pmuludq	xmm7,[edx-16]
827	paddq	xmm1,xmm6
828	movdqa	xmm6,[64+eax]
829	pmuludq	xmm5,[edx-128]
830	paddq	xmm2,xmm7
831	movdqa	xmm7,xmm6
832	pmuludq	xmm6,[edx-16]
833	paddq	xmm4,xmm5
834	movdqa	xmm5,xmm7
835	pmuludq	xmm7,[edx-64]
836	paddq	xmm3,xmm6
837	movdqa	xmm6,xmm5
838	pmuludq	xmm5,[edx-48]
839	paddq	xmm0,xmm7
840	movdqa	xmm7,[64+ebx]
841	pmuludq	xmm6,[edx-32]
842	paddq	xmm1,xmm5
843	paddq	xmm2,xmm6
844	movdqu	xmm5,[esi-32]
845	movdqu	xmm6,[esi-16]
846	lea	esi,[32+esi]
847	movdqa	[32+esp],xmm2
848	movdqa	[48+esp],xmm3
849	movdqa	[64+esp],xmm4
850	movdqa	xmm2,xmm5
851	movdqa	xmm3,xmm6
852	psrldq	xmm2,6
853	psrldq	xmm3,6
854	movdqa	xmm4,xmm5
855	punpcklqdq	xmm2,xmm3
856	punpckhqdq	xmm4,xmm6
857	punpcklqdq	xmm5,xmm6
858	movdqa	xmm3,xmm2
859	psrlq	xmm2,4
860	psrlq	xmm3,30
861	movdqa	xmm6,xmm5
862	psrlq	xmm4,40
863	psrlq	xmm6,26
864	pand	xmm5,xmm7
865	pand	xmm6,xmm7
866	pand	xmm2,xmm7
867	pand	xmm3,xmm7
868	por	xmm4,[ebx]
869	lea	eax,[esi-32]
870	sub	ecx,64
871	paddd	xmm5,[80+esp]
872	paddd	xmm6,[96+esp]
873	paddd	xmm2,[112+esp]
874	paddd	xmm3,[128+esp]
875	paddd	xmm4,[144+esp]
876	cmovb	esi,eax
877	lea	eax,[160+esp]
878	movdqa	xmm7,[edx]
879	movdqa	[16+esp],xmm1
880	movdqa	[16+eax],xmm6
881	movdqa	[32+eax],xmm2
882	movdqa	[48+eax],xmm3
883	movdqa	[64+eax],xmm4
884	movdqa	xmm1,xmm5
885	pmuludq	xmm5,xmm7
886	paddq	xmm5,xmm0
887	movdqa	xmm0,xmm6
888	pmuludq	xmm6,xmm7
889	pmuludq	xmm2,xmm7
890	pmuludq	xmm3,xmm7
891	pmuludq	xmm4,xmm7
892	paddq	xmm6,[16+esp]
893	paddq	xmm2,[32+esp]
894	paddq	xmm3,[48+esp]
895	paddq	xmm4,[64+esp]
896	pmuludq	xmm0,[128+edx]
897	movdqa	xmm7,xmm1
898	pmuludq	xmm1,[16+edx]
899	paddq	xmm0,xmm5
900	movdqa	xmm5,xmm7
901	pmuludq	xmm7,[32+edx]
902	paddq	xmm1,xmm6
903	movdqa	xmm6,xmm5
904	pmuludq	xmm5,[48+edx]
905	paddq	xmm2,xmm7
906	movdqa	xmm7,[16+eax]
907	pmuludq	xmm6,[64+edx]
908	paddq	xmm3,xmm5
909	movdqa	xmm5,xmm7
910	pmuludq	xmm7,[16+edx]
911	paddq	xmm4,xmm6
912	movdqa	xmm6,xmm5
913	pmuludq	xmm5,[32+edx]
914	paddq	xmm2,xmm7
915	movdqa	xmm7,[32+eax]
916	pmuludq	xmm6,[48+edx]
917	paddq	xmm3,xmm5
918	movdqa	xmm5,xmm7
919	pmuludq	xmm7,[112+edx]
920	paddq	xmm4,xmm6
921	movdqa	xmm6,xmm5
922	pmuludq	xmm5,[128+edx]
923	paddq	xmm0,xmm7
924	movdqa	xmm7,xmm6
925	pmuludq	xmm6,[16+edx]
926	paddq	xmm1,xmm5
927	movdqa	xmm5,[48+eax]
928	pmuludq	xmm7,[32+edx]
929	paddq	xmm3,xmm6
930	movdqa	xmm6,xmm5
931	pmuludq	xmm5,[96+edx]
932	paddq	xmm4,xmm7
933	movdqa	xmm7,xmm6
934	pmuludq	xmm6,[112+edx]
935	paddq	xmm0,xmm5
936	movdqa	xmm5,xmm7
937	pmuludq	xmm7,[128+edx]
938	paddq	xmm1,xmm6
939	movdqa	xmm6,[64+eax]
940	pmuludq	xmm5,[16+edx]
941	paddq	xmm2,xmm7
942	movdqa	xmm7,xmm6
943	pmuludq	xmm6,[128+edx]
944	paddq	xmm4,xmm5
945	movdqa	xmm5,xmm7
946	pmuludq	xmm7,[80+edx]
947	paddq	xmm3,xmm6
948	movdqa	xmm6,xmm5
949	pmuludq	xmm5,[96+edx]
950	paddq	xmm0,xmm7
951	movdqa	xmm7,[64+ebx]
952	pmuludq	xmm6,[112+edx]
953	paddq	xmm1,xmm5
954	paddq	xmm2,xmm6
955	movdqa	xmm5,xmm3
956	pand	xmm3,xmm7
957	psrlq	xmm5,26
958	paddq	xmm5,xmm4
959	movdqa	xmm6,xmm0
960	pand	xmm0,xmm7
961	psrlq	xmm6,26
962	movdqa	xmm4,xmm5
963	paddq	xmm6,xmm1
964	psrlq	xmm5,26
965	pand	xmm4,xmm7
966	movdqa	xmm1,xmm6
967	psrlq	xmm6,26
968	paddd	xmm0,xmm5
969	psllq	xmm5,2
970	paddq	xmm6,xmm2
971	paddq	xmm5,xmm0
972	pand	xmm1,xmm7
973	movdqa	xmm2,xmm6
974	psrlq	xmm6,26
975	pand	xmm2,xmm7
976	paddd	xmm6,xmm3
977	movdqa	xmm0,xmm5
978	psrlq	xmm5,26
979	movdqa	xmm3,xmm6
980	psrlq	xmm6,26
981	pand	xmm0,xmm7
982	paddd	xmm1,xmm5
983	pand	xmm3,xmm7
984	paddd	xmm4,xmm6
985	movdqu	xmm5,[32+esi]
986	movdqu	xmm6,[48+esi]
987	lea	esi,[32+esi]
988	movdqa	[112+esp],xmm2
989	movdqa	[128+esp],xmm3
990	movdqa	[144+esp],xmm4
991	movdqa	xmm2,xmm5
992	movdqa	xmm3,xmm6
993	psrldq	xmm2,6
994	psrldq	xmm3,6
995	movdqa	xmm4,xmm5
996	punpcklqdq	xmm2,xmm3
997	punpckhqdq	xmm4,xmm6
998	punpcklqdq	xmm5,xmm6
999	movdqa	xmm3,xmm2
1000	psrlq	xmm2,4
1001	psrlq	xmm3,30
1002	movdqa	xmm6,xmm5
1003	psrlq	xmm4,40
1004	psrlq	xmm6,26
1005	pand	xmm5,xmm7
1006	pand	xmm6,xmm7
1007	pand	xmm2,xmm7
1008	pand	xmm3,xmm7
1009	por	xmm4,[ebx]
1010	movdqa	[80+esp],xmm0
1011	movdqa	[96+esp],xmm1
1012	ja	NEAR L$014loop
1013L$013skip_loop:
1014	pshufd	xmm7,[edx-144],16
1015	add	ecx,32
1016	jnz	NEAR L$015long_tail
1017	paddd	xmm5,xmm0
1018	paddd	xmm6,xmm1
1019	paddd	xmm2,[112+esp]
1020	paddd	xmm3,[128+esp]
1021	paddd	xmm4,[144+esp]
1022L$015long_tail:
1023	movdqa	[eax],xmm5
1024	movdqa	[16+eax],xmm6
1025	movdqa	[32+eax],xmm2
1026	movdqa	[48+eax],xmm3
1027	movdqa	[64+eax],xmm4
1028	pmuludq	xmm5,xmm7
1029	pmuludq	xmm6,xmm7
1030	pmuludq	xmm2,xmm7
1031	movdqa	xmm0,xmm5
1032	pshufd	xmm5,[edx-128],16
1033	pmuludq	xmm3,xmm7
1034	movdqa	xmm1,xmm6
1035	pmuludq	xmm4,xmm7
1036	movdqa	xmm6,xmm5
1037	pmuludq	xmm5,[48+eax]
1038	movdqa	xmm7,xmm6
1039	pmuludq	xmm6,[32+eax]
1040	paddq	xmm4,xmm5
1041	movdqa	xmm5,xmm7
1042	pmuludq	xmm7,[16+eax]
1043	paddq	xmm3,xmm6
1044	pshufd	xmm6,[edx-64],16
1045	pmuludq	xmm5,[eax]
1046	paddq	xmm2,xmm7
1047	pmuludq	xmm6,[64+eax]
1048	pshufd	xmm7,[edx-112],16
1049	paddq	xmm1,xmm5
1050	movdqa	xmm5,xmm7
1051	pmuludq	xmm7,[32+eax]
1052	paddq	xmm0,xmm6
1053	movdqa	xmm6,xmm5
1054	pmuludq	xmm5,[16+eax]
1055	paddq	xmm4,xmm7
1056	pshufd	xmm7,[edx-48],16
1057	pmuludq	xmm6,[eax]
1058	paddq	xmm3,xmm5
1059	movdqa	xmm5,xmm7
1060	pmuludq	xmm7,[64+eax]
1061	paddq	xmm2,xmm6
1062	pmuludq	xmm5,[48+eax]
1063	pshufd	xmm6,[edx-96],16
1064	paddq	xmm1,xmm7
1065	movdqa	xmm7,xmm6
1066	pmuludq	xmm6,[16+eax]
1067	paddq	xmm0,xmm5
1068	pshufd	xmm5,[edx-32],16
1069	pmuludq	xmm7,[eax]
1070	paddq	xmm4,xmm6
1071	movdqa	xmm6,xmm5
1072	pmuludq	xmm5,[64+eax]
1073	paddq	xmm3,xmm7
1074	movdqa	xmm7,xmm6
1075	pmuludq	xmm6,[48+eax]
1076	paddq	xmm2,xmm5
1077	pmuludq	xmm7,[32+eax]
1078	pshufd	xmm5,[edx-80],16
1079	paddq	xmm1,xmm6
1080	pshufd	xmm6,[edx-16],16
1081	pmuludq	xmm5,[eax]
1082	paddq	xmm0,xmm7
1083	movdqa	xmm7,xmm6
1084	pmuludq	xmm6,[64+eax]
1085	paddq	xmm4,xmm5
1086	movdqa	xmm5,xmm7
1087	pmuludq	xmm7,[16+eax]
1088	paddq	xmm3,xmm6
1089	movdqa	xmm6,xmm5
1090	pmuludq	xmm5,[32+eax]
1091	paddq	xmm0,xmm7
1092	pmuludq	xmm6,[48+eax]
1093	movdqa	xmm7,[64+ebx]
1094	paddq	xmm1,xmm5
1095	paddq	xmm2,xmm6
1096	jz	NEAR L$016short_tail
1097	movdqu	xmm5,[esi-32]
1098	movdqu	xmm6,[esi-16]
1099	lea	esi,[32+esi]
1100	movdqa	[32+esp],xmm2
1101	movdqa	[48+esp],xmm3
1102	movdqa	[64+esp],xmm4
1103	movdqa	xmm2,xmm5
1104	movdqa	xmm3,xmm6
1105	psrldq	xmm2,6
1106	psrldq	xmm3,6
1107	movdqa	xmm4,xmm5
1108	punpcklqdq	xmm2,xmm3
1109	punpckhqdq	xmm4,xmm6
1110	punpcklqdq	xmm5,xmm6
1111	movdqa	xmm3,xmm2
1112	psrlq	xmm2,4
1113	psrlq	xmm3,30
1114	movdqa	xmm6,xmm5
1115	psrlq	xmm4,40
1116	psrlq	xmm6,26
1117	pand	xmm5,xmm7
1118	pand	xmm6,xmm7
1119	pand	xmm2,xmm7
1120	pand	xmm3,xmm7
1121	por	xmm4,[ebx]
1122	pshufd	xmm7,[edx],16
1123	paddd	xmm5,[80+esp]
1124	paddd	xmm6,[96+esp]
1125	paddd	xmm2,[112+esp]
1126	paddd	xmm3,[128+esp]
1127	paddd	xmm4,[144+esp]
1128	movdqa	[esp],xmm5
1129	pmuludq	xmm5,xmm7
1130	movdqa	[16+esp],xmm6
1131	pmuludq	xmm6,xmm7
1132	paddq	xmm0,xmm5
1133	movdqa	xmm5,xmm2
1134	pmuludq	xmm2,xmm7
1135	paddq	xmm1,xmm6
1136	movdqa	xmm6,xmm3
1137	pmuludq	xmm3,xmm7
1138	paddq	xmm2,[32+esp]
1139	movdqa	[32+esp],xmm5
1140	pshufd	xmm5,[16+edx],16
1141	paddq	xmm3,[48+esp]
1142	movdqa	[48+esp],xmm6
1143	movdqa	xmm6,xmm4
1144	pmuludq	xmm4,xmm7
1145	paddq	xmm4,[64+esp]
1146	movdqa	[64+esp],xmm6
1147	movdqa	xmm6,xmm5
1148	pmuludq	xmm5,[48+esp]
1149	movdqa	xmm7,xmm6
1150	pmuludq	xmm6,[32+esp]
1151	paddq	xmm4,xmm5
1152	movdqa	xmm5,xmm7
1153	pmuludq	xmm7,[16+esp]
1154	paddq	xmm3,xmm6
1155	pshufd	xmm6,[80+edx],16
1156	pmuludq	xmm5,[esp]
1157	paddq	xmm2,xmm7
1158	pmuludq	xmm6,[64+esp]
1159	pshufd	xmm7,[32+edx],16
1160	paddq	xmm1,xmm5
1161	movdqa	xmm5,xmm7
1162	pmuludq	xmm7,[32+esp]
1163	paddq	xmm0,xmm6
1164	movdqa	xmm6,xmm5
1165	pmuludq	xmm5,[16+esp]
1166	paddq	xmm4,xmm7
1167	pshufd	xmm7,[96+edx],16
1168	pmuludq	xmm6,[esp]
1169	paddq	xmm3,xmm5
1170	movdqa	xmm5,xmm7
1171	pmuludq	xmm7,[64+esp]
1172	paddq	xmm2,xmm6
1173	pmuludq	xmm5,[48+esp]
1174	pshufd	xmm6,[48+edx],16
1175	paddq	xmm1,xmm7
1176	movdqa	xmm7,xmm6
1177	pmuludq	xmm6,[16+esp]
1178	paddq	xmm0,xmm5
1179	pshufd	xmm5,[112+edx],16
1180	pmuludq	xmm7,[esp]
1181	paddq	xmm4,xmm6
1182	movdqa	xmm6,xmm5
1183	pmuludq	xmm5,[64+esp]
1184	paddq	xmm3,xmm7
1185	movdqa	xmm7,xmm6
1186	pmuludq	xmm6,[48+esp]
1187	paddq	xmm2,xmm5
1188	pmuludq	xmm7,[32+esp]
1189	pshufd	xmm5,[64+edx],16
1190	paddq	xmm1,xmm6
1191	pshufd	xmm6,[128+edx],16
1192	pmuludq	xmm5,[esp]
1193	paddq	xmm0,xmm7
1194	movdqa	xmm7,xmm6
1195	pmuludq	xmm6,[64+esp]
1196	paddq	xmm4,xmm5
1197	movdqa	xmm5,xmm7
1198	pmuludq	xmm7,[16+esp]
1199	paddq	xmm3,xmm6
1200	movdqa	xmm6,xmm5
1201	pmuludq	xmm5,[32+esp]
1202	paddq	xmm0,xmm7
1203	pmuludq	xmm6,[48+esp]
1204	movdqa	xmm7,[64+ebx]
1205	paddq	xmm1,xmm5
1206	paddq	xmm2,xmm6
1207L$016short_tail:
1208	pshufd	xmm6,xmm4,78
1209	pshufd	xmm5,xmm3,78
1210	paddq	xmm4,xmm6
1211	paddq	xmm3,xmm5
1212	pshufd	xmm6,xmm0,78
1213	pshufd	xmm5,xmm1,78
1214	paddq	xmm0,xmm6
1215	paddq	xmm1,xmm5
1216	pshufd	xmm6,xmm2,78
1217	movdqa	xmm5,xmm3
1218	pand	xmm3,xmm7
1219	psrlq	xmm5,26
1220	paddq	xmm2,xmm6
1221	paddq	xmm5,xmm4
1222	movdqa	xmm6,xmm0
1223	pand	xmm0,xmm7
1224	psrlq	xmm6,26
1225	movdqa	xmm4,xmm5
1226	paddq	xmm6,xmm1
1227	psrlq	xmm5,26
1228	pand	xmm4,xmm7
1229	movdqa	xmm1,xmm6
1230	psrlq	xmm6,26
1231	paddd	xmm0,xmm5
1232	psllq	xmm5,2
1233	paddq	xmm6,xmm2
1234	paddq	xmm5,xmm0
1235	pand	xmm1,xmm7
1236	movdqa	xmm2,xmm6
1237	psrlq	xmm6,26
1238	pand	xmm2,xmm7
1239	paddd	xmm6,xmm3
1240	movdqa	xmm0,xmm5
1241	psrlq	xmm5,26
1242	movdqa	xmm3,xmm6
1243	psrlq	xmm6,26
1244	pand	xmm0,xmm7
1245	paddd	xmm1,xmm5
1246	pand	xmm3,xmm7
1247	paddd	xmm4,xmm6
1248L$012done:
1249	movd	DWORD [edi-48],xmm0
1250	movd	DWORD [edi-44],xmm1
1251	movd	DWORD [edi-40],xmm2
1252	movd	DWORD [edi-36],xmm3
1253	movd	DWORD [edi-32],xmm4
1254	mov	esp,ebp
1255L$006nodata:
1256	pop	edi
1257	pop	esi
1258	pop	ebx
1259	pop	ebp
1260	ret
1261align	32
1262align	16
1263__poly1305_emit_sse2:
1264	push	ebp
1265	push	ebx
1266	push	esi
1267	push	edi
1268	mov	ebp,DWORD [20+esp]
1269	cmp	DWORD [20+ebp],0
1270	je	NEAR L$enter_emit
1271	mov	eax,DWORD [ebp]
1272	mov	edi,DWORD [4+ebp]
1273	mov	ecx,DWORD [8+ebp]
1274	mov	edx,DWORD [12+ebp]
1275	mov	esi,DWORD [16+ebp]
1276	mov	ebx,edi
1277	shl	edi,26
1278	shr	ebx,6
1279	add	eax,edi
1280	mov	edi,ecx
1281	adc	ebx,0
1282	shl	edi,20
1283	shr	ecx,12
1284	add	ebx,edi
1285	mov	edi,edx
1286	adc	ecx,0
1287	shl	edi,14
1288	shr	edx,18
1289	add	ecx,edi
1290	mov	edi,esi
1291	adc	edx,0
1292	shl	edi,8
1293	shr	esi,24
1294	add	edx,edi
1295	adc	esi,0
1296	mov	edi,esi
1297	and	esi,3
1298	shr	edi,2
1299	lea	ebp,[edi*4+edi]
1300	mov	edi,DWORD [24+esp]
1301	add	eax,ebp
1302	mov	ebp,DWORD [28+esp]
1303	adc	ebx,0
1304	adc	ecx,0
1305	adc	edx,0
1306	adc	esi,0
1307	movd	xmm0,eax
1308	add	eax,5
1309	movd	xmm1,ebx
1310	adc	ebx,0
1311	movd	xmm2,ecx
1312	adc	ecx,0
1313	movd	xmm3,edx
1314	adc	edx,0
1315	adc	esi,0
1316	shr	esi,2
1317	neg	esi
1318	and	eax,esi
1319	and	ebx,esi
1320	and	ecx,esi
1321	and	edx,esi
1322	mov	DWORD [edi],eax
1323	movd	eax,xmm0
1324	mov	DWORD [4+edi],ebx
1325	movd	ebx,xmm1
1326	mov	DWORD [8+edi],ecx
1327	movd	ecx,xmm2
1328	mov	DWORD [12+edi],edx
1329	movd	edx,xmm3
1330	not	esi
1331	and	eax,esi
1332	and	ebx,esi
1333	or	eax,DWORD [edi]
1334	and	ecx,esi
1335	or	ebx,DWORD [4+edi]
1336	and	edx,esi
1337	or	ecx,DWORD [8+edi]
1338	or	edx,DWORD [12+edi]
1339	add	eax,DWORD [ebp]
1340	adc	ebx,DWORD [4+ebp]
1341	mov	DWORD [edi],eax
1342	adc	ecx,DWORD [8+ebp]
1343	mov	DWORD [4+edi],ebx
1344	adc	edx,DWORD [12+ebp]
1345	mov	DWORD [8+edi],ecx
1346	mov	DWORD [12+edi],edx
1347	pop	edi
1348	pop	esi
1349	pop	ebx
1350	pop	ebp
1351	ret
1352align	64
1353L$const_sse2:
1354dd	16777216,0,16777216,0,16777216,0,16777216,0
1355dd	0,0,0,0,0,0,0,0
1356dd	67108863,0,67108863,0,67108863,0,67108863,0
1357dd	268435455,268435452,268435452,268435452
1358db	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1359db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1360db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1361db	114,103,62,0
1362align	4
1363segment	.bss
1364common	_GFp_ia32cap_P 16
1365