1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4$@feat.00 equ 1
5section	.text	code align=64
6%else
7section	.text	code
8%endif
9global	_gcm_gmult_4bit_x86
10align	16
11_gcm_gmult_4bit_x86:
12L$_gcm_gmult_4bit_x86_begin:
13	push	ebp
14	push	ebx
15	push	esi
16	push	edi
17	sub	esp,84
18	mov	edi,DWORD [104+esp]
19	mov	esi,DWORD [108+esp]
20	mov	ebp,DWORD [edi]
21	mov	edx,DWORD [4+edi]
22	mov	ecx,DWORD [8+edi]
23	mov	ebx,DWORD [12+edi]
24	mov	DWORD [16+esp],0
25	mov	DWORD [20+esp],471859200
26	mov	DWORD [24+esp],943718400
27	mov	DWORD [28+esp],610271232
28	mov	DWORD [32+esp],1887436800
29	mov	DWORD [36+esp],1822425088
30	mov	DWORD [40+esp],1220542464
31	mov	DWORD [44+esp],1423966208
32	mov	DWORD [48+esp],3774873600
33	mov	DWORD [52+esp],4246732800
34	mov	DWORD [56+esp],3644850176
35	mov	DWORD [60+esp],3311403008
36	mov	DWORD [64+esp],2441084928
37	mov	DWORD [68+esp],2376073216
38	mov	DWORD [72+esp],2847932416
39	mov	DWORD [76+esp],3051356160
40	mov	DWORD [esp],ebp
41	mov	DWORD [4+esp],edx
42	mov	DWORD [8+esp],ecx
43	mov	DWORD [12+esp],ebx
44	shr	ebx,20
45	and	ebx,240
46	mov	ebp,DWORD [4+ebx*1+esi]
47	mov	edx,DWORD [ebx*1+esi]
48	mov	ecx,DWORD [12+ebx*1+esi]
49	mov	ebx,DWORD [8+ebx*1+esi]
50	xor	eax,eax
51	mov	edi,15
52	jmp	NEAR L$000x86_loop
53align	16
54L$000x86_loop:
55	mov	al,bl
56	shrd	ebx,ecx,4
57	and	al,15
58	shrd	ecx,edx,4
59	shrd	edx,ebp,4
60	shr	ebp,4
61	xor	ebp,DWORD [16+eax*4+esp]
62	mov	al,BYTE [edi*1+esp]
63	and	al,240
64	xor	ebx,DWORD [8+eax*1+esi]
65	xor	ecx,DWORD [12+eax*1+esi]
66	xor	edx,DWORD [eax*1+esi]
67	xor	ebp,DWORD [4+eax*1+esi]
68	dec	edi
69	js	NEAR L$001x86_break
70	mov	al,bl
71	shrd	ebx,ecx,4
72	and	al,15
73	shrd	ecx,edx,4
74	shrd	edx,ebp,4
75	shr	ebp,4
76	xor	ebp,DWORD [16+eax*4+esp]
77	mov	al,BYTE [edi*1+esp]
78	shl	al,4
79	xor	ebx,DWORD [8+eax*1+esi]
80	xor	ecx,DWORD [12+eax*1+esi]
81	xor	edx,DWORD [eax*1+esi]
82	xor	ebp,DWORD [4+eax*1+esi]
83	jmp	NEAR L$000x86_loop
84align	16
85L$001x86_break:
86	bswap	ebx
87	bswap	ecx
88	bswap	edx
89	bswap	ebp
90	mov	edi,DWORD [104+esp]
91	mov	DWORD [12+edi],ebx
92	mov	DWORD [8+edi],ecx
93	mov	DWORD [4+edi],edx
94	mov	DWORD [edi],ebp
95	add	esp,84
96	pop	edi
97	pop	esi
98	pop	ebx
99	pop	ebp
100	ret
101global	_gcm_ghash_4bit_x86
102align	16
103_gcm_ghash_4bit_x86:
104L$_gcm_ghash_4bit_x86_begin:
105	push	ebp
106	push	ebx
107	push	esi
108	push	edi
109	sub	esp,84
110	mov	ebx,DWORD [104+esp]
111	mov	esi,DWORD [108+esp]
112	mov	edi,DWORD [112+esp]
113	mov	ecx,DWORD [116+esp]
114	add	ecx,edi
115	mov	DWORD [116+esp],ecx
116	mov	ebp,DWORD [ebx]
117	mov	edx,DWORD [4+ebx]
118	mov	ecx,DWORD [8+ebx]
119	mov	ebx,DWORD [12+ebx]
120	mov	DWORD [16+esp],0
121	mov	DWORD [20+esp],471859200
122	mov	DWORD [24+esp],943718400
123	mov	DWORD [28+esp],610271232
124	mov	DWORD [32+esp],1887436800
125	mov	DWORD [36+esp],1822425088
126	mov	DWORD [40+esp],1220542464
127	mov	DWORD [44+esp],1423966208
128	mov	DWORD [48+esp],3774873600
129	mov	DWORD [52+esp],4246732800
130	mov	DWORD [56+esp],3644850176
131	mov	DWORD [60+esp],3311403008
132	mov	DWORD [64+esp],2441084928
133	mov	DWORD [68+esp],2376073216
134	mov	DWORD [72+esp],2847932416
135	mov	DWORD [76+esp],3051356160
136align	16
137L$002x86_outer_loop:
138	xor	ebx,DWORD [12+edi]
139	xor	ecx,DWORD [8+edi]
140	xor	edx,DWORD [4+edi]
141	xor	ebp,DWORD [edi]
142	mov	DWORD [12+esp],ebx
143	mov	DWORD [8+esp],ecx
144	mov	DWORD [4+esp],edx
145	mov	DWORD [esp],ebp
146	shr	ebx,20
147	and	ebx,240
148	mov	ebp,DWORD [4+ebx*1+esi]
149	mov	edx,DWORD [ebx*1+esi]
150	mov	ecx,DWORD [12+ebx*1+esi]
151	mov	ebx,DWORD [8+ebx*1+esi]
152	xor	eax,eax
153	mov	edi,15
154	jmp	NEAR L$003x86_loop
155align	16
156L$003x86_loop:
157	mov	al,bl
158	shrd	ebx,ecx,4
159	and	al,15
160	shrd	ecx,edx,4
161	shrd	edx,ebp,4
162	shr	ebp,4
163	xor	ebp,DWORD [16+eax*4+esp]
164	mov	al,BYTE [edi*1+esp]
165	and	al,240
166	xor	ebx,DWORD [8+eax*1+esi]
167	xor	ecx,DWORD [12+eax*1+esi]
168	xor	edx,DWORD [eax*1+esi]
169	xor	ebp,DWORD [4+eax*1+esi]
170	dec	edi
171	js	NEAR L$004x86_break
172	mov	al,bl
173	shrd	ebx,ecx,4
174	and	al,15
175	shrd	ecx,edx,4
176	shrd	edx,ebp,4
177	shr	ebp,4
178	xor	ebp,DWORD [16+eax*4+esp]
179	mov	al,BYTE [edi*1+esp]
180	shl	al,4
181	xor	ebx,DWORD [8+eax*1+esi]
182	xor	ecx,DWORD [12+eax*1+esi]
183	xor	edx,DWORD [eax*1+esi]
184	xor	ebp,DWORD [4+eax*1+esi]
185	jmp	NEAR L$003x86_loop
186align	16
187L$004x86_break:
188	bswap	ebx
189	bswap	ecx
190	bswap	edx
191	bswap	ebp
192	mov	edi,DWORD [112+esp]
193	lea	edi,[16+edi]
194	cmp	edi,DWORD [116+esp]
195	mov	DWORD [112+esp],edi
196	jb	NEAR L$002x86_outer_loop
197	mov	edi,DWORD [104+esp]
198	mov	DWORD [12+edi],ebx
199	mov	DWORD [8+edi],ecx
200	mov	DWORD [4+edi],edx
201	mov	DWORD [edi],ebp
202	add	esp,84
203	pop	edi
204	pop	esi
205	pop	ebx
206	pop	ebp
207	ret
208global	_gcm_gmult_4bit_mmx
209align	16
210_gcm_gmult_4bit_mmx:
211L$_gcm_gmult_4bit_mmx_begin:
212	push	ebp
213	push	ebx
214	push	esi
215	push	edi
216	mov	edi,DWORD [20+esp]
217	mov	esi,DWORD [24+esp]
218	call	L$005pic_point
219L$005pic_point:
220	pop	eax
221	lea	eax,[(L$rem_4bit-L$005pic_point)+eax]
222	movzx	ebx,BYTE [15+edi]
223	xor	ecx,ecx
224	mov	edx,ebx
225	mov	cl,dl
226	mov	ebp,14
227	shl	cl,4
228	and	edx,240
229	movq	mm0,[8+ecx*1+esi]
230	movq	mm1,[ecx*1+esi]
231	movd	ebx,mm0
232	jmp	NEAR L$006mmx_loop
233align	16
234L$006mmx_loop:
235	psrlq	mm0,4
236	and	ebx,15
237	movq	mm2,mm1
238	psrlq	mm1,4
239	pxor	mm0,[8+edx*1+esi]
240	mov	cl,BYTE [ebp*1+edi]
241	psllq	mm2,60
242	pxor	mm1,[ebx*8+eax]
243	dec	ebp
244	movd	ebx,mm0
245	pxor	mm1,[edx*1+esi]
246	mov	edx,ecx
247	pxor	mm0,mm2
248	js	NEAR L$007mmx_break
249	shl	cl,4
250	and	ebx,15
251	psrlq	mm0,4
252	and	edx,240
253	movq	mm2,mm1
254	psrlq	mm1,4
255	pxor	mm0,[8+ecx*1+esi]
256	psllq	mm2,60
257	pxor	mm1,[ebx*8+eax]
258	movd	ebx,mm0
259	pxor	mm1,[ecx*1+esi]
260	pxor	mm0,mm2
261	jmp	NEAR L$006mmx_loop
262align	16
263L$007mmx_break:
264	shl	cl,4
265	and	ebx,15
266	psrlq	mm0,4
267	and	edx,240
268	movq	mm2,mm1
269	psrlq	mm1,4
270	pxor	mm0,[8+ecx*1+esi]
271	psllq	mm2,60
272	pxor	mm1,[ebx*8+eax]
273	movd	ebx,mm0
274	pxor	mm1,[ecx*1+esi]
275	pxor	mm0,mm2
276	psrlq	mm0,4
277	and	ebx,15
278	movq	mm2,mm1
279	psrlq	mm1,4
280	pxor	mm0,[8+edx*1+esi]
281	psllq	mm2,60
282	pxor	mm1,[ebx*8+eax]
283	movd	ebx,mm0
284	pxor	mm1,[edx*1+esi]
285	pxor	mm0,mm2
286	psrlq	mm0,32
287	movd	edx,mm1
288	psrlq	mm1,32
289	movd	ecx,mm0
290	movd	ebp,mm1
291	bswap	ebx
292	bswap	edx
293	bswap	ecx
294	bswap	ebp
295	emms
296	mov	DWORD [12+edi],ebx
297	mov	DWORD [4+edi],edx
298	mov	DWORD [8+edi],ecx
299	mov	DWORD [edi],ebp
300	pop	edi
301	pop	esi
302	pop	ebx
303	pop	ebp
304	ret
305global	_gcm_ghash_4bit_mmx
306align	16
307_gcm_ghash_4bit_mmx:
308L$_gcm_ghash_4bit_mmx_begin:
309	push	ebp
310	push	ebx
311	push	esi
312	push	edi
313	mov	eax,DWORD [20+esp]
314	mov	ebx,DWORD [24+esp]
315	mov	ecx,DWORD [28+esp]
316	mov	edx,DWORD [32+esp]
317	mov	ebp,esp
318	call	L$008pic_point
319L$008pic_point:
320	pop	esi
321	lea	esi,[(L$rem_8bit-L$008pic_point)+esi]
322	sub	esp,544
323	and	esp,-64
324	sub	esp,16
325	add	edx,ecx
326	mov	DWORD [544+esp],eax
327	mov	DWORD [552+esp],edx
328	mov	DWORD [556+esp],ebp
329	add	ebx,128
330	lea	edi,[144+esp]
331	lea	ebp,[400+esp]
332	mov	edx,DWORD [ebx-120]
333	movq	mm0,[ebx-120]
334	movq	mm3,[ebx-128]
335	shl	edx,4
336	mov	BYTE [esp],dl
337	mov	edx,DWORD [ebx-104]
338	movq	mm2,[ebx-104]
339	movq	mm5,[ebx-112]
340	movq	[edi-128],mm0
341	psrlq	mm0,4
342	movq	[edi],mm3
343	movq	mm7,mm3
344	psrlq	mm3,4
345	shl	edx,4
346	mov	BYTE [1+esp],dl
347	mov	edx,DWORD [ebx-88]
348	movq	mm1,[ebx-88]
349	psllq	mm7,60
350	movq	mm4,[ebx-96]
351	por	mm0,mm7
352	movq	[edi-120],mm2
353	psrlq	mm2,4
354	movq	[8+edi],mm5
355	movq	mm6,mm5
356	movq	[ebp-128],mm0
357	psrlq	mm5,4
358	movq	[ebp],mm3
359	shl	edx,4
360	mov	BYTE [2+esp],dl
361	mov	edx,DWORD [ebx-72]
362	movq	mm0,[ebx-72]
363	psllq	mm6,60
364	movq	mm3,[ebx-80]
365	por	mm2,mm6
366	movq	[edi-112],mm1
367	psrlq	mm1,4
368	movq	[16+edi],mm4
369	movq	mm7,mm4
370	movq	[ebp-120],mm2
371	psrlq	mm4,4
372	movq	[8+ebp],mm5
373	shl	edx,4
374	mov	BYTE [3+esp],dl
375	mov	edx,DWORD [ebx-56]
376	movq	mm2,[ebx-56]
377	psllq	mm7,60
378	movq	mm5,[ebx-64]
379	por	mm1,mm7
380	movq	[edi-104],mm0
381	psrlq	mm0,4
382	movq	[24+edi],mm3
383	movq	mm6,mm3
384	movq	[ebp-112],mm1
385	psrlq	mm3,4
386	movq	[16+ebp],mm4
387	shl	edx,4
388	mov	BYTE [4+esp],dl
389	mov	edx,DWORD [ebx-40]
390	movq	mm1,[ebx-40]
391	psllq	mm6,60
392	movq	mm4,[ebx-48]
393	por	mm0,mm6
394	movq	[edi-96],mm2
395	psrlq	mm2,4
396	movq	[32+edi],mm5
397	movq	mm7,mm5
398	movq	[ebp-104],mm0
399	psrlq	mm5,4
400	movq	[24+ebp],mm3
401	shl	edx,4
402	mov	BYTE [5+esp],dl
403	mov	edx,DWORD [ebx-24]
404	movq	mm0,[ebx-24]
405	psllq	mm7,60
406	movq	mm3,[ebx-32]
407	por	mm2,mm7
408	movq	[edi-88],mm1
409	psrlq	mm1,4
410	movq	[40+edi],mm4
411	movq	mm6,mm4
412	movq	[ebp-96],mm2
413	psrlq	mm4,4
414	movq	[32+ebp],mm5
415	shl	edx,4
416	mov	BYTE [6+esp],dl
417	mov	edx,DWORD [ebx-8]
418	movq	mm2,[ebx-8]
419	psllq	mm6,60
420	movq	mm5,[ebx-16]
421	por	mm1,mm6
422	movq	[edi-80],mm0
423	psrlq	mm0,4
424	movq	[48+edi],mm3
425	movq	mm7,mm3
426	movq	[ebp-88],mm1
427	psrlq	mm3,4
428	movq	[40+ebp],mm4
429	shl	edx,4
430	mov	BYTE [7+esp],dl
431	mov	edx,DWORD [8+ebx]
432	movq	mm1,[8+ebx]
433	psllq	mm7,60
434	movq	mm4,[ebx]
435	por	mm0,mm7
436	movq	[edi-72],mm2
437	psrlq	mm2,4
438	movq	[56+edi],mm5
439	movq	mm6,mm5
440	movq	[ebp-80],mm0
441	psrlq	mm5,4
442	movq	[48+ebp],mm3
443	shl	edx,4
444	mov	BYTE [8+esp],dl
445	mov	edx,DWORD [24+ebx]
446	movq	mm0,[24+ebx]
447	psllq	mm6,60
448	movq	mm3,[16+ebx]
449	por	mm2,mm6
450	movq	[edi-64],mm1
451	psrlq	mm1,4
452	movq	[64+edi],mm4
453	movq	mm7,mm4
454	movq	[ebp-72],mm2
455	psrlq	mm4,4
456	movq	[56+ebp],mm5
457	shl	edx,4
458	mov	BYTE [9+esp],dl
459	mov	edx,DWORD [40+ebx]
460	movq	mm2,[40+ebx]
461	psllq	mm7,60
462	movq	mm5,[32+ebx]
463	por	mm1,mm7
464	movq	[edi-56],mm0
465	psrlq	mm0,4
466	movq	[72+edi],mm3
467	movq	mm6,mm3
468	movq	[ebp-64],mm1
469	psrlq	mm3,4
470	movq	[64+ebp],mm4
471	shl	edx,4
472	mov	BYTE [10+esp],dl
473	mov	edx,DWORD [56+ebx]
474	movq	mm1,[56+ebx]
475	psllq	mm6,60
476	movq	mm4,[48+ebx]
477	por	mm0,mm6
478	movq	[edi-48],mm2
479	psrlq	mm2,4
480	movq	[80+edi],mm5
481	movq	mm7,mm5
482	movq	[ebp-56],mm0
483	psrlq	mm5,4
484	movq	[72+ebp],mm3
485	shl	edx,4
486	mov	BYTE [11+esp],dl
487	mov	edx,DWORD [72+ebx]
488	movq	mm0,[72+ebx]
489	psllq	mm7,60
490	movq	mm3,[64+ebx]
491	por	mm2,mm7
492	movq	[edi-40],mm1
493	psrlq	mm1,4
494	movq	[88+edi],mm4
495	movq	mm6,mm4
496	movq	[ebp-48],mm2
497	psrlq	mm4,4
498	movq	[80+ebp],mm5
499	shl	edx,4
500	mov	BYTE [12+esp],dl
501	mov	edx,DWORD [88+ebx]
502	movq	mm2,[88+ebx]
503	psllq	mm6,60
504	movq	mm5,[80+ebx]
505	por	mm1,mm6
506	movq	[edi-32],mm0
507	psrlq	mm0,4
508	movq	[96+edi],mm3
509	movq	mm7,mm3
510	movq	[ebp-40],mm1
511	psrlq	mm3,4
512	movq	[88+ebp],mm4
513	shl	edx,4
514	mov	BYTE [13+esp],dl
515	mov	edx,DWORD [104+ebx]
516	movq	mm1,[104+ebx]
517	psllq	mm7,60
518	movq	mm4,[96+ebx]
519	por	mm0,mm7
520	movq	[edi-24],mm2
521	psrlq	mm2,4
522	movq	[104+edi],mm5
523	movq	mm6,mm5
524	movq	[ebp-32],mm0
525	psrlq	mm5,4
526	movq	[96+ebp],mm3
527	shl	edx,4
528	mov	BYTE [14+esp],dl
529	mov	edx,DWORD [120+ebx]
530	movq	mm0,[120+ebx]
531	psllq	mm6,60
532	movq	mm3,[112+ebx]
533	por	mm2,mm6
534	movq	[edi-16],mm1
535	psrlq	mm1,4
536	movq	[112+edi],mm4
537	movq	mm7,mm4
538	movq	[ebp-24],mm2
539	psrlq	mm4,4
540	movq	[104+ebp],mm5
541	shl	edx,4
542	mov	BYTE [15+esp],dl
543	psllq	mm7,60
544	por	mm1,mm7
545	movq	[edi-8],mm0
546	psrlq	mm0,4
547	movq	[120+edi],mm3
548	movq	mm6,mm3
549	movq	[ebp-16],mm1
550	psrlq	mm3,4
551	movq	[112+ebp],mm4
552	psllq	mm6,60
553	por	mm0,mm6
554	movq	[ebp-8],mm0
555	movq	[120+ebp],mm3
556	movq	mm6,[eax]
557	mov	ebx,DWORD [8+eax]
558	mov	edx,DWORD [12+eax]
559align	16
560L$009outer:
561	xor	edx,DWORD [12+ecx]
562	xor	ebx,DWORD [8+ecx]
563	pxor	mm6,[ecx]
564	lea	ecx,[16+ecx]
565	mov	DWORD [536+esp],ebx
566	movq	[528+esp],mm6
567	mov	DWORD [548+esp],ecx
568	xor	eax,eax
569	rol	edx,8
570	mov	al,dl
571	mov	ebp,eax
572	and	al,15
573	shr	ebp,4
574	pxor	mm0,mm0
575	rol	edx,8
576	pxor	mm1,mm1
577	pxor	mm2,mm2
578	movq	mm7,[16+eax*8+esp]
579	movq	mm6,[144+eax*8+esp]
580	mov	al,dl
581	movd	ebx,mm7
582	psrlq	mm7,8
583	movq	mm3,mm6
584	mov	edi,eax
585	psrlq	mm6,8
586	pxor	mm7,[272+ebp*8+esp]
587	and	al,15
588	psllq	mm3,56
589	shr	edi,4
590	pxor	mm7,[16+eax*8+esp]
591	rol	edx,8
592	pxor	mm6,[144+eax*8+esp]
593	pxor	mm7,mm3
594	pxor	mm6,[400+ebp*8+esp]
595	xor	bl,BYTE [ebp*1+esp]
596	mov	al,dl
597	movd	ecx,mm7
598	movzx	ebx,bl
599	psrlq	mm7,8
600	movq	mm3,mm6
601	mov	ebp,eax
602	psrlq	mm6,8
603	pxor	mm7,[272+edi*8+esp]
604	and	al,15
605	psllq	mm3,56
606	shr	ebp,4
607	pinsrw	mm2,WORD [ebx*2+esi],2
608	pxor	mm7,[16+eax*8+esp]
609	rol	edx,8
610	pxor	mm6,[144+eax*8+esp]
611	pxor	mm7,mm3
612	pxor	mm6,[400+edi*8+esp]
613	xor	cl,BYTE [edi*1+esp]
614	mov	al,dl
615	mov	edx,DWORD [536+esp]
616	movd	ebx,mm7
617	movzx	ecx,cl
618	psrlq	mm7,8
619	movq	mm3,mm6
620	mov	edi,eax
621	psrlq	mm6,8
622	pxor	mm7,[272+ebp*8+esp]
623	and	al,15
624	psllq	mm3,56
625	pxor	mm6,mm2
626	shr	edi,4
627	pinsrw	mm1,WORD [ecx*2+esi],2
628	pxor	mm7,[16+eax*8+esp]
629	rol	edx,8
630	pxor	mm6,[144+eax*8+esp]
631	pxor	mm7,mm3
632	pxor	mm6,[400+ebp*8+esp]
633	xor	bl,BYTE [ebp*1+esp]
634	mov	al,dl
635	movd	ecx,mm7
636	movzx	ebx,bl
637	psrlq	mm7,8
638	movq	mm3,mm6
639	mov	ebp,eax
640	psrlq	mm6,8
641	pxor	mm7,[272+edi*8+esp]
642	and	al,15
643	psllq	mm3,56
644	pxor	mm6,mm1
645	shr	ebp,4
646	pinsrw	mm0,WORD [ebx*2+esi],2
647	pxor	mm7,[16+eax*8+esp]
648	rol	edx,8
649	pxor	mm6,[144+eax*8+esp]
650	pxor	mm7,mm3
651	pxor	mm6,[400+edi*8+esp]
652	xor	cl,BYTE [edi*1+esp]
653	mov	al,dl
654	movd	ebx,mm7
655	movzx	ecx,cl
656	psrlq	mm7,8
657	movq	mm3,mm6
658	mov	edi,eax
659	psrlq	mm6,8
660	pxor	mm7,[272+ebp*8+esp]
661	and	al,15
662	psllq	mm3,56
663	pxor	mm6,mm0
664	shr	edi,4
665	pinsrw	mm2,WORD [ecx*2+esi],2
666	pxor	mm7,[16+eax*8+esp]
667	rol	edx,8
668	pxor	mm6,[144+eax*8+esp]
669	pxor	mm7,mm3
670	pxor	mm6,[400+ebp*8+esp]
671	xor	bl,BYTE [ebp*1+esp]
672	mov	al,dl
673	movd	ecx,mm7
674	movzx	ebx,bl
675	psrlq	mm7,8
676	movq	mm3,mm6
677	mov	ebp,eax
678	psrlq	mm6,8
679	pxor	mm7,[272+edi*8+esp]
680	and	al,15
681	psllq	mm3,56
682	pxor	mm6,mm2
683	shr	ebp,4
684	pinsrw	mm1,WORD [ebx*2+esi],2
685	pxor	mm7,[16+eax*8+esp]
686	rol	edx,8
687	pxor	mm6,[144+eax*8+esp]
688	pxor	mm7,mm3
689	pxor	mm6,[400+edi*8+esp]
690	xor	cl,BYTE [edi*1+esp]
691	mov	al,dl
692	mov	edx,DWORD [532+esp]
693	movd	ebx,mm7
694	movzx	ecx,cl
695	psrlq	mm7,8
696	movq	mm3,mm6
697	mov	edi,eax
698	psrlq	mm6,8
699	pxor	mm7,[272+ebp*8+esp]
700	and	al,15
701	psllq	mm3,56
702	pxor	mm6,mm1
703	shr	edi,4
704	pinsrw	mm0,WORD [ecx*2+esi],2
705	pxor	mm7,[16+eax*8+esp]
706	rol	edx,8
707	pxor	mm6,[144+eax*8+esp]
708	pxor	mm7,mm3
709	pxor	mm6,[400+ebp*8+esp]
710	xor	bl,BYTE [ebp*1+esp]
711	mov	al,dl
712	movd	ecx,mm7
713	movzx	ebx,bl
714	psrlq	mm7,8
715	movq	mm3,mm6
716	mov	ebp,eax
717	psrlq	mm6,8
718	pxor	mm7,[272+edi*8+esp]
719	and	al,15
720	psllq	mm3,56
721	pxor	mm6,mm0
722	shr	ebp,4
723	pinsrw	mm2,WORD [ebx*2+esi],2
724	pxor	mm7,[16+eax*8+esp]
725	rol	edx,8
726	pxor	mm6,[144+eax*8+esp]
727	pxor	mm7,mm3
728	pxor	mm6,[400+edi*8+esp]
729	xor	cl,BYTE [edi*1+esp]
730	mov	al,dl
731	movd	ebx,mm7
732	movzx	ecx,cl
733	psrlq	mm7,8
734	movq	mm3,mm6
735	mov	edi,eax
736	psrlq	mm6,8
737	pxor	mm7,[272+ebp*8+esp]
738	and	al,15
739	psllq	mm3,56
740	pxor	mm6,mm2
741	shr	edi,4
742	pinsrw	mm1,WORD [ecx*2+esi],2
743	pxor	mm7,[16+eax*8+esp]
744	rol	edx,8
745	pxor	mm6,[144+eax*8+esp]
746	pxor	mm7,mm3
747	pxor	mm6,[400+ebp*8+esp]
748	xor	bl,BYTE [ebp*1+esp]
749	mov	al,dl
750	movd	ecx,mm7
751	movzx	ebx,bl
752	psrlq	mm7,8
753	movq	mm3,mm6
754	mov	ebp,eax
755	psrlq	mm6,8
756	pxor	mm7,[272+edi*8+esp]
757	and	al,15
758	psllq	mm3,56
759	pxor	mm6,mm1
760	shr	ebp,4
761	pinsrw	mm0,WORD [ebx*2+esi],2
762	pxor	mm7,[16+eax*8+esp]
763	rol	edx,8
764	pxor	mm6,[144+eax*8+esp]
765	pxor	mm7,mm3
766	pxor	mm6,[400+edi*8+esp]
767	xor	cl,BYTE [edi*1+esp]
768	mov	al,dl
769	mov	edx,DWORD [528+esp]
770	movd	ebx,mm7
771	movzx	ecx,cl
772	psrlq	mm7,8
773	movq	mm3,mm6
774	mov	edi,eax
775	psrlq	mm6,8
776	pxor	mm7,[272+ebp*8+esp]
777	and	al,15
778	psllq	mm3,56
779	pxor	mm6,mm0
780	shr	edi,4
781	pinsrw	mm2,WORD [ecx*2+esi],2
782	pxor	mm7,[16+eax*8+esp]
783	rol	edx,8
784	pxor	mm6,[144+eax*8+esp]
785	pxor	mm7,mm3
786	pxor	mm6,[400+ebp*8+esp]
787	xor	bl,BYTE [ebp*1+esp]
788	mov	al,dl
789	movd	ecx,mm7
790	movzx	ebx,bl
791	psrlq	mm7,8
792	movq	mm3,mm6
793	mov	ebp,eax
794	psrlq	mm6,8
795	pxor	mm7,[272+edi*8+esp]
796	and	al,15
797	psllq	mm3,56
798	pxor	mm6,mm2
799	shr	ebp,4
800	pinsrw	mm1,WORD [ebx*2+esi],2
801	pxor	mm7,[16+eax*8+esp]
802	rol	edx,8
803	pxor	mm6,[144+eax*8+esp]
804	pxor	mm7,mm3
805	pxor	mm6,[400+edi*8+esp]
806	xor	cl,BYTE [edi*1+esp]
807	mov	al,dl
808	movd	ebx,mm7
809	movzx	ecx,cl
810	psrlq	mm7,8
811	movq	mm3,mm6
812	mov	edi,eax
813	psrlq	mm6,8
814	pxor	mm7,[272+ebp*8+esp]
815	and	al,15
816	psllq	mm3,56
817	pxor	mm6,mm1
818	shr	edi,4
819	pinsrw	mm0,WORD [ecx*2+esi],2
820	pxor	mm7,[16+eax*8+esp]
821	rol	edx,8
822	pxor	mm6,[144+eax*8+esp]
823	pxor	mm7,mm3
824	pxor	mm6,[400+ebp*8+esp]
825	xor	bl,BYTE [ebp*1+esp]
826	mov	al,dl
827	movd	ecx,mm7
828	movzx	ebx,bl
829	psrlq	mm7,8
830	movq	mm3,mm6
831	mov	ebp,eax
832	psrlq	mm6,8
833	pxor	mm7,[272+edi*8+esp]
834	and	al,15
835	psllq	mm3,56
836	pxor	mm6,mm0
837	shr	ebp,4
838	pinsrw	mm2,WORD [ebx*2+esi],2
839	pxor	mm7,[16+eax*8+esp]
840	rol	edx,8
841	pxor	mm6,[144+eax*8+esp]
842	pxor	mm7,mm3
843	pxor	mm6,[400+edi*8+esp]
844	xor	cl,BYTE [edi*1+esp]
845	mov	al,dl
846	mov	edx,DWORD [524+esp]
847	movd	ebx,mm7
848	movzx	ecx,cl
849	psrlq	mm7,8
850	movq	mm3,mm6
851	mov	edi,eax
852	psrlq	mm6,8
853	pxor	mm7,[272+ebp*8+esp]
854	and	al,15
855	psllq	mm3,56
856	pxor	mm6,mm2
857	shr	edi,4
858	pinsrw	mm1,WORD [ecx*2+esi],2
859	pxor	mm7,[16+eax*8+esp]
860	pxor	mm6,[144+eax*8+esp]
861	xor	bl,BYTE [ebp*1+esp]
862	pxor	mm7,mm3
863	pxor	mm6,[400+ebp*8+esp]
864	movzx	ebx,bl
865	pxor	mm2,mm2
866	psllq	mm1,4
867	movd	ecx,mm7
868	psrlq	mm7,4
869	movq	mm3,mm6
870	psrlq	mm6,4
871	shl	ecx,4
872	pxor	mm7,[16+edi*8+esp]
873	psllq	mm3,60
874	movzx	ecx,cl
875	pxor	mm7,mm3
876	pxor	mm6,[144+edi*8+esp]
877	pinsrw	mm0,WORD [ebx*2+esi],2
878	pxor	mm6,mm1
879	movd	edx,mm7
880	pinsrw	mm2,WORD [ecx*2+esi],3
881	psllq	mm0,12
882	pxor	mm6,mm0
883	psrlq	mm7,32
884	pxor	mm6,mm2
885	mov	ecx,DWORD [548+esp]
886	movd	ebx,mm7
887	movq	mm3,mm6
888	psllw	mm6,8
889	psrlw	mm3,8
890	por	mm6,mm3
891	bswap	edx
892	pshufw	mm6,mm6,27
893	bswap	ebx
894	cmp	ecx,DWORD [552+esp]
895	jne	NEAR L$009outer
896	mov	eax,DWORD [544+esp]
897	mov	DWORD [12+eax],edx
898	mov	DWORD [8+eax],ebx
899	movq	[eax],mm6
900	mov	esp,DWORD [556+esp]
901	emms
902	pop	edi
903	pop	esi
904	pop	ebx
905	pop	ebp
906	ret
907global	_gcm_init_clmul
908align	16
909_gcm_init_clmul:
910L$_gcm_init_clmul_begin:
911	mov	edx,DWORD [4+esp]
912	mov	eax,DWORD [8+esp]
913	call	L$010pic
914L$010pic:
915	pop	ecx
916	lea	ecx,[(L$bswap-L$010pic)+ecx]
917	movdqu	xmm2,[eax]
918	pshufd	xmm2,xmm2,78
919	pshufd	xmm4,xmm2,255
920	movdqa	xmm3,xmm2
921	psllq	xmm2,1
922	pxor	xmm5,xmm5
923	psrlq	xmm3,63
924	pcmpgtd	xmm5,xmm4
925	pslldq	xmm3,8
926	por	xmm2,xmm3
927	pand	xmm5,[16+ecx]
928	pxor	xmm2,xmm5
929	movdqa	xmm0,xmm2
930	movdqa	xmm1,xmm0
931	pshufd	xmm3,xmm0,78
932	pshufd	xmm4,xmm2,78
933	pxor	xmm3,xmm0
934	pxor	xmm4,xmm2
935db	102,15,58,68,194,0
936db	102,15,58,68,202,17
937db	102,15,58,68,220,0
938	xorps	xmm3,xmm0
939	xorps	xmm3,xmm1
940	movdqa	xmm4,xmm3
941	psrldq	xmm3,8
942	pslldq	xmm4,8
943	pxor	xmm1,xmm3
944	pxor	xmm0,xmm4
945	movdqa	xmm4,xmm0
946	movdqa	xmm3,xmm0
947	psllq	xmm0,5
948	pxor	xmm3,xmm0
949	psllq	xmm0,1
950	pxor	xmm0,xmm3
951	psllq	xmm0,57
952	movdqa	xmm3,xmm0
953	pslldq	xmm0,8
954	psrldq	xmm3,8
955	pxor	xmm0,xmm4
956	pxor	xmm1,xmm3
957	movdqa	xmm4,xmm0
958	psrlq	xmm0,1
959	pxor	xmm1,xmm4
960	pxor	xmm4,xmm0
961	psrlq	xmm0,5
962	pxor	xmm0,xmm4
963	psrlq	xmm0,1
964	pxor	xmm0,xmm1
965	pshufd	xmm3,xmm2,78
966	pshufd	xmm4,xmm0,78
967	pxor	xmm3,xmm2
968	movdqu	[edx],xmm2
969	pxor	xmm4,xmm0
970	movdqu	[16+edx],xmm0
971db	102,15,58,15,227,8
972	movdqu	[32+edx],xmm4
973	ret
974global	_gcm_gmult_clmul
975align	16
976_gcm_gmult_clmul:
977L$_gcm_gmult_clmul_begin:
978	mov	eax,DWORD [4+esp]
979	mov	edx,DWORD [8+esp]
980	call	L$011pic
981L$011pic:
982	pop	ecx
983	lea	ecx,[(L$bswap-L$011pic)+ecx]
984	movdqu	xmm0,[eax]
985	movdqa	xmm5,[ecx]
986	movups	xmm2,[edx]
987db	102,15,56,0,197
988	movups	xmm4,[32+edx]
989	movdqa	xmm1,xmm0
990	pshufd	xmm3,xmm0,78
991	pxor	xmm3,xmm0
992db	102,15,58,68,194,0
993db	102,15,58,68,202,17
994db	102,15,58,68,220,0
995	xorps	xmm3,xmm0
996	xorps	xmm3,xmm1
997	movdqa	xmm4,xmm3
998	psrldq	xmm3,8
999	pslldq	xmm4,8
1000	pxor	xmm1,xmm3
1001	pxor	xmm0,xmm4
1002	movdqa	xmm4,xmm0
1003	movdqa	xmm3,xmm0
1004	psllq	xmm0,5
1005	pxor	xmm3,xmm0
1006	psllq	xmm0,1
1007	pxor	xmm0,xmm3
1008	psllq	xmm0,57
1009	movdqa	xmm3,xmm0
1010	pslldq	xmm0,8
1011	psrldq	xmm3,8
1012	pxor	xmm0,xmm4
1013	pxor	xmm1,xmm3
1014	movdqa	xmm4,xmm0
1015	psrlq	xmm0,1
1016	pxor	xmm1,xmm4
1017	pxor	xmm4,xmm0
1018	psrlq	xmm0,5
1019	pxor	xmm0,xmm4
1020	psrlq	xmm0,1
1021	pxor	xmm0,xmm1
1022db	102,15,56,0,197
1023	movdqu	[eax],xmm0
1024	ret
1025global	_gcm_ghash_clmul
1026align	16
1027_gcm_ghash_clmul:
1028L$_gcm_ghash_clmul_begin:
1029	push	ebp
1030	push	ebx
1031	push	esi
1032	push	edi
1033	mov	eax,DWORD [20+esp]
1034	mov	edx,DWORD [24+esp]
1035	mov	esi,DWORD [28+esp]
1036	mov	ebx,DWORD [32+esp]
1037	call	L$012pic
1038L$012pic:
1039	pop	ecx
1040	lea	ecx,[(L$bswap-L$012pic)+ecx]
1041	movdqu	xmm0,[eax]
1042	movdqa	xmm5,[ecx]
1043	movdqu	xmm2,[edx]
1044db	102,15,56,0,197
1045	sub	ebx,16
1046	jz	NEAR L$013odd_tail
1047	movdqu	xmm3,[esi]
1048	movdqu	xmm6,[16+esi]
1049db	102,15,56,0,221
1050db	102,15,56,0,245
1051	movdqu	xmm5,[32+edx]
1052	pxor	xmm0,xmm3
1053	pshufd	xmm3,xmm6,78
1054	movdqa	xmm7,xmm6
1055	pxor	xmm3,xmm6
1056	lea	esi,[32+esi]
1057db	102,15,58,68,242,0
1058db	102,15,58,68,250,17
1059db	102,15,58,68,221,0
1060	movups	xmm2,[16+edx]
1061	nop
1062	sub	ebx,32
1063	jbe	NEAR L$014even_tail
1064	jmp	NEAR L$015mod_loop
1065align	32
1066L$015mod_loop:
1067	pshufd	xmm4,xmm0,78
1068	movdqa	xmm1,xmm0
1069	pxor	xmm4,xmm0
1070	nop
1071db	102,15,58,68,194,0
1072db	102,15,58,68,202,17
1073db	102,15,58,68,229,16
1074	movups	xmm2,[edx]
1075	xorps	xmm0,xmm6
1076	movdqa	xmm5,[ecx]
1077	xorps	xmm1,xmm7
1078	movdqu	xmm7,[esi]
1079	pxor	xmm3,xmm0
1080	movdqu	xmm6,[16+esi]
1081	pxor	xmm3,xmm1
1082db	102,15,56,0,253
1083	pxor	xmm4,xmm3
1084	movdqa	xmm3,xmm4
1085	psrldq	xmm4,8
1086	pslldq	xmm3,8
1087	pxor	xmm1,xmm4
1088	pxor	xmm0,xmm3
1089db	102,15,56,0,245
1090	pxor	xmm1,xmm7
1091	movdqa	xmm7,xmm6
1092	movdqa	xmm4,xmm0
1093	movdqa	xmm3,xmm0
1094	psllq	xmm0,5
1095	pxor	xmm3,xmm0
1096	psllq	xmm0,1
1097	pxor	xmm0,xmm3
1098db	102,15,58,68,242,0
1099	movups	xmm5,[32+edx]
1100	psllq	xmm0,57
1101	movdqa	xmm3,xmm0
1102	pslldq	xmm0,8
1103	psrldq	xmm3,8
1104	pxor	xmm0,xmm4
1105	pxor	xmm1,xmm3
1106	pshufd	xmm3,xmm7,78
1107	movdqa	xmm4,xmm0
1108	psrlq	xmm0,1
1109	pxor	xmm3,xmm7
1110	pxor	xmm1,xmm4
1111db	102,15,58,68,250,17
1112	movups	xmm2,[16+edx]
1113	pxor	xmm4,xmm0
1114	psrlq	xmm0,5
1115	pxor	xmm0,xmm4
1116	psrlq	xmm0,1
1117	pxor	xmm0,xmm1
1118db	102,15,58,68,221,0
1119	lea	esi,[32+esi]
1120	sub	ebx,32
1121	ja	NEAR L$015mod_loop
1122L$014even_tail:
1123	pshufd	xmm4,xmm0,78
1124	movdqa	xmm1,xmm0
1125	pxor	xmm4,xmm0
1126db	102,15,58,68,194,0
1127db	102,15,58,68,202,17
1128db	102,15,58,68,229,16
1129	movdqa	xmm5,[ecx]
1130	xorps	xmm0,xmm6
1131	xorps	xmm1,xmm7
1132	pxor	xmm3,xmm0
1133	pxor	xmm3,xmm1
1134	pxor	xmm4,xmm3
1135	movdqa	xmm3,xmm4
1136	psrldq	xmm4,8
1137	pslldq	xmm3,8
1138	pxor	xmm1,xmm4
1139	pxor	xmm0,xmm3
1140	movdqa	xmm4,xmm0
1141	movdqa	xmm3,xmm0
1142	psllq	xmm0,5
1143	pxor	xmm3,xmm0
1144	psllq	xmm0,1
1145	pxor	xmm0,xmm3
1146	psllq	xmm0,57
1147	movdqa	xmm3,xmm0
1148	pslldq	xmm0,8
1149	psrldq	xmm3,8
1150	pxor	xmm0,xmm4
1151	pxor	xmm1,xmm3
1152	movdqa	xmm4,xmm0
1153	psrlq	xmm0,1
1154	pxor	xmm1,xmm4
1155	pxor	xmm4,xmm0
1156	psrlq	xmm0,5
1157	pxor	xmm0,xmm4
1158	psrlq	xmm0,1
1159	pxor	xmm0,xmm1
1160	test	ebx,ebx
1161	jnz	NEAR L$016done
1162	movups	xmm2,[edx]
1163L$013odd_tail:
1164	movdqu	xmm3,[esi]
1165db	102,15,56,0,221
1166	pxor	xmm0,xmm3
1167	movdqa	xmm1,xmm0
1168	pshufd	xmm3,xmm0,78
1169	pshufd	xmm4,xmm2,78
1170	pxor	xmm3,xmm0
1171	pxor	xmm4,xmm2
1172db	102,15,58,68,194,0
1173db	102,15,58,68,202,17
1174db	102,15,58,68,220,0
1175	xorps	xmm3,xmm0
1176	xorps	xmm3,xmm1
1177	movdqa	xmm4,xmm3
1178	psrldq	xmm3,8
1179	pslldq	xmm4,8
1180	pxor	xmm1,xmm3
1181	pxor	xmm0,xmm4
1182	movdqa	xmm4,xmm0
1183	movdqa	xmm3,xmm0
1184	psllq	xmm0,5
1185	pxor	xmm3,xmm0
1186	psllq	xmm0,1
1187	pxor	xmm0,xmm3
1188	psllq	xmm0,57
1189	movdqa	xmm3,xmm0
1190	pslldq	xmm0,8
1191	psrldq	xmm3,8
1192	pxor	xmm0,xmm4
1193	pxor	xmm1,xmm3
1194	movdqa	xmm4,xmm0
1195	psrlq	xmm0,1
1196	pxor	xmm1,xmm4
1197	pxor	xmm4,xmm0
1198	psrlq	xmm0,5
1199	pxor	xmm0,xmm4
1200	psrlq	xmm0,1
1201	pxor	xmm0,xmm1
1202L$016done:
1203db	102,15,56,0,197
1204	movdqu	[eax],xmm0
1205	pop	edi
1206	pop	esi
1207	pop	ebx
1208	pop	ebp
1209	ret
1210align	64
1211L$bswap:
1212db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1213db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
1214align	64
1215L$rem_8bit:
1216dw	0,450,900,582,1800,1738,1164,1358
1217dw	3600,4050,3476,3158,2328,2266,2716,2910
1218dw	7200,7650,8100,7782,6952,6890,6316,6510
1219dw	4656,5106,4532,4214,5432,5370,5820,6014
1220dw	14400,14722,15300,14854,16200,16010,15564,15630
1221dw	13904,14226,13780,13334,12632,12442,13020,13086
1222dw	9312,9634,10212,9766,9064,8874,8428,8494
1223dw	10864,11186,10740,10294,11640,11450,12028,12094
1224dw	28800,28994,29444,29382,30600,30282,29708,30158
1225dw	32400,32594,32020,31958,31128,30810,31260,31710
1226dw	27808,28002,28452,28390,27560,27242,26668,27118
1227dw	25264,25458,24884,24822,26040,25722,26172,26622
1228dw	18624,18690,19268,19078,20424,19978,19532,19854
1229dw	18128,18194,17748,17558,16856,16410,16988,17310
1230dw	21728,21794,22372,22182,21480,21034,20588,20910
1231dw	23280,23346,22900,22710,24056,23610,24188,24510
1232dw	57600,57538,57988,58182,58888,59338,58764,58446
1233dw	61200,61138,60564,60758,59416,59866,60316,59998
1234dw	64800,64738,65188,65382,64040,64490,63916,63598
1235dw	62256,62194,61620,61814,62520,62970,63420,63102
1236dw	55616,55426,56004,56070,56904,57226,56780,56334
1237dw	55120,54930,54484,54550,53336,53658,54236,53790
1238dw	50528,50338,50916,50982,49768,50090,49644,49198
1239dw	52080,51890,51444,51510,52344,52666,53244,52798
1240dw	37248,36930,37380,37830,38536,38730,38156,38094
1241dw	40848,40530,39956,40406,39064,39258,39708,39646
1242dw	36256,35938,36388,36838,35496,35690,35116,35054
1243dw	33712,33394,32820,33270,33976,34170,34620,34558
1244dw	43456,43010,43588,43910,44744,44810,44364,44174
1245dw	42960,42514,42068,42390,41176,41242,41820,41630
1246dw	46560,46114,46692,47014,45800,45866,45420,45230
1247dw	48112,47666,47220,47542,48376,48442,49020,48830
1248align	64
1249L$rem_4bit:
1250dd	0,0,0,471859200,0,943718400,0,610271232
1251dd	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1252dd	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1253dd	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1254db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
1255db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
1256db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
1257db	0
1258