1#include "x86.inc"
2
3SECTION_TEXT
4
5GLOBAL_HIDDEN_FN poly1305_block_size_sse2
6movl $32, %eax
7ret
8FN_END poly1305_block_size_sse2
9
10GLOBAL_HIDDEN_FN poly1305_auth_sse2
11poly1305_auth_sse2_local:
12pushl %ebp
13movl %esp, %ebp
14andl $-64, %esp
15pushl %esi
16pushl %edi
17pushl %ebx
18subl $244, %esp
19movl 16(%ebp), %esi
20lea 64(%esp), %eax
21movl %esi, %ecx
22movl 20(%ebp), %edx
23movl 12(%ebp), %edi
24call poly1305_init_ext_sse2_local
25poly1305_auth_sse2_2:
26movl %esi, %ebx
27andl $-32, %ebx
28je poly1305_auth_sse2_5
29poly1305_auth_sse2_3:
30movl %edi, %edx
31lea 64(%esp), %eax
32movl %ebx, %ecx
33call poly1305_blocks_sse2_local
34poly1305_auth_sse2_4:
35addl %ebx, %edi
36subl %ebx, %esi
37poly1305_auth_sse2_5:
38pushl 8(%ebp)
39pushl %esi
40pushl %edi
41lea 76(%esp), %eax
42pushl %eax
43call poly1305_finish_ext_sse2_local
44poly1305_auth_sse2_6:
45addl $260, %esp
46popl %ebx
47popl %edi
48popl %esi
49movl %ebp, %esp
50popl %ebp
51ret
52FN_END poly1305_auth_sse2
53
54GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2
55poly1305_finish_ext_sse2_local:
56pushl %esi
57pushl %edi
58pushl %ebx
59pushl %ebp
60subl $60, %esp
61movl 88(%esp), %ebp
62testl %ebp, %ebp
63movl 80(%esp), %ebx
64je poly1305_finish_ext_sse2_18
65poly1305_finish_ext_sse2_2:
66pxor %xmm0, %xmm0
67movaps %xmm0, 16(%esp)
68movaps %xmm0, 32(%esp)
69poly1305_finish_ext_sse2_3:
70movl 84(%esp), %ecx
71lea 16(%esp), %edx
72subl %edx, %ecx
73testl $16, %ebp
74je poly1305_finish_ext_sse2_5
75poly1305_finish_ext_sse2_4:
76lea 32(%esp), %edx
77movdqu 16(%esp,%ecx), %xmm0
78movdqa %xmm0, 16(%esp)
79poly1305_finish_ext_sse2_5:
80testl $8, %ebp
81je poly1305_finish_ext_sse2_7
82poly1305_finish_ext_sse2_6:
83movl (%edx,%ecx), %esi
84movl 4(%edx,%ecx), %edi
85movl %esi, (%edx)
86movl %edi, 4(%edx)
87addl $8, %edx
88poly1305_finish_ext_sse2_7:
89testl $4, %ebp
90je poly1305_finish_ext_sse2_9
91poly1305_finish_ext_sse2_8:
92movl (%edx,%ecx), %esi
93movl %esi, (%edx)
94addl $4, %edx
95poly1305_finish_ext_sse2_9:
96testl $2, %ebp
97je poly1305_finish_ext_sse2_11
98poly1305_finish_ext_sse2_10:
99movzwl (%edx,%ecx), %esi
100movw %si, (%edx)
101addl $2, %edx
102poly1305_finish_ext_sse2_11:
103testl $1, %ebp
104je poly1305_finish_ext_sse2_13
105poly1305_finish_ext_sse2_12:
106movzbl (%edx,%ecx), %ecx
107movb %cl, (%edx)
108poly1305_finish_ext_sse2_13:
109cmpl $16, %ebp
110je poly1305_finish_ext_sse2_16
111poly1305_finish_ext_sse2_14:
112movb $1, 16(%esp,%ebp)
113jae poly1305_finish_ext_sse2_16
114poly1305_finish_ext_sse2_15:
115movl $8, %edx
116jmp poly1305_finish_ext_sse2_17
117poly1305_finish_ext_sse2_16:
118movl $4, %edx
119poly1305_finish_ext_sse2_17:
120orl %edx, 116(%ebx)
121movl %ebx, %eax
122movl $32, %ecx
123lea 16(%esp), %edx
124call poly1305_blocks_sse2_local
125poly1305_finish_ext_sse2_18:
126movl 116(%ebx), %edx
127testb $1, %dl
128je poly1305_finish_ext_sse2_24
129poly1305_finish_ext_sse2_19:
130testl %ebp, %ebp
131je poly1305_finish_ext_sse2_21
132poly1305_finish_ext_sse2_20:
133cmpl $16, %ebp
134jbe poly1305_finish_ext_sse2_22
135poly1305_finish_ext_sse2_21:
136orl $16, %edx
137movl %edx, 116(%ebx)
138jmp poly1305_finish_ext_sse2_23
139poly1305_finish_ext_sse2_22:
140orl $32, %edx
141movl %edx, 116(%ebx)
142poly1305_finish_ext_sse2_23:
143movl %ebx, %eax
144xorl %edx, %edx
145movl $32, %ecx
146call poly1305_blocks_sse2_local
147poly1305_finish_ext_sse2_24:
148movl 8(%ebx), %edx
149movl %edx, %eax
150movl 4(%ebx), %ecx
151movl %ecx, %esi
152shrl $6, %ecx
153shll $20, %eax
154pxor %xmm0, %xmm0
155movl 12(%ebx), %ebp
156orl %eax, %ecx
157movl %ebp, %eax
158shrl $12, %edx
159shll $14, %eax
160orl %eax, %edx
161movl 16(%ebx), %eax
162shll $26, %esi
163shrl $18, %ebp
164shll $8, %eax
165movl 92(%esp), %edi
166orl %eax, %ebp
167orl (%ebx), %esi
168addl 100(%ebx), %esi
169adcl 104(%ebx), %ecx
170adcl 108(%ebx), %edx
171adcl 112(%ebx), %ebp
172movdqu %xmm0, (%ebx)
173movdqu %xmm0, 16(%ebx)
174movdqu %xmm0, 32(%ebx)
175movdqu %xmm0, 48(%ebx)
176movdqu %xmm0, 64(%ebx)
177movdqu %xmm0, 80(%ebx)
178movdqu %xmm0, 96(%ebx)
179movdqu %xmm0, 112(%ebx)
180movl %esi, (%edi)
181movl %ecx, 4(%edi)
182movl %edx, 8(%edi)
183movl %ebp, 12(%edi)
184addl $60, %esp
185popl %ebp
186popl %ebx
187popl %edi
188popl %esi
189ret
190FN_END poly1305_finish_ext_sse2
191
192
193GLOBAL_HIDDEN_FN poly1305_blocks_sse2
194movl 4(%esp), %eax
195movl 8(%esp), %edx
196movl 12(%esp), %ecx
197poly1305_blocks_sse2_local:
198pushl %esi
199pushl %edi
200pushl %ebx
201subl $544, %esp
202movl $16777216, %ebx
203movl $67108863, %esi
204movl $5, %edi
205movd %ebx, %xmm0
206movd %esi, %xmm2
207movd %edi, %xmm4
208movl 116(%eax), %ebx
209testb $4, %bl
210pshufd $68, %xmm0, %xmm1
211pshufd $68, %xmm2, %xmm3
212pshufd $68, %xmm4, %xmm5
213movdqa %xmm1, 272(%esp)
214movdqa %xmm3, 256(%esp)
215movdqa %xmm5, 160(%esp)
216je poly1305_blocks_sse2_3
217poly1305_blocks_sse2_2:
218movdqa 272(%esp), %xmm0
219psrldq $8, %xmm0
220movdqa %xmm0, 272(%esp)
221poly1305_blocks_sse2_3:
222testb $8, %bl
223je poly1305_blocks_sse2_5
224poly1305_blocks_sse2_4:
225pxor %xmm0, %xmm0
226movdqa %xmm0, 272(%esp)
227poly1305_blocks_sse2_5:
228testb $1, %bl
229jne poly1305_blocks_sse2_7
230poly1305_blocks_sse2_6:
231movq 8(%edx), %xmm0
232orl $1, %ebx
233movq (%edx), %xmm1
234addl $-32, %ecx
235movhpd 24(%edx), %xmm0
236movdqa 256(%esp), %xmm4
237movaps %xmm0, %xmm2
238movhpd 16(%edx), %xmm1
239movdqa %xmm4, %xmm7
240pand %xmm1, %xmm7
241movaps %xmm1, %xmm6
242psrlq $52, %xmm1
243psllq $12, %xmm2
244por %xmm2, %xmm1
245movdqa %xmm4, %xmm3
246psrlq $26, %xmm6
247pand %xmm1, %xmm3
248psrlq $26, %xmm1
249psrlq $40, %xmm0
250movdqa %xmm3, 32(%esp)
251pand %xmm4, %xmm6
252por 272(%esp), %xmm0
253pand %xmm4, %xmm1
254movl %ebx, 116(%eax)
255addl $32, %edx
256jmp poly1305_blocks_sse2_8
257poly1305_blocks_sse2_7:
258movdqu 16(%eax), %xmm0
259movdqu (%eax), %xmm6
260movdqu 32(%eax), %xmm2
261pshufd $80, %xmm0, %xmm1
262pshufd $80, %xmm6, %xmm7
263pshufd $250, %xmm6, %xmm6
264movdqa %xmm1, 32(%esp)
265pshufd $250, %xmm0, %xmm1
266pshufd $80, %xmm2, %xmm0
267poly1305_blocks_sse2_8:
268testb $48, %bl
269je poly1305_blocks_sse2_13
270poly1305_blocks_sse2_9:
271movdqu 40(%eax), %xmm2
272movl 56(%eax), %esi
273testb $16, %bl
274je poly1305_blocks_sse2_11
275poly1305_blocks_sse2_10:
276movdqu 60(%eax), %xmm4
277movdqa %xmm4, %xmm5
278movd 76(%eax), %xmm3
279punpckldq %xmm2, %xmm5
280punpckhdq %xmm2, %xmm4
281movd %esi, %xmm2
282punpcklqdq %xmm2, %xmm3
283movdqa %xmm3, 240(%esp)
284jmp poly1305_blocks_sse2_12
285poly1305_blocks_sse2_11:
286movl $1, %ebx
287movdqa %xmm2, %xmm5
288movdqa %xmm2, %xmm4
289movd %esi, %xmm2
290movdqa %xmm2, 240(%esp)
291movd %ebx, %xmm3
292punpckldq %xmm3, %xmm5
293punpckhdq %xmm3, %xmm4
294poly1305_blocks_sse2_12:
295pshufd $80, %xmm5, %xmm2
296pshufd $250, %xmm5, %xmm3
297pshufd $80, %xmm4, %xmm5
298pshufd $250, %xmm4, %xmm4
299movdqa %xmm2, 176(%esp)
300movdqa %xmm3, 224(%esp)
301movdqa %xmm5, 208(%esp)
302movdqa %xmm4, 192(%esp)
303jmp poly1305_blocks_sse2_14
304poly1305_blocks_sse2_13:
305movdqu 60(%eax), %xmm3
306movd 76(%eax), %xmm2
307pshufd $0, %xmm3, %xmm4
308movdqa %xmm4, 176(%esp)
309pshufd $85, %xmm3, %xmm5
310pshufd $170, %xmm3, %xmm4
311pshufd $255, %xmm3, %xmm3
312pshufd $0, %xmm2, %xmm2
313movdqa %xmm5, 224(%esp)
314movdqa %xmm4, 208(%esp)
315movdqa %xmm3, 192(%esp)
316movdqa %xmm2, 240(%esp)
317poly1305_blocks_sse2_14:
318movdqa 160(%esp), %xmm2
319cmpl $64, %ecx
320movdqa 192(%esp), %xmm5
321pmuludq %xmm2, %xmm5
322movdqa %xmm5, 320(%esp)
323movdqa 224(%esp), %xmm4
324movdqa 208(%esp), %xmm3
325movdqa 240(%esp), %xmm5
326pmuludq %xmm2, %xmm4
327pmuludq %xmm2, %xmm3
328pmuludq %xmm2, %xmm5
329movdqa 320(%esp), %xmm2
330jb poly1305_blocks_sse2_18
331poly1305_blocks_sse2_15:
332movdqa %xmm3, 304(%esp)
333movdqu 80(%eax), %xmm3
334movaps %xmm0, 400(%esp)
335movd 96(%eax), %xmm0
336movdqa %xmm2, 320(%esp)
337pshufd $85, %xmm3, %xmm2
338movdqa %xmm2, 128(%esp)
339pshufd $0, %xmm0, %xmm2
340movdqa 160(%esp), %xmm0
341movdqa %xmm5, 336(%esp)
342pshufd $0, %xmm3, %xmm5
343movdqa %xmm5, (%esp)
344movdqa %xmm0, %xmm5
345pmuludq 128(%esp), %xmm5
346movdqa %xmm4, 288(%esp)
347pshufd $170, %xmm3, %xmm4
348movdqa %xmm5, 80(%esp)
349movdqa %xmm0, %xmm5
350movdqa %xmm4, 112(%esp)
351pshufd $255, %xmm3, %xmm3
352pmuludq %xmm4, %xmm5
353movdqa %xmm0, %xmm4
354pmuludq %xmm3, %xmm4
355pmuludq %xmm2, %xmm0
356movdqa %xmm3, 144(%esp)
357movdqa %xmm2, 96(%esp)
358movdqa %xmm5, 64(%esp)
359movdqa %xmm4, 48(%esp)
360movdqa %xmm0, 16(%esp)
361movaps 400(%esp), %xmm0
362movaps %xmm1, 384(%esp)
363movdqa %xmm6, 368(%esp)
364movdqa %xmm7, 352(%esp)
365poly1305_blocks_sse2_16:
366movq 8(%edx), %xmm6
367addl $-64, %ecx
368movq (%edx), %xmm2
369movhpd 24(%edx), %xmm6
370movdqa 256(%esp), %xmm1
371movaps %xmm6, %xmm7
372movhpd 16(%edx), %xmm2
373movdqa %xmm1, %xmm4
374pand %xmm2, %xmm4
375movaps %xmm2, %xmm3
376movaps %xmm6, %xmm5
377psrlq $52, %xmm2
378psllq $12, %xmm7
379psrlq $26, %xmm3
380psrlq $14, %xmm5
381por %xmm7, %xmm2
382pand %xmm1, %xmm3
383pand %xmm1, %xmm5
384pand %xmm1, %xmm2
385psrlq $40, %xmm6
386movdqu 32(%edx), %xmm1
387movdqu 48(%edx), %xmm7
388movdqa %xmm4, 416(%esp)
389movdqa %xmm1, %xmm4
390punpckldq %xmm7, %xmm4
391addl $64, %edx
392punpckhdq %xmm7, %xmm1
393cmpl $64, %ecx
394movaps 384(%esp), %xmm7
395movaps %xmm0, 400(%esp)
396pmuludq 80(%esp), %xmm0
397pmuludq 64(%esp), %xmm7
398movdqa %xmm1, 480(%esp)
399movdqa 32(%esp), %xmm1
400pmuludq 48(%esp), %xmm1
401paddq %xmm7, %xmm0
402movdqa 368(%esp), %xmm7
403pmuludq 16(%esp), %xmm7
404paddq %xmm1, %xmm0
405movdqa 352(%esp), %xmm1
406paddq %xmm7, %xmm0
407movdqa (%esp), %xmm7
408pmuludq %xmm7, %xmm1
409por 272(%esp), %xmm6
410paddq %xmm1, %xmm0
411movdqa 288(%esp), %xmm1
412pmuludq %xmm6, %xmm1
413paddq %xmm1, %xmm0
414movdqa 304(%esp), %xmm1
415pmuludq %xmm5, %xmm1
416paddq %xmm1, %xmm0
417movdqa 320(%esp), %xmm1
418pmuludq %xmm2, %xmm1
419movaps %xmm2, 448(%esp)
420movdqa 336(%esp), %xmm2
421paddq %xmm1, %xmm0
422movdqa %xmm2, %xmm1
423pmuludq %xmm3, %xmm1
424paddq %xmm1, %xmm0
425movdqa 176(%esp), %xmm1
426movdqa %xmm3, 432(%esp)
427movdqa %xmm1, %xmm3
428pmuludq 416(%esp), %xmm3
429pmuludq %xmm5, %xmm1
430paddq %xmm3, %xmm0
431pxor %xmm3, %xmm3
432movdqa %xmm4, 464(%esp)
433punpckldq %xmm3, %xmm4
434paddq %xmm4, %xmm0
435movdqa %xmm0, 496(%esp)
436movaps 400(%esp), %xmm4
437movaps 384(%esp), %xmm0
438pmuludq 16(%esp), %xmm4
439pmuludq %xmm7, %xmm0
440movdqa 32(%esp), %xmm7
441pmuludq 128(%esp), %xmm7
442paddq %xmm0, %xmm4
443movdqa 368(%esp), %xmm0
444pmuludq 112(%esp), %xmm0
445paddq %xmm7, %xmm4
446movdqa 352(%esp), %xmm7
447pmuludq 144(%esp), %xmm7
448paddq %xmm0, %xmm4
449movdqa %xmm2, %xmm0
450pmuludq %xmm6, %xmm0
451paddq %xmm7, %xmm4
452movdqa 224(%esp), %xmm7
453paddq %xmm0, %xmm4
454movaps 448(%esp), %xmm0
455pmuludq %xmm0, %xmm7
456pmuludq %xmm0, %xmm2
457paddq %xmm1, %xmm4
458movdqa 208(%esp), %xmm1
459pmuludq 432(%esp), %xmm1
460paddq %xmm7, %xmm4
461movdqa 192(%esp), %xmm7
462paddq %xmm1, %xmm4
463movdqa 416(%esp), %xmm1
464pmuludq %xmm1, %xmm7
465paddq %xmm7, %xmm4
466movdqa 480(%esp), %xmm7
467punpckhdq %xmm3, %xmm7
468psllq $18, %xmm7
469paddq %xmm7, %xmm4
470movdqa %xmm4, 512(%esp)
471movaps 400(%esp), %xmm4
472movaps 384(%esp), %xmm3
473pmuludq 64(%esp), %xmm4
474pmuludq 48(%esp), %xmm3
475movdqa 32(%esp), %xmm7
476pmuludq 16(%esp), %xmm7
477paddq %xmm3, %xmm4
478movdqa 368(%esp), %xmm3
479pmuludq (%esp), %xmm3
480paddq %xmm7, %xmm4
481movdqa 352(%esp), %xmm7
482paddq %xmm3, %xmm4
483movdqa 128(%esp), %xmm3
484pmuludq %xmm3, %xmm7
485paddq %xmm7, %xmm4
486movdqa 304(%esp), %xmm7
487pmuludq %xmm6, %xmm7
488paddq %xmm7, %xmm4
489movdqa 320(%esp), %xmm7
490pmuludq %xmm5, %xmm7
491paddq %xmm7, %xmm4
492movdqa 176(%esp), %xmm7
493movdqa %xmm7, %xmm0
494pmuludq 432(%esp), %xmm0
495paddq %xmm2, %xmm4
496movdqa 224(%esp), %xmm2
497paddq %xmm0, %xmm4
498movdqa %xmm2, %xmm0
499pmuludq %xmm1, %xmm0
500pmuludq %xmm5, %xmm2
501pmuludq 336(%esp), %xmm5
502paddq %xmm0, %xmm4
503movdqa 464(%esp), %xmm0
504pxor %xmm1, %xmm1
505punpckhdq %xmm1, %xmm0
506psllq $6, %xmm0
507movdqa 496(%esp), %xmm1
508paddq %xmm0, %xmm4
509psrlq $26, %xmm1
510paddq %xmm1, %xmm4
511movdqa %xmm4, 528(%esp)
512movaps 400(%esp), %xmm4
513movaps 384(%esp), %xmm1
514movaps %xmm4, %xmm0
515pmuludq (%esp), %xmm0
516pmuludq %xmm3, %xmm1
517pmuludq 48(%esp), %xmm4
518paddq %xmm1, %xmm0
519movdqa 32(%esp), %xmm3
520pmuludq 112(%esp), %xmm3
521paddq %xmm3, %xmm0
522movdqa 368(%esp), %xmm3
523movdqa %xmm3, %xmm1
524pmuludq 144(%esp), %xmm1
525pmuludq 128(%esp), %xmm3
526paddq %xmm1, %xmm0
527movdqa 352(%esp), %xmm1
528pmuludq 96(%esp), %xmm1
529paddq %xmm1, %xmm0
530movdqa %xmm7, %xmm1
531pmuludq %xmm6, %xmm1
532pmuludq 320(%esp), %xmm6
533paddq %xmm1, %xmm0
534movdqa 208(%esp), %xmm1
535pmuludq 448(%esp), %xmm1
536paddq %xmm2, %xmm0
537movdqa 192(%esp), %xmm2
538pmuludq 432(%esp), %xmm2
539paddq %xmm1, %xmm0
540movdqa 416(%esp), %xmm1
541paddq %xmm2, %xmm0
542movdqa 240(%esp), %xmm2
543pmuludq %xmm1, %xmm2
544pmuludq 208(%esp), %xmm1
545paddq %xmm2, %xmm0
546movdqa 512(%esp), %xmm2
547paddq 272(%esp), %xmm0
548psrlq $26, %xmm2
549paddq %xmm2, %xmm0
550movaps 384(%esp), %xmm2
551pmuludq 16(%esp), %xmm2
552paddq %xmm2, %xmm4
553movdqa 32(%esp), %xmm2
554pmuludq (%esp), %xmm2
555paddq %xmm2, %xmm4
556paddq %xmm3, %xmm4
557movdqa 352(%esp), %xmm3
558pmuludq 112(%esp), %xmm3
559paddq %xmm3, %xmm4
560paddq %xmm6, %xmm4
561movaps 448(%esp), %xmm6
562movdqa %xmm0, %xmm3
563pmuludq %xmm7, %xmm6
564psrlq $26, %xmm3
565pmuludq 160(%esp), %xmm3
566paddq %xmm5, %xmm4
567movdqa 432(%esp), %xmm5
568pmuludq 224(%esp), %xmm5
569paddq %xmm6, %xmm4
570paddq %xmm5, %xmm4
571movdqa 480(%esp), %xmm7
572pxor %xmm5, %xmm5
573punpckldq %xmm5, %xmm7
574paddq %xmm1, %xmm4
575movdqa 528(%esp), %xmm2
576psllq $12, %xmm7
577movdqa %xmm2, %xmm1
578paddq %xmm7, %xmm4
579psrlq $26, %xmm1
580paddq %xmm1, %xmm4
581movdqa 256(%esp), %xmm5
582movaps %xmm4, %xmm1
583movdqa 496(%esp), %xmm6
584psrlq $26, %xmm1
585movdqa 512(%esp), %xmm7
586pand %xmm5, %xmm6
587pand %xmm5, %xmm7
588pand %xmm5, %xmm2
589paddq %xmm3, %xmm6
590paddq %xmm1, %xmm7
591movdqa %xmm5, %xmm3
592movdqa %xmm5, %xmm1
593pand %xmm6, %xmm3
594psrlq $26, %xmm6
595pand %xmm4, %xmm1
596movdqa %xmm5, %xmm4
597paddq %xmm6, %xmm2
598pand %xmm7, %xmm4
599pand %xmm5, %xmm0
600psrlq $26, %xmm7
601movdqa %xmm3, 352(%esp)
602movdqa %xmm2, 368(%esp)
603movdqa %xmm1, 32(%esp)
604movaps %xmm4, 384(%esp)
605paddq %xmm7, %xmm0
606jae poly1305_blocks_sse2_16
607poly1305_blocks_sse2_17:
608movdqa 336(%esp), %xmm5
609movdqa 320(%esp), %xmm2
610movdqa 304(%esp), %xmm3
611movdqa 288(%esp), %xmm4
612movaps 384(%esp), %xmm1
613movdqa 368(%esp), %xmm6
614movdqa 352(%esp), %xmm7
615poly1305_blocks_sse2_18:
616cmpl $32, %ecx
617jb poly1305_blocks_sse2_22
618poly1305_blocks_sse2_19:
619movaps %xmm1, 384(%esp)
620testl %edx, %edx
621pmuludq %xmm0, %xmm4
622pmuludq %xmm3, %xmm1
623pmuludq %xmm0, %xmm3
624paddq %xmm1, %xmm4
625movdqa 32(%esp), %xmm1
626pmuludq %xmm2, %xmm1
627paddq %xmm1, %xmm4
628movdqa %xmm6, %xmm1
629pmuludq %xmm5, %xmm1
630paddq %xmm1, %xmm4
631movdqa 176(%esp), %xmm1
632movdqa %xmm7, 352(%esp)
633pmuludq %xmm1, %xmm7
634paddq %xmm7, %xmm4
635movdqa %xmm4, 288(%esp)
636movaps 384(%esp), %xmm4
637movaps %xmm4, %xmm7
638pmuludq %xmm2, %xmm7
639pmuludq %xmm0, %xmm2
640paddq %xmm7, %xmm3
641movdqa 32(%esp), %xmm7
642pmuludq %xmm5, %xmm7
643movdqa %xmm6, 368(%esp)
644pmuludq %xmm1, %xmm6
645paddq %xmm7, %xmm3
646paddq %xmm6, %xmm3
647movdqa 352(%esp), %xmm7
648movdqa 224(%esp), %xmm6
649pmuludq %xmm6, %xmm7
650paddq %xmm7, %xmm3
651movdqa %xmm3, 304(%esp)
652movaps %xmm4, %xmm3
653pmuludq %xmm5, %xmm3
654pmuludq %xmm0, %xmm5
655pmuludq %xmm1, %xmm0
656paddq %xmm3, %xmm2
657movdqa 32(%esp), %xmm3
658movdqa %xmm3, %xmm7
659pmuludq %xmm1, %xmm7
660paddq %xmm7, %xmm2
661movdqa 368(%esp), %xmm7
662pmuludq %xmm6, %xmm7
663movdqa 352(%esp), %xmm6
664paddq %xmm7, %xmm2
665movdqa 208(%esp), %xmm7
666pmuludq %xmm7, %xmm6
667paddq %xmm6, %xmm2
668movdqa %xmm2, 320(%esp)
669movaps %xmm4, %xmm2
670pmuludq %xmm1, %xmm2
671paddq %xmm2, %xmm5
672movdqa 224(%esp), %xmm6
673movdqa %xmm3, %xmm2
674pmuludq %xmm6, %xmm2
675pmuludq %xmm6, %xmm4
676pmuludq 208(%esp), %xmm3
677paddq %xmm2, %xmm5
678paddq %xmm4, %xmm0
679movdqa 368(%esp), %xmm2
680pmuludq %xmm7, %xmm2
681paddq %xmm3, %xmm0
682paddq %xmm2, %xmm5
683movdqa 192(%esp), %xmm2
684movdqa 368(%esp), %xmm4
685pmuludq %xmm2, %xmm4
686movdqa 352(%esp), %xmm7
687movdqa 352(%esp), %xmm1
688pmuludq 240(%esp), %xmm1
689pmuludq %xmm2, %xmm7
690paddq %xmm4, %xmm0
691paddq %xmm7, %xmm5
692paddq %xmm1, %xmm0
693movdqa 288(%esp), %xmm4
694movdqa 304(%esp), %xmm3
695movdqa 320(%esp), %xmm2
696je poly1305_blocks_sse2_21
697poly1305_blocks_sse2_20:
698movdqu (%edx), %xmm1
699movdqu 16(%edx), %xmm7
700movdqa %xmm1, %xmm6
701movaps %xmm0, 400(%esp)
702punpckldq %xmm7, %xmm6
703pxor %xmm0, %xmm0
704punpckhdq %xmm7, %xmm1
705movdqa %xmm6, %xmm7
706punpckhdq %xmm0, %xmm6
707psllq $6, %xmm6
708paddq %xmm6, %xmm3
709movdqa %xmm1, %xmm6
710punpckldq %xmm0, %xmm6
711punpckhdq %xmm0, %xmm1
712psllq $12, %xmm6
713punpckldq %xmm0, %xmm7
714psllq $18, %xmm1
715movaps 400(%esp), %xmm0
716paddq 272(%esp), %xmm0
717paddq %xmm7, %xmm4
718paddq %xmm6, %xmm2
719paddq %xmm1, %xmm5
720poly1305_blocks_sse2_21:
721movdqa %xmm5, %xmm6
722movdqa %xmm4, %xmm7
723psrlq $26, %xmm6
724psrlq $26, %xmm7
725paddq %xmm6, %xmm0
726paddq %xmm7, %xmm3
727movaps %xmm0, %xmm7
728movdqa %xmm3, %xmm1
729psrlq $26, %xmm7
730psrlq $26, %xmm1
731pmuludq 160(%esp), %xmm7
732paddq %xmm1, %xmm2
733movdqa 256(%esp), %xmm1
734movdqa %xmm2, %xmm6
735pand 256(%esp), %xmm5
736psrlq $26, %xmm6
737pand %xmm1, %xmm3
738pand 256(%esp), %xmm4
739paddq %xmm6, %xmm5
740paddq %xmm7, %xmm4
741movdqa %xmm3, %xmm6
742movdqa %xmm1, %xmm3
743movdqa %xmm1, %xmm7
744pand %xmm2, %xmm3
745movdqa %xmm1, %xmm2
746pand %xmm4, %xmm7
747psrlq $26, %xmm4
748pand %xmm5, %xmm1
749pand %xmm2, %xmm0
750psrlq $26, %xmm5
751paddq %xmm4, %xmm6
752paddq %xmm5, %xmm0
753movdqa %xmm3, 32(%esp)
754poly1305_blocks_sse2_22:
755testl %edx, %edx
756je poly1305_blocks_sse2_24
757poly1305_blocks_sse2_23:
758pshufd $8, %xmm7, %xmm2
759pshufd $8, %xmm6, %xmm6
760pshufd $8, 32(%esp), %xmm3
761pshufd $8, %xmm1, %xmm1
762punpcklqdq %xmm6, %xmm2
763punpcklqdq %xmm1, %xmm3
764pshufd $8, %xmm0, %xmm0
765movdqu %xmm2, (%eax)
766movdqu %xmm3, 16(%eax)
767movq %xmm0, 32(%eax)
768addl $544, %esp
769popl %ebx
770popl %edi
771popl %esi
772ret
773poly1305_blocks_sse2_24:
774movdqa %xmm7, %xmm2
775movdqa %xmm6, %xmm3
776psrldq $8, %xmm2
777paddq %xmm2, %xmm7
778psrldq $8, %xmm3
779movd %xmm7, %ecx
780paddq %xmm3, %xmm6
781movdqa 32(%esp), %xmm5
782movl %ecx, %esi
783movdqa %xmm5, %xmm4
784andl $67108863, %ecx
785movd %xmm6, %ebx
786movaps %xmm1, %xmm6
787psrldq $8, %xmm4
788paddq %xmm4, %xmm5
789shrl $26, %esi
790addl %esi, %ebx
791psrldq $8, %xmm6
792movd %xmm5, %edi
793paddq %xmm6, %xmm1
794movl %eax, (%esp)
795movl %ebx, %eax
796shrl $26, %eax
797andl $67108863, %ebx
798addl %eax, %edi
799movd %xmm1, %eax
800movaps %xmm0, %xmm1
801psrldq $8, %xmm1
802paddq %xmm1, %xmm0
803movl %edi, %edx
804andl $67108863, %edi
805shrl $26, %edx
806addl %edx, %eax
807movd %xmm0, %edx
808movl %eax, %esi
809shrl $26, %esi
810andl $67108863, %eax
811addl %esi, %edx
812movl %edx, %esi
813andl $67108863, %edx
814shrl $26, %esi
815lea (%esi,%esi,4), %esi
816addl %esi, %ecx
817movl %ecx, %esi
818andl $67108863, %ecx
819shrl $26, %esi
820addl %esi, %ebx
821movl %ebx, %esi
822andl $67108863, %ebx
823shrl $26, %esi
824addl %esi, %edi
825movl %edi, %esi
826shrl $26, %edi
827andl $67108863, %esi
828addl %edi, %eax
829movl %eax, %edi
830shrl $26, %eax
831andl $67108863, %edi
832addl %eax, %edx
833movl %edx, %eax
834shrl $26, %edx
835andl $67108863, %eax
836movl %eax, 8(%esp)
837movl %edi, 4(%esp)
838lea (%edx,%edx,4), %edx
839addl %edx, %ecx
840movl %ecx, %edx
841andl $67108863, %edx
842shrl $26, %ecx
843addl %ecx, %ebx
844lea 5(%edx), %ecx
845movl %ecx, 12(%esp)
846shrl $26, %ecx
847addl %ebx, %ecx
848movl %ecx, 16(%esp)
849shrl $26, %ecx
850addl %esi, %ecx
851movl %ecx, 20(%esp)
852shrl $26, %ecx
853addl %edi, %ecx
854movl %ecx, 24(%esp)
855shrl $26, %ecx
856movl 12(%esp), %edi
857andl $67108863, %edi
858lea -67108864(%ecx,%eax), %eax
859movl %eax, 28(%esp)
860shrl $31, %eax
861decl %eax
862movl %eax, %ecx
863andl %eax, %edi
864notl %ecx
865andl %ecx, %edx
866andl %ecx, %ebx
867orl %edi, %edx
868andl %ecx, %esi
869movl (%esp), %edi
870movl %edx, (%edi)
871movl 16(%esp), %edx
872andl $67108863, %edx
873andl %eax, %edx
874orl %edx, %ebx
875movl %ebx, 4(%edi)
876movl 20(%esp), %ebx
877andl $67108863, %ebx
878andl %eax, %ebx
879movl 24(%esp), %edx
880orl %ebx, %esi
881andl $67108863, %edx
882movl %esi, 8(%edi)
883andl %eax, %edx
884movl 4(%esp), %esi
885andl %ecx, %esi
886orl %edx, %esi
887movl 28(%esp), %edx
888andl 8(%esp), %ecx
889andl %eax, %edx
890orl %edx, %ecx
891movl %esi, 12(%edi)
892movl %ecx, 16(%edi)
893poly1305_blocks_sse2_25:
894addl $544, %esp
895popl %ebx
896popl %edi
897popl %esi
898ret
899FN_END poly1305_blocks_sse2
900
901GLOBAL_HIDDEN_FN poly1305_init_ext_sse2
902movl 4(%esp), %eax
903movl 8(%esp), %edx
904movl 12(%esp), %ecx
905poly1305_init_ext_sse2_local:
906pushl %esi
907pushl %edi
908pushl %ebx
909pushl %ebp
910subl $76, %esp
911movl %edx, %ebx
912movl $-1, %edx
913testl %ecx, %ecx
914pxor %xmm0, %xmm0
915movdqu %xmm0, (%eax)
916movdqu %xmm0, 16(%eax)
917movdqu %xmm0, 32(%eax)
918cmove %edx, %ecx
919movl 4(%ebx), %edx
920movl %edx, %ebp
921movl (%ebx), %edi
922movl %edi, %esi
923shrl $26, %edi
924andl $67108863, %esi
925shll $6, %ebp
926movl %ecx, 12(%esp)
927orl %ebp, %edi
928movl 8(%ebx), %ecx
929movl %ecx, %ebp
930shrl $20, %edx
931andl $67108611, %edi
932shll $12, %ebp
933movl %ebx, (%esp)
934orl %ebp, %edx
935movl 12(%ebx), %ebx
936movl %ebx, %ebp
937shrl $14, %ecx
938andl $67092735, %edx
939shll $18, %ebp
940orl %ebp, %ecx
941movl (%esp), %ebp
942andl $66076671, %ecx
943shrl $8, %ebx
944andl $1048575, %ebx
945movl %esi, 40(%eax)
946movl %edi, 44(%eax)
947movl %edx, 48(%eax)
948movl %ecx, 52(%eax)
949movl %ebx, 56(%eax)
950movl %esi, 20(%esp)
951movl 16(%ebp), %esi
952movl %esi, 100(%eax)
953movl %edi, 24(%esp)
954movl 20(%ebp), %edi
955movl %edi, 104(%eax)
956movl 24(%ebp), %esi
957movl %esi, 108(%eax)
958lea 80(%eax), %esi
959movl 28(%ebp), %ebp
960movl 12(%esp), %edi
961cmpl $16, %edi
962movl %ebp, 112(%eax)
963lea 60(%eax), %ebp
964movl $0, 28(%esp)
965movl %ebp, 16(%esp)
966jbe poly1305_init_ext_sse2_9
967poly1305_init_ext_sse2_2:
968movl %ebp, 8(%esp)
969movl %esi, (%esp)
970movl %ebx, 32(%esp)
971movl %ecx, 40(%esp)
972movl %edx, 36(%esp)
973movl %edi, 12(%esp)
974movl %eax, 4(%esp)
975poly1305_init_ext_sse2_3:
976movl 40(%esp), %ebp
977movl 20(%esp), %eax
978mull %eax
979movl 32(%esp), %ebx
980lea (%ebp,%ebp,4), %esi
981movl 24(%esp), %ebp
982movl %eax, %ecx
983movl %esi, 48(%esp)
984lea (%ebx,%ebx), %edi
985movl %edx, %ebx
986movl %edi, 44(%esp)
987lea (%ebp,%ebp,4), %eax
988mull %edi
989movl 36(%esp), %edi
990addl %eax, %ecx
991movl %edi, 60(%esp)
992adcl %edx, %ebx
993lea (%edi,%edi), %eax
994mull %esi
995addl %eax, %ecx
996lea (%ebp,%ebp), %esi
997movl 20(%esp), %eax
998adcl %edx, %ebx
999movl 40(%esp), %ebp
1000movl %esi, 56(%esp)
1001lea (%eax,%eax), %edx
1002movl %edx, 68(%esp)
1003mull %esi
1004movl %edx, %esi
1005addl %ebp, %ebp
1006movl %ebp, 64(%esp)
1007movl %eax, %ebp
1008movl %ecx, 52(%esp)
1009lea (%edi,%edi,4), %eax
1010mull 44(%esp)
1011addl %eax, %ebp
1012movl 40(%esp), %eax
1013movl 48(%esp), %edi
1014adcl %edx, %esi
1015mull %edi
1016shll $6, %ebx
1017shrl $26, %ecx
1018orl %ecx, %ebx
1019addl %ebx, %eax
1020adcl $0, %edx
1021addl %eax, %ebp
1022movl 24(%esp), %eax
1023adcl %edx, %esi
1024mull %eax
1025movl %eax, %ecx
1026movl %edx, %ebx
1027movl 60(%esp), %eax
1028mull 68(%esp)
1029addl %eax, %ecx
1030movl 44(%esp), %eax
1031adcl %edx, %ebx
1032mull %edi
1033movl %ebp, 72(%esp)
1034shll $6, %esi
1035shrl $26, %ebp
1036orl %ebp, %esi
1037addl %esi, %eax
1038adcl $0, %edx
1039addl %eax, %ecx
1040movl %ecx, %edi
1041adcl %edx, %ebx
1042andl $67108863, %edi
1043shll $6, %ebx
1044shrl $26, %ecx
1045movl 56(%esp), %eax
1046orl %ecx, %ebx
1047movl 60(%esp), %ecx
1048mull %ecx
1049movl %edi, 36(%esp)
1050movl %eax, %esi
1051movl 20(%esp), %eax
1052movl %edx, %ebp
1053movl 64(%esp), %edi
1054mull %edi
1055addl %eax, %esi
1056movl 32(%esp), %eax
1057adcl %edx, %ebp
1058lea (%eax,%eax,4), %edx
1059mull %edx
1060addl %eax, %esi
1061movl %ecx, %eax
1062adcl %edx, %ebp
1063addl %esi, %ebx
1064movl %ebx, %esi
1065adcl $0, %ebp
1066andl $67108863, %esi
1067mull %ecx
1068shll $6, %ebp
1069movl %eax, %ecx
1070shrl $26, %ebx
1071movl %edi, %eax
1072orl %ebx, %ebp
1073movl %edx, %ebx
1074mull 24(%esp)
1075addl %eax, %ecx
1076movl 68(%esp), %eax
1077adcl %edx, %ebx
1078mull 32(%esp)
1079addl %eax, %ecx
1080movl 52(%esp), %edi
1081adcl %edx, %ebx
1082addl %ecx, %ebp
1083movl %ebp, %ecx
1084adcl $0, %ebx
1085andl $67108863, %edi
1086shll $6, %ebx
1087andl $67108863, %ecx
1088shrl $26, %ebp
1089orl %ebp, %ebx
1090movl 72(%esp), %eax
1091andl $67108863, %eax
1092movl %ecx, 32(%esp)
1093movl 28(%esp), %ecx
1094lea (%ebx,%ebx,4), %ebx
1095addl %ebx, %edi
1096incl %ecx
1097movl %edi, %ebp
1098shrl $26, %edi
1099andl $67108863, %ebp
1100movl %esi, 40(%esp)
1101cmpl $2, %ecx
1102movl %ebp, 20(%esp)
1103movl %ecx, 28(%esp)
1104lea (%eax,%edi), %edx
1105movl %edx, 24(%esp)
1106jae poly1305_init_ext_sse2_8
1107poly1305_init_ext_sse2_4:
1108cmpl $0, 28(%esp)
1109jne poly1305_init_ext_sse2_6
1110poly1305_init_ext_sse2_5:
1111movl 8(%esp), %esi
1112movl 32(%esp), %eax
1113movl 40(%esp), %edx
1114movl 36(%esp), %ecx
1115movl 24(%esp), %ebx
1116movl 16(%esp), %edi
1117movl %eax, 16(%esi)
1118movl %edx, 12(%esi)
1119movl %ecx, 8(%esi)
1120movl %ebx, 4(%esi)
1121movl %ebp, (%esi)
1122movl %edi, 8(%esp)
1123jmp poly1305_init_ext_sse2_3
1124poly1305_init_ext_sse2_6:
1125cmpl $1, 28(%esp)
1126jne poly1305_init_ext_sse2_3
1127poly1305_init_ext_sse2_7:
1128movl 8(%esp), %esi
1129movl 32(%esp), %eax
1130movl 40(%esp), %edx
1131movl 36(%esp), %ecx
1132movl 24(%esp), %ebx
1133movl (%esp), %edi
1134movl %eax, 16(%esi)
1135movl %edx, 12(%esi)
1136movl %ecx, 8(%esi)
1137movl %ebx, 4(%esi)
1138movl %ebp, (%esi)
1139movl %edi, 8(%esp)
1140cmpl $96, 12(%esp)
1141jae poly1305_init_ext_sse2_3
1142jmp poly1305_init_ext_sse2_10
1143poly1305_init_ext_sse2_8:
1144movl 8(%esp), %ebp
1145movl %esi, %ecx
1146movl 36(%esp), %edx
1147movl %ecx, 12(%ebp)
1148movl %edx, 8(%ebp)
1149movl 32(%esp), %ebx
1150movl 24(%esp), %edx
1151movl 20(%esp), %ecx
1152movl 4(%esp), %eax
1153movl %ebx, 16(%ebp)
1154movl %edx, 4(%ebp)
1155movl %ecx, (%ebp)
1156poly1305_init_ext_sse2_9:
1157movl $0, 116(%eax)
1158addl $76, %esp
1159popl %ebp
1160popl %ebx
1161popl %edi
1162popl %esi
1163ret
1164poly1305_init_ext_sse2_10:
1165movl 4(%esp), %eax
1166jmp poly1305_init_ext_sse2_9
1167FN_END poly1305_init_ext_sse2
1168
1169