1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4default	rel
5%define XMMWORD
6%define YMMWORD
7%define ZMMWORD
8section	.text code align=64
9
10
11EXTERN	GFp_ia32cap_P
12
13global	GFp_bn_mul_mont_gather5
14
15ALIGN	64
16GFp_bn_mul_mont_gather5:
17	mov	QWORD[8+rsp],rdi	;WIN64 prologue
18	mov	QWORD[16+rsp],rsi
19	mov	rax,rsp
20$L$SEH_begin_GFp_bn_mul_mont_gather5:
21	mov	rdi,rcx
22	mov	rsi,rdx
23	mov	rdx,r8
24	mov	rcx,r9
25	mov	r8,QWORD[40+rsp]
26	mov	r9,QWORD[48+rsp]
27
28
29
30	mov	r9d,r9d
31	mov	rax,rsp
32
33	test	r9d,7
34	jnz	NEAR $L$mul_enter
35	lea	r11,[GFp_ia32cap_P]
36	mov	r11d,DWORD[8+r11]
37	jmp	NEAR $L$mul4x_enter
38
39ALIGN	16
40$L$mul_enter:
41	movd	xmm5,DWORD[56+rsp]
42	push	rbx
43
44	push	rbp
45
46	push	r12
47
48	push	r13
49
50	push	r14
51
52	push	r15
53
54
55	neg	r9
56	mov	r11,rsp
57	lea	r10,[((-280))+r9*8+rsp]
58	neg	r9
59	and	r10,-1024
60
61
62
63
64
65
66
67
68
69	sub	r11,r10
70	and	r11,-4096
71	lea	rsp,[r11*1+r10]
72	mov	r11,QWORD[rsp]
73	cmp	rsp,r10
74	ja	NEAR $L$mul_page_walk
75	jmp	NEAR $L$mul_page_walk_done
76
77$L$mul_page_walk:
78	lea	rsp,[((-4096))+rsp]
79	mov	r11,QWORD[rsp]
80	cmp	rsp,r10
81	ja	NEAR $L$mul_page_walk
82$L$mul_page_walk_done:
83
84	lea	r10,[$L$inc]
85	mov	QWORD[8+r9*8+rsp],rax
86
87$L$mul_body:
88
89	lea	r12,[128+rdx]
90	movdqa	xmm0,XMMWORD[r10]
91	movdqa	xmm1,XMMWORD[16+r10]
92	lea	r10,[((24-112))+r9*8+rsp]
93	and	r10,-16
94
95	pshufd	xmm5,xmm5,0
96	movdqa	xmm4,xmm1
97	movdqa	xmm2,xmm1
98	paddd	xmm1,xmm0
99	pcmpeqd	xmm0,xmm5
100DB	0x67
101	movdqa	xmm3,xmm4
102	paddd	xmm2,xmm1
103	pcmpeqd	xmm1,xmm5
104	movdqa	XMMWORD[112+r10],xmm0
105	movdqa	xmm0,xmm4
106
107	paddd	xmm3,xmm2
108	pcmpeqd	xmm2,xmm5
109	movdqa	XMMWORD[128+r10],xmm1
110	movdqa	xmm1,xmm4
111
112	paddd	xmm0,xmm3
113	pcmpeqd	xmm3,xmm5
114	movdqa	XMMWORD[144+r10],xmm2
115	movdqa	xmm2,xmm4
116
117	paddd	xmm1,xmm0
118	pcmpeqd	xmm0,xmm5
119	movdqa	XMMWORD[160+r10],xmm3
120	movdqa	xmm3,xmm4
121	paddd	xmm2,xmm1
122	pcmpeqd	xmm1,xmm5
123	movdqa	XMMWORD[176+r10],xmm0
124	movdqa	xmm0,xmm4
125
126	paddd	xmm3,xmm2
127	pcmpeqd	xmm2,xmm5
128	movdqa	XMMWORD[192+r10],xmm1
129	movdqa	xmm1,xmm4
130
131	paddd	xmm0,xmm3
132	pcmpeqd	xmm3,xmm5
133	movdqa	XMMWORD[208+r10],xmm2
134	movdqa	xmm2,xmm4
135
136	paddd	xmm1,xmm0
137	pcmpeqd	xmm0,xmm5
138	movdqa	XMMWORD[224+r10],xmm3
139	movdqa	xmm3,xmm4
140	paddd	xmm2,xmm1
141	pcmpeqd	xmm1,xmm5
142	movdqa	XMMWORD[240+r10],xmm0
143	movdqa	xmm0,xmm4
144
145	paddd	xmm3,xmm2
146	pcmpeqd	xmm2,xmm5
147	movdqa	XMMWORD[256+r10],xmm1
148	movdqa	xmm1,xmm4
149
150	paddd	xmm0,xmm3
151	pcmpeqd	xmm3,xmm5
152	movdqa	XMMWORD[272+r10],xmm2
153	movdqa	xmm2,xmm4
154
155	paddd	xmm1,xmm0
156	pcmpeqd	xmm0,xmm5
157	movdqa	XMMWORD[288+r10],xmm3
158	movdqa	xmm3,xmm4
159	paddd	xmm2,xmm1
160	pcmpeqd	xmm1,xmm5
161	movdqa	XMMWORD[304+r10],xmm0
162
163	paddd	xmm3,xmm2
164DB	0x67
165	pcmpeqd	xmm2,xmm5
166	movdqa	XMMWORD[320+r10],xmm1
167
168	pcmpeqd	xmm3,xmm5
169	movdqa	XMMWORD[336+r10],xmm2
170	pand	xmm0,XMMWORD[64+r12]
171
172	pand	xmm1,XMMWORD[80+r12]
173	pand	xmm2,XMMWORD[96+r12]
174	movdqa	XMMWORD[352+r10],xmm3
175	pand	xmm3,XMMWORD[112+r12]
176	por	xmm0,xmm2
177	por	xmm1,xmm3
178	movdqa	xmm4,XMMWORD[((-128))+r12]
179	movdqa	xmm5,XMMWORD[((-112))+r12]
180	movdqa	xmm2,XMMWORD[((-96))+r12]
181	pand	xmm4,XMMWORD[112+r10]
182	movdqa	xmm3,XMMWORD[((-80))+r12]
183	pand	xmm5,XMMWORD[128+r10]
184	por	xmm0,xmm4
185	pand	xmm2,XMMWORD[144+r10]
186	por	xmm1,xmm5
187	pand	xmm3,XMMWORD[160+r10]
188	por	xmm0,xmm2
189	por	xmm1,xmm3
190	movdqa	xmm4,XMMWORD[((-64))+r12]
191	movdqa	xmm5,XMMWORD[((-48))+r12]
192	movdqa	xmm2,XMMWORD[((-32))+r12]
193	pand	xmm4,XMMWORD[176+r10]
194	movdqa	xmm3,XMMWORD[((-16))+r12]
195	pand	xmm5,XMMWORD[192+r10]
196	por	xmm0,xmm4
197	pand	xmm2,XMMWORD[208+r10]
198	por	xmm1,xmm5
199	pand	xmm3,XMMWORD[224+r10]
200	por	xmm0,xmm2
201	por	xmm1,xmm3
202	movdqa	xmm4,XMMWORD[r12]
203	movdqa	xmm5,XMMWORD[16+r12]
204	movdqa	xmm2,XMMWORD[32+r12]
205	pand	xmm4,XMMWORD[240+r10]
206	movdqa	xmm3,XMMWORD[48+r12]
207	pand	xmm5,XMMWORD[256+r10]
208	por	xmm0,xmm4
209	pand	xmm2,XMMWORD[272+r10]
210	por	xmm1,xmm5
211	pand	xmm3,XMMWORD[288+r10]
212	por	xmm0,xmm2
213	por	xmm1,xmm3
214	por	xmm0,xmm1
215	pshufd	xmm1,xmm0,0x4e
216	por	xmm0,xmm1
217	lea	r12,[256+r12]
218DB	102,72,15,126,195
219
220	mov	r8,QWORD[r8]
221	mov	rax,QWORD[rsi]
222
223	xor	r14,r14
224	xor	r15,r15
225
226	mov	rbp,r8
227	mul	rbx
228	mov	r10,rax
229	mov	rax,QWORD[rcx]
230
231	imul	rbp,r10
232	mov	r11,rdx
233
234	mul	rbp
235	add	r10,rax
236	mov	rax,QWORD[8+rsi]
237	adc	rdx,0
238	mov	r13,rdx
239
240	lea	r15,[1+r15]
241	jmp	NEAR $L$1st_enter
242
243ALIGN	16
244$L$1st:
245	add	r13,rax
246	mov	rax,QWORD[r15*8+rsi]
247	adc	rdx,0
248	add	r13,r11
249	mov	r11,r10
250	adc	rdx,0
251	mov	QWORD[((-16))+r15*8+rsp],r13
252	mov	r13,rdx
253
254$L$1st_enter:
255	mul	rbx
256	add	r11,rax
257	mov	rax,QWORD[r15*8+rcx]
258	adc	rdx,0
259	lea	r15,[1+r15]
260	mov	r10,rdx
261
262	mul	rbp
263	cmp	r15,r9
264	jne	NEAR $L$1st
265
266
267	add	r13,rax
268	adc	rdx,0
269	add	r13,r11
270	adc	rdx,0
271	mov	QWORD[((-16))+r9*8+rsp],r13
272	mov	r13,rdx
273	mov	r11,r10
274
275	xor	rdx,rdx
276	add	r13,r11
277	adc	rdx,0
278	mov	QWORD[((-8))+r9*8+rsp],r13
279	mov	QWORD[r9*8+rsp],rdx
280
281	lea	r14,[1+r14]
282	jmp	NEAR $L$outer
283ALIGN	16
284$L$outer:
285	lea	rdx,[((24+128))+r9*8+rsp]
286	and	rdx,-16
287	pxor	xmm4,xmm4
288	pxor	xmm5,xmm5
289	movdqa	xmm0,XMMWORD[((-128))+r12]
290	movdqa	xmm1,XMMWORD[((-112))+r12]
291	movdqa	xmm2,XMMWORD[((-96))+r12]
292	movdqa	xmm3,XMMWORD[((-80))+r12]
293	pand	xmm0,XMMWORD[((-128))+rdx]
294	pand	xmm1,XMMWORD[((-112))+rdx]
295	por	xmm4,xmm0
296	pand	xmm2,XMMWORD[((-96))+rdx]
297	por	xmm5,xmm1
298	pand	xmm3,XMMWORD[((-80))+rdx]
299	por	xmm4,xmm2
300	por	xmm5,xmm3
301	movdqa	xmm0,XMMWORD[((-64))+r12]
302	movdqa	xmm1,XMMWORD[((-48))+r12]
303	movdqa	xmm2,XMMWORD[((-32))+r12]
304	movdqa	xmm3,XMMWORD[((-16))+r12]
305	pand	xmm0,XMMWORD[((-64))+rdx]
306	pand	xmm1,XMMWORD[((-48))+rdx]
307	por	xmm4,xmm0
308	pand	xmm2,XMMWORD[((-32))+rdx]
309	por	xmm5,xmm1
310	pand	xmm3,XMMWORD[((-16))+rdx]
311	por	xmm4,xmm2
312	por	xmm5,xmm3
313	movdqa	xmm0,XMMWORD[r12]
314	movdqa	xmm1,XMMWORD[16+r12]
315	movdqa	xmm2,XMMWORD[32+r12]
316	movdqa	xmm3,XMMWORD[48+r12]
317	pand	xmm0,XMMWORD[rdx]
318	pand	xmm1,XMMWORD[16+rdx]
319	por	xmm4,xmm0
320	pand	xmm2,XMMWORD[32+rdx]
321	por	xmm5,xmm1
322	pand	xmm3,XMMWORD[48+rdx]
323	por	xmm4,xmm2
324	por	xmm5,xmm3
325	movdqa	xmm0,XMMWORD[64+r12]
326	movdqa	xmm1,XMMWORD[80+r12]
327	movdqa	xmm2,XMMWORD[96+r12]
328	movdqa	xmm3,XMMWORD[112+r12]
329	pand	xmm0,XMMWORD[64+rdx]
330	pand	xmm1,XMMWORD[80+rdx]
331	por	xmm4,xmm0
332	pand	xmm2,XMMWORD[96+rdx]
333	por	xmm5,xmm1
334	pand	xmm3,XMMWORD[112+rdx]
335	por	xmm4,xmm2
336	por	xmm5,xmm3
337	por	xmm4,xmm5
338	pshufd	xmm0,xmm4,0x4e
339	por	xmm0,xmm4
340	lea	r12,[256+r12]
341
342	mov	rax,QWORD[rsi]
343DB	102,72,15,126,195
344
345	xor	r15,r15
346	mov	rbp,r8
347	mov	r10,QWORD[rsp]
348
349	mul	rbx
350	add	r10,rax
351	mov	rax,QWORD[rcx]
352	adc	rdx,0
353
354	imul	rbp,r10
355	mov	r11,rdx
356
357	mul	rbp
358	add	r10,rax
359	mov	rax,QWORD[8+rsi]
360	adc	rdx,0
361	mov	r10,QWORD[8+rsp]
362	mov	r13,rdx
363
364	lea	r15,[1+r15]
365	jmp	NEAR $L$inner_enter
366
367ALIGN	16
368$L$inner:
369	add	r13,rax
370	mov	rax,QWORD[r15*8+rsi]
371	adc	rdx,0
372	add	r13,r10
373	mov	r10,QWORD[r15*8+rsp]
374	adc	rdx,0
375	mov	QWORD[((-16))+r15*8+rsp],r13
376	mov	r13,rdx
377
378$L$inner_enter:
379	mul	rbx
380	add	r11,rax
381	mov	rax,QWORD[r15*8+rcx]
382	adc	rdx,0
383	add	r10,r11
384	mov	r11,rdx
385	adc	r11,0
386	lea	r15,[1+r15]
387
388	mul	rbp
389	cmp	r15,r9
390	jne	NEAR $L$inner
391
392	add	r13,rax
393	adc	rdx,0
394	add	r13,r10
395	mov	r10,QWORD[r9*8+rsp]
396	adc	rdx,0
397	mov	QWORD[((-16))+r9*8+rsp],r13
398	mov	r13,rdx
399
400	xor	rdx,rdx
401	add	r13,r11
402	adc	rdx,0
403	add	r13,r10
404	adc	rdx,0
405	mov	QWORD[((-8))+r9*8+rsp],r13
406	mov	QWORD[r9*8+rsp],rdx
407
408	lea	r14,[1+r14]
409	cmp	r14,r9
410	jb	NEAR $L$outer
411
412	xor	r14,r14
413	mov	rax,QWORD[rsp]
414	lea	rsi,[rsp]
415	mov	r15,r9
416	jmp	NEAR $L$sub
417ALIGN	16
418$L$sub:	sbb	rax,QWORD[r14*8+rcx]
419	mov	QWORD[r14*8+rdi],rax
420	mov	rax,QWORD[8+r14*8+rsi]
421	lea	r14,[1+r14]
422	dec	r15
423	jnz	NEAR $L$sub
424
425	sbb	rax,0
426	mov	rbx,-1
427	xor	rbx,rax
428	xor	r14,r14
429	mov	r15,r9
430
431$L$copy:
432	mov	rcx,QWORD[r14*8+rdi]
433	mov	rdx,QWORD[r14*8+rsp]
434	and	rcx,rbx
435	and	rdx,rax
436	mov	QWORD[r14*8+rsp],r14
437	or	rdx,rcx
438	mov	QWORD[r14*8+rdi],rdx
439	lea	r14,[1+r14]
440	sub	r15,1
441	jnz	NEAR $L$copy
442
443	mov	rsi,QWORD[8+r9*8+rsp]
444
445	mov	rax,1
446
447	mov	r15,QWORD[((-48))+rsi]
448
449	mov	r14,QWORD[((-40))+rsi]
450
451	mov	r13,QWORD[((-32))+rsi]
452
453	mov	r12,QWORD[((-24))+rsi]
454
455	mov	rbp,QWORD[((-16))+rsi]
456
457	mov	rbx,QWORD[((-8))+rsi]
458
459	lea	rsp,[rsi]
460
461$L$mul_epilogue:
462	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
463	mov	rsi,QWORD[16+rsp]
464	DB	0F3h,0C3h		;repret
465
466$L$SEH_end_GFp_bn_mul_mont_gather5:
467
468ALIGN	32
469bn_mul4x_mont_gather5:
470	mov	QWORD[8+rsp],rdi	;WIN64 prologue
471	mov	QWORD[16+rsp],rsi
472	mov	rax,rsp
473$L$SEH_begin_bn_mul4x_mont_gather5:
474	mov	rdi,rcx
475	mov	rsi,rdx
476	mov	rdx,r8
477	mov	rcx,r9
478	mov	r8,QWORD[40+rsp]
479	mov	r9,QWORD[48+rsp]
480
481
482
483DB	0x67
484	mov	rax,rsp
485
486$L$mul4x_enter:
487	and	r11d,0x80108
488	cmp	r11d,0x80108
489	je	NEAR $L$mulx4x_enter
490	push	rbx
491
492	push	rbp
493
494	push	r12
495
496	push	r13
497
498	push	r14
499
500	push	r15
501
502$L$mul4x_prologue:
503
504DB	0x67
505	shl	r9d,3
506	lea	r10,[r9*2+r9]
507	neg	r9
508
509
510
511
512
513
514
515
516
517
518	lea	r11,[((-320))+r9*2+rsp]
519	mov	rbp,rsp
520	sub	r11,rdi
521	and	r11,4095
522	cmp	r10,r11
523	jb	NEAR $L$mul4xsp_alt
524	sub	rbp,r11
525	lea	rbp,[((-320))+r9*2+rbp]
526	jmp	NEAR $L$mul4xsp_done
527
528ALIGN	32
529$L$mul4xsp_alt:
530	lea	r10,[((4096-320))+r9*2]
531	lea	rbp,[((-320))+r9*2+rbp]
532	sub	r11,r10
533	mov	r10,0
534	cmovc	r11,r10
535	sub	rbp,r11
536$L$mul4xsp_done:
537	and	rbp,-64
538	mov	r11,rsp
539	sub	r11,rbp
540	and	r11,-4096
541	lea	rsp,[rbp*1+r11]
542	mov	r10,QWORD[rsp]
543	cmp	rsp,rbp
544	ja	NEAR $L$mul4x_page_walk
545	jmp	NEAR $L$mul4x_page_walk_done
546
547$L$mul4x_page_walk:
548	lea	rsp,[((-4096))+rsp]
549	mov	r10,QWORD[rsp]
550	cmp	rsp,rbp
551	ja	NEAR $L$mul4x_page_walk
552$L$mul4x_page_walk_done:
553
554	neg	r9
555
556	mov	QWORD[40+rsp],rax
557
558$L$mul4x_body:
559
560	call	mul4x_internal
561
562	mov	rsi,QWORD[40+rsp]
563
564	mov	rax,1
565
566	mov	r15,QWORD[((-48))+rsi]
567
568	mov	r14,QWORD[((-40))+rsi]
569
570	mov	r13,QWORD[((-32))+rsi]
571
572	mov	r12,QWORD[((-24))+rsi]
573
574	mov	rbp,QWORD[((-16))+rsi]
575
576	mov	rbx,QWORD[((-8))+rsi]
577
578	lea	rsp,[rsi]
579
580$L$mul4x_epilogue:
581	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
582	mov	rsi,QWORD[16+rsp]
583	DB	0F3h,0C3h		;repret
584
585$L$SEH_end_bn_mul4x_mont_gather5:
586
587
588ALIGN	32
589mul4x_internal:
590
591	shl	r9,5
592	movd	xmm5,DWORD[56+rax]
593	lea	rax,[$L$inc]
594	lea	r13,[128+r9*1+rdx]
595	shr	r9,5
596	movdqa	xmm0,XMMWORD[rax]
597	movdqa	xmm1,XMMWORD[16+rax]
598	lea	r10,[((88-112))+r9*1+rsp]
599	lea	r12,[128+rdx]
600
601	pshufd	xmm5,xmm5,0
602	movdqa	xmm4,xmm1
603DB	0x67,0x67
604	movdqa	xmm2,xmm1
605	paddd	xmm1,xmm0
606	pcmpeqd	xmm0,xmm5
607DB	0x67
608	movdqa	xmm3,xmm4
609	paddd	xmm2,xmm1
610	pcmpeqd	xmm1,xmm5
611	movdqa	XMMWORD[112+r10],xmm0
612	movdqa	xmm0,xmm4
613
614	paddd	xmm3,xmm2
615	pcmpeqd	xmm2,xmm5
616	movdqa	XMMWORD[128+r10],xmm1
617	movdqa	xmm1,xmm4
618
619	paddd	xmm0,xmm3
620	pcmpeqd	xmm3,xmm5
621	movdqa	XMMWORD[144+r10],xmm2
622	movdqa	xmm2,xmm4
623
624	paddd	xmm1,xmm0
625	pcmpeqd	xmm0,xmm5
626	movdqa	XMMWORD[160+r10],xmm3
627	movdqa	xmm3,xmm4
628	paddd	xmm2,xmm1
629	pcmpeqd	xmm1,xmm5
630	movdqa	XMMWORD[176+r10],xmm0
631	movdqa	xmm0,xmm4
632
633	paddd	xmm3,xmm2
634	pcmpeqd	xmm2,xmm5
635	movdqa	XMMWORD[192+r10],xmm1
636	movdqa	xmm1,xmm4
637
638	paddd	xmm0,xmm3
639	pcmpeqd	xmm3,xmm5
640	movdqa	XMMWORD[208+r10],xmm2
641	movdqa	xmm2,xmm4
642
643	paddd	xmm1,xmm0
644	pcmpeqd	xmm0,xmm5
645	movdqa	XMMWORD[224+r10],xmm3
646	movdqa	xmm3,xmm4
647	paddd	xmm2,xmm1
648	pcmpeqd	xmm1,xmm5
649	movdqa	XMMWORD[240+r10],xmm0
650	movdqa	xmm0,xmm4
651
652	paddd	xmm3,xmm2
653	pcmpeqd	xmm2,xmm5
654	movdqa	XMMWORD[256+r10],xmm1
655	movdqa	xmm1,xmm4
656
657	paddd	xmm0,xmm3
658	pcmpeqd	xmm3,xmm5
659	movdqa	XMMWORD[272+r10],xmm2
660	movdqa	xmm2,xmm4
661
662	paddd	xmm1,xmm0
663	pcmpeqd	xmm0,xmm5
664	movdqa	XMMWORD[288+r10],xmm3
665	movdqa	xmm3,xmm4
666	paddd	xmm2,xmm1
667	pcmpeqd	xmm1,xmm5
668	movdqa	XMMWORD[304+r10],xmm0
669
670	paddd	xmm3,xmm2
671DB	0x67
672	pcmpeqd	xmm2,xmm5
673	movdqa	XMMWORD[320+r10],xmm1
674
675	pcmpeqd	xmm3,xmm5
676	movdqa	XMMWORD[336+r10],xmm2
677	pand	xmm0,XMMWORD[64+r12]
678
679	pand	xmm1,XMMWORD[80+r12]
680	pand	xmm2,XMMWORD[96+r12]
681	movdqa	XMMWORD[352+r10],xmm3
682	pand	xmm3,XMMWORD[112+r12]
683	por	xmm0,xmm2
684	por	xmm1,xmm3
685	movdqa	xmm4,XMMWORD[((-128))+r12]
686	movdqa	xmm5,XMMWORD[((-112))+r12]
687	movdqa	xmm2,XMMWORD[((-96))+r12]
688	pand	xmm4,XMMWORD[112+r10]
689	movdqa	xmm3,XMMWORD[((-80))+r12]
690	pand	xmm5,XMMWORD[128+r10]
691	por	xmm0,xmm4
692	pand	xmm2,XMMWORD[144+r10]
693	por	xmm1,xmm5
694	pand	xmm3,XMMWORD[160+r10]
695	por	xmm0,xmm2
696	por	xmm1,xmm3
697	movdqa	xmm4,XMMWORD[((-64))+r12]
698	movdqa	xmm5,XMMWORD[((-48))+r12]
699	movdqa	xmm2,XMMWORD[((-32))+r12]
700	pand	xmm4,XMMWORD[176+r10]
701	movdqa	xmm3,XMMWORD[((-16))+r12]
702	pand	xmm5,XMMWORD[192+r10]
703	por	xmm0,xmm4
704	pand	xmm2,XMMWORD[208+r10]
705	por	xmm1,xmm5
706	pand	xmm3,XMMWORD[224+r10]
707	por	xmm0,xmm2
708	por	xmm1,xmm3
709	movdqa	xmm4,XMMWORD[r12]
710	movdqa	xmm5,XMMWORD[16+r12]
711	movdqa	xmm2,XMMWORD[32+r12]
712	pand	xmm4,XMMWORD[240+r10]
713	movdqa	xmm3,XMMWORD[48+r12]
714	pand	xmm5,XMMWORD[256+r10]
715	por	xmm0,xmm4
716	pand	xmm2,XMMWORD[272+r10]
717	por	xmm1,xmm5
718	pand	xmm3,XMMWORD[288+r10]
719	por	xmm0,xmm2
720	por	xmm1,xmm3
721	por	xmm0,xmm1
722	pshufd	xmm1,xmm0,0x4e
723	por	xmm0,xmm1
724	lea	r12,[256+r12]
725DB	102,72,15,126,195
726
727	mov	QWORD[((16+8))+rsp],r13
728	mov	QWORD[((56+8))+rsp],rdi
729
730	mov	r8,QWORD[r8]
731	mov	rax,QWORD[rsi]
732	lea	rsi,[r9*1+rsi]
733	neg	r9
734
735	mov	rbp,r8
736	mul	rbx
737	mov	r10,rax
738	mov	rax,QWORD[rcx]
739
740	imul	rbp,r10
741	lea	r14,[((64+8))+rsp]
742	mov	r11,rdx
743
744	mul	rbp
745	add	r10,rax
746	mov	rax,QWORD[8+r9*1+rsi]
747	adc	rdx,0
748	mov	rdi,rdx
749
750	mul	rbx
751	add	r11,rax
752	mov	rax,QWORD[8+rcx]
753	adc	rdx,0
754	mov	r10,rdx
755
756	mul	rbp
757	add	rdi,rax
758	mov	rax,QWORD[16+r9*1+rsi]
759	adc	rdx,0
760	add	rdi,r11
761	lea	r15,[32+r9]
762	lea	rcx,[32+rcx]
763	adc	rdx,0
764	mov	QWORD[r14],rdi
765	mov	r13,rdx
766	jmp	NEAR $L$1st4x
767
768ALIGN	32
769$L$1st4x:
770	mul	rbx
771	add	r10,rax
772	mov	rax,QWORD[((-16))+rcx]
773	lea	r14,[32+r14]
774	adc	rdx,0
775	mov	r11,rdx
776
777	mul	rbp
778	add	r13,rax
779	mov	rax,QWORD[((-8))+r15*1+rsi]
780	adc	rdx,0
781	add	r13,r10
782	adc	rdx,0
783	mov	QWORD[((-24))+r14],r13
784	mov	rdi,rdx
785
786	mul	rbx
787	add	r11,rax
788	mov	rax,QWORD[((-8))+rcx]
789	adc	rdx,0
790	mov	r10,rdx
791
792	mul	rbp
793	add	rdi,rax
794	mov	rax,QWORD[r15*1+rsi]
795	adc	rdx,0
796	add	rdi,r11
797	adc	rdx,0
798	mov	QWORD[((-16))+r14],rdi
799	mov	r13,rdx
800
801	mul	rbx
802	add	r10,rax
803	mov	rax,QWORD[rcx]
804	adc	rdx,0
805	mov	r11,rdx
806
807	mul	rbp
808	add	r13,rax
809	mov	rax,QWORD[8+r15*1+rsi]
810	adc	rdx,0
811	add	r13,r10
812	adc	rdx,0
813	mov	QWORD[((-8))+r14],r13
814	mov	rdi,rdx
815
816	mul	rbx
817	add	r11,rax
818	mov	rax,QWORD[8+rcx]
819	adc	rdx,0
820	mov	r10,rdx
821
822	mul	rbp
823	add	rdi,rax
824	mov	rax,QWORD[16+r15*1+rsi]
825	adc	rdx,0
826	add	rdi,r11
827	lea	rcx,[32+rcx]
828	adc	rdx,0
829	mov	QWORD[r14],rdi
830	mov	r13,rdx
831
832	add	r15,32
833	jnz	NEAR $L$1st4x
834
835	mul	rbx
836	add	r10,rax
837	mov	rax,QWORD[((-16))+rcx]
838	lea	r14,[32+r14]
839	adc	rdx,0
840	mov	r11,rdx
841
842	mul	rbp
843	add	r13,rax
844	mov	rax,QWORD[((-8))+rsi]
845	adc	rdx,0
846	add	r13,r10
847	adc	rdx,0
848	mov	QWORD[((-24))+r14],r13
849	mov	rdi,rdx
850
851	mul	rbx
852	add	r11,rax
853	mov	rax,QWORD[((-8))+rcx]
854	adc	rdx,0
855	mov	r10,rdx
856
857	mul	rbp
858	add	rdi,rax
859	mov	rax,QWORD[r9*1+rsi]
860	adc	rdx,0
861	add	rdi,r11
862	adc	rdx,0
863	mov	QWORD[((-16))+r14],rdi
864	mov	r13,rdx
865
866	lea	rcx,[r9*1+rcx]
867
868	xor	rdi,rdi
869	add	r13,r10
870	adc	rdi,0
871	mov	QWORD[((-8))+r14],r13
872
873	jmp	NEAR $L$outer4x
874
875ALIGN	32
876$L$outer4x:
877	lea	rdx,[((16+128))+r14]
878	pxor	xmm4,xmm4
879	pxor	xmm5,xmm5
880	movdqa	xmm0,XMMWORD[((-128))+r12]
881	movdqa	xmm1,XMMWORD[((-112))+r12]
882	movdqa	xmm2,XMMWORD[((-96))+r12]
883	movdqa	xmm3,XMMWORD[((-80))+r12]
884	pand	xmm0,XMMWORD[((-128))+rdx]
885	pand	xmm1,XMMWORD[((-112))+rdx]
886	por	xmm4,xmm0
887	pand	xmm2,XMMWORD[((-96))+rdx]
888	por	xmm5,xmm1
889	pand	xmm3,XMMWORD[((-80))+rdx]
890	por	xmm4,xmm2
891	por	xmm5,xmm3
892	movdqa	xmm0,XMMWORD[((-64))+r12]
893	movdqa	xmm1,XMMWORD[((-48))+r12]
894	movdqa	xmm2,XMMWORD[((-32))+r12]
895	movdqa	xmm3,XMMWORD[((-16))+r12]
896	pand	xmm0,XMMWORD[((-64))+rdx]
897	pand	xmm1,XMMWORD[((-48))+rdx]
898	por	xmm4,xmm0
899	pand	xmm2,XMMWORD[((-32))+rdx]
900	por	xmm5,xmm1
901	pand	xmm3,XMMWORD[((-16))+rdx]
902	por	xmm4,xmm2
903	por	xmm5,xmm3
904	movdqa	xmm0,XMMWORD[r12]
905	movdqa	xmm1,XMMWORD[16+r12]
906	movdqa	xmm2,XMMWORD[32+r12]
907	movdqa	xmm3,XMMWORD[48+r12]
908	pand	xmm0,XMMWORD[rdx]
909	pand	xmm1,XMMWORD[16+rdx]
910	por	xmm4,xmm0
911	pand	xmm2,XMMWORD[32+rdx]
912	por	xmm5,xmm1
913	pand	xmm3,XMMWORD[48+rdx]
914	por	xmm4,xmm2
915	por	xmm5,xmm3
916	movdqa	xmm0,XMMWORD[64+r12]
917	movdqa	xmm1,XMMWORD[80+r12]
918	movdqa	xmm2,XMMWORD[96+r12]
919	movdqa	xmm3,XMMWORD[112+r12]
920	pand	xmm0,XMMWORD[64+rdx]
921	pand	xmm1,XMMWORD[80+rdx]
922	por	xmm4,xmm0
923	pand	xmm2,XMMWORD[96+rdx]
924	por	xmm5,xmm1
925	pand	xmm3,XMMWORD[112+rdx]
926	por	xmm4,xmm2
927	por	xmm5,xmm3
928	por	xmm4,xmm5
929	pshufd	xmm0,xmm4,0x4e
930	por	xmm0,xmm4
931	lea	r12,[256+r12]
932DB	102,72,15,126,195
933
934	mov	r10,QWORD[r9*1+r14]
935	mov	rbp,r8
936	mul	rbx
937	add	r10,rax
938	mov	rax,QWORD[rcx]
939	adc	rdx,0
940
941	imul	rbp,r10
942	mov	r11,rdx
943	mov	QWORD[r14],rdi
944
945	lea	r14,[r9*1+r14]
946
947	mul	rbp
948	add	r10,rax
949	mov	rax,QWORD[8+r9*1+rsi]
950	adc	rdx,0
951	mov	rdi,rdx
952
953	mul	rbx
954	add	r11,rax
955	mov	rax,QWORD[8+rcx]
956	adc	rdx,0
957	add	r11,QWORD[8+r14]
958	adc	rdx,0
959	mov	r10,rdx
960
961	mul	rbp
962	add	rdi,rax
963	mov	rax,QWORD[16+r9*1+rsi]
964	adc	rdx,0
965	add	rdi,r11
966	lea	r15,[32+r9]
967	lea	rcx,[32+rcx]
968	adc	rdx,0
969	mov	r13,rdx
970	jmp	NEAR $L$inner4x
971
972ALIGN	32
973$L$inner4x:
974	mul	rbx
975	add	r10,rax
976	mov	rax,QWORD[((-16))+rcx]
977	adc	rdx,0
978	add	r10,QWORD[16+r14]
979	lea	r14,[32+r14]
980	adc	rdx,0
981	mov	r11,rdx
982
983	mul	rbp
984	add	r13,rax
985	mov	rax,QWORD[((-8))+r15*1+rsi]
986	adc	rdx,0
987	add	r13,r10
988	adc	rdx,0
989	mov	QWORD[((-32))+r14],rdi
990	mov	rdi,rdx
991
992	mul	rbx
993	add	r11,rax
994	mov	rax,QWORD[((-8))+rcx]
995	adc	rdx,0
996	add	r11,QWORD[((-8))+r14]
997	adc	rdx,0
998	mov	r10,rdx
999
1000	mul	rbp
1001	add	rdi,rax
1002	mov	rax,QWORD[r15*1+rsi]
1003	adc	rdx,0
1004	add	rdi,r11
1005	adc	rdx,0
1006	mov	QWORD[((-24))+r14],r13
1007	mov	r13,rdx
1008
1009	mul	rbx
1010	add	r10,rax
1011	mov	rax,QWORD[rcx]
1012	adc	rdx,0
1013	add	r10,QWORD[r14]
1014	adc	rdx,0
1015	mov	r11,rdx
1016
1017	mul	rbp
1018	add	r13,rax
1019	mov	rax,QWORD[8+r15*1+rsi]
1020	adc	rdx,0
1021	add	r13,r10
1022	adc	rdx,0
1023	mov	QWORD[((-16))+r14],rdi
1024	mov	rdi,rdx
1025
1026	mul	rbx
1027	add	r11,rax
1028	mov	rax,QWORD[8+rcx]
1029	adc	rdx,0
1030	add	r11,QWORD[8+r14]
1031	adc	rdx,0
1032	mov	r10,rdx
1033
1034	mul	rbp
1035	add	rdi,rax
1036	mov	rax,QWORD[16+r15*1+rsi]
1037	adc	rdx,0
1038	add	rdi,r11
1039	lea	rcx,[32+rcx]
1040	adc	rdx,0
1041	mov	QWORD[((-8))+r14],r13
1042	mov	r13,rdx
1043
1044	add	r15,32
1045	jnz	NEAR $L$inner4x
1046
1047	mul	rbx
1048	add	r10,rax
1049	mov	rax,QWORD[((-16))+rcx]
1050	adc	rdx,0
1051	add	r10,QWORD[16+r14]
1052	lea	r14,[32+r14]
1053	adc	rdx,0
1054	mov	r11,rdx
1055
1056	mul	rbp
1057	add	r13,rax
1058	mov	rax,QWORD[((-8))+rsi]
1059	adc	rdx,0
1060	add	r13,r10
1061	adc	rdx,0
1062	mov	QWORD[((-32))+r14],rdi
1063	mov	rdi,rdx
1064
1065	mul	rbx
1066	add	r11,rax
1067	mov	rax,rbp
1068	mov	rbp,QWORD[((-8))+rcx]
1069	adc	rdx,0
1070	add	r11,QWORD[((-8))+r14]
1071	adc	rdx,0
1072	mov	r10,rdx
1073
1074	mul	rbp
1075	add	rdi,rax
1076	mov	rax,QWORD[r9*1+rsi]
1077	adc	rdx,0
1078	add	rdi,r11
1079	adc	rdx,0
1080	mov	QWORD[((-24))+r14],r13
1081	mov	r13,rdx
1082
1083	mov	QWORD[((-16))+r14],rdi
1084	lea	rcx,[r9*1+rcx]
1085
1086	xor	rdi,rdi
1087	add	r13,r10
1088	adc	rdi,0
1089	add	r13,QWORD[r14]
1090	adc	rdi,0
1091	mov	QWORD[((-8))+r14],r13
1092
1093	cmp	r12,QWORD[((16+8))+rsp]
1094	jb	NEAR $L$outer4x
1095	xor	rax,rax
1096	sub	rbp,r13
1097	adc	r15,r15
1098	or	rdi,r15
1099	sub	rax,rdi
1100	lea	rbx,[r9*1+r14]
1101	mov	r12,QWORD[rcx]
1102	lea	rbp,[rcx]
1103	mov	rcx,r9
1104	sar	rcx,3+2
1105	mov	rdi,QWORD[((56+8))+rsp]
1106	dec	r12
1107	xor	r10,r10
1108	mov	r13,QWORD[8+rbp]
1109	mov	r14,QWORD[16+rbp]
1110	mov	r15,QWORD[24+rbp]
1111	jmp	NEAR $L$sqr4x_sub_entry
1112
1113
1114global	GFp_bn_power5
1115
1116ALIGN	32
1117GFp_bn_power5:
1118	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1119	mov	QWORD[16+rsp],rsi
1120	mov	rax,rsp
1121$L$SEH_begin_GFp_bn_power5:
1122	mov	rdi,rcx
1123	mov	rsi,rdx
1124	mov	rdx,r8
1125	mov	rcx,r9
1126	mov	r8,QWORD[40+rsp]
1127	mov	r9,QWORD[48+rsp]
1128
1129
1130
1131	mov	rax,rsp
1132
1133	lea	r11,[GFp_ia32cap_P]
1134	mov	r11d,DWORD[8+r11]
1135	and	r11d,0x80108
1136	cmp	r11d,0x80108
1137	je	NEAR $L$powerx5_enter
1138	push	rbx
1139
1140	push	rbp
1141
1142	push	r12
1143
1144	push	r13
1145
1146	push	r14
1147
1148	push	r15
1149
1150$L$power5_prologue:
1151
1152	shl	r9d,3
1153	lea	r10d,[r9*2+r9]
1154	neg	r9
1155	mov	r8,QWORD[r8]
1156
1157
1158
1159
1160
1161
1162
1163
1164	lea	r11,[((-320))+r9*2+rsp]
1165	mov	rbp,rsp
1166	sub	r11,rdi
1167	and	r11,4095
1168	cmp	r10,r11
1169	jb	NEAR $L$pwr_sp_alt
1170	sub	rbp,r11
1171	lea	rbp,[((-320))+r9*2+rbp]
1172	jmp	NEAR $L$pwr_sp_done
1173
1174ALIGN	32
1175$L$pwr_sp_alt:
1176	lea	r10,[((4096-320))+r9*2]
1177	lea	rbp,[((-320))+r9*2+rbp]
1178	sub	r11,r10
1179	mov	r10,0
1180	cmovc	r11,r10
1181	sub	rbp,r11
1182$L$pwr_sp_done:
1183	and	rbp,-64
1184	mov	r11,rsp
1185	sub	r11,rbp
1186	and	r11,-4096
1187	lea	rsp,[rbp*1+r11]
1188	mov	r10,QWORD[rsp]
1189	cmp	rsp,rbp
1190	ja	NEAR $L$pwr_page_walk
1191	jmp	NEAR $L$pwr_page_walk_done
1192
1193$L$pwr_page_walk:
1194	lea	rsp,[((-4096))+rsp]
1195	mov	r10,QWORD[rsp]
1196	cmp	rsp,rbp
1197	ja	NEAR $L$pwr_page_walk
1198$L$pwr_page_walk_done:
1199
1200	mov	r10,r9
1201	neg	r9
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212	mov	QWORD[32+rsp],r8
1213	mov	QWORD[40+rsp],rax
1214
1215$L$power5_body:
1216DB	102,72,15,110,207
1217DB	102,72,15,110,209
1218DB	102,73,15,110,218
1219DB	102,72,15,110,226
1220
1221	call	__bn_sqr8x_internal
1222	call	__bn_post4x_internal
1223	call	__bn_sqr8x_internal
1224	call	__bn_post4x_internal
1225	call	__bn_sqr8x_internal
1226	call	__bn_post4x_internal
1227	call	__bn_sqr8x_internal
1228	call	__bn_post4x_internal
1229	call	__bn_sqr8x_internal
1230	call	__bn_post4x_internal
1231
1232DB	102,72,15,126,209
1233DB	102,72,15,126,226
1234	mov	rdi,rsi
1235	mov	rax,QWORD[40+rsp]
1236	lea	r8,[32+rsp]
1237
1238	call	mul4x_internal
1239
1240	mov	rsi,QWORD[40+rsp]
1241
1242	mov	rax,1
1243	mov	r15,QWORD[((-48))+rsi]
1244
1245	mov	r14,QWORD[((-40))+rsi]
1246
1247	mov	r13,QWORD[((-32))+rsi]
1248
1249	mov	r12,QWORD[((-24))+rsi]
1250
1251	mov	rbp,QWORD[((-16))+rsi]
1252
1253	mov	rbx,QWORD[((-8))+rsi]
1254
1255	lea	rsp,[rsi]
1256
1257$L$power5_epilogue:
1258	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1259	mov	rsi,QWORD[16+rsp]
1260	DB	0F3h,0C3h		;repret
1261
1262$L$SEH_end_GFp_bn_power5:
1263
1264global	GFp_bn_sqr8x_internal
1265
1266
1267ALIGN	32
1268GFp_bn_sqr8x_internal:
1269__bn_sqr8x_internal:
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344	lea	rbp,[32+r10]
1345	lea	rsi,[r9*1+rsi]
1346
1347	mov	rcx,r9
1348
1349
1350	mov	r14,QWORD[((-32))+rbp*1+rsi]
1351	lea	rdi,[((48+8))+r9*2+rsp]
1352	mov	rax,QWORD[((-24))+rbp*1+rsi]
1353	lea	rdi,[((-32))+rbp*1+rdi]
1354	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1355	mov	r15,rax
1356
1357	mul	r14
1358	mov	r10,rax
1359	mov	rax,rbx
1360	mov	r11,rdx
1361	mov	QWORD[((-24))+rbp*1+rdi],r10
1362
1363	mul	r14
1364	add	r11,rax
1365	mov	rax,rbx
1366	adc	rdx,0
1367	mov	QWORD[((-16))+rbp*1+rdi],r11
1368	mov	r10,rdx
1369
1370
1371	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1372	mul	r15
1373	mov	r12,rax
1374	mov	rax,rbx
1375	mov	r13,rdx
1376
1377	lea	rcx,[rbp]
1378	mul	r14
1379	add	r10,rax
1380	mov	rax,rbx
1381	mov	r11,rdx
1382	adc	r11,0
1383	add	r10,r12
1384	adc	r11,0
1385	mov	QWORD[((-8))+rcx*1+rdi],r10
1386	jmp	NEAR $L$sqr4x_1st
1387
1388ALIGN	32
1389$L$sqr4x_1st:
1390	mov	rbx,QWORD[rcx*1+rsi]
1391	mul	r15
1392	add	r13,rax
1393	mov	rax,rbx
1394	mov	r12,rdx
1395	adc	r12,0
1396
1397	mul	r14
1398	add	r11,rax
1399	mov	rax,rbx
1400	mov	rbx,QWORD[8+rcx*1+rsi]
1401	mov	r10,rdx
1402	adc	r10,0
1403	add	r11,r13
1404	adc	r10,0
1405
1406
1407	mul	r15
1408	add	r12,rax
1409	mov	rax,rbx
1410	mov	QWORD[rcx*1+rdi],r11
1411	mov	r13,rdx
1412	adc	r13,0
1413
1414	mul	r14
1415	add	r10,rax
1416	mov	rax,rbx
1417	mov	rbx,QWORD[16+rcx*1+rsi]
1418	mov	r11,rdx
1419	adc	r11,0
1420	add	r10,r12
1421	adc	r11,0
1422
1423	mul	r15
1424	add	r13,rax
1425	mov	rax,rbx
1426	mov	QWORD[8+rcx*1+rdi],r10
1427	mov	r12,rdx
1428	adc	r12,0
1429
1430	mul	r14
1431	add	r11,rax
1432	mov	rax,rbx
1433	mov	rbx,QWORD[24+rcx*1+rsi]
1434	mov	r10,rdx
1435	adc	r10,0
1436	add	r11,r13
1437	adc	r10,0
1438
1439
1440	mul	r15
1441	add	r12,rax
1442	mov	rax,rbx
1443	mov	QWORD[16+rcx*1+rdi],r11
1444	mov	r13,rdx
1445	adc	r13,0
1446	lea	rcx,[32+rcx]
1447
1448	mul	r14
1449	add	r10,rax
1450	mov	rax,rbx
1451	mov	r11,rdx
1452	adc	r11,0
1453	add	r10,r12
1454	adc	r11,0
1455	mov	QWORD[((-8))+rcx*1+rdi],r10
1456
1457	cmp	rcx,0
1458	jne	NEAR $L$sqr4x_1st
1459
1460	mul	r15
1461	add	r13,rax
1462	lea	rbp,[16+rbp]
1463	adc	rdx,0
1464	add	r13,r11
1465	adc	rdx,0
1466
1467	mov	QWORD[rdi],r13
1468	mov	r12,rdx
1469	mov	QWORD[8+rdi],rdx
1470	jmp	NEAR $L$sqr4x_outer
1471
1472ALIGN	32
1473$L$sqr4x_outer:
1474	mov	r14,QWORD[((-32))+rbp*1+rsi]
1475	lea	rdi,[((48+8))+r9*2+rsp]
1476	mov	rax,QWORD[((-24))+rbp*1+rsi]
1477	lea	rdi,[((-32))+rbp*1+rdi]
1478	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1479	mov	r15,rax
1480
1481	mul	r14
1482	mov	r10,QWORD[((-24))+rbp*1+rdi]
1483	add	r10,rax
1484	mov	rax,rbx
1485	adc	rdx,0
1486	mov	QWORD[((-24))+rbp*1+rdi],r10
1487	mov	r11,rdx
1488
1489	mul	r14
1490	add	r11,rax
1491	mov	rax,rbx
1492	adc	rdx,0
1493	add	r11,QWORD[((-16))+rbp*1+rdi]
1494	mov	r10,rdx
1495	adc	r10,0
1496	mov	QWORD[((-16))+rbp*1+rdi],r11
1497
1498	xor	r12,r12
1499
1500	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1501	mul	r15
1502	add	r12,rax
1503	mov	rax,rbx
1504	adc	rdx,0
1505	add	r12,QWORD[((-8))+rbp*1+rdi]
1506	mov	r13,rdx
1507	adc	r13,0
1508
1509	mul	r14
1510	add	r10,rax
1511	mov	rax,rbx
1512	adc	rdx,0
1513	add	r10,r12
1514	mov	r11,rdx
1515	adc	r11,0
1516	mov	QWORD[((-8))+rbp*1+rdi],r10
1517
1518	lea	rcx,[rbp]
1519	jmp	NEAR $L$sqr4x_inner
1520
1521ALIGN	32
1522$L$sqr4x_inner:
1523	mov	rbx,QWORD[rcx*1+rsi]
1524	mul	r15
1525	add	r13,rax
1526	mov	rax,rbx
1527	mov	r12,rdx
1528	adc	r12,0
1529	add	r13,QWORD[rcx*1+rdi]
1530	adc	r12,0
1531
1532DB	0x67
1533	mul	r14
1534	add	r11,rax
1535	mov	rax,rbx
1536	mov	rbx,QWORD[8+rcx*1+rsi]
1537	mov	r10,rdx
1538	adc	r10,0
1539	add	r11,r13
1540	adc	r10,0
1541
1542	mul	r15
1543	add	r12,rax
1544	mov	QWORD[rcx*1+rdi],r11
1545	mov	rax,rbx
1546	mov	r13,rdx
1547	adc	r13,0
1548	add	r12,QWORD[8+rcx*1+rdi]
1549	lea	rcx,[16+rcx]
1550	adc	r13,0
1551
1552	mul	r14
1553	add	r10,rax
1554	mov	rax,rbx
1555	adc	rdx,0
1556	add	r10,r12
1557	mov	r11,rdx
1558	adc	r11,0
1559	mov	QWORD[((-8))+rcx*1+rdi],r10
1560
1561	cmp	rcx,0
1562	jne	NEAR $L$sqr4x_inner
1563
1564DB	0x67
1565	mul	r15
1566	add	r13,rax
1567	adc	rdx,0
1568	add	r13,r11
1569	adc	rdx,0
1570
1571	mov	QWORD[rdi],r13
1572	mov	r12,rdx
1573	mov	QWORD[8+rdi],rdx
1574
1575	add	rbp,16
1576	jnz	NEAR $L$sqr4x_outer
1577
1578
1579	mov	r14,QWORD[((-32))+rsi]
1580	lea	rdi,[((48+8))+r9*2+rsp]
1581	mov	rax,QWORD[((-24))+rsi]
1582	lea	rdi,[((-32))+rbp*1+rdi]
1583	mov	rbx,QWORD[((-16))+rsi]
1584	mov	r15,rax
1585
1586	mul	r14
1587	add	r10,rax
1588	mov	rax,rbx
1589	mov	r11,rdx
1590	adc	r11,0
1591
1592	mul	r14
1593	add	r11,rax
1594	mov	rax,rbx
1595	mov	QWORD[((-24))+rdi],r10
1596	mov	r10,rdx
1597	adc	r10,0
1598	add	r11,r13
1599	mov	rbx,QWORD[((-8))+rsi]
1600	adc	r10,0
1601
1602	mul	r15
1603	add	r12,rax
1604	mov	rax,rbx
1605	mov	QWORD[((-16))+rdi],r11
1606	mov	r13,rdx
1607	adc	r13,0
1608
1609	mul	r14
1610	add	r10,rax
1611	mov	rax,rbx
1612	mov	r11,rdx
1613	adc	r11,0
1614	add	r10,r12
1615	adc	r11,0
1616	mov	QWORD[((-8))+rdi],r10
1617
1618	mul	r15
1619	add	r13,rax
1620	mov	rax,QWORD[((-16))+rsi]
1621	adc	rdx,0
1622	add	r13,r11
1623	adc	rdx,0
1624
1625	mov	QWORD[rdi],r13
1626	mov	r12,rdx
1627	mov	QWORD[8+rdi],rdx
1628
1629	mul	rbx
1630	add	rbp,16
1631	xor	r14,r14
1632	sub	rbp,r9
1633	xor	r15,r15
1634
1635	add	rax,r12
1636	adc	rdx,0
1637	mov	QWORD[8+rdi],rax
1638	mov	QWORD[16+rdi],rdx
1639	mov	QWORD[24+rdi],r15
1640
1641	mov	rax,QWORD[((-16))+rbp*1+rsi]
1642	lea	rdi,[((48+8))+rsp]
1643	xor	r10,r10
1644	mov	r11,QWORD[8+rdi]
1645
1646	lea	r12,[r10*2+r14]
1647	shr	r10,63
1648	lea	r13,[r11*2+rcx]
1649	shr	r11,63
1650	or	r13,r10
1651	mov	r10,QWORD[16+rdi]
1652	mov	r14,r11
1653	mul	rax
1654	neg	r15
1655	mov	r11,QWORD[24+rdi]
1656	adc	r12,rax
1657	mov	rax,QWORD[((-8))+rbp*1+rsi]
1658	mov	QWORD[rdi],r12
1659	adc	r13,rdx
1660
1661	lea	rbx,[r10*2+r14]
1662	mov	QWORD[8+rdi],r13
1663	sbb	r15,r15
1664	shr	r10,63
1665	lea	r8,[r11*2+rcx]
1666	shr	r11,63
1667	or	r8,r10
1668	mov	r10,QWORD[32+rdi]
1669	mov	r14,r11
1670	mul	rax
1671	neg	r15
1672	mov	r11,QWORD[40+rdi]
1673	adc	rbx,rax
1674	mov	rax,QWORD[rbp*1+rsi]
1675	mov	QWORD[16+rdi],rbx
1676	adc	r8,rdx
1677	lea	rbp,[16+rbp]
1678	mov	QWORD[24+rdi],r8
1679	sbb	r15,r15
1680	lea	rdi,[64+rdi]
1681	jmp	NEAR $L$sqr4x_shift_n_add
1682
1683ALIGN	32
1684$L$sqr4x_shift_n_add:
1685	lea	r12,[r10*2+r14]
1686	shr	r10,63
1687	lea	r13,[r11*2+rcx]
1688	shr	r11,63
1689	or	r13,r10
1690	mov	r10,QWORD[((-16))+rdi]
1691	mov	r14,r11
1692	mul	rax
1693	neg	r15
1694	mov	r11,QWORD[((-8))+rdi]
1695	adc	r12,rax
1696	mov	rax,QWORD[((-8))+rbp*1+rsi]
1697	mov	QWORD[((-32))+rdi],r12
1698	adc	r13,rdx
1699
1700	lea	rbx,[r10*2+r14]
1701	mov	QWORD[((-24))+rdi],r13
1702	sbb	r15,r15
1703	shr	r10,63
1704	lea	r8,[r11*2+rcx]
1705	shr	r11,63
1706	or	r8,r10
1707	mov	r10,QWORD[rdi]
1708	mov	r14,r11
1709	mul	rax
1710	neg	r15
1711	mov	r11,QWORD[8+rdi]
1712	adc	rbx,rax
1713	mov	rax,QWORD[rbp*1+rsi]
1714	mov	QWORD[((-16))+rdi],rbx
1715	adc	r8,rdx
1716
1717	lea	r12,[r10*2+r14]
1718	mov	QWORD[((-8))+rdi],r8
1719	sbb	r15,r15
1720	shr	r10,63
1721	lea	r13,[r11*2+rcx]
1722	shr	r11,63
1723	or	r13,r10
1724	mov	r10,QWORD[16+rdi]
1725	mov	r14,r11
1726	mul	rax
1727	neg	r15
1728	mov	r11,QWORD[24+rdi]
1729	adc	r12,rax
1730	mov	rax,QWORD[8+rbp*1+rsi]
1731	mov	QWORD[rdi],r12
1732	adc	r13,rdx
1733
1734	lea	rbx,[r10*2+r14]
1735	mov	QWORD[8+rdi],r13
1736	sbb	r15,r15
1737	shr	r10,63
1738	lea	r8,[r11*2+rcx]
1739	shr	r11,63
1740	or	r8,r10
1741	mov	r10,QWORD[32+rdi]
1742	mov	r14,r11
1743	mul	rax
1744	neg	r15
1745	mov	r11,QWORD[40+rdi]
1746	adc	rbx,rax
1747	mov	rax,QWORD[16+rbp*1+rsi]
1748	mov	QWORD[16+rdi],rbx
1749	adc	r8,rdx
1750	mov	QWORD[24+rdi],r8
1751	sbb	r15,r15
1752	lea	rdi,[64+rdi]
1753	add	rbp,32
1754	jnz	NEAR $L$sqr4x_shift_n_add
1755
1756	lea	r12,[r10*2+r14]
1757DB	0x67
1758	shr	r10,63
1759	lea	r13,[r11*2+rcx]
1760	shr	r11,63
1761	or	r13,r10
1762	mov	r10,QWORD[((-16))+rdi]
1763	mov	r14,r11
1764	mul	rax
1765	neg	r15
1766	mov	r11,QWORD[((-8))+rdi]
1767	adc	r12,rax
1768	mov	rax,QWORD[((-8))+rsi]
1769	mov	QWORD[((-32))+rdi],r12
1770	adc	r13,rdx
1771
1772	lea	rbx,[r10*2+r14]
1773	mov	QWORD[((-24))+rdi],r13
1774	sbb	r15,r15
1775	shr	r10,63
1776	lea	r8,[r11*2+rcx]
1777	shr	r11,63
1778	or	r8,r10
1779	mul	rax
1780	neg	r15
1781	adc	rbx,rax
1782	adc	r8,rdx
1783	mov	QWORD[((-16))+rdi],rbx
1784	mov	QWORD[((-8))+rdi],r8
1785DB	102,72,15,126,213
1786__bn_sqr8x_reduction:
1787	xor	rax,rax
1788	lea	rcx,[rbp*1+r9]
1789	lea	rdx,[((48+8))+r9*2+rsp]
1790	mov	QWORD[((0+8))+rsp],rcx
1791	lea	rdi,[((48+8))+r9*1+rsp]
1792	mov	QWORD[((8+8))+rsp],rdx
1793	neg	r9
1794	jmp	NEAR $L$8x_reduction_loop
1795
1796ALIGN	32
1797$L$8x_reduction_loop:
1798	lea	rdi,[r9*1+rdi]
1799DB	0x66
1800	mov	rbx,QWORD[rdi]
1801	mov	r9,QWORD[8+rdi]
1802	mov	r10,QWORD[16+rdi]
1803	mov	r11,QWORD[24+rdi]
1804	mov	r12,QWORD[32+rdi]
1805	mov	r13,QWORD[40+rdi]
1806	mov	r14,QWORD[48+rdi]
1807	mov	r15,QWORD[56+rdi]
1808	mov	QWORD[rdx],rax
1809	lea	rdi,[64+rdi]
1810
1811DB	0x67
1812	mov	r8,rbx
1813	imul	rbx,QWORD[((32+8))+rsp]
1814	mov	rax,QWORD[rbp]
1815	mov	ecx,8
1816	jmp	NEAR $L$8x_reduce
1817
1818ALIGN	32
1819$L$8x_reduce:
1820	mul	rbx
1821	mov	rax,QWORD[8+rbp]
1822	neg	r8
1823	mov	r8,rdx
1824	adc	r8,0
1825
1826	mul	rbx
1827	add	r9,rax
1828	mov	rax,QWORD[16+rbp]
1829	adc	rdx,0
1830	add	r8,r9
1831	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1832	mov	r9,rdx
1833	adc	r9,0
1834
1835	mul	rbx
1836	add	r10,rax
1837	mov	rax,QWORD[24+rbp]
1838	adc	rdx,0
1839	add	r9,r10
1840	mov	rsi,QWORD[((32+8))+rsp]
1841	mov	r10,rdx
1842	adc	r10,0
1843
1844	mul	rbx
1845	add	r11,rax
1846	mov	rax,QWORD[32+rbp]
1847	adc	rdx,0
1848	imul	rsi,r8
1849	add	r10,r11
1850	mov	r11,rdx
1851	adc	r11,0
1852
1853	mul	rbx
1854	add	r12,rax
1855	mov	rax,QWORD[40+rbp]
1856	adc	rdx,0
1857	add	r11,r12
1858	mov	r12,rdx
1859	adc	r12,0
1860
1861	mul	rbx
1862	add	r13,rax
1863	mov	rax,QWORD[48+rbp]
1864	adc	rdx,0
1865	add	r12,r13
1866	mov	r13,rdx
1867	adc	r13,0
1868
1869	mul	rbx
1870	add	r14,rax
1871	mov	rax,QWORD[56+rbp]
1872	adc	rdx,0
1873	add	r13,r14
1874	mov	r14,rdx
1875	adc	r14,0
1876
1877	mul	rbx
1878	mov	rbx,rsi
1879	add	r15,rax
1880	mov	rax,QWORD[rbp]
1881	adc	rdx,0
1882	add	r14,r15
1883	mov	r15,rdx
1884	adc	r15,0
1885
1886	dec	ecx
1887	jnz	NEAR $L$8x_reduce
1888
1889	lea	rbp,[64+rbp]
1890	xor	rax,rax
1891	mov	rdx,QWORD[((8+8))+rsp]
1892	cmp	rbp,QWORD[((0+8))+rsp]
1893	jae	NEAR $L$8x_no_tail
1894
1895DB	0x66
1896	add	r8,QWORD[rdi]
1897	adc	r9,QWORD[8+rdi]
1898	adc	r10,QWORD[16+rdi]
1899	adc	r11,QWORD[24+rdi]
1900	adc	r12,QWORD[32+rdi]
1901	adc	r13,QWORD[40+rdi]
1902	adc	r14,QWORD[48+rdi]
1903	adc	r15,QWORD[56+rdi]
1904	sbb	rsi,rsi
1905
1906	mov	rbx,QWORD[((48+56+8))+rsp]
1907	mov	ecx,8
1908	mov	rax,QWORD[rbp]
1909	jmp	NEAR $L$8x_tail
1910
1911ALIGN	32
1912$L$8x_tail:
1913	mul	rbx
1914	add	r8,rax
1915	mov	rax,QWORD[8+rbp]
1916	mov	QWORD[rdi],r8
1917	mov	r8,rdx
1918	adc	r8,0
1919
1920	mul	rbx
1921	add	r9,rax
1922	mov	rax,QWORD[16+rbp]
1923	adc	rdx,0
1924	add	r8,r9
1925	lea	rdi,[8+rdi]
1926	mov	r9,rdx
1927	adc	r9,0
1928
1929	mul	rbx
1930	add	r10,rax
1931	mov	rax,QWORD[24+rbp]
1932	adc	rdx,0
1933	add	r9,r10
1934	mov	r10,rdx
1935	adc	r10,0
1936
1937	mul	rbx
1938	add	r11,rax
1939	mov	rax,QWORD[32+rbp]
1940	adc	rdx,0
1941	add	r10,r11
1942	mov	r11,rdx
1943	adc	r11,0
1944
1945	mul	rbx
1946	add	r12,rax
1947	mov	rax,QWORD[40+rbp]
1948	adc	rdx,0
1949	add	r11,r12
1950	mov	r12,rdx
1951	adc	r12,0
1952
1953	mul	rbx
1954	add	r13,rax
1955	mov	rax,QWORD[48+rbp]
1956	adc	rdx,0
1957	add	r12,r13
1958	mov	r13,rdx
1959	adc	r13,0
1960
1961	mul	rbx
1962	add	r14,rax
1963	mov	rax,QWORD[56+rbp]
1964	adc	rdx,0
1965	add	r13,r14
1966	mov	r14,rdx
1967	adc	r14,0
1968
1969	mul	rbx
1970	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1971	add	r15,rax
1972	adc	rdx,0
1973	add	r14,r15
1974	mov	rax,QWORD[rbp]
1975	mov	r15,rdx
1976	adc	r15,0
1977
1978	dec	ecx
1979	jnz	NEAR $L$8x_tail
1980
1981	lea	rbp,[64+rbp]
1982	mov	rdx,QWORD[((8+8))+rsp]
1983	cmp	rbp,QWORD[((0+8))+rsp]
1984	jae	NEAR $L$8x_tail_done
1985
1986	mov	rbx,QWORD[((48+56+8))+rsp]
1987	neg	rsi
1988	mov	rax,QWORD[rbp]
1989	adc	r8,QWORD[rdi]
1990	adc	r9,QWORD[8+rdi]
1991	adc	r10,QWORD[16+rdi]
1992	adc	r11,QWORD[24+rdi]
1993	adc	r12,QWORD[32+rdi]
1994	adc	r13,QWORD[40+rdi]
1995	adc	r14,QWORD[48+rdi]
1996	adc	r15,QWORD[56+rdi]
1997	sbb	rsi,rsi
1998
1999	mov	ecx,8
2000	jmp	NEAR $L$8x_tail
2001
2002ALIGN	32
2003$L$8x_tail_done:
2004	xor	rax,rax
2005	add	r8,QWORD[rdx]
2006	adc	r9,0
2007	adc	r10,0
2008	adc	r11,0
2009	adc	r12,0
2010	adc	r13,0
2011	adc	r14,0
2012	adc	r15,0
2013	adc	rax,0
2014
2015	neg	rsi
2016$L$8x_no_tail:
2017	adc	r8,QWORD[rdi]
2018	adc	r9,QWORD[8+rdi]
2019	adc	r10,QWORD[16+rdi]
2020	adc	r11,QWORD[24+rdi]
2021	adc	r12,QWORD[32+rdi]
2022	adc	r13,QWORD[40+rdi]
2023	adc	r14,QWORD[48+rdi]
2024	adc	r15,QWORD[56+rdi]
2025	adc	rax,0
2026	mov	rcx,QWORD[((-8))+rbp]
2027	xor	rsi,rsi
2028
2029DB	102,72,15,126,213
2030
2031	mov	QWORD[rdi],r8
2032	mov	QWORD[8+rdi],r9
2033DB	102,73,15,126,217
2034	mov	QWORD[16+rdi],r10
2035	mov	QWORD[24+rdi],r11
2036	mov	QWORD[32+rdi],r12
2037	mov	QWORD[40+rdi],r13
2038	mov	QWORD[48+rdi],r14
2039	mov	QWORD[56+rdi],r15
2040	lea	rdi,[64+rdi]
2041
2042	cmp	rdi,rdx
2043	jb	NEAR $L$8x_reduction_loop
2044	DB	0F3h,0C3h		;repret
2045
2046
2047
2048ALIGN	32
2049__bn_post4x_internal:
2050
2051	mov	r12,QWORD[rbp]
2052	lea	rbx,[r9*1+rdi]
2053	mov	rcx,r9
2054DB	102,72,15,126,207
2055	neg	rax
2056DB	102,72,15,126,206
2057	sar	rcx,3+2
2058	dec	r12
2059	xor	r10,r10
2060	mov	r13,QWORD[8+rbp]
2061	mov	r14,QWORD[16+rbp]
2062	mov	r15,QWORD[24+rbp]
2063	jmp	NEAR $L$sqr4x_sub_entry
2064
2065ALIGN	16
2066$L$sqr4x_sub:
2067	mov	r12,QWORD[rbp]
2068	mov	r13,QWORD[8+rbp]
2069	mov	r14,QWORD[16+rbp]
2070	mov	r15,QWORD[24+rbp]
2071$L$sqr4x_sub_entry:
2072	lea	rbp,[32+rbp]
2073	not	r12
2074	not	r13
2075	not	r14
2076	not	r15
2077	and	r12,rax
2078	and	r13,rax
2079	and	r14,rax
2080	and	r15,rax
2081
2082	neg	r10
2083	adc	r12,QWORD[rbx]
2084	adc	r13,QWORD[8+rbx]
2085	adc	r14,QWORD[16+rbx]
2086	adc	r15,QWORD[24+rbx]
2087	mov	QWORD[rdi],r12
2088	lea	rbx,[32+rbx]
2089	mov	QWORD[8+rdi],r13
2090	sbb	r10,r10
2091	mov	QWORD[16+rdi],r14
2092	mov	QWORD[24+rdi],r15
2093	lea	rdi,[32+rdi]
2094
2095	inc	rcx
2096	jnz	NEAR $L$sqr4x_sub
2097
2098	mov	r10,r9
2099	neg	r9
2100	DB	0F3h,0C3h		;repret
2101
2102
2103global	GFp_bn_from_montgomery
2104
2105ALIGN	32
2106GFp_bn_from_montgomery:
2107
2108	test	DWORD[48+rsp],7
2109	jz	NEAR bn_from_mont8x
2110	xor	eax,eax
2111	DB	0F3h,0C3h		;repret
2112
2113
2114
2115
2116ALIGN	32
2117bn_from_mont8x:
2118	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2119	mov	QWORD[16+rsp],rsi
2120	mov	rax,rsp
2121$L$SEH_begin_bn_from_mont8x:
2122	mov	rdi,rcx
2123	mov	rsi,rdx
2124	mov	rdx,r8
2125	mov	rcx,r9
2126	mov	r8,QWORD[40+rsp]
2127	mov	r9,QWORD[48+rsp]
2128
2129
2130
2131DB	0x67
2132	mov	rax,rsp
2133
2134	push	rbx
2135
2136	push	rbp
2137
2138	push	r12
2139
2140	push	r13
2141
2142	push	r14
2143
2144	push	r15
2145
2146$L$from_prologue:
2147
2148	shl	r9d,3
2149	lea	r10,[r9*2+r9]
2150	neg	r9
2151	mov	r8,QWORD[r8]
2152
2153
2154
2155
2156
2157
2158
2159
2160	lea	r11,[((-320))+r9*2+rsp]
2161	mov	rbp,rsp
2162	sub	r11,rdi
2163	and	r11,4095
2164	cmp	r10,r11
2165	jb	NEAR $L$from_sp_alt
2166	sub	rbp,r11
2167	lea	rbp,[((-320))+r9*2+rbp]
2168	jmp	NEAR $L$from_sp_done
2169
2170ALIGN	32
2171$L$from_sp_alt:
2172	lea	r10,[((4096-320))+r9*2]
2173	lea	rbp,[((-320))+r9*2+rbp]
2174	sub	r11,r10
2175	mov	r10,0
2176	cmovc	r11,r10
2177	sub	rbp,r11
2178$L$from_sp_done:
2179	and	rbp,-64
2180	mov	r11,rsp
2181	sub	r11,rbp
2182	and	r11,-4096
2183	lea	rsp,[rbp*1+r11]
2184	mov	r10,QWORD[rsp]
2185	cmp	rsp,rbp
2186	ja	NEAR $L$from_page_walk
2187	jmp	NEAR $L$from_page_walk_done
2188
2189$L$from_page_walk:
2190	lea	rsp,[((-4096))+rsp]
2191	mov	r10,QWORD[rsp]
2192	cmp	rsp,rbp
2193	ja	NEAR $L$from_page_walk
2194$L$from_page_walk_done:
2195
2196	mov	r10,r9
2197	neg	r9
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208	mov	QWORD[32+rsp],r8
2209	mov	QWORD[40+rsp],rax
2210
2211$L$from_body:
2212	mov	r11,r9
2213	lea	rax,[48+rsp]
2214	pxor	xmm0,xmm0
2215	jmp	NEAR $L$mul_by_1
2216
2217ALIGN	32
2218$L$mul_by_1:
2219	movdqu	xmm1,XMMWORD[rsi]
2220	movdqu	xmm2,XMMWORD[16+rsi]
2221	movdqu	xmm3,XMMWORD[32+rsi]
2222	movdqa	XMMWORD[r9*1+rax],xmm0
2223	movdqu	xmm4,XMMWORD[48+rsi]
2224	movdqa	XMMWORD[16+r9*1+rax],xmm0
2225DB	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2226	movdqa	XMMWORD[rax],xmm1
2227	movdqa	XMMWORD[32+r9*1+rax],xmm0
2228	movdqa	XMMWORD[16+rax],xmm2
2229	movdqa	XMMWORD[48+r9*1+rax],xmm0
2230	movdqa	XMMWORD[32+rax],xmm3
2231	movdqa	XMMWORD[48+rax],xmm4
2232	lea	rax,[64+rax]
2233	sub	r11,64
2234	jnz	NEAR $L$mul_by_1
2235
2236DB	102,72,15,110,207
2237DB	102,72,15,110,209
2238DB	0x67
2239	mov	rbp,rcx
2240DB	102,73,15,110,218
2241	lea	r11,[GFp_ia32cap_P]
2242	mov	r11d,DWORD[8+r11]
2243	and	r11d,0x80108
2244	cmp	r11d,0x80108
2245	jne	NEAR $L$from_mont_nox
2246
2247	lea	rdi,[r9*1+rax]
2248	call	__bn_sqrx8x_reduction
2249	call	__bn_postx4x_internal
2250
2251	pxor	xmm0,xmm0
2252	lea	rax,[48+rsp]
2253	jmp	NEAR $L$from_mont_zero
2254
2255ALIGN	32
2256$L$from_mont_nox:
2257	call	__bn_sqr8x_reduction
2258	call	__bn_post4x_internal
2259
2260	pxor	xmm0,xmm0
2261	lea	rax,[48+rsp]
2262	jmp	NEAR $L$from_mont_zero
2263
2264ALIGN	32
2265$L$from_mont_zero:
2266	mov	rsi,QWORD[40+rsp]
2267
2268	movdqa	XMMWORD[rax],xmm0
2269	movdqa	XMMWORD[16+rax],xmm0
2270	movdqa	XMMWORD[32+rax],xmm0
2271	movdqa	XMMWORD[48+rax],xmm0
2272	lea	rax,[64+rax]
2273	sub	r9,32
2274	jnz	NEAR $L$from_mont_zero
2275
2276	mov	rax,1
2277	mov	r15,QWORD[((-48))+rsi]
2278
2279	mov	r14,QWORD[((-40))+rsi]
2280
2281	mov	r13,QWORD[((-32))+rsi]
2282
2283	mov	r12,QWORD[((-24))+rsi]
2284
2285	mov	rbp,QWORD[((-16))+rsi]
2286
2287	mov	rbx,QWORD[((-8))+rsi]
2288
2289	lea	rsp,[rsi]
2290
2291$L$from_epilogue:
2292	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2293	mov	rsi,QWORD[16+rsp]
2294	DB	0F3h,0C3h		;repret
2295
2296$L$SEH_end_bn_from_mont8x:
2297
2298ALIGN	32
2299bn_mulx4x_mont_gather5:
2300	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2301	mov	QWORD[16+rsp],rsi
2302	mov	rax,rsp
2303$L$SEH_begin_bn_mulx4x_mont_gather5:
2304	mov	rdi,rcx
2305	mov	rsi,rdx
2306	mov	rdx,r8
2307	mov	rcx,r9
2308	mov	r8,QWORD[40+rsp]
2309	mov	r9,QWORD[48+rsp]
2310
2311
2312
2313	mov	rax,rsp
2314
2315$L$mulx4x_enter:
2316	push	rbx
2317
2318	push	rbp
2319
2320	push	r12
2321
2322	push	r13
2323
2324	push	r14
2325
2326	push	r15
2327
2328$L$mulx4x_prologue:
2329
2330	shl	r9d,3
2331	lea	r10,[r9*2+r9]
2332	neg	r9
2333	mov	r8,QWORD[r8]
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344	lea	r11,[((-320))+r9*2+rsp]
2345	mov	rbp,rsp
2346	sub	r11,rdi
2347	and	r11,4095
2348	cmp	r10,r11
2349	jb	NEAR $L$mulx4xsp_alt
2350	sub	rbp,r11
2351	lea	rbp,[((-320))+r9*2+rbp]
2352	jmp	NEAR $L$mulx4xsp_done
2353
2354$L$mulx4xsp_alt:
2355	lea	r10,[((4096-320))+r9*2]
2356	lea	rbp,[((-320))+r9*2+rbp]
2357	sub	r11,r10
2358	mov	r10,0
2359	cmovc	r11,r10
2360	sub	rbp,r11
2361$L$mulx4xsp_done:
2362	and	rbp,-64
2363	mov	r11,rsp
2364	sub	r11,rbp
2365	and	r11,-4096
2366	lea	rsp,[rbp*1+r11]
2367	mov	r10,QWORD[rsp]
2368	cmp	rsp,rbp
2369	ja	NEAR $L$mulx4x_page_walk
2370	jmp	NEAR $L$mulx4x_page_walk_done
2371
2372$L$mulx4x_page_walk:
2373	lea	rsp,[((-4096))+rsp]
2374	mov	r10,QWORD[rsp]
2375	cmp	rsp,rbp
2376	ja	NEAR $L$mulx4x_page_walk
2377$L$mulx4x_page_walk_done:
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391	mov	QWORD[32+rsp],r8
2392	mov	QWORD[40+rsp],rax
2393
2394$L$mulx4x_body:
2395	call	mulx4x_internal
2396
2397	mov	rsi,QWORD[40+rsp]
2398
2399	mov	rax,1
2400
2401	mov	r15,QWORD[((-48))+rsi]
2402
2403	mov	r14,QWORD[((-40))+rsi]
2404
2405	mov	r13,QWORD[((-32))+rsi]
2406
2407	mov	r12,QWORD[((-24))+rsi]
2408
2409	mov	rbp,QWORD[((-16))+rsi]
2410
2411	mov	rbx,QWORD[((-8))+rsi]
2412
2413	lea	rsp,[rsi]
2414
2415$L$mulx4x_epilogue:
2416	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2417	mov	rsi,QWORD[16+rsp]
2418	DB	0F3h,0C3h		;repret
2419
2420$L$SEH_end_bn_mulx4x_mont_gather5:
2421
2422
2423ALIGN	32
2424mulx4x_internal:
2425
2426	mov	QWORD[8+rsp],r9
2427	mov	r10,r9
2428	neg	r9
2429	shl	r9,5
2430	neg	r10
2431	lea	r13,[128+r9*1+rdx]
2432	shr	r9,5+5
2433	movd	xmm5,DWORD[56+rax]
2434	sub	r9,1
2435	lea	rax,[$L$inc]
2436	mov	QWORD[((16+8))+rsp],r13
2437	mov	QWORD[((24+8))+rsp],r9
2438	mov	QWORD[((56+8))+rsp],rdi
2439	movdqa	xmm0,XMMWORD[rax]
2440	movdqa	xmm1,XMMWORD[16+rax]
2441	lea	r10,[((88-112))+r10*1+rsp]
2442	lea	rdi,[128+rdx]
2443
2444	pshufd	xmm5,xmm5,0
2445	movdqa	xmm4,xmm1
2446DB	0x67
2447	movdqa	xmm2,xmm1
2448DB	0x67
2449	paddd	xmm1,xmm0
2450	pcmpeqd	xmm0,xmm5
2451	movdqa	xmm3,xmm4
2452	paddd	xmm2,xmm1
2453	pcmpeqd	xmm1,xmm5
2454	movdqa	XMMWORD[112+r10],xmm0
2455	movdqa	xmm0,xmm4
2456
2457	paddd	xmm3,xmm2
2458	pcmpeqd	xmm2,xmm5
2459	movdqa	XMMWORD[128+r10],xmm1
2460	movdqa	xmm1,xmm4
2461
2462	paddd	xmm0,xmm3
2463	pcmpeqd	xmm3,xmm5
2464	movdqa	XMMWORD[144+r10],xmm2
2465	movdqa	xmm2,xmm4
2466
2467	paddd	xmm1,xmm0
2468	pcmpeqd	xmm0,xmm5
2469	movdqa	XMMWORD[160+r10],xmm3
2470	movdqa	xmm3,xmm4
2471	paddd	xmm2,xmm1
2472	pcmpeqd	xmm1,xmm5
2473	movdqa	XMMWORD[176+r10],xmm0
2474	movdqa	xmm0,xmm4
2475
2476	paddd	xmm3,xmm2
2477	pcmpeqd	xmm2,xmm5
2478	movdqa	XMMWORD[192+r10],xmm1
2479	movdqa	xmm1,xmm4
2480
2481	paddd	xmm0,xmm3
2482	pcmpeqd	xmm3,xmm5
2483	movdqa	XMMWORD[208+r10],xmm2
2484	movdqa	xmm2,xmm4
2485
2486	paddd	xmm1,xmm0
2487	pcmpeqd	xmm0,xmm5
2488	movdqa	XMMWORD[224+r10],xmm3
2489	movdqa	xmm3,xmm4
2490	paddd	xmm2,xmm1
2491	pcmpeqd	xmm1,xmm5
2492	movdqa	XMMWORD[240+r10],xmm0
2493	movdqa	xmm0,xmm4
2494
2495	paddd	xmm3,xmm2
2496	pcmpeqd	xmm2,xmm5
2497	movdqa	XMMWORD[256+r10],xmm1
2498	movdqa	xmm1,xmm4
2499
2500	paddd	xmm0,xmm3
2501	pcmpeqd	xmm3,xmm5
2502	movdqa	XMMWORD[272+r10],xmm2
2503	movdqa	xmm2,xmm4
2504
2505	paddd	xmm1,xmm0
2506	pcmpeqd	xmm0,xmm5
2507	movdqa	XMMWORD[288+r10],xmm3
2508	movdqa	xmm3,xmm4
2509DB	0x67
2510	paddd	xmm2,xmm1
2511	pcmpeqd	xmm1,xmm5
2512	movdqa	XMMWORD[304+r10],xmm0
2513
2514	paddd	xmm3,xmm2
2515	pcmpeqd	xmm2,xmm5
2516	movdqa	XMMWORD[320+r10],xmm1
2517
2518	pcmpeqd	xmm3,xmm5
2519	movdqa	XMMWORD[336+r10],xmm2
2520
2521	pand	xmm0,XMMWORD[64+rdi]
2522	pand	xmm1,XMMWORD[80+rdi]
2523	pand	xmm2,XMMWORD[96+rdi]
2524	movdqa	XMMWORD[352+r10],xmm3
2525	pand	xmm3,XMMWORD[112+rdi]
2526	por	xmm0,xmm2
2527	por	xmm1,xmm3
2528	movdqa	xmm4,XMMWORD[((-128))+rdi]
2529	movdqa	xmm5,XMMWORD[((-112))+rdi]
2530	movdqa	xmm2,XMMWORD[((-96))+rdi]
2531	pand	xmm4,XMMWORD[112+r10]
2532	movdqa	xmm3,XMMWORD[((-80))+rdi]
2533	pand	xmm5,XMMWORD[128+r10]
2534	por	xmm0,xmm4
2535	pand	xmm2,XMMWORD[144+r10]
2536	por	xmm1,xmm5
2537	pand	xmm3,XMMWORD[160+r10]
2538	por	xmm0,xmm2
2539	por	xmm1,xmm3
2540	movdqa	xmm4,XMMWORD[((-64))+rdi]
2541	movdqa	xmm5,XMMWORD[((-48))+rdi]
2542	movdqa	xmm2,XMMWORD[((-32))+rdi]
2543	pand	xmm4,XMMWORD[176+r10]
2544	movdqa	xmm3,XMMWORD[((-16))+rdi]
2545	pand	xmm5,XMMWORD[192+r10]
2546	por	xmm0,xmm4
2547	pand	xmm2,XMMWORD[208+r10]
2548	por	xmm1,xmm5
2549	pand	xmm3,XMMWORD[224+r10]
2550	por	xmm0,xmm2
2551	por	xmm1,xmm3
2552	movdqa	xmm4,XMMWORD[rdi]
2553	movdqa	xmm5,XMMWORD[16+rdi]
2554	movdqa	xmm2,XMMWORD[32+rdi]
2555	pand	xmm4,XMMWORD[240+r10]
2556	movdqa	xmm3,XMMWORD[48+rdi]
2557	pand	xmm5,XMMWORD[256+r10]
2558	por	xmm0,xmm4
2559	pand	xmm2,XMMWORD[272+r10]
2560	por	xmm1,xmm5
2561	pand	xmm3,XMMWORD[288+r10]
2562	por	xmm0,xmm2
2563	por	xmm1,xmm3
2564	pxor	xmm0,xmm1
2565	pshufd	xmm1,xmm0,0x4e
2566	por	xmm0,xmm1
2567	lea	rdi,[256+rdi]
2568DB	102,72,15,126,194
2569	lea	rbx,[((64+32+8))+rsp]
2570
2571	mov	r9,rdx
2572	mulx	rax,r8,QWORD[rsi]
2573	mulx	r12,r11,QWORD[8+rsi]
2574	add	r11,rax
2575	mulx	r13,rax,QWORD[16+rsi]
2576	adc	r12,rax
2577	adc	r13,0
2578	mulx	r14,rax,QWORD[24+rsi]
2579
2580	mov	r15,r8
2581	imul	r8,QWORD[((32+8))+rsp]
2582	xor	rbp,rbp
2583	mov	rdx,r8
2584
2585	mov	QWORD[((8+8))+rsp],rdi
2586
2587	lea	rsi,[32+rsi]
2588	adcx	r13,rax
2589	adcx	r14,rbp
2590
2591	mulx	r10,rax,QWORD[rcx]
2592	adcx	r15,rax
2593	adox	r10,r11
2594	mulx	r11,rax,QWORD[8+rcx]
2595	adcx	r10,rax
2596	adox	r11,r12
2597	mulx	r12,rax,QWORD[16+rcx]
2598	mov	rdi,QWORD[((24+8))+rsp]
2599	mov	QWORD[((-32))+rbx],r10
2600	adcx	r11,rax
2601	adox	r12,r13
2602	mulx	r15,rax,QWORD[24+rcx]
2603	mov	rdx,r9
2604	mov	QWORD[((-24))+rbx],r11
2605	adcx	r12,rax
2606	adox	r15,rbp
2607	lea	rcx,[32+rcx]
2608	mov	QWORD[((-16))+rbx],r12
2609	jmp	NEAR $L$mulx4x_1st
2610
2611ALIGN	32
2612$L$mulx4x_1st:
2613	adcx	r15,rbp
2614	mulx	rax,r10,QWORD[rsi]
2615	adcx	r10,r14
2616	mulx	r14,r11,QWORD[8+rsi]
2617	adcx	r11,rax
2618	mulx	rax,r12,QWORD[16+rsi]
2619	adcx	r12,r14
2620	mulx	r14,r13,QWORD[24+rsi]
2621DB	0x67,0x67
2622	mov	rdx,r8
2623	adcx	r13,rax
2624	adcx	r14,rbp
2625	lea	rsi,[32+rsi]
2626	lea	rbx,[32+rbx]
2627
2628	adox	r10,r15
2629	mulx	r15,rax,QWORD[rcx]
2630	adcx	r10,rax
2631	adox	r11,r15
2632	mulx	r15,rax,QWORD[8+rcx]
2633	adcx	r11,rax
2634	adox	r12,r15
2635	mulx	r15,rax,QWORD[16+rcx]
2636	mov	QWORD[((-40))+rbx],r10
2637	adcx	r12,rax
2638	mov	QWORD[((-32))+rbx],r11
2639	adox	r13,r15
2640	mulx	r15,rax,QWORD[24+rcx]
2641	mov	rdx,r9
2642	mov	QWORD[((-24))+rbx],r12
2643	adcx	r13,rax
2644	adox	r15,rbp
2645	lea	rcx,[32+rcx]
2646	mov	QWORD[((-16))+rbx],r13
2647
2648	dec	rdi
2649	jnz	NEAR $L$mulx4x_1st
2650
2651	mov	rax,QWORD[8+rsp]
2652	adc	r15,rbp
2653	lea	rsi,[rax*1+rsi]
2654	add	r14,r15
2655	mov	rdi,QWORD[((8+8))+rsp]
2656	adc	rbp,rbp
2657	mov	QWORD[((-8))+rbx],r14
2658	jmp	NEAR $L$mulx4x_outer
2659
2660ALIGN	32
2661$L$mulx4x_outer:
2662	lea	r10,[((16-256))+rbx]
2663	pxor	xmm4,xmm4
2664DB	0x67,0x67
2665	pxor	xmm5,xmm5
2666	movdqa	xmm0,XMMWORD[((-128))+rdi]
2667	movdqa	xmm1,XMMWORD[((-112))+rdi]
2668	movdqa	xmm2,XMMWORD[((-96))+rdi]
2669	pand	xmm0,XMMWORD[256+r10]
2670	movdqa	xmm3,XMMWORD[((-80))+rdi]
2671	pand	xmm1,XMMWORD[272+r10]
2672	por	xmm4,xmm0
2673	pand	xmm2,XMMWORD[288+r10]
2674	por	xmm5,xmm1
2675	pand	xmm3,XMMWORD[304+r10]
2676	por	xmm4,xmm2
2677	por	xmm5,xmm3
2678	movdqa	xmm0,XMMWORD[((-64))+rdi]
2679	movdqa	xmm1,XMMWORD[((-48))+rdi]
2680	movdqa	xmm2,XMMWORD[((-32))+rdi]
2681	pand	xmm0,XMMWORD[320+r10]
2682	movdqa	xmm3,XMMWORD[((-16))+rdi]
2683	pand	xmm1,XMMWORD[336+r10]
2684	por	xmm4,xmm0
2685	pand	xmm2,XMMWORD[352+r10]
2686	por	xmm5,xmm1
2687	pand	xmm3,XMMWORD[368+r10]
2688	por	xmm4,xmm2
2689	por	xmm5,xmm3
2690	movdqa	xmm0,XMMWORD[rdi]
2691	movdqa	xmm1,XMMWORD[16+rdi]
2692	movdqa	xmm2,XMMWORD[32+rdi]
2693	pand	xmm0,XMMWORD[384+r10]
2694	movdqa	xmm3,XMMWORD[48+rdi]
2695	pand	xmm1,XMMWORD[400+r10]
2696	por	xmm4,xmm0
2697	pand	xmm2,XMMWORD[416+r10]
2698	por	xmm5,xmm1
2699	pand	xmm3,XMMWORD[432+r10]
2700	por	xmm4,xmm2
2701	por	xmm5,xmm3
2702	movdqa	xmm0,XMMWORD[64+rdi]
2703	movdqa	xmm1,XMMWORD[80+rdi]
2704	movdqa	xmm2,XMMWORD[96+rdi]
2705	pand	xmm0,XMMWORD[448+r10]
2706	movdqa	xmm3,XMMWORD[112+rdi]
2707	pand	xmm1,XMMWORD[464+r10]
2708	por	xmm4,xmm0
2709	pand	xmm2,XMMWORD[480+r10]
2710	por	xmm5,xmm1
2711	pand	xmm3,XMMWORD[496+r10]
2712	por	xmm4,xmm2
2713	por	xmm5,xmm3
2714	por	xmm4,xmm5
2715	pshufd	xmm0,xmm4,0x4e
2716	por	xmm0,xmm4
2717	lea	rdi,[256+rdi]
2718DB	102,72,15,126,194
2719
2720	mov	QWORD[rbx],rbp
2721	lea	rbx,[32+rax*1+rbx]
2722	mulx	r11,r8,QWORD[rsi]
2723	xor	rbp,rbp
2724	mov	r9,rdx
2725	mulx	r12,r14,QWORD[8+rsi]
2726	adox	r8,QWORD[((-32))+rbx]
2727	adcx	r11,r14
2728	mulx	r13,r15,QWORD[16+rsi]
2729	adox	r11,QWORD[((-24))+rbx]
2730	adcx	r12,r15
2731	mulx	r14,rdx,QWORD[24+rsi]
2732	adox	r12,QWORD[((-16))+rbx]
2733	adcx	r13,rdx
2734	lea	rcx,[rax*1+rcx]
2735	lea	rsi,[32+rsi]
2736	adox	r13,QWORD[((-8))+rbx]
2737	adcx	r14,rbp
2738	adox	r14,rbp
2739
2740	mov	r15,r8
2741	imul	r8,QWORD[((32+8))+rsp]
2742
2743	mov	rdx,r8
2744	xor	rbp,rbp
2745	mov	QWORD[((8+8))+rsp],rdi
2746
2747	mulx	r10,rax,QWORD[rcx]
2748	adcx	r15,rax
2749	adox	r10,r11
2750	mulx	r11,rax,QWORD[8+rcx]
2751	adcx	r10,rax
2752	adox	r11,r12
2753	mulx	r12,rax,QWORD[16+rcx]
2754	adcx	r11,rax
2755	adox	r12,r13
2756	mulx	r15,rax,QWORD[24+rcx]
2757	mov	rdx,r9
2758	mov	rdi,QWORD[((24+8))+rsp]
2759	mov	QWORD[((-32))+rbx],r10
2760	adcx	r12,rax
2761	mov	QWORD[((-24))+rbx],r11
2762	adox	r15,rbp
2763	mov	QWORD[((-16))+rbx],r12
2764	lea	rcx,[32+rcx]
2765	jmp	NEAR $L$mulx4x_inner
2766
2767ALIGN	32
2768$L$mulx4x_inner:
2769	mulx	rax,r10,QWORD[rsi]
2770	adcx	r15,rbp
2771	adox	r10,r14
2772	mulx	r14,r11,QWORD[8+rsi]
2773	adcx	r10,QWORD[rbx]
2774	adox	r11,rax
2775	mulx	rax,r12,QWORD[16+rsi]
2776	adcx	r11,QWORD[8+rbx]
2777	adox	r12,r14
2778	mulx	r14,r13,QWORD[24+rsi]
2779	mov	rdx,r8
2780	adcx	r12,QWORD[16+rbx]
2781	adox	r13,rax
2782	adcx	r13,QWORD[24+rbx]
2783	adox	r14,rbp
2784	lea	rsi,[32+rsi]
2785	lea	rbx,[32+rbx]
2786	adcx	r14,rbp
2787
2788	adox	r10,r15
2789	mulx	r15,rax,QWORD[rcx]
2790	adcx	r10,rax
2791	adox	r11,r15
2792	mulx	r15,rax,QWORD[8+rcx]
2793	adcx	r11,rax
2794	adox	r12,r15
2795	mulx	r15,rax,QWORD[16+rcx]
2796	mov	QWORD[((-40))+rbx],r10
2797	adcx	r12,rax
2798	adox	r13,r15
2799	mov	QWORD[((-32))+rbx],r11
2800	mulx	r15,rax,QWORD[24+rcx]
2801	mov	rdx,r9
2802	lea	rcx,[32+rcx]
2803	mov	QWORD[((-24))+rbx],r12
2804	adcx	r13,rax
2805	adox	r15,rbp
2806	mov	QWORD[((-16))+rbx],r13
2807
2808	dec	rdi
2809	jnz	NEAR $L$mulx4x_inner
2810
2811	mov	rax,QWORD[((0+8))+rsp]
2812	adc	r15,rbp
2813	sub	rdi,QWORD[rbx]
2814	mov	rdi,QWORD[((8+8))+rsp]
2815	mov	r10,QWORD[((16+8))+rsp]
2816	adc	r14,r15
2817	lea	rsi,[rax*1+rsi]
2818	adc	rbp,rbp
2819	mov	QWORD[((-8))+rbx],r14
2820
2821	cmp	rdi,r10
2822	jb	NEAR $L$mulx4x_outer
2823
2824	mov	r10,QWORD[((-8))+rcx]
2825	mov	r8,rbp
2826	mov	r12,QWORD[rax*1+rcx]
2827	lea	rbp,[rax*1+rcx]
2828	mov	rcx,rax
2829	lea	rdi,[rax*1+rbx]
2830	xor	eax,eax
2831	xor	r15,r15
2832	sub	r10,r14
2833	adc	r15,r15
2834	or	r8,r15
2835	sar	rcx,3+2
2836	sub	rax,r8
2837	mov	rdx,QWORD[((56+8))+rsp]
2838	dec	r12
2839	mov	r13,QWORD[8+rbp]
2840	xor	r8,r8
2841	mov	r14,QWORD[16+rbp]
2842	mov	r15,QWORD[24+rbp]
2843	jmp	NEAR $L$sqrx4x_sub_entry
2844
2845
2846
2847ALIGN	32
2848bn_powerx5:
2849	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2850	mov	QWORD[16+rsp],rsi
2851	mov	rax,rsp
2852$L$SEH_begin_bn_powerx5:
2853	mov	rdi,rcx
2854	mov	rsi,rdx
2855	mov	rdx,r8
2856	mov	rcx,r9
2857	mov	r8,QWORD[40+rsp]
2858	mov	r9,QWORD[48+rsp]
2859
2860
2861
2862	mov	rax,rsp
2863
2864$L$powerx5_enter:
2865	push	rbx
2866
2867	push	rbp
2868
2869	push	r12
2870
2871	push	r13
2872
2873	push	r14
2874
2875	push	r15
2876
2877$L$powerx5_prologue:
2878
2879	shl	r9d,3
2880	lea	r10,[r9*2+r9]
2881	neg	r9
2882	mov	r8,QWORD[r8]
2883
2884
2885
2886
2887
2888
2889
2890
2891	lea	r11,[((-320))+r9*2+rsp]
2892	mov	rbp,rsp
2893	sub	r11,rdi
2894	and	r11,4095
2895	cmp	r10,r11
2896	jb	NEAR $L$pwrx_sp_alt
2897	sub	rbp,r11
2898	lea	rbp,[((-320))+r9*2+rbp]
2899	jmp	NEAR $L$pwrx_sp_done
2900
2901ALIGN	32
2902$L$pwrx_sp_alt:
2903	lea	r10,[((4096-320))+r9*2]
2904	lea	rbp,[((-320))+r9*2+rbp]
2905	sub	r11,r10
2906	mov	r10,0
2907	cmovc	r11,r10
2908	sub	rbp,r11
2909$L$pwrx_sp_done:
2910	and	rbp,-64
2911	mov	r11,rsp
2912	sub	r11,rbp
2913	and	r11,-4096
2914	lea	rsp,[rbp*1+r11]
2915	mov	r10,QWORD[rsp]
2916	cmp	rsp,rbp
2917	ja	NEAR $L$pwrx_page_walk
2918	jmp	NEAR $L$pwrx_page_walk_done
2919
2920$L$pwrx_page_walk:
2921	lea	rsp,[((-4096))+rsp]
2922	mov	r10,QWORD[rsp]
2923	cmp	rsp,rbp
2924	ja	NEAR $L$pwrx_page_walk
2925$L$pwrx_page_walk_done:
2926
2927	mov	r10,r9
2928	neg	r9
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941	pxor	xmm0,xmm0
2942DB	102,72,15,110,207
2943DB	102,72,15,110,209
2944DB	102,73,15,110,218
2945DB	102,72,15,110,226
2946	mov	QWORD[32+rsp],r8
2947	mov	QWORD[40+rsp],rax
2948
2949$L$powerx5_body:
2950
2951	call	__bn_sqrx8x_internal
2952	call	__bn_postx4x_internal
2953	call	__bn_sqrx8x_internal
2954	call	__bn_postx4x_internal
2955	call	__bn_sqrx8x_internal
2956	call	__bn_postx4x_internal
2957	call	__bn_sqrx8x_internal
2958	call	__bn_postx4x_internal
2959	call	__bn_sqrx8x_internal
2960	call	__bn_postx4x_internal
2961
2962	mov	r9,r10
2963	mov	rdi,rsi
2964DB	102,72,15,126,209
2965DB	102,72,15,126,226
2966	mov	rax,QWORD[40+rsp]
2967
2968	call	mulx4x_internal
2969
2970	mov	rsi,QWORD[40+rsp]
2971
2972	mov	rax,1
2973
2974	mov	r15,QWORD[((-48))+rsi]
2975
2976	mov	r14,QWORD[((-40))+rsi]
2977
2978	mov	r13,QWORD[((-32))+rsi]
2979
2980	mov	r12,QWORD[((-24))+rsi]
2981
2982	mov	rbp,QWORD[((-16))+rsi]
2983
2984	mov	rbx,QWORD[((-8))+rsi]
2985
2986	lea	rsp,[rsi]
2987
2988$L$powerx5_epilogue:
2989	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2990	mov	rsi,QWORD[16+rsp]
2991	DB	0F3h,0C3h		;repret
2992
2993$L$SEH_end_bn_powerx5:
2994
2995global	GFp_bn_sqrx8x_internal
2996
2997ALIGN	32
2998GFp_bn_sqrx8x_internal:
2999__bn_sqrx8x_internal:
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041	lea	rdi,[((48+8))+rsp]
3042	lea	rbp,[r9*1+rsi]
3043	mov	QWORD[((0+8))+rsp],r9
3044	mov	QWORD[((8+8))+rsp],rbp
3045	jmp	NEAR $L$sqr8x_zero_start
3046
3047ALIGN	32
3048DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
3049$L$sqrx8x_zero:
3050DB	0x3e
3051	movdqa	XMMWORD[rdi],xmm0
3052	movdqa	XMMWORD[16+rdi],xmm0
3053	movdqa	XMMWORD[32+rdi],xmm0
3054	movdqa	XMMWORD[48+rdi],xmm0
3055$L$sqr8x_zero_start:
3056	movdqa	XMMWORD[64+rdi],xmm0
3057	movdqa	XMMWORD[80+rdi],xmm0
3058	movdqa	XMMWORD[96+rdi],xmm0
3059	movdqa	XMMWORD[112+rdi],xmm0
3060	lea	rdi,[128+rdi]
3061	sub	r9,64
3062	jnz	NEAR $L$sqrx8x_zero
3063
3064	mov	rdx,QWORD[rsi]
3065
3066	xor	r10,r10
3067	xor	r11,r11
3068	xor	r12,r12
3069	xor	r13,r13
3070	xor	r14,r14
3071	xor	r15,r15
3072	lea	rdi,[((48+8))+rsp]
3073	xor	rbp,rbp
3074	jmp	NEAR $L$sqrx8x_outer_loop
3075
3076ALIGN	32
3077$L$sqrx8x_outer_loop:
3078	mulx	rax,r8,QWORD[8+rsi]
3079	adcx	r8,r9
3080	adox	r10,rax
3081	mulx	rax,r9,QWORD[16+rsi]
3082	adcx	r9,r10
3083	adox	r11,rax
3084DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3085	adcx	r10,r11
3086	adox	r12,rax
3087DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3088	adcx	r11,r12
3089	adox	r13,rax
3090	mulx	rax,r12,QWORD[40+rsi]
3091	adcx	r12,r13
3092	adox	r14,rax
3093	mulx	rax,r13,QWORD[48+rsi]
3094	adcx	r13,r14
3095	adox	rax,r15
3096	mulx	r15,r14,QWORD[56+rsi]
3097	mov	rdx,QWORD[8+rsi]
3098	adcx	r14,rax
3099	adox	r15,rbp
3100	adc	r15,QWORD[64+rdi]
3101	mov	QWORD[8+rdi],r8
3102	mov	QWORD[16+rdi],r9
3103	sbb	rcx,rcx
3104	xor	rbp,rbp
3105
3106
3107	mulx	rbx,r8,QWORD[16+rsi]
3108	mulx	rax,r9,QWORD[24+rsi]
3109	adcx	r8,r10
3110	adox	r9,rbx
3111	mulx	rbx,r10,QWORD[32+rsi]
3112	adcx	r9,r11
3113	adox	r10,rax
3114DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3115	adcx	r10,r12
3116	adox	r11,rbx
3117DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3118	adcx	r11,r13
3119	adox	r12,r14
3120DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3121	mov	rdx,QWORD[16+rsi]
3122	adcx	r12,rax
3123	adox	r13,rbx
3124	adcx	r13,r15
3125	adox	r14,rbp
3126	adcx	r14,rbp
3127
3128	mov	QWORD[24+rdi],r8
3129	mov	QWORD[32+rdi],r9
3130
3131	mulx	rbx,r8,QWORD[24+rsi]
3132	mulx	rax,r9,QWORD[32+rsi]
3133	adcx	r8,r10
3134	adox	r9,rbx
3135	mulx	rbx,r10,QWORD[40+rsi]
3136	adcx	r9,r11
3137	adox	r10,rax
3138DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3139	adcx	r10,r12
3140	adox	r11,r13
3141DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3142DB	0x3e
3143	mov	rdx,QWORD[24+rsi]
3144	adcx	r11,rbx
3145	adox	r12,rax
3146	adcx	r12,r14
3147	mov	QWORD[40+rdi],r8
3148	mov	QWORD[48+rdi],r9
3149	mulx	rax,r8,QWORD[32+rsi]
3150	adox	r13,rbp
3151	adcx	r13,rbp
3152
3153	mulx	rbx,r9,QWORD[40+rsi]
3154	adcx	r8,r10
3155	adox	r9,rax
3156	mulx	rax,r10,QWORD[48+rsi]
3157	adcx	r9,r11
3158	adox	r10,r12
3159	mulx	r12,r11,QWORD[56+rsi]
3160	mov	rdx,QWORD[32+rsi]
3161	mov	r14,QWORD[40+rsi]
3162	adcx	r10,rbx
3163	adox	r11,rax
3164	mov	r15,QWORD[48+rsi]
3165	adcx	r11,r13
3166	adox	r12,rbp
3167	adcx	r12,rbp
3168
3169	mov	QWORD[56+rdi],r8
3170	mov	QWORD[64+rdi],r9
3171
3172	mulx	rax,r9,r14
3173	mov	r8,QWORD[56+rsi]
3174	adcx	r9,r10
3175	mulx	rbx,r10,r15
3176	adox	r10,rax
3177	adcx	r10,r11
3178	mulx	rax,r11,r8
3179	mov	rdx,r14
3180	adox	r11,rbx
3181	adcx	r11,r12
3182
3183	adcx	rax,rbp
3184
3185	mulx	rbx,r14,r15
3186	mulx	r13,r12,r8
3187	mov	rdx,r15
3188	lea	rsi,[64+rsi]
3189	adcx	r11,r14
3190	adox	r12,rbx
3191	adcx	r12,rax
3192	adox	r13,rbp
3193
3194DB	0x67,0x67
3195	mulx	r14,r8,r8
3196	adcx	r13,r8
3197	adcx	r14,rbp
3198
3199	cmp	rsi,QWORD[((8+8))+rsp]
3200	je	NEAR $L$sqrx8x_outer_break
3201
3202	neg	rcx
3203	mov	rcx,-8
3204	mov	r15,rbp
3205	mov	r8,QWORD[64+rdi]
3206	adcx	r9,QWORD[72+rdi]
3207	adcx	r10,QWORD[80+rdi]
3208	adcx	r11,QWORD[88+rdi]
3209	adc	r12,QWORD[96+rdi]
3210	adc	r13,QWORD[104+rdi]
3211	adc	r14,QWORD[112+rdi]
3212	adc	r15,QWORD[120+rdi]
3213	lea	rbp,[rsi]
3214	lea	rdi,[128+rdi]
3215	sbb	rax,rax
3216
3217	mov	rdx,QWORD[((-64))+rsi]
3218	mov	QWORD[((16+8))+rsp],rax
3219	mov	QWORD[((24+8))+rsp],rdi
3220
3221
3222	xor	eax,eax
3223	jmp	NEAR $L$sqrx8x_loop
3224
3225ALIGN	32
3226$L$sqrx8x_loop:
3227	mov	rbx,r8
3228	mulx	r8,rax,QWORD[rbp]
3229	adcx	rbx,rax
3230	adox	r8,r9
3231
3232	mulx	r9,rax,QWORD[8+rbp]
3233	adcx	r8,rax
3234	adox	r9,r10
3235
3236	mulx	r10,rax,QWORD[16+rbp]
3237	adcx	r9,rax
3238	adox	r10,r11
3239
3240	mulx	r11,rax,QWORD[24+rbp]
3241	adcx	r10,rax
3242	adox	r11,r12
3243
3244DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3245	adcx	r11,rax
3246	adox	r12,r13
3247
3248	mulx	r13,rax,QWORD[40+rbp]
3249	adcx	r12,rax
3250	adox	r13,r14
3251
3252	mulx	r14,rax,QWORD[48+rbp]
3253	mov	QWORD[rcx*8+rdi],rbx
3254	mov	ebx,0
3255	adcx	r13,rax
3256	adox	r14,r15
3257
3258DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3259	mov	rdx,QWORD[8+rcx*8+rsi]
3260	adcx	r14,rax
3261	adox	r15,rbx
3262	adcx	r15,rbx
3263
3264DB	0x67
3265	inc	rcx
3266	jnz	NEAR $L$sqrx8x_loop
3267
3268	lea	rbp,[64+rbp]
3269	mov	rcx,-8
3270	cmp	rbp,QWORD[((8+8))+rsp]
3271	je	NEAR $L$sqrx8x_break
3272
3273	sub	rbx,QWORD[((16+8))+rsp]
3274DB	0x66
3275	mov	rdx,QWORD[((-64))+rsi]
3276	adcx	r8,QWORD[rdi]
3277	adcx	r9,QWORD[8+rdi]
3278	adc	r10,QWORD[16+rdi]
3279	adc	r11,QWORD[24+rdi]
3280	adc	r12,QWORD[32+rdi]
3281	adc	r13,QWORD[40+rdi]
3282	adc	r14,QWORD[48+rdi]
3283	adc	r15,QWORD[56+rdi]
3284	lea	rdi,[64+rdi]
3285DB	0x67
3286	sbb	rax,rax
3287	xor	ebx,ebx
3288	mov	QWORD[((16+8))+rsp],rax
3289	jmp	NEAR $L$sqrx8x_loop
3290
3291ALIGN	32
3292$L$sqrx8x_break:
3293	xor	rbp,rbp
3294	sub	rbx,QWORD[((16+8))+rsp]
3295	adcx	r8,rbp
3296	mov	rcx,QWORD[((24+8))+rsp]
3297	adcx	r9,rbp
3298	mov	rdx,QWORD[rsi]
3299	adc	r10,0
3300	mov	QWORD[rdi],r8
3301	adc	r11,0
3302	adc	r12,0
3303	adc	r13,0
3304	adc	r14,0
3305	adc	r15,0
3306	cmp	rdi,rcx
3307	je	NEAR $L$sqrx8x_outer_loop
3308
3309	mov	QWORD[8+rdi],r9
3310	mov	r9,QWORD[8+rcx]
3311	mov	QWORD[16+rdi],r10
3312	mov	r10,QWORD[16+rcx]
3313	mov	QWORD[24+rdi],r11
3314	mov	r11,QWORD[24+rcx]
3315	mov	QWORD[32+rdi],r12
3316	mov	r12,QWORD[32+rcx]
3317	mov	QWORD[40+rdi],r13
3318	mov	r13,QWORD[40+rcx]
3319	mov	QWORD[48+rdi],r14
3320	mov	r14,QWORD[48+rcx]
3321	mov	QWORD[56+rdi],r15
3322	mov	r15,QWORD[56+rcx]
3323	mov	rdi,rcx
3324	jmp	NEAR $L$sqrx8x_outer_loop
3325
3326ALIGN	32
3327$L$sqrx8x_outer_break:
3328	mov	QWORD[72+rdi],r9
3329DB	102,72,15,126,217
3330	mov	QWORD[80+rdi],r10
3331	mov	QWORD[88+rdi],r11
3332	mov	QWORD[96+rdi],r12
3333	mov	QWORD[104+rdi],r13
3334	mov	QWORD[112+rdi],r14
3335	lea	rdi,[((48+8))+rsp]
3336	mov	rdx,QWORD[rcx*1+rsi]
3337
3338	mov	r11,QWORD[8+rdi]
3339	xor	r10,r10
3340	mov	r9,QWORD[((0+8))+rsp]
3341	adox	r11,r11
3342	mov	r12,QWORD[16+rdi]
3343	mov	r13,QWORD[24+rdi]
3344
3345
3346ALIGN	32
3347$L$sqrx4x_shift_n_add:
3348	mulx	rbx,rax,rdx
3349	adox	r12,r12
3350	adcx	rax,r10
3351DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3352DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3353	adox	r13,r13
3354	adcx	rbx,r11
3355	mov	r11,QWORD[40+rdi]
3356	mov	QWORD[rdi],rax
3357	mov	QWORD[8+rdi],rbx
3358
3359	mulx	rbx,rax,rdx
3360	adox	r10,r10
3361	adcx	rax,r12
3362	mov	rdx,QWORD[16+rcx*1+rsi]
3363	mov	r12,QWORD[48+rdi]
3364	adox	r11,r11
3365	adcx	rbx,r13
3366	mov	r13,QWORD[56+rdi]
3367	mov	QWORD[16+rdi],rax
3368	mov	QWORD[24+rdi],rbx
3369
3370	mulx	rbx,rax,rdx
3371	adox	r12,r12
3372	adcx	rax,r10
3373	mov	rdx,QWORD[24+rcx*1+rsi]
3374	lea	rcx,[32+rcx]
3375	mov	r10,QWORD[64+rdi]
3376	adox	r13,r13
3377	adcx	rbx,r11
3378	mov	r11,QWORD[72+rdi]
3379	mov	QWORD[32+rdi],rax
3380	mov	QWORD[40+rdi],rbx
3381
3382	mulx	rbx,rax,rdx
3383	adox	r10,r10
3384	adcx	rax,r12
3385	jrcxz	$L$sqrx4x_shift_n_add_break
3386DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3387	adox	r11,r11
3388	adcx	rbx,r13
3389	mov	r12,QWORD[80+rdi]
3390	mov	r13,QWORD[88+rdi]
3391	mov	QWORD[48+rdi],rax
3392	mov	QWORD[56+rdi],rbx
3393	lea	rdi,[64+rdi]
3394	nop
3395	jmp	NEAR $L$sqrx4x_shift_n_add
3396
3397ALIGN	32
3398$L$sqrx4x_shift_n_add_break:
3399	adcx	rbx,r13
3400	mov	QWORD[48+rdi],rax
3401	mov	QWORD[56+rdi],rbx
3402	lea	rdi,[64+rdi]
3403DB	102,72,15,126,213
3404__bn_sqrx8x_reduction:
3405	xor	eax,eax
3406	mov	rbx,QWORD[((32+8))+rsp]
3407	mov	rdx,QWORD[((48+8))+rsp]
3408	lea	rcx,[((-64))+r9*1+rbp]
3409
3410	mov	QWORD[((0+8))+rsp],rcx
3411	mov	QWORD[((8+8))+rsp],rdi
3412
3413	lea	rdi,[((48+8))+rsp]
3414	jmp	NEAR $L$sqrx8x_reduction_loop
3415
3416ALIGN	32
3417$L$sqrx8x_reduction_loop:
3418	mov	r9,QWORD[8+rdi]
3419	mov	r10,QWORD[16+rdi]
3420	mov	r11,QWORD[24+rdi]
3421	mov	r12,QWORD[32+rdi]
3422	mov	r8,rdx
3423	imul	rdx,rbx
3424	mov	r13,QWORD[40+rdi]
3425	mov	r14,QWORD[48+rdi]
3426	mov	r15,QWORD[56+rdi]
3427	mov	QWORD[((24+8))+rsp],rax
3428
3429	lea	rdi,[64+rdi]
3430	xor	rsi,rsi
3431	mov	rcx,-8
3432	jmp	NEAR $L$sqrx8x_reduce
3433
3434ALIGN	32
3435$L$sqrx8x_reduce:
3436	mov	rbx,r8
3437	mulx	r8,rax,QWORD[rbp]
3438	adcx	rax,rbx
3439	adox	r8,r9
3440
3441	mulx	r9,rbx,QWORD[8+rbp]
3442	adcx	r8,rbx
3443	adox	r9,r10
3444
3445	mulx	r10,rbx,QWORD[16+rbp]
3446	adcx	r9,rbx
3447	adox	r10,r11
3448
3449	mulx	r11,rbx,QWORD[24+rbp]
3450	adcx	r10,rbx
3451	adox	r11,r12
3452
3453DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3454	mov	rax,rdx
3455	mov	rdx,r8
3456	adcx	r11,rbx
3457	adox	r12,r13
3458
3459	mulx	rdx,rbx,QWORD[((32+8))+rsp]
3460	mov	rdx,rax
3461	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
3462
3463	mulx	r13,rax,QWORD[40+rbp]
3464	adcx	r12,rax
3465	adox	r13,r14
3466
3467	mulx	r14,rax,QWORD[48+rbp]
3468	adcx	r13,rax
3469	adox	r14,r15
3470
3471	mulx	r15,rax,QWORD[56+rbp]
3472	mov	rdx,rbx
3473	adcx	r14,rax
3474	adox	r15,rsi
3475	adcx	r15,rsi
3476
3477DB	0x67,0x67,0x67
3478	inc	rcx
3479	jnz	NEAR $L$sqrx8x_reduce
3480
3481	mov	rax,rsi
3482	cmp	rbp,QWORD[((0+8))+rsp]
3483	jae	NEAR $L$sqrx8x_no_tail
3484
3485	mov	rdx,QWORD[((48+8))+rsp]
3486	add	r8,QWORD[rdi]
3487	lea	rbp,[64+rbp]
3488	mov	rcx,-8
3489	adcx	r9,QWORD[8+rdi]
3490	adcx	r10,QWORD[16+rdi]
3491	adc	r11,QWORD[24+rdi]
3492	adc	r12,QWORD[32+rdi]
3493	adc	r13,QWORD[40+rdi]
3494	adc	r14,QWORD[48+rdi]
3495	adc	r15,QWORD[56+rdi]
3496	lea	rdi,[64+rdi]
3497	sbb	rax,rax
3498
3499	xor	rsi,rsi
3500	mov	QWORD[((16+8))+rsp],rax
3501	jmp	NEAR $L$sqrx8x_tail
3502
3503ALIGN	32
3504$L$sqrx8x_tail:
3505	mov	rbx,r8
3506	mulx	r8,rax,QWORD[rbp]
3507	adcx	rbx,rax
3508	adox	r8,r9
3509
3510	mulx	r9,rax,QWORD[8+rbp]
3511	adcx	r8,rax
3512	adox	r9,r10
3513
3514	mulx	r10,rax,QWORD[16+rbp]
3515	adcx	r9,rax
3516	adox	r10,r11
3517
3518	mulx	r11,rax,QWORD[24+rbp]
3519	adcx	r10,rax
3520	adox	r11,r12
3521
3522DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3523	adcx	r11,rax
3524	adox	r12,r13
3525
3526	mulx	r13,rax,QWORD[40+rbp]
3527	adcx	r12,rax
3528	adox	r13,r14
3529
3530	mulx	r14,rax,QWORD[48+rbp]
3531	adcx	r13,rax
3532	adox	r14,r15
3533
3534	mulx	r15,rax,QWORD[56+rbp]
3535	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
3536	adcx	r14,rax
3537	adox	r15,rsi
3538	mov	QWORD[rcx*8+rdi],rbx
3539	mov	rbx,r8
3540	adcx	r15,rsi
3541
3542	inc	rcx
3543	jnz	NEAR $L$sqrx8x_tail
3544
3545	cmp	rbp,QWORD[((0+8))+rsp]
3546	jae	NEAR $L$sqrx8x_tail_done
3547
3548	sub	rsi,QWORD[((16+8))+rsp]
3549	mov	rdx,QWORD[((48+8))+rsp]
3550	lea	rbp,[64+rbp]
3551	adc	r8,QWORD[rdi]
3552	adc	r9,QWORD[8+rdi]
3553	adc	r10,QWORD[16+rdi]
3554	adc	r11,QWORD[24+rdi]
3555	adc	r12,QWORD[32+rdi]
3556	adc	r13,QWORD[40+rdi]
3557	adc	r14,QWORD[48+rdi]
3558	adc	r15,QWORD[56+rdi]
3559	lea	rdi,[64+rdi]
3560	sbb	rax,rax
3561	sub	rcx,8
3562
3563	xor	rsi,rsi
3564	mov	QWORD[((16+8))+rsp],rax
3565	jmp	NEAR $L$sqrx8x_tail
3566
3567ALIGN	32
3568$L$sqrx8x_tail_done:
3569	xor	rax,rax
3570	add	r8,QWORD[((24+8))+rsp]
3571	adc	r9,0
3572	adc	r10,0
3573	adc	r11,0
3574	adc	r12,0
3575	adc	r13,0
3576	adc	r14,0
3577	adc	r15,0
3578	adc	rax,0
3579
3580	sub	rsi,QWORD[((16+8))+rsp]
3581$L$sqrx8x_no_tail:
3582	adc	r8,QWORD[rdi]
3583DB	102,72,15,126,217
3584	adc	r9,QWORD[8+rdi]
3585	mov	rsi,QWORD[56+rbp]
3586DB	102,72,15,126,213
3587	adc	r10,QWORD[16+rdi]
3588	adc	r11,QWORD[24+rdi]
3589	adc	r12,QWORD[32+rdi]
3590	adc	r13,QWORD[40+rdi]
3591	adc	r14,QWORD[48+rdi]
3592	adc	r15,QWORD[56+rdi]
3593	adc	rax,0
3594
3595	mov	rbx,QWORD[((32+8))+rsp]
3596	mov	rdx,QWORD[64+rcx*1+rdi]
3597
3598	mov	QWORD[rdi],r8
3599	lea	r8,[64+rdi]
3600	mov	QWORD[8+rdi],r9
3601	mov	QWORD[16+rdi],r10
3602	mov	QWORD[24+rdi],r11
3603	mov	QWORD[32+rdi],r12
3604	mov	QWORD[40+rdi],r13
3605	mov	QWORD[48+rdi],r14
3606	mov	QWORD[56+rdi],r15
3607
3608	lea	rdi,[64+rcx*1+rdi]
3609	cmp	r8,QWORD[((8+8))+rsp]
3610	jb	NEAR $L$sqrx8x_reduction_loop
3611	DB	0F3h,0C3h		;repret
3612
3613
3614ALIGN	32
3615
3616__bn_postx4x_internal:
3617
3618	mov	r12,QWORD[rbp]
3619	mov	r10,rcx
3620	mov	r9,rcx
3621	neg	rax
3622	sar	rcx,3+2
3623
3624DB	102,72,15,126,202
3625DB	102,72,15,126,206
3626	dec	r12
3627	mov	r13,QWORD[8+rbp]
3628	xor	r8,r8
3629	mov	r14,QWORD[16+rbp]
3630	mov	r15,QWORD[24+rbp]
3631	jmp	NEAR $L$sqrx4x_sub_entry
3632
3633ALIGN	16
3634$L$sqrx4x_sub:
3635	mov	r12,QWORD[rbp]
3636	mov	r13,QWORD[8+rbp]
3637	mov	r14,QWORD[16+rbp]
3638	mov	r15,QWORD[24+rbp]
3639$L$sqrx4x_sub_entry:
3640	andn	r12,r12,rax
3641	lea	rbp,[32+rbp]
3642	andn	r13,r13,rax
3643	andn	r14,r14,rax
3644	andn	r15,r15,rax
3645
3646	neg	r8
3647	adc	r12,QWORD[rdi]
3648	adc	r13,QWORD[8+rdi]
3649	adc	r14,QWORD[16+rdi]
3650	adc	r15,QWORD[24+rdi]
3651	mov	QWORD[rdx],r12
3652	lea	rdi,[32+rdi]
3653	mov	QWORD[8+rdx],r13
3654	sbb	r8,r8
3655	mov	QWORD[16+rdx],r14
3656	mov	QWORD[24+rdx],r15
3657	lea	rdx,[32+rdx]
3658
3659	inc	rcx
3660	jnz	NEAR $L$sqrx4x_sub
3661
3662	neg	r9
3663
3664	DB	0F3h,0C3h		;repret
3665
3666
3667global	GFp_bn_scatter5
3668
3669ALIGN	16
3670GFp_bn_scatter5:
3671
3672	cmp	edx,0
3673	jz	NEAR $L$scatter_epilogue
3674	lea	r8,[r9*8+r8]
3675$L$scatter:
3676	mov	rax,QWORD[rcx]
3677	lea	rcx,[8+rcx]
3678	mov	QWORD[r8],rax
3679	lea	r8,[256+r8]
3680	sub	edx,1
3681	jnz	NEAR $L$scatter
3682$L$scatter_epilogue:
3683	DB	0F3h,0C3h		;repret
3684
3685
3686
3687global	GFp_bn_gather5
3688
3689ALIGN	32
3690GFp_bn_gather5:
3691
3692$L$SEH_begin_GFp_bn_gather5:
3693
3694DB	0x4c,0x8d,0x14,0x24
3695
3696DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3697	lea	rax,[$L$inc]
3698	and	rsp,-16
3699
3700	movd	xmm5,r9d
3701	movdqa	xmm0,XMMWORD[rax]
3702	movdqa	xmm1,XMMWORD[16+rax]
3703	lea	r11,[128+r8]
3704	lea	rax,[128+rsp]
3705
3706	pshufd	xmm5,xmm5,0
3707	movdqa	xmm4,xmm1
3708	movdqa	xmm2,xmm1
3709	paddd	xmm1,xmm0
3710	pcmpeqd	xmm0,xmm5
3711	movdqa	xmm3,xmm4
3712
3713	paddd	xmm2,xmm1
3714	pcmpeqd	xmm1,xmm5
3715	movdqa	XMMWORD[(-128)+rax],xmm0
3716	movdqa	xmm0,xmm4
3717
3718	paddd	xmm3,xmm2
3719	pcmpeqd	xmm2,xmm5
3720	movdqa	XMMWORD[(-112)+rax],xmm1
3721	movdqa	xmm1,xmm4
3722
3723	paddd	xmm0,xmm3
3724	pcmpeqd	xmm3,xmm5
3725	movdqa	XMMWORD[(-96)+rax],xmm2
3726	movdqa	xmm2,xmm4
3727	paddd	xmm1,xmm0
3728	pcmpeqd	xmm0,xmm5
3729	movdqa	XMMWORD[(-80)+rax],xmm3
3730	movdqa	xmm3,xmm4
3731
3732	paddd	xmm2,xmm1
3733	pcmpeqd	xmm1,xmm5
3734	movdqa	XMMWORD[(-64)+rax],xmm0
3735	movdqa	xmm0,xmm4
3736
3737	paddd	xmm3,xmm2
3738	pcmpeqd	xmm2,xmm5
3739	movdqa	XMMWORD[(-48)+rax],xmm1
3740	movdqa	xmm1,xmm4
3741
3742	paddd	xmm0,xmm3
3743	pcmpeqd	xmm3,xmm5
3744	movdqa	XMMWORD[(-32)+rax],xmm2
3745	movdqa	xmm2,xmm4
3746	paddd	xmm1,xmm0
3747	pcmpeqd	xmm0,xmm5
3748	movdqa	XMMWORD[(-16)+rax],xmm3
3749	movdqa	xmm3,xmm4
3750
3751	paddd	xmm2,xmm1
3752	pcmpeqd	xmm1,xmm5
3753	movdqa	XMMWORD[rax],xmm0
3754	movdqa	xmm0,xmm4
3755
3756	paddd	xmm3,xmm2
3757	pcmpeqd	xmm2,xmm5
3758	movdqa	XMMWORD[16+rax],xmm1
3759	movdqa	xmm1,xmm4
3760
3761	paddd	xmm0,xmm3
3762	pcmpeqd	xmm3,xmm5
3763	movdqa	XMMWORD[32+rax],xmm2
3764	movdqa	xmm2,xmm4
3765	paddd	xmm1,xmm0
3766	pcmpeqd	xmm0,xmm5
3767	movdqa	XMMWORD[48+rax],xmm3
3768	movdqa	xmm3,xmm4
3769
3770	paddd	xmm2,xmm1
3771	pcmpeqd	xmm1,xmm5
3772	movdqa	XMMWORD[64+rax],xmm0
3773	movdqa	xmm0,xmm4
3774
3775	paddd	xmm3,xmm2
3776	pcmpeqd	xmm2,xmm5
3777	movdqa	XMMWORD[80+rax],xmm1
3778	movdqa	xmm1,xmm4
3779
3780	paddd	xmm0,xmm3
3781	pcmpeqd	xmm3,xmm5
3782	movdqa	XMMWORD[96+rax],xmm2
3783	movdqa	xmm2,xmm4
3784	movdqa	XMMWORD[112+rax],xmm3
3785	jmp	NEAR $L$gather
3786
3787ALIGN	32
3788$L$gather:
3789	pxor	xmm4,xmm4
3790	pxor	xmm5,xmm5
3791	movdqa	xmm0,XMMWORD[((-128))+r11]
3792	movdqa	xmm1,XMMWORD[((-112))+r11]
3793	movdqa	xmm2,XMMWORD[((-96))+r11]
3794	pand	xmm0,XMMWORD[((-128))+rax]
3795	movdqa	xmm3,XMMWORD[((-80))+r11]
3796	pand	xmm1,XMMWORD[((-112))+rax]
3797	por	xmm4,xmm0
3798	pand	xmm2,XMMWORD[((-96))+rax]
3799	por	xmm5,xmm1
3800	pand	xmm3,XMMWORD[((-80))+rax]
3801	por	xmm4,xmm2
3802	por	xmm5,xmm3
3803	movdqa	xmm0,XMMWORD[((-64))+r11]
3804	movdqa	xmm1,XMMWORD[((-48))+r11]
3805	movdqa	xmm2,XMMWORD[((-32))+r11]
3806	pand	xmm0,XMMWORD[((-64))+rax]
3807	movdqa	xmm3,XMMWORD[((-16))+r11]
3808	pand	xmm1,XMMWORD[((-48))+rax]
3809	por	xmm4,xmm0
3810	pand	xmm2,XMMWORD[((-32))+rax]
3811	por	xmm5,xmm1
3812	pand	xmm3,XMMWORD[((-16))+rax]
3813	por	xmm4,xmm2
3814	por	xmm5,xmm3
3815	movdqa	xmm0,XMMWORD[r11]
3816	movdqa	xmm1,XMMWORD[16+r11]
3817	movdqa	xmm2,XMMWORD[32+r11]
3818	pand	xmm0,XMMWORD[rax]
3819	movdqa	xmm3,XMMWORD[48+r11]
3820	pand	xmm1,XMMWORD[16+rax]
3821	por	xmm4,xmm0
3822	pand	xmm2,XMMWORD[32+rax]
3823	por	xmm5,xmm1
3824	pand	xmm3,XMMWORD[48+rax]
3825	por	xmm4,xmm2
3826	por	xmm5,xmm3
3827	movdqa	xmm0,XMMWORD[64+r11]
3828	movdqa	xmm1,XMMWORD[80+r11]
3829	movdqa	xmm2,XMMWORD[96+r11]
3830	pand	xmm0,XMMWORD[64+rax]
3831	movdqa	xmm3,XMMWORD[112+r11]
3832	pand	xmm1,XMMWORD[80+rax]
3833	por	xmm4,xmm0
3834	pand	xmm2,XMMWORD[96+rax]
3835	por	xmm5,xmm1
3836	pand	xmm3,XMMWORD[112+rax]
3837	por	xmm4,xmm2
3838	por	xmm5,xmm3
3839	por	xmm4,xmm5
3840	lea	r11,[256+r11]
3841	pshufd	xmm0,xmm4,0x4e
3842	por	xmm0,xmm4
3843	movq	QWORD[rcx],xmm0
3844	lea	rcx,[8+rcx]
3845	sub	edx,1
3846	jnz	NEAR $L$gather
3847
3848	lea	rsp,[r10]
3849
3850	DB	0F3h,0C3h		;repret
3851$L$SEH_end_GFp_bn_gather5:
3852
3853
3854ALIGN	64
3855$L$inc:
3856	DD	0,0,1,1
3857	DD	2,2,2,2
3858DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
3859DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
3860DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
3861DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
3862DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
3863DB	112,101,110,115,115,108,46,111,114,103,62,0
3864EXTERN	__imp_RtlVirtualUnwind
3865
3866ALIGN	16
3867mul_handler:
3868	push	rsi
3869	push	rdi
3870	push	rbx
3871	push	rbp
3872	push	r12
3873	push	r13
3874	push	r14
3875	push	r15
3876	pushfq
3877	sub	rsp,64
3878
3879	mov	rax,QWORD[120+r8]
3880	mov	rbx,QWORD[248+r8]
3881
3882	mov	rsi,QWORD[8+r9]
3883	mov	r11,QWORD[56+r9]
3884
3885	mov	r10d,DWORD[r11]
3886	lea	r10,[r10*1+rsi]
3887	cmp	rbx,r10
3888	jb	NEAR $L$common_seh_tail
3889
3890	mov	r10d,DWORD[4+r11]
3891	lea	r10,[r10*1+rsi]
3892	cmp	rbx,r10
3893	jb	NEAR $L$common_pop_regs
3894
3895	mov	rax,QWORD[152+r8]
3896
3897	mov	r10d,DWORD[8+r11]
3898	lea	r10,[r10*1+rsi]
3899	cmp	rbx,r10
3900	jae	NEAR $L$common_seh_tail
3901
3902	lea	r10,[$L$mul_epilogue]
3903	cmp	rbx,r10
3904	ja	NEAR $L$body_40
3905
3906	mov	r10,QWORD[192+r8]
3907	mov	rax,QWORD[8+r10*8+rax]
3908
3909	jmp	NEAR $L$common_pop_regs
3910
3911$L$body_40:
3912	mov	rax,QWORD[40+rax]
3913$L$common_pop_regs:
3914	mov	rbx,QWORD[((-8))+rax]
3915	mov	rbp,QWORD[((-16))+rax]
3916	mov	r12,QWORD[((-24))+rax]
3917	mov	r13,QWORD[((-32))+rax]
3918	mov	r14,QWORD[((-40))+rax]
3919	mov	r15,QWORD[((-48))+rax]
3920	mov	QWORD[144+r8],rbx
3921	mov	QWORD[160+r8],rbp
3922	mov	QWORD[216+r8],r12
3923	mov	QWORD[224+r8],r13
3924	mov	QWORD[232+r8],r14
3925	mov	QWORD[240+r8],r15
3926
3927$L$common_seh_tail:
3928	mov	rdi,QWORD[8+rax]
3929	mov	rsi,QWORD[16+rax]
3930	mov	QWORD[152+r8],rax
3931	mov	QWORD[168+r8],rsi
3932	mov	QWORD[176+r8],rdi
3933
3934	mov	rdi,QWORD[40+r9]
3935	mov	rsi,r8
3936	mov	ecx,154
3937	DD	0xa548f3fc
3938
3939	mov	rsi,r9
3940	xor	rcx,rcx
3941	mov	rdx,QWORD[8+rsi]
3942	mov	r8,QWORD[rsi]
3943	mov	r9,QWORD[16+rsi]
3944	mov	r10,QWORD[40+rsi]
3945	lea	r11,[56+rsi]
3946	lea	r12,[24+rsi]
3947	mov	QWORD[32+rsp],r10
3948	mov	QWORD[40+rsp],r11
3949	mov	QWORD[48+rsp],r12
3950	mov	QWORD[56+rsp],rcx
3951	call	QWORD[__imp_RtlVirtualUnwind]
3952
3953	mov	eax,1
3954	add	rsp,64
3955	popfq
3956	pop	r15
3957	pop	r14
3958	pop	r13
3959	pop	r12
3960	pop	rbp
3961	pop	rbx
3962	pop	rdi
3963	pop	rsi
3964	DB	0F3h,0C3h		;repret
3965
3966
3967section	.pdata rdata align=4
3968ALIGN	4
3969	DD	$L$SEH_begin_GFp_bn_mul_mont_gather5 wrt ..imagebase
3970	DD	$L$SEH_end_GFp_bn_mul_mont_gather5 wrt ..imagebase
3971	DD	$L$SEH_info_GFp_bn_mul_mont_gather5 wrt ..imagebase
3972
3973	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
3974	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
3975	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
3976
3977	DD	$L$SEH_begin_GFp_bn_power5 wrt ..imagebase
3978	DD	$L$SEH_end_GFp_bn_power5 wrt ..imagebase
3979	DD	$L$SEH_info_GFp_bn_power5 wrt ..imagebase
3980
3981	DD	$L$SEH_begin_bn_from_mont8x wrt ..imagebase
3982	DD	$L$SEH_end_bn_from_mont8x wrt ..imagebase
3983	DD	$L$SEH_info_bn_from_mont8x wrt ..imagebase
3984	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
3985	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
3986	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
3987
3988	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
3989	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
3990	DD	$L$SEH_info_GFp_bn_powerx5 wrt ..imagebase
3991	DD	$L$SEH_begin_GFp_bn_gather5 wrt ..imagebase
3992	DD	$L$SEH_end_GFp_bn_gather5 wrt ..imagebase
3993	DD	$L$SEH_info_GFp_bn_gather5 wrt ..imagebase
3994
3995section	.xdata rdata align=8
3996ALIGN	8
3997$L$SEH_info_GFp_bn_mul_mont_gather5:
3998DB	9,0,0,0
3999	DD	mul_handler wrt ..imagebase
4000	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
4001ALIGN	8
4002$L$SEH_info_bn_mul4x_mont_gather5:
4003DB	9,0,0,0
4004	DD	mul_handler wrt ..imagebase
4005	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
4006ALIGN	8
4007$L$SEH_info_GFp_bn_power5:
4008DB	9,0,0,0
4009	DD	mul_handler wrt ..imagebase
4010	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
4011ALIGN	8
4012$L$SEH_info_bn_from_mont8x:
4013DB	9,0,0,0
4014	DD	mul_handler wrt ..imagebase
4015	DD	$L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
4016ALIGN	8
4017$L$SEH_info_bn_mulx4x_mont_gather5:
4018DB	9,0,0,0
4019	DD	mul_handler wrt ..imagebase
4020	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
4021ALIGN	8
4022$L$SEH_info_GFp_bn_powerx5:
4023DB	9,0,0,0
4024	DD	mul_handler wrt ..imagebase
4025	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
4026ALIGN	8
4027$L$SEH_info_GFp_bn_gather5:
4028DB	0x01,0x0b,0x03,0x0a
4029DB	0x0b,0x01,0x21,0x00
4030DB	0x04,0xa3,0x00,0x00
4031ALIGN	8
4032