xref: /freebsd/sys/crypto/openssl/amd64/rsaz-avx512.S (revision 61e21613)
1/* Do not modify. This file is auto-generated from rsaz-avx512.pl. */
2
3.globl	ossl_rsaz_avx512ifma_eligible
4.type	ossl_rsaz_avx512ifma_eligible,@function
5.align	32
6ossl_rsaz_avx512ifma_eligible:
7	movl	OPENSSL_ia32cap_P+8(%rip),%ecx
8	xorl	%eax,%eax
9	andl	$2149777408,%ecx
10	cmpl	$2149777408,%ecx
11	cmovel	%ecx,%eax
12	.byte	0xf3,0xc3
13.size	ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
14.text
15
16.globl	ossl_rsaz_amm52x20_x1_256
17.type	ossl_rsaz_amm52x20_x1_256,@function
18.align	32
19ossl_rsaz_amm52x20_x1_256:
20.cfi_startproc
21.byte	243,15,30,250
22	pushq	%rbx
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%rbx,-16
25	pushq	%rbp
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%rbp,-24
28	pushq	%r12
29.cfi_adjust_cfa_offset	8
30.cfi_offset	%r12,-32
31	pushq	%r13
32.cfi_adjust_cfa_offset	8
33.cfi_offset	%r13,-40
34	pushq	%r14
35.cfi_adjust_cfa_offset	8
36.cfi_offset	%r14,-48
37	pushq	%r15
38.cfi_adjust_cfa_offset	8
39.cfi_offset	%r15,-56
40.Lrsaz_amm52x20_x1_256_body:
41
42
43	vpxord	%ymm0,%ymm0,%ymm0
44	vmovdqa64	%ymm0,%ymm1
45	vmovdqa64	%ymm0,%ymm16
46	vmovdqa64	%ymm0,%ymm17
47	vmovdqa64	%ymm0,%ymm18
48	vmovdqa64	%ymm0,%ymm19
49
50	xorl	%r9d,%r9d
51
52	movq	%rdx,%r11
53	movq	$0xfffffffffffff,%rax
54
55
56	movl	$5,%ebx
57
58.align	32
59.Lloop5:
60	movq	0(%r11),%r13
61
62	vpbroadcastq	%r13,%ymm3
63	movq	0(%rsi),%rdx
64	mulxq	%r13,%r13,%r12
65	addq	%r13,%r9
66	movq	%r12,%r10
67	adcq	$0,%r10
68
69	movq	%r8,%r13
70	imulq	%r9,%r13
71	andq	%rax,%r13
72
73	vpbroadcastq	%r13,%ymm4
74	movq	0(%rcx),%rdx
75	mulxq	%r13,%r13,%r12
76	addq	%r13,%r9
77	adcq	%r12,%r10
78
79	shrq	$52,%r9
80	salq	$12,%r10
81	orq	%r10,%r9
82
83	vpmadd52luq	0(%rsi),%ymm3,%ymm1
84	vpmadd52luq	32(%rsi),%ymm3,%ymm16
85	vpmadd52luq	64(%rsi),%ymm3,%ymm17
86	vpmadd52luq	96(%rsi),%ymm3,%ymm18
87	vpmadd52luq	128(%rsi),%ymm3,%ymm19
88
89	vpmadd52luq	0(%rcx),%ymm4,%ymm1
90	vpmadd52luq	32(%rcx),%ymm4,%ymm16
91	vpmadd52luq	64(%rcx),%ymm4,%ymm17
92	vpmadd52luq	96(%rcx),%ymm4,%ymm18
93	vpmadd52luq	128(%rcx),%ymm4,%ymm19
94
95
96	valignq	$1,%ymm1,%ymm16,%ymm1
97	valignq	$1,%ymm16,%ymm17,%ymm16
98	valignq	$1,%ymm17,%ymm18,%ymm17
99	valignq	$1,%ymm18,%ymm19,%ymm18
100	valignq	$1,%ymm19,%ymm0,%ymm19
101
102	vmovq	%xmm1,%r13
103	addq	%r13,%r9
104
105	vpmadd52huq	0(%rsi),%ymm3,%ymm1
106	vpmadd52huq	32(%rsi),%ymm3,%ymm16
107	vpmadd52huq	64(%rsi),%ymm3,%ymm17
108	vpmadd52huq	96(%rsi),%ymm3,%ymm18
109	vpmadd52huq	128(%rsi),%ymm3,%ymm19
110
111	vpmadd52huq	0(%rcx),%ymm4,%ymm1
112	vpmadd52huq	32(%rcx),%ymm4,%ymm16
113	vpmadd52huq	64(%rcx),%ymm4,%ymm17
114	vpmadd52huq	96(%rcx),%ymm4,%ymm18
115	vpmadd52huq	128(%rcx),%ymm4,%ymm19
116	movq	8(%r11),%r13
117
118	vpbroadcastq	%r13,%ymm3
119	movq	0(%rsi),%rdx
120	mulxq	%r13,%r13,%r12
121	addq	%r13,%r9
122	movq	%r12,%r10
123	adcq	$0,%r10
124
125	movq	%r8,%r13
126	imulq	%r9,%r13
127	andq	%rax,%r13
128
129	vpbroadcastq	%r13,%ymm4
130	movq	0(%rcx),%rdx
131	mulxq	%r13,%r13,%r12
132	addq	%r13,%r9
133	adcq	%r12,%r10
134
135	shrq	$52,%r9
136	salq	$12,%r10
137	orq	%r10,%r9
138
139	vpmadd52luq	0(%rsi),%ymm3,%ymm1
140	vpmadd52luq	32(%rsi),%ymm3,%ymm16
141	vpmadd52luq	64(%rsi),%ymm3,%ymm17
142	vpmadd52luq	96(%rsi),%ymm3,%ymm18
143	vpmadd52luq	128(%rsi),%ymm3,%ymm19
144
145	vpmadd52luq	0(%rcx),%ymm4,%ymm1
146	vpmadd52luq	32(%rcx),%ymm4,%ymm16
147	vpmadd52luq	64(%rcx),%ymm4,%ymm17
148	vpmadd52luq	96(%rcx),%ymm4,%ymm18
149	vpmadd52luq	128(%rcx),%ymm4,%ymm19
150
151
152	valignq	$1,%ymm1,%ymm16,%ymm1
153	valignq	$1,%ymm16,%ymm17,%ymm16
154	valignq	$1,%ymm17,%ymm18,%ymm17
155	valignq	$1,%ymm18,%ymm19,%ymm18
156	valignq	$1,%ymm19,%ymm0,%ymm19
157
158	vmovq	%xmm1,%r13
159	addq	%r13,%r9
160
161	vpmadd52huq	0(%rsi),%ymm3,%ymm1
162	vpmadd52huq	32(%rsi),%ymm3,%ymm16
163	vpmadd52huq	64(%rsi),%ymm3,%ymm17
164	vpmadd52huq	96(%rsi),%ymm3,%ymm18
165	vpmadd52huq	128(%rsi),%ymm3,%ymm19
166
167	vpmadd52huq	0(%rcx),%ymm4,%ymm1
168	vpmadd52huq	32(%rcx),%ymm4,%ymm16
169	vpmadd52huq	64(%rcx),%ymm4,%ymm17
170	vpmadd52huq	96(%rcx),%ymm4,%ymm18
171	vpmadd52huq	128(%rcx),%ymm4,%ymm19
172	movq	16(%r11),%r13
173
174	vpbroadcastq	%r13,%ymm3
175	movq	0(%rsi),%rdx
176	mulxq	%r13,%r13,%r12
177	addq	%r13,%r9
178	movq	%r12,%r10
179	adcq	$0,%r10
180
181	movq	%r8,%r13
182	imulq	%r9,%r13
183	andq	%rax,%r13
184
185	vpbroadcastq	%r13,%ymm4
186	movq	0(%rcx),%rdx
187	mulxq	%r13,%r13,%r12
188	addq	%r13,%r9
189	adcq	%r12,%r10
190
191	shrq	$52,%r9
192	salq	$12,%r10
193	orq	%r10,%r9
194
195	vpmadd52luq	0(%rsi),%ymm3,%ymm1
196	vpmadd52luq	32(%rsi),%ymm3,%ymm16
197	vpmadd52luq	64(%rsi),%ymm3,%ymm17
198	vpmadd52luq	96(%rsi),%ymm3,%ymm18
199	vpmadd52luq	128(%rsi),%ymm3,%ymm19
200
201	vpmadd52luq	0(%rcx),%ymm4,%ymm1
202	vpmadd52luq	32(%rcx),%ymm4,%ymm16
203	vpmadd52luq	64(%rcx),%ymm4,%ymm17
204	vpmadd52luq	96(%rcx),%ymm4,%ymm18
205	vpmadd52luq	128(%rcx),%ymm4,%ymm19
206
207
208	valignq	$1,%ymm1,%ymm16,%ymm1
209	valignq	$1,%ymm16,%ymm17,%ymm16
210	valignq	$1,%ymm17,%ymm18,%ymm17
211	valignq	$1,%ymm18,%ymm19,%ymm18
212	valignq	$1,%ymm19,%ymm0,%ymm19
213
214	vmovq	%xmm1,%r13
215	addq	%r13,%r9
216
217	vpmadd52huq	0(%rsi),%ymm3,%ymm1
218	vpmadd52huq	32(%rsi),%ymm3,%ymm16
219	vpmadd52huq	64(%rsi),%ymm3,%ymm17
220	vpmadd52huq	96(%rsi),%ymm3,%ymm18
221	vpmadd52huq	128(%rsi),%ymm3,%ymm19
222
223	vpmadd52huq	0(%rcx),%ymm4,%ymm1
224	vpmadd52huq	32(%rcx),%ymm4,%ymm16
225	vpmadd52huq	64(%rcx),%ymm4,%ymm17
226	vpmadd52huq	96(%rcx),%ymm4,%ymm18
227	vpmadd52huq	128(%rcx),%ymm4,%ymm19
228	movq	24(%r11),%r13
229
230	vpbroadcastq	%r13,%ymm3
231	movq	0(%rsi),%rdx
232	mulxq	%r13,%r13,%r12
233	addq	%r13,%r9
234	movq	%r12,%r10
235	adcq	$0,%r10
236
237	movq	%r8,%r13
238	imulq	%r9,%r13
239	andq	%rax,%r13
240
241	vpbroadcastq	%r13,%ymm4
242	movq	0(%rcx),%rdx
243	mulxq	%r13,%r13,%r12
244	addq	%r13,%r9
245	adcq	%r12,%r10
246
247	shrq	$52,%r9
248	salq	$12,%r10
249	orq	%r10,%r9
250
251	vpmadd52luq	0(%rsi),%ymm3,%ymm1
252	vpmadd52luq	32(%rsi),%ymm3,%ymm16
253	vpmadd52luq	64(%rsi),%ymm3,%ymm17
254	vpmadd52luq	96(%rsi),%ymm3,%ymm18
255	vpmadd52luq	128(%rsi),%ymm3,%ymm19
256
257	vpmadd52luq	0(%rcx),%ymm4,%ymm1
258	vpmadd52luq	32(%rcx),%ymm4,%ymm16
259	vpmadd52luq	64(%rcx),%ymm4,%ymm17
260	vpmadd52luq	96(%rcx),%ymm4,%ymm18
261	vpmadd52luq	128(%rcx),%ymm4,%ymm19
262
263
264	valignq	$1,%ymm1,%ymm16,%ymm1
265	valignq	$1,%ymm16,%ymm17,%ymm16
266	valignq	$1,%ymm17,%ymm18,%ymm17
267	valignq	$1,%ymm18,%ymm19,%ymm18
268	valignq	$1,%ymm19,%ymm0,%ymm19
269
270	vmovq	%xmm1,%r13
271	addq	%r13,%r9
272
273	vpmadd52huq	0(%rsi),%ymm3,%ymm1
274	vpmadd52huq	32(%rsi),%ymm3,%ymm16
275	vpmadd52huq	64(%rsi),%ymm3,%ymm17
276	vpmadd52huq	96(%rsi),%ymm3,%ymm18
277	vpmadd52huq	128(%rsi),%ymm3,%ymm19
278
279	vpmadd52huq	0(%rcx),%ymm4,%ymm1
280	vpmadd52huq	32(%rcx),%ymm4,%ymm16
281	vpmadd52huq	64(%rcx),%ymm4,%ymm17
282	vpmadd52huq	96(%rcx),%ymm4,%ymm18
283	vpmadd52huq	128(%rcx),%ymm4,%ymm19
284	leaq	32(%r11),%r11
285	decl	%ebx
286	jne	.Lloop5
287
288	vmovdqa64	.Lmask52x4(%rip),%ymm4
289
290	vpbroadcastq	%r9,%ymm3
291	vpblendd	$3,%ymm3,%ymm1,%ymm1
292
293
294
295	vpsrlq	$52,%ymm1,%ymm24
296	vpsrlq	$52,%ymm16,%ymm25
297	vpsrlq	$52,%ymm17,%ymm26
298	vpsrlq	$52,%ymm18,%ymm27
299	vpsrlq	$52,%ymm19,%ymm28
300
301
302	valignq	$3,%ymm27,%ymm28,%ymm28
303	valignq	$3,%ymm26,%ymm27,%ymm27
304	valignq	$3,%ymm25,%ymm26,%ymm26
305	valignq	$3,%ymm24,%ymm25,%ymm25
306	valignq	$3,%ymm0,%ymm24,%ymm24
307
308
309	vpandq	%ymm4,%ymm1,%ymm1
310	vpandq	%ymm4,%ymm16,%ymm16
311	vpandq	%ymm4,%ymm17,%ymm17
312	vpandq	%ymm4,%ymm18,%ymm18
313	vpandq	%ymm4,%ymm19,%ymm19
314
315
316	vpaddq	%ymm24,%ymm1,%ymm1
317	vpaddq	%ymm25,%ymm16,%ymm16
318	vpaddq	%ymm26,%ymm17,%ymm17
319	vpaddq	%ymm27,%ymm18,%ymm18
320	vpaddq	%ymm28,%ymm19,%ymm19
321
322
323
324	vpcmpuq	$1,%ymm1,%ymm4,%k1
325	vpcmpuq	$1,%ymm16,%ymm4,%k2
326	vpcmpuq	$1,%ymm17,%ymm4,%k3
327	vpcmpuq	$1,%ymm18,%ymm4,%k4
328	vpcmpuq	$1,%ymm19,%ymm4,%k5
329	kmovb	%k1,%r14d
330	kmovb	%k2,%r13d
331	kmovb	%k3,%r12d
332	kmovb	%k4,%r11d
333	kmovb	%k5,%r10d
334
335
336	vpcmpuq	$0,%ymm1,%ymm4,%k1
337	vpcmpuq	$0,%ymm16,%ymm4,%k2
338	vpcmpuq	$0,%ymm17,%ymm4,%k3
339	vpcmpuq	$0,%ymm18,%ymm4,%k4
340	vpcmpuq	$0,%ymm19,%ymm4,%k5
341	kmovb	%k1,%r9d
342	kmovb	%k2,%r8d
343	kmovb	%k3,%ebx
344	kmovb	%k4,%ecx
345	kmovb	%k5,%edx
346
347
348
349	shlb	$4,%r13b
350	orb	%r13b,%r14b
351	shlb	$4,%r11b
352	orb	%r11b,%r12b
353
354	addb	%r14b,%r14b
355	adcb	%r12b,%r12b
356	adcb	%r10b,%r10b
357
358	shlb	$4,%r8b
359	orb	%r8b,%r9b
360	shlb	$4,%cl
361	orb	%cl,%bl
362
363	addb	%r9b,%r14b
364	adcb	%bl,%r12b
365	adcb	%dl,%r10b
366
367	xorb	%r9b,%r14b
368	xorb	%bl,%r12b
369	xorb	%dl,%r10b
370
371	kmovb	%r14d,%k1
372	shrb	$4,%r14b
373	kmovb	%r14d,%k2
374	kmovb	%r12d,%k3
375	shrb	$4,%r12b
376	kmovb	%r12d,%k4
377	kmovb	%r10d,%k5
378
379
380	vpsubq	%ymm4,%ymm1,%ymm1{%k1}
381	vpsubq	%ymm4,%ymm16,%ymm16{%k2}
382	vpsubq	%ymm4,%ymm17,%ymm17{%k3}
383	vpsubq	%ymm4,%ymm18,%ymm18{%k4}
384	vpsubq	%ymm4,%ymm19,%ymm19{%k5}
385
386	vpandq	%ymm4,%ymm1,%ymm1
387	vpandq	%ymm4,%ymm16,%ymm16
388	vpandq	%ymm4,%ymm17,%ymm17
389	vpandq	%ymm4,%ymm18,%ymm18
390	vpandq	%ymm4,%ymm19,%ymm19
391
392	vmovdqu64	%ymm1,(%rdi)
393	vmovdqu64	%ymm16,32(%rdi)
394	vmovdqu64	%ymm17,64(%rdi)
395	vmovdqu64	%ymm18,96(%rdi)
396	vmovdqu64	%ymm19,128(%rdi)
397
398	vzeroupper
399	movq	0(%rsp),%r15
400.cfi_restore	%r15
401	movq	8(%rsp),%r14
402.cfi_restore	%r14
403	movq	16(%rsp),%r13
404.cfi_restore	%r13
405	movq	24(%rsp),%r12
406.cfi_restore	%r12
407	movq	32(%rsp),%rbp
408.cfi_restore	%rbp
409	movq	40(%rsp),%rbx
410.cfi_restore	%rbx
411	leaq	48(%rsp),%rsp
412.cfi_adjust_cfa_offset	-48
413.Lrsaz_amm52x20_x1_256_epilogue:
414	.byte	0xf3,0xc3
415.cfi_endproc
416.size	ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
417.data
418.align	32
419.Lmask52x4:
420.quad	0xfffffffffffff
421.quad	0xfffffffffffff
422.quad	0xfffffffffffff
423.quad	0xfffffffffffff
424.text
425
426.globl	ossl_rsaz_amm52x20_x2_256
427.type	ossl_rsaz_amm52x20_x2_256,@function
428.align	32
429ossl_rsaz_amm52x20_x2_256:
430.cfi_startproc
431.byte	243,15,30,250
432	pushq	%rbx
433.cfi_adjust_cfa_offset	8
434.cfi_offset	%rbx,-16
435	pushq	%rbp
436.cfi_adjust_cfa_offset	8
437.cfi_offset	%rbp,-24
438	pushq	%r12
439.cfi_adjust_cfa_offset	8
440.cfi_offset	%r12,-32
441	pushq	%r13
442.cfi_adjust_cfa_offset	8
443.cfi_offset	%r13,-40
444	pushq	%r14
445.cfi_adjust_cfa_offset	8
446.cfi_offset	%r14,-48
447	pushq	%r15
448.cfi_adjust_cfa_offset	8
449.cfi_offset	%r15,-56
450.Lrsaz_amm52x20_x2_256_body:
451
452
453	vpxord	%ymm0,%ymm0,%ymm0
454	vmovdqa64	%ymm0,%ymm1
455	vmovdqa64	%ymm0,%ymm16
456	vmovdqa64	%ymm0,%ymm17
457	vmovdqa64	%ymm0,%ymm18
458	vmovdqa64	%ymm0,%ymm19
459	vmovdqa64	%ymm0,%ymm2
460	vmovdqa64	%ymm0,%ymm20
461	vmovdqa64	%ymm0,%ymm21
462	vmovdqa64	%ymm0,%ymm22
463	vmovdqa64	%ymm0,%ymm23
464
465	xorl	%r9d,%r9d
466	xorl	%r15d,%r15d
467
468	movq	%rdx,%r11
469	movq	$0xfffffffffffff,%rax
470
471	movl	$20,%ebx
472
473.align	32
474.Lloop20:
475	movq	0(%r11),%r13
476
477	vpbroadcastq	%r13,%ymm3
478	movq	0(%rsi),%rdx
479	mulxq	%r13,%r13,%r12
480	addq	%r13,%r9
481	movq	%r12,%r10
482	adcq	$0,%r10
483
484	movq	(%r8),%r13
485	imulq	%r9,%r13
486	andq	%rax,%r13
487
488	vpbroadcastq	%r13,%ymm4
489	movq	0(%rcx),%rdx
490	mulxq	%r13,%r13,%r12
491	addq	%r13,%r9
492	adcq	%r12,%r10
493
494	shrq	$52,%r9
495	salq	$12,%r10
496	orq	%r10,%r9
497
498	vpmadd52luq	0(%rsi),%ymm3,%ymm1
499	vpmadd52luq	32(%rsi),%ymm3,%ymm16
500	vpmadd52luq	64(%rsi),%ymm3,%ymm17
501	vpmadd52luq	96(%rsi),%ymm3,%ymm18
502	vpmadd52luq	128(%rsi),%ymm3,%ymm19
503
504	vpmadd52luq	0(%rcx),%ymm4,%ymm1
505	vpmadd52luq	32(%rcx),%ymm4,%ymm16
506	vpmadd52luq	64(%rcx),%ymm4,%ymm17
507	vpmadd52luq	96(%rcx),%ymm4,%ymm18
508	vpmadd52luq	128(%rcx),%ymm4,%ymm19
509
510
511	valignq	$1,%ymm1,%ymm16,%ymm1
512	valignq	$1,%ymm16,%ymm17,%ymm16
513	valignq	$1,%ymm17,%ymm18,%ymm17
514	valignq	$1,%ymm18,%ymm19,%ymm18
515	valignq	$1,%ymm19,%ymm0,%ymm19
516
517	vmovq	%xmm1,%r13
518	addq	%r13,%r9
519
520	vpmadd52huq	0(%rsi),%ymm3,%ymm1
521	vpmadd52huq	32(%rsi),%ymm3,%ymm16
522	vpmadd52huq	64(%rsi),%ymm3,%ymm17
523	vpmadd52huq	96(%rsi),%ymm3,%ymm18
524	vpmadd52huq	128(%rsi),%ymm3,%ymm19
525
526	vpmadd52huq	0(%rcx),%ymm4,%ymm1
527	vpmadd52huq	32(%rcx),%ymm4,%ymm16
528	vpmadd52huq	64(%rcx),%ymm4,%ymm17
529	vpmadd52huq	96(%rcx),%ymm4,%ymm18
530	vpmadd52huq	128(%rcx),%ymm4,%ymm19
531	movq	160(%r11),%r13
532
533	vpbroadcastq	%r13,%ymm3
534	movq	160(%rsi),%rdx
535	mulxq	%r13,%r13,%r12
536	addq	%r13,%r15
537	movq	%r12,%r10
538	adcq	$0,%r10
539
540	movq	8(%r8),%r13
541	imulq	%r15,%r13
542	andq	%rax,%r13
543
544	vpbroadcastq	%r13,%ymm4
545	movq	160(%rcx),%rdx
546	mulxq	%r13,%r13,%r12
547	addq	%r13,%r15
548	adcq	%r12,%r10
549
550	shrq	$52,%r15
551	salq	$12,%r10
552	orq	%r10,%r15
553
554	vpmadd52luq	160(%rsi),%ymm3,%ymm2
555	vpmadd52luq	192(%rsi),%ymm3,%ymm20
556	vpmadd52luq	224(%rsi),%ymm3,%ymm21
557	vpmadd52luq	256(%rsi),%ymm3,%ymm22
558	vpmadd52luq	288(%rsi),%ymm3,%ymm23
559
560	vpmadd52luq	160(%rcx),%ymm4,%ymm2
561	vpmadd52luq	192(%rcx),%ymm4,%ymm20
562	vpmadd52luq	224(%rcx),%ymm4,%ymm21
563	vpmadd52luq	256(%rcx),%ymm4,%ymm22
564	vpmadd52luq	288(%rcx),%ymm4,%ymm23
565
566
567	valignq	$1,%ymm2,%ymm20,%ymm2
568	valignq	$1,%ymm20,%ymm21,%ymm20
569	valignq	$1,%ymm21,%ymm22,%ymm21
570	valignq	$1,%ymm22,%ymm23,%ymm22
571	valignq	$1,%ymm23,%ymm0,%ymm23
572
573	vmovq	%xmm2,%r13
574	addq	%r13,%r15
575
576	vpmadd52huq	160(%rsi),%ymm3,%ymm2
577	vpmadd52huq	192(%rsi),%ymm3,%ymm20
578	vpmadd52huq	224(%rsi),%ymm3,%ymm21
579	vpmadd52huq	256(%rsi),%ymm3,%ymm22
580	vpmadd52huq	288(%rsi),%ymm3,%ymm23
581
582	vpmadd52huq	160(%rcx),%ymm4,%ymm2
583	vpmadd52huq	192(%rcx),%ymm4,%ymm20
584	vpmadd52huq	224(%rcx),%ymm4,%ymm21
585	vpmadd52huq	256(%rcx),%ymm4,%ymm22
586	vpmadd52huq	288(%rcx),%ymm4,%ymm23
587	leaq	8(%r11),%r11
588	decl	%ebx
589	jne	.Lloop20
590
591	vmovdqa64	.Lmask52x4(%rip),%ymm4
592
593	vpbroadcastq	%r9,%ymm3
594	vpblendd	$3,%ymm3,%ymm1,%ymm1
595
596
597
598	vpsrlq	$52,%ymm1,%ymm24
599	vpsrlq	$52,%ymm16,%ymm25
600	vpsrlq	$52,%ymm17,%ymm26
601	vpsrlq	$52,%ymm18,%ymm27
602	vpsrlq	$52,%ymm19,%ymm28
603
604
605	valignq	$3,%ymm27,%ymm28,%ymm28
606	valignq	$3,%ymm26,%ymm27,%ymm27
607	valignq	$3,%ymm25,%ymm26,%ymm26
608	valignq	$3,%ymm24,%ymm25,%ymm25
609	valignq	$3,%ymm0,%ymm24,%ymm24
610
611
612	vpandq	%ymm4,%ymm1,%ymm1
613	vpandq	%ymm4,%ymm16,%ymm16
614	vpandq	%ymm4,%ymm17,%ymm17
615	vpandq	%ymm4,%ymm18,%ymm18
616	vpandq	%ymm4,%ymm19,%ymm19
617
618
619	vpaddq	%ymm24,%ymm1,%ymm1
620	vpaddq	%ymm25,%ymm16,%ymm16
621	vpaddq	%ymm26,%ymm17,%ymm17
622	vpaddq	%ymm27,%ymm18,%ymm18
623	vpaddq	%ymm28,%ymm19,%ymm19
624
625
626
627	vpcmpuq	$1,%ymm1,%ymm4,%k1
628	vpcmpuq	$1,%ymm16,%ymm4,%k2
629	vpcmpuq	$1,%ymm17,%ymm4,%k3
630	vpcmpuq	$1,%ymm18,%ymm4,%k4
631	vpcmpuq	$1,%ymm19,%ymm4,%k5
632	kmovb	%k1,%r14d
633	kmovb	%k2,%r13d
634	kmovb	%k3,%r12d
635	kmovb	%k4,%r11d
636	kmovb	%k5,%r10d
637
638
639	vpcmpuq	$0,%ymm1,%ymm4,%k1
640	vpcmpuq	$0,%ymm16,%ymm4,%k2
641	vpcmpuq	$0,%ymm17,%ymm4,%k3
642	vpcmpuq	$0,%ymm18,%ymm4,%k4
643	vpcmpuq	$0,%ymm19,%ymm4,%k5
644	kmovb	%k1,%r9d
645	kmovb	%k2,%r8d
646	kmovb	%k3,%ebx
647	kmovb	%k4,%ecx
648	kmovb	%k5,%edx
649
650
651
652	shlb	$4,%r13b
653	orb	%r13b,%r14b
654	shlb	$4,%r11b
655	orb	%r11b,%r12b
656
657	addb	%r14b,%r14b
658	adcb	%r12b,%r12b
659	adcb	%r10b,%r10b
660
661	shlb	$4,%r8b
662	orb	%r8b,%r9b
663	shlb	$4,%cl
664	orb	%cl,%bl
665
666	addb	%r9b,%r14b
667	adcb	%bl,%r12b
668	adcb	%dl,%r10b
669
670	xorb	%r9b,%r14b
671	xorb	%bl,%r12b
672	xorb	%dl,%r10b
673
674	kmovb	%r14d,%k1
675	shrb	$4,%r14b
676	kmovb	%r14d,%k2
677	kmovb	%r12d,%k3
678	shrb	$4,%r12b
679	kmovb	%r12d,%k4
680	kmovb	%r10d,%k5
681
682
683	vpsubq	%ymm4,%ymm1,%ymm1{%k1}
684	vpsubq	%ymm4,%ymm16,%ymm16{%k2}
685	vpsubq	%ymm4,%ymm17,%ymm17{%k3}
686	vpsubq	%ymm4,%ymm18,%ymm18{%k4}
687	vpsubq	%ymm4,%ymm19,%ymm19{%k5}
688
689	vpandq	%ymm4,%ymm1,%ymm1
690	vpandq	%ymm4,%ymm16,%ymm16
691	vpandq	%ymm4,%ymm17,%ymm17
692	vpandq	%ymm4,%ymm18,%ymm18
693	vpandq	%ymm4,%ymm19,%ymm19
694
695	vpbroadcastq	%r15,%ymm3
696	vpblendd	$3,%ymm3,%ymm2,%ymm2
697
698
699
700	vpsrlq	$52,%ymm2,%ymm24
701	vpsrlq	$52,%ymm20,%ymm25
702	vpsrlq	$52,%ymm21,%ymm26
703	vpsrlq	$52,%ymm22,%ymm27
704	vpsrlq	$52,%ymm23,%ymm28
705
706
707	valignq	$3,%ymm27,%ymm28,%ymm28
708	valignq	$3,%ymm26,%ymm27,%ymm27
709	valignq	$3,%ymm25,%ymm26,%ymm26
710	valignq	$3,%ymm24,%ymm25,%ymm25
711	valignq	$3,%ymm0,%ymm24,%ymm24
712
713
714	vpandq	%ymm4,%ymm2,%ymm2
715	vpandq	%ymm4,%ymm20,%ymm20
716	vpandq	%ymm4,%ymm21,%ymm21
717	vpandq	%ymm4,%ymm22,%ymm22
718	vpandq	%ymm4,%ymm23,%ymm23
719
720
721	vpaddq	%ymm24,%ymm2,%ymm2
722	vpaddq	%ymm25,%ymm20,%ymm20
723	vpaddq	%ymm26,%ymm21,%ymm21
724	vpaddq	%ymm27,%ymm22,%ymm22
725	vpaddq	%ymm28,%ymm23,%ymm23
726
727
728
729	vpcmpuq	$1,%ymm2,%ymm4,%k1
730	vpcmpuq	$1,%ymm20,%ymm4,%k2
731	vpcmpuq	$1,%ymm21,%ymm4,%k3
732	vpcmpuq	$1,%ymm22,%ymm4,%k4
733	vpcmpuq	$1,%ymm23,%ymm4,%k5
734	kmovb	%k1,%r14d
735	kmovb	%k2,%r13d
736	kmovb	%k3,%r12d
737	kmovb	%k4,%r11d
738	kmovb	%k5,%r10d
739
740
741	vpcmpuq	$0,%ymm2,%ymm4,%k1
742	vpcmpuq	$0,%ymm20,%ymm4,%k2
743	vpcmpuq	$0,%ymm21,%ymm4,%k3
744	vpcmpuq	$0,%ymm22,%ymm4,%k4
745	vpcmpuq	$0,%ymm23,%ymm4,%k5
746	kmovb	%k1,%r9d
747	kmovb	%k2,%r8d
748	kmovb	%k3,%ebx
749	kmovb	%k4,%ecx
750	kmovb	%k5,%edx
751
752
753
754	shlb	$4,%r13b
755	orb	%r13b,%r14b
756	shlb	$4,%r11b
757	orb	%r11b,%r12b
758
759	addb	%r14b,%r14b
760	adcb	%r12b,%r12b
761	adcb	%r10b,%r10b
762
763	shlb	$4,%r8b
764	orb	%r8b,%r9b
765	shlb	$4,%cl
766	orb	%cl,%bl
767
768	addb	%r9b,%r14b
769	adcb	%bl,%r12b
770	adcb	%dl,%r10b
771
772	xorb	%r9b,%r14b
773	xorb	%bl,%r12b
774	xorb	%dl,%r10b
775
776	kmovb	%r14d,%k1
777	shrb	$4,%r14b
778	kmovb	%r14d,%k2
779	kmovb	%r12d,%k3
780	shrb	$4,%r12b
781	kmovb	%r12d,%k4
782	kmovb	%r10d,%k5
783
784
785	vpsubq	%ymm4,%ymm2,%ymm2{%k1}
786	vpsubq	%ymm4,%ymm20,%ymm20{%k2}
787	vpsubq	%ymm4,%ymm21,%ymm21{%k3}
788	vpsubq	%ymm4,%ymm22,%ymm22{%k4}
789	vpsubq	%ymm4,%ymm23,%ymm23{%k5}
790
791	vpandq	%ymm4,%ymm2,%ymm2
792	vpandq	%ymm4,%ymm20,%ymm20
793	vpandq	%ymm4,%ymm21,%ymm21
794	vpandq	%ymm4,%ymm22,%ymm22
795	vpandq	%ymm4,%ymm23,%ymm23
796
797	vmovdqu64	%ymm1,(%rdi)
798	vmovdqu64	%ymm16,32(%rdi)
799	vmovdqu64	%ymm17,64(%rdi)
800	vmovdqu64	%ymm18,96(%rdi)
801	vmovdqu64	%ymm19,128(%rdi)
802
803	vmovdqu64	%ymm2,160(%rdi)
804	vmovdqu64	%ymm20,192(%rdi)
805	vmovdqu64	%ymm21,224(%rdi)
806	vmovdqu64	%ymm22,256(%rdi)
807	vmovdqu64	%ymm23,288(%rdi)
808
809	vzeroupper
810	movq	0(%rsp),%r15
811.cfi_restore	%r15
812	movq	8(%rsp),%r14
813.cfi_restore	%r14
814	movq	16(%rsp),%r13
815.cfi_restore	%r13
816	movq	24(%rsp),%r12
817.cfi_restore	%r12
818	movq	32(%rsp),%rbp
819.cfi_restore	%rbp
820	movq	40(%rsp),%rbx
821.cfi_restore	%rbx
822	leaq	48(%rsp),%rsp
823.cfi_adjust_cfa_offset	-48
824.Lrsaz_amm52x20_x2_256_epilogue:
825	.byte	0xf3,0xc3
826.cfi_endproc
827.size	ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256
828.text
829
830.align	32
831.globl	ossl_extract_multiplier_2x20_win5
832.type	ossl_extract_multiplier_2x20_win5,@function
833ossl_extract_multiplier_2x20_win5:
834.cfi_startproc
835.byte	243,15,30,250
836	leaq	(%rcx,%rcx,4),%rax
837	salq	$5,%rax
838	addq	%rax,%rsi
839
840	vmovdqa64	.Lones(%rip),%ymm23
841	vpbroadcastq	%rdx,%ymm22
842	leaq	10240(%rsi),%rax
843
844	vpxor	%xmm4,%xmm4,%xmm4
845	vmovdqa64	%ymm4,%ymm3
846	vmovdqa64	%ymm4,%ymm2
847	vmovdqa64	%ymm4,%ymm1
848	vmovdqa64	%ymm4,%ymm0
849	vmovdqa64	%ymm4,%ymm21
850
851.align	32
852.Lloop:
853	vpcmpq	$0,%ymm21,%ymm22,%k1
854	addq	$320,%rsi
855	vpaddq	%ymm23,%ymm21,%ymm21
856	vmovdqu64	-320(%rsi),%ymm16
857	vmovdqu64	-288(%rsi),%ymm17
858	vmovdqu64	-256(%rsi),%ymm18
859	vmovdqu64	-224(%rsi),%ymm19
860	vmovdqu64	-192(%rsi),%ymm20
861	vpblendmq	%ymm16,%ymm0,%ymm0{%k1}
862	vpblendmq	%ymm17,%ymm1,%ymm1{%k1}
863	vpblendmq	%ymm18,%ymm2,%ymm2{%k1}
864	vpblendmq	%ymm19,%ymm3,%ymm3{%k1}
865	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
866	cmpq	%rsi,%rax
867	jne	.Lloop
868
869	vmovdqu64	%ymm0,(%rdi)
870	vmovdqu64	%ymm1,32(%rdi)
871	vmovdqu64	%ymm2,64(%rdi)
872	vmovdqu64	%ymm3,96(%rdi)
873	vmovdqu64	%ymm4,128(%rdi)
874
875	.byte	0xf3,0xc3
876.cfi_endproc
877.size	ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
878.data
879.align	32
880.Lones:
881.quad	1,1,1,1
882	.section ".note.gnu.property", "a"
883	.p2align 3
884	.long 1f - 0f
885	.long 4f - 1f
886	.long 5
8870:
888	# "GNU" encoded with .byte, since .asciz isn't supported
889	# on Solaris.
890	.byte 0x47
891	.byte 0x4e
892	.byte 0x55
893	.byte 0
8941:
895	.p2align 3
896	.long 0xc0000002
897	.long 3f - 2f
8982:
899	.long 3
9003:
901	.p2align 3
9024:
903