1.section	".text",#alloc,#execinstr
2
3.global bn_mul_mont_fpu
4.align  32
5bn_mul_mont_fpu:
6	save	%sp,-192-64,%sp
7
8	cmp	%i5,4
9	bl,a,pn %icc,.Lret
10	clr	%i0
11	andcc	%i5,1,%g0		! %i5 has to be even...
12	bnz,a,pn %icc,.Lret
13	clr	%i0			! signal "unsupported input value"
14
15	srl	%i5,1,%i5
16	sethi	%hi(0xffff),%l7
17	ld	[%i4+0],%g4		! %g4 reassigned, remember?
18	or	%l7,%lo(0xffff),%l7
19	ld	[%i4+4],%o0
20	sllx	%o0,32,%o0
21	or	%o0,%g4,%g4		! %g4=n0[1].n0[0]
22
23	sll	%i5,3,%i5		! num*=8
24
25	add	%sp,2047,%o0		! real top of stack
26	sll	%i5,2,%o1
27	add	%o1,%i5,%o1		! %o1=num*5
28	sub	%o0,%o1,%o0
29	and	%o0,-2048,%o0		! optimize TLB utilization
30	sub	%o0,2047,%sp		! alloca(5*num*8)
31
32	rd	%asi,%o7		! save %asi
33	add	%sp,2047+192+64,%l0
34	add	%l0,%i5,%l1
35	add	%l1,%i5,%l1	! [an]p_[lh] point at the vectors' ends !
36	add	%l1,%i5,%l2
37	add	%l2,%i5,%l3
38	add	%l3,%i5,%l4
39
40	wr	%g0,210,%asi	! setup %asi for 16-bit FP loads
41
42	add	%i0,%i5,%i0		! readjust input pointers to point
43	add	%i1,%i5,%i1		! at the ends too...
44	add	%i2,%i5,%i2
45	add	%i3,%i5,%i3
46
47	stx	%o7,[%sp+2047+192+48]	! save %asi
48
49	sub	%g0,%i5,%l5		! i=-num
50	sub	%g0,%i5,%l6		! j=-num
51
52	add	%i1,%l6,%o3
53	add	%i2,%l5,%o4
54
55	ld	[%o3+4],%g1		! bp[0]
56	ld	[%o3+0],%o0
57	ld	[%o4+4],%g5		! ap[0]
58	sllx	%g1,32,%g1
59	ld	[%o4+0],%o1
60	sllx	%g5,32,%g5
61	or	%g1,%o0,%o0
62	or	%g5,%o1,%o1
63
64	add	%i3,%l6,%o5
65
66	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
67	mulx	%g4,%o0,%o0		! ap[0]*bp[0]*n0
68	stx	%o0,[%sp+2047+192+0]
69
70	ld	[%o3+0],%f17	! load a[j] as pair of 32-bit words
71	.word	0xa1b00c20	! fzeros %f16
72	ld	[%o3+4],%f19
73	.word	0xa5b00c20	! fzeros %f18
74	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
75	.word	0xa9b00c20	! fzeros %f20
76	ld	[%o5+4],%f23
77	.word	0xadb00c20	! fzeros %f22
78
79	! transfer b[i] to FPU as 4x16-bit values
80	ldda	[%o4+2]%asi,%f0
81	fxtod	%f16,%f16
82	ldda	[%o4+0]%asi,%f2
83	fxtod	%f18,%f18
84	ldda	[%o4+6]%asi,%f4
85	fxtod	%f20,%f20
86	ldda	[%o4+4]%asi,%f6
87	fxtod	%f22,%f22
88
89	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
90	ldda	[%sp+2047+192+6]%asi,%f8
91	fxtod	%f0,%f0
92	ldda	[%sp+2047+192+4]%asi,%f10
93	fxtod	%f2,%f2
94	ldda	[%sp+2047+192+2]%asi,%f12
95	fxtod	%f4,%f4
96	ldda	[%sp+2047+192+0]%asi,%f14
97	fxtod	%f6,%f6
98
99	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
100	fxtod	%f8,%f8
101	std	%f18,[%l2+%l6]
102	fxtod	%f10,%f10
103	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
104	fxtod	%f12,%f12
105	std	%f22,[%l4+%l6]
106	fxtod	%f14,%f14
107
108		fmuld	%f16,%f0,%f32
109		fmuld	%f20,%f8,%f48
110		fmuld	%f16,%f2,%f34
111		fmuld	%f20,%f10,%f50
112		fmuld	%f16,%f4,%f36
113	faddd	%f32,%f48,%f48
114		fmuld	%f20,%f12,%f52
115		fmuld	%f16,%f6,%f38
116	faddd	%f34,%f50,%f50
117		fmuld	%f20,%f14,%f54
118		fmuld	%f18,%f0,%f40
119	faddd	%f36,%f52,%f52
120		fmuld	%f22,%f8,%f56
121		fmuld	%f18,%f2,%f42
122	faddd	%f38,%f54,%f54
123		fmuld	%f22,%f10,%f58
124		fmuld	%f18,%f4,%f44
125	faddd	%f40,%f56,%f56
126		fmuld	%f22,%f12,%f60
127		fmuld	%f18,%f6,%f46
128	faddd	%f42,%f58,%f58
129		fmuld	%f22,%f14,%f62
130
131	faddd	%f44,%f60,%f24	! %f60
132	faddd	%f46,%f62,%f26	! %f62
133
134	faddd	%f52,%f56,%f52
135	faddd	%f54,%f58,%f54
136
137	fdtox	%f48,%f48
138	fdtox	%f50,%f50
139	fdtox	%f52,%f52
140	fdtox	%f54,%f54
141
142	std	%f48,[%sp+2047+192+0]
143	add	%l6,8,%l6
144	std	%f50,[%sp+2047+192+8]
145	add	%i1,%l6,%o4
146	std	%f52,[%sp+2047+192+16]
147	add	%i3,%l6,%o5
148	std	%f54,[%sp+2047+192+24]
149
150	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
151	.word	0xa1b00c20	! fzeros %f16
152	ld	[%o4+4],%f19
153	.word	0xa5b00c20	! fzeros %f18
154	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
155	.word	0xa9b00c20	! fzeros %f20
156	ld	[%o5+4],%f23
157	.word	0xadb00c20	! fzeros %f22
158
159	fxtod	%f16,%f16
160	fxtod	%f18,%f18
161	fxtod	%f20,%f20
162	fxtod	%f22,%f22
163
164	ldx	[%sp+2047+192+0],%o0
165		fmuld	%f16,%f0,%f32
166	ldx	[%sp+2047+192+8],%o1
167		fmuld	%f20,%f8,%f48
168	ldx	[%sp+2047+192+16],%o2
169		fmuld	%f16,%f2,%f34
170	ldx	[%sp+2047+192+24],%o3
171		fmuld	%f20,%f10,%f50
172
173	srlx	%o0,16,%o7
174	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
175		fmuld	%f16,%f4,%f36
176	add	%o7,%o1,%o1
177	std	%f18,[%l2+%l6]
178		faddd	%f32,%f48,%f48
179		fmuld	%f20,%f12,%f52
180	srlx	%o1,16,%o7
181	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
182		fmuld	%f16,%f6,%f38
183	add	%o7,%o2,%o2
184	std	%f22,[%l4+%l6]
185		faddd	%f34,%f50,%f50
186		fmuld	%f20,%f14,%f54
187	srlx	%o2,16,%o7
188		fmuld	%f18,%f0,%f40
189	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
190		faddd	%f36,%f52,%f52
191		fmuld	%f22,%f8,%f56
192	!and	%o0,%l7,%o0
193	!and	%o1,%l7,%o1
194	!and	%o2,%l7,%o2
195	!sllx	%o1,16,%o1
196	!sllx	%o2,32,%o2
197	!sllx	%o3,48,%o7
198	!or	%o1,%o0,%o0
199	!or	%o2,%o0,%o0
200	!or	%o7,%o0,%o0		! 64-bit result
201	srlx	%o3,16,%g1		! 34-bit carry
202		fmuld	%f18,%f2,%f42
203
204	faddd	%f38,%f54,%f54
205		fmuld	%f22,%f10,%f58
206		fmuld	%f18,%f4,%f44
207	faddd	%f40,%f56,%f56
208		fmuld	%f22,%f12,%f60
209		fmuld	%f18,%f6,%f46
210	faddd	%f42,%f58,%f58
211		fmuld	%f22,%f14,%f62
212
213	faddd	%f24,%f48,%f48
214	faddd	%f26,%f50,%f50
215	faddd	%f44,%f60,%f24	! %f60
216	faddd	%f46,%f62,%f26	! %f62
217
218	faddd	%f52,%f56,%f52
219	faddd	%f54,%f58,%f54
220
221	fdtox	%f48,%f48
222	fdtox	%f50,%f50
223	fdtox	%f52,%f52
224	fdtox	%f54,%f54
225
226	std	%f48,[%sp+2047+192+0]
227	std	%f50,[%sp+2047+192+8]
228	addcc	%l6,8,%l6
229	std	%f52,[%sp+2047+192+16]
230	bz,pn	%icc,.L1stskip
231	std	%f54,[%sp+2047+192+24]
232
233.align	32			! incidentally already aligned !
234.L1st:
235	add	%i1,%l6,%o4
236	add	%i3,%l6,%o5
237	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
238	.word	0xa1b00c20	! fzeros %f16
239	ld	[%o4+4],%f19
240	.word	0xa5b00c20	! fzeros %f18
241	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
242	.word	0xa9b00c20	! fzeros %f20
243	ld	[%o5+4],%f23
244	.word	0xadb00c20	! fzeros %f22
245
246	fxtod	%f16,%f16
247	fxtod	%f18,%f18
248	fxtod	%f20,%f20
249	fxtod	%f22,%f22
250
251	ldx	[%sp+2047+192+0],%o0
252		fmuld	%f16,%f0,%f32
253	ldx	[%sp+2047+192+8],%o1
254		fmuld	%f20,%f8,%f48
255	ldx	[%sp+2047+192+16],%o2
256		fmuld	%f16,%f2,%f34
257	ldx	[%sp+2047+192+24],%o3
258		fmuld	%f20,%f10,%f50
259
260	srlx	%o0,16,%o7
261	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
262		fmuld	%f16,%f4,%f36
263	add	%o7,%o1,%o1
264	std	%f18,[%l2+%l6]
265		faddd	%f32,%f48,%f48
266		fmuld	%f20,%f12,%f52
267	srlx	%o1,16,%o7
268	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
269		fmuld	%f16,%f6,%f38
270	add	%o7,%o2,%o2
271	std	%f22,[%l4+%l6]
272		faddd	%f34,%f50,%f50
273		fmuld	%f20,%f14,%f54
274	srlx	%o2,16,%o7
275		fmuld	%f18,%f0,%f40
276	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
277	and	%o0,%l7,%o0
278		faddd	%f36,%f52,%f52
279		fmuld	%f22,%f8,%f56
280	and	%o1,%l7,%o1
281	and	%o2,%l7,%o2
282		fmuld	%f18,%f2,%f42
283	sllx	%o1,16,%o1
284		faddd	%f38,%f54,%f54
285		fmuld	%f22,%f10,%f58
286	sllx	%o2,32,%o2
287		fmuld	%f18,%f4,%f44
288	sllx	%o3,48,%o7
289	or	%o1,%o0,%o0
290		faddd	%f40,%f56,%f56
291		fmuld	%f22,%f12,%f60
292	or	%o2,%o0,%o0
293		fmuld	%f18,%f6,%f46
294	or	%o7,%o0,%o0		! 64-bit result
295		faddd	%f42,%f58,%f58
296		fmuld	%f22,%f14,%f62
297	addcc	%g1,%o0,%o0
298		faddd	%f24,%f48,%f48
299	srlx	%o3,16,%g1		! 34-bit carry
300		faddd	%f26,%f50,%f50
301	bcs,a	%xcc,.+8
302	add	%g1,1,%g1
303
304	stx	%o0,[%l0]		! tp[j-1]=
305
306	faddd	%f44,%f60,%f24	! %f60
307	faddd	%f46,%f62,%f26	! %f62
308
309	faddd	%f52,%f56,%f52
310	faddd	%f54,%f58,%f54
311
312	fdtox	%f48,%f48
313	fdtox	%f50,%f50
314	fdtox	%f52,%f52
315	fdtox	%f54,%f54
316
317	std	%f48,[%sp+2047+192+0]
318	std	%f50,[%sp+2047+192+8]
319	std	%f52,[%sp+2047+192+16]
320	std	%f54,[%sp+2047+192+24]
321
322	addcc	%l6,8,%l6
323	bnz,pt	%icc,.L1st
324	add	%l0,8,%l0
325
326.L1stskip:
327	fdtox	%f24,%f24
328	fdtox	%f26,%f26
329
330	ldx	[%sp+2047+192+0],%o0
331	ldx	[%sp+2047+192+8],%o1
332	ldx	[%sp+2047+192+16],%o2
333	ldx	[%sp+2047+192+24],%o3
334
335	srlx	%o0,16,%o7
336	std	%f24,[%sp+2047+192+32]
337	add	%o7,%o1,%o1
338	std	%f26,[%sp+2047+192+40]
339	srlx	%o1,16,%o7
340	add	%o7,%o2,%o2
341	srlx	%o2,16,%o7
342	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
343	and	%o0,%l7,%o0
344	and	%o1,%l7,%o1
345	and	%o2,%l7,%o2
346	sllx	%o1,16,%o1
347	sllx	%o2,32,%o2
348	sllx	%o3,48,%o7
349	or	%o1,%o0,%o0
350	or	%o2,%o0,%o0
351	or	%o7,%o0,%o0		! 64-bit result
352	ldx	[%sp+2047+192+32],%o4
353	addcc	%g1,%o0,%o0
354	ldx	[%sp+2047+192+40],%o5
355	srlx	%o3,16,%g1		! 34-bit carry
356	bcs,a	%xcc,.+8
357	add	%g1,1,%g1
358
359	stx	%o0,[%l0]		! tp[j-1]=
360	add	%l0,8,%l0
361
362	srlx	%o4,16,%o7
363	add	%o7,%o5,%o5
364	and	%o4,%l7,%o4
365	sllx	%o5,16,%o7
366	or	%o7,%o4,%o4
367	addcc	%g1,%o4,%o4
368	srlx	%o5,48,%g1
369	bcs,a	%xcc,.+8
370	add	%g1,1,%g1
371
372	mov	%g1,%i4
373	stx	%o4,[%l0]		! tp[num-1]=
374
375	ba	.Louter
376	add	%l5,8,%l5
377.align	32
378.Louter:
379	sub	%g0,%i5,%l6		! j=-num
380	add	%sp,2047+192+64,%l0
381
382	add	%i1,%l6,%o3
383	add	%i2,%l5,%o4
384
385	ld	[%o3+4],%g1		! bp[i]
386	ld	[%o3+0],%o0
387	ld	[%o4+4],%g5		! ap[0]
388	sllx	%g1,32,%g1
389	ld	[%o4+0],%o1
390	sllx	%g5,32,%g5
391	or	%g1,%o0,%o0
392	or	%g5,%o1,%o1
393
394	ldx	[%l0],%o2		! tp[0]
395	mulx	%o1,%o0,%o0
396	addcc	%o2,%o0,%o0
397	mulx	%g4,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
398	stx	%o0,[%sp+2047+192+0]
399
400	! transfer b[i] to FPU as 4x16-bit values
401	ldda	[%o4+2]%asi,%f0
402	ldda	[%o4+0]%asi,%f2
403	ldda	[%o4+6]%asi,%f4
404	ldda	[%o4+4]%asi,%f6
405
406	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
407	ldda	[%sp+2047+192+6]%asi,%f8
408	fxtod	%f0,%f0
409	ldda	[%sp+2047+192+4]%asi,%f10
410	fxtod	%f2,%f2
411	ldda	[%sp+2047+192+2]%asi,%f12
412	fxtod	%f4,%f4
413	ldda	[%sp+2047+192+0]%asi,%f14
414	fxtod	%f6,%f6
415	ldd	[%l1+%l6],%f16		! load a[j] in double format
416	fxtod	%f8,%f8
417	ldd	[%l2+%l6],%f18
418	fxtod	%f10,%f10
419	ldd	[%l3+%l6],%f20		! load n[j] in double format
420	fxtod	%f12,%f12
421	ldd	[%l4+%l6],%f22
422	fxtod	%f14,%f14
423
424		fmuld	%f16,%f0,%f32
425		fmuld	%f20,%f8,%f48
426		fmuld	%f16,%f2,%f34
427		fmuld	%f20,%f10,%f50
428		fmuld	%f16,%f4,%f36
429	faddd	%f32,%f48,%f48
430		fmuld	%f20,%f12,%f52
431		fmuld	%f16,%f6,%f38
432	faddd	%f34,%f50,%f50
433		fmuld	%f20,%f14,%f54
434		fmuld	%f18,%f0,%f40
435	faddd	%f36,%f52,%f52
436		fmuld	%f22,%f8,%f56
437		fmuld	%f18,%f2,%f42
438	faddd	%f38,%f54,%f54
439		fmuld	%f22,%f10,%f58
440		fmuld	%f18,%f4,%f44
441	faddd	%f40,%f56,%f56
442		fmuld	%f22,%f12,%f60
443		fmuld	%f18,%f6,%f46
444	faddd	%f42,%f58,%f58
445		fmuld	%f22,%f14,%f62
446
447	faddd	%f44,%f60,%f24	! %f60
448	faddd	%f46,%f62,%f26	! %f62
449
450	faddd	%f52,%f56,%f52
451	faddd	%f54,%f58,%f54
452
453	fdtox	%f48,%f48
454	fdtox	%f50,%f50
455	fdtox	%f52,%f52
456	fdtox	%f54,%f54
457
458	std	%f48,[%sp+2047+192+0]
459	std	%f50,[%sp+2047+192+8]
460	std	%f52,[%sp+2047+192+16]
461	add	%l6,8,%l6
462	std	%f54,[%sp+2047+192+24]
463
464	ldd	[%l1+%l6],%f16		! load a[j] in double format
465	ldd	[%l2+%l6],%f18
466	ldd	[%l3+%l6],%f20		! load n[j] in double format
467	ldd	[%l4+%l6],%f22
468
469		fmuld	%f16,%f0,%f32
470		fmuld	%f20,%f8,%f48
471		fmuld	%f16,%f2,%f34
472		fmuld	%f20,%f10,%f50
473		fmuld	%f16,%f4,%f36
474	ldx	[%sp+2047+192+0],%o0
475		faddd	%f32,%f48,%f48
476		fmuld	%f20,%f12,%f52
477	ldx	[%sp+2047+192+8],%o1
478		fmuld	%f16,%f6,%f38
479	ldx	[%sp+2047+192+16],%o2
480		faddd	%f34,%f50,%f50
481		fmuld	%f20,%f14,%f54
482	ldx	[%sp+2047+192+24],%o3
483		fmuld	%f18,%f0,%f40
484
485	srlx	%o0,16,%o7
486		faddd	%f36,%f52,%f52
487		fmuld	%f22,%f8,%f56
488	add	%o7,%o1,%o1
489		fmuld	%f18,%f2,%f42
490	srlx	%o1,16,%o7
491		faddd	%f38,%f54,%f54
492		fmuld	%f22,%f10,%f58
493	add	%o7,%o2,%o2
494		fmuld	%f18,%f4,%f44
495	srlx	%o2,16,%o7
496		faddd	%f40,%f56,%f56
497		fmuld	%f22,%f12,%f60
498	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
499	! why?
500	and	%o0,%l7,%o0
501		fmuld	%f18,%f6,%f46
502	and	%o1,%l7,%o1
503	and	%o2,%l7,%o2
504		faddd	%f42,%f58,%f58
505		fmuld	%f22,%f14,%f62
506	sllx	%o1,16,%o1
507		faddd	%f24,%f48,%f48
508	sllx	%o2,32,%o2
509		faddd	%f26,%f50,%f50
510	sllx	%o3,48,%o7
511	or	%o1,%o0,%o0
512		faddd	%f44,%f60,%f24	! %f60
513	or	%o2,%o0,%o0
514		faddd	%f46,%f62,%f26	! %f62
515	or	%o7,%o0,%o0		! 64-bit result
516	ldx	[%l0],%o7
517		faddd	%f52,%f56,%f52
518	addcc	%o7,%o0,%o0
519	! end-of-why?
520		faddd	%f54,%f58,%f54
521	srlx	%o3,16,%g1		! 34-bit carry
522		fdtox	%f48,%f48
523	bcs,a	%xcc,.+8
524	add	%g1,1,%g1
525
526	fdtox	%f50,%f50
527	fdtox	%f52,%f52
528	fdtox	%f54,%f54
529
530	std	%f48,[%sp+2047+192+0]
531	std	%f50,[%sp+2047+192+8]
532	addcc	%l6,8,%l6
533	std	%f52,[%sp+2047+192+16]
534	bz,pn	%icc,.Linnerskip
535	std	%f54,[%sp+2047+192+24]
536
537	ba	.Linner
538	nop
539.align	32
540.Linner:
541	ldd	[%l1+%l6],%f16		! load a[j] in double format
542	ldd	[%l2+%l6],%f18
543	ldd	[%l3+%l6],%f20		! load n[j] in double format
544	ldd	[%l4+%l6],%f22
545
546		fmuld	%f16,%f0,%f32
547		fmuld	%f20,%f8,%f48
548		fmuld	%f16,%f2,%f34
549		fmuld	%f20,%f10,%f50
550		fmuld	%f16,%f4,%f36
551	ldx	[%sp+2047+192+0],%o0
552		faddd	%f32,%f48,%f48
553		fmuld	%f20,%f12,%f52
554	ldx	[%sp+2047+192+8],%o1
555		fmuld	%f16,%f6,%f38
556	ldx	[%sp+2047+192+16],%o2
557		faddd	%f34,%f50,%f50
558		fmuld	%f20,%f14,%f54
559	ldx	[%sp+2047+192+24],%o3
560		fmuld	%f18,%f0,%f40
561
562	srlx	%o0,16,%o7
563		faddd	%f36,%f52,%f52
564		fmuld	%f22,%f8,%f56
565	add	%o7,%o1,%o1
566		fmuld	%f18,%f2,%f42
567	srlx	%o1,16,%o7
568		faddd	%f38,%f54,%f54
569		fmuld	%f22,%f10,%f58
570	add	%o7,%o2,%o2
571		fmuld	%f18,%f4,%f44
572	srlx	%o2,16,%o7
573		faddd	%f40,%f56,%f56
574		fmuld	%f22,%f12,%f60
575	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
576	and	%o0,%l7,%o0
577		fmuld	%f18,%f6,%f46
578	and	%o1,%l7,%o1
579	and	%o2,%l7,%o2
580		faddd	%f42,%f58,%f58
581		fmuld	%f22,%f14,%f62
582	sllx	%o1,16,%o1
583		faddd	%f24,%f48,%f48
584	sllx	%o2,32,%o2
585		faddd	%f26,%f50,%f50
586	sllx	%o3,48,%o7
587	or	%o1,%o0,%o0
588		faddd	%f44,%f60,%f24	! %f60
589	or	%o2,%o0,%o0
590		faddd	%f46,%f62,%f26	! %f62
591	or	%o7,%o0,%o0		! 64-bit result
592		faddd	%f52,%f56,%f52
593	addcc	%g1,%o0,%o0
594	ldx	[%l0+8],%o7		! tp[j]
595		faddd	%f54,%f58,%f54
596	srlx	%o3,16,%g1		! 34-bit carry
597		fdtox	%f48,%f48
598	bcs,a	%xcc,.+8
599	add	%g1,1,%g1
600		fdtox	%f50,%f50
601	addcc	%o7,%o0,%o0
602		fdtox	%f52,%f52
603	bcs,a	%xcc,.+8
604	add	%g1,1,%g1
605
606	stx	%o0,[%l0]		! tp[j-1]
607		fdtox	%f54,%f54
608
609	std	%f48,[%sp+2047+192+0]
610	std	%f50,[%sp+2047+192+8]
611	std	%f52,[%sp+2047+192+16]
612	addcc	%l6,8,%l6
613	std	%f54,[%sp+2047+192+24]
614	bnz,pt	%icc,.Linner
615	add	%l0,8,%l0
616
617.Linnerskip:
618	fdtox	%f24,%f24
619	fdtox	%f26,%f26
620
621	ldx	[%sp+2047+192+0],%o0
622	ldx	[%sp+2047+192+8],%o1
623	ldx	[%sp+2047+192+16],%o2
624	ldx	[%sp+2047+192+24],%o3
625
626	srlx	%o0,16,%o7
627	std	%f24,[%sp+2047+192+32]
628	add	%o7,%o1,%o1
629	std	%f26,[%sp+2047+192+40]
630	srlx	%o1,16,%o7
631	add	%o7,%o2,%o2
632	srlx	%o2,16,%o7
633	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
634	and	%o0,%l7,%o0
635	and	%o1,%l7,%o1
636	and	%o2,%l7,%o2
637	sllx	%o1,16,%o1
638	sllx	%o2,32,%o2
639	sllx	%o3,48,%o7
640	or	%o1,%o0,%o0
641	or	%o2,%o0,%o0
642	ldx	[%sp+2047+192+32],%o4
643	or	%o7,%o0,%o0		! 64-bit result
644	ldx	[%sp+2047+192+40],%o5
645	addcc	%g1,%o0,%o0
646	ldx	[%l0+8],%o7		! tp[j]
647	srlx	%o3,16,%g1		! 34-bit carry
648	bcs,a	%xcc,.+8
649	add	%g1,1,%g1
650
651	addcc	%o7,%o0,%o0
652	bcs,a	%xcc,.+8
653	add	%g1,1,%g1
654
655	stx	%o0,[%l0]		! tp[j-1]
656	add	%l0,8,%l0
657
658	srlx	%o4,16,%o7
659	add	%o7,%o5,%o5
660	and	%o4,%l7,%o4
661	sllx	%o5,16,%o7
662	or	%o7,%o4,%o4
663	addcc	%g1,%o4,%o4
664	srlx	%o5,48,%g1
665	bcs,a	%xcc,.+8
666	add	%g1,1,%g1
667
668	addcc	%i4,%o4,%o4
669	stx	%o4,[%l0]		! tp[num-1]
670	mov	%g1,%i4
671	bcs,a	%xcc,.+8
672	add	%i4,1,%i4
673
674	addcc	%l5,8,%l5
675	bnz	%icc,.Louter
676	nop
677
678	add	%l0,8,%l0		! adjust tp to point at the end
679	orn	%g0,%g0,%g4
680	sub	%g0,%i5,%o7		! n=-num
681	ba	.Lsub
682	subcc	%g0,%g0,%g0		! clear %icc.c
683
684.align	32
685.Lsub:
686	ldx	[%l0+%o7],%o0
687	add	%i3,%o7,%g1
688	ld	[%g1+0],%o2
689	ld	[%g1+4],%o3
690	srlx	%o0,32,%o1
691	subccc	%o0,%o2,%o2
692	add	%i0,%o7,%g1
693	subccc	%o1,%o3,%o3
694	st	%o2,[%g1+0]
695	add	%o7,8,%o7
696	brnz,pt	%o7,.Lsub
697	st	%o3,[%g1+4]
698	subc	%i4,0,%g4
699	sub	%g0,%i5,%o7		! n=-num
700	ba	.Lcopy
701	nop
702
703.align	32
704.Lcopy:
705	ldx	[%l0+%o7],%o0
706	add	%i0,%o7,%g1
707	ld	[%g1+0],%o2
708	ld	[%g1+4],%o3
709	stx	%g0,[%l0+%o7]
710	and	%o0,%g4,%o0
711	srlx	%o0,32,%o1
712	andn	%o2,%g4,%o2
713	andn	%o3,%g4,%o3
714	or	%o2,%o0,%o0
715	or	%o3,%o1,%o1
716	st	%o0,[%g1+0]
717	add	%o7,8,%o7
718	brnz,pt	%o7,.Lcopy
719	st	%o1,[%g1+4]
720	sub	%g0,%i5,%o7		! n=-num
721
722.Lzap:
723	stx	%g0,[%l1+%o7]
724	stx	%g0,[%l2+%o7]
725	stx	%g0,[%l3+%o7]
726	stx	%g0,[%l4+%o7]
727	add	%o7,8,%o7
728	brnz,pt	%o7,.Lzap
729	nop
730
731	ldx	[%sp+2047+192+48],%o7
732	wr	%g0,%o7,%asi		! restore %asi
733
734	mov	1,%i0
735.Lret:
736	ret
737	restore
738.type   bn_mul_mont_fpu,#function
739.size	bn_mul_mont_fpu,(.-bn_mul_mont_fpu)
740.asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>"
741.align	32
742