1dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
2
3dnl  Copyright 2001-2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
33dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
34dnl  for optimization.
35
36include(`../config.m4')
37
38C INPUT PARAMETERS
39define(`rp',`%r26')
40define(`up',`%r25')
41define(`n',`%r24')
42
43define(`p00',`%r28')
44define(`p32',`%r29')
45define(`p64',`%r31')
46define(`t0',`%r19')
47define(`t1',`%r20')
48
49ifdef(`HAVE_ABI_2_0w',
50`	.level	2.0w
51',`	.level	2.0
52')
53PROLOGUE(mpn_sqr_diagonal)
54	ldo		128(%r30),%r30
55
56	fldds,ma	8(up),%fr8
57	addib,=		-1,n,L(end1)
58	nop
59	fldds,ma	8(up),%fr4
60	xmpyu		%fr8l,%fr8r,%fr10
61	fstd		%fr10,-120(%r30)
62	xmpyu		%fr8r,%fr8r,%fr9
63	fstd		%fr9,0(rp)
64	xmpyu		%fr8l,%fr8l,%fr11
65	fstd		%fr11,8(rp)
66	addib,=		-1,n,L(end2)
67	ldo		16(rp),rp
68
69LDEF(loop)
70	fldds,ma	8(up),%fr8		C load next up limb
71	xmpyu		%fr4l,%fr4r,%fr6
72	fstd		%fr6,-128(%r30)
73	xmpyu		%fr4r,%fr4r,%fr5	C multiply in fp regs
74	fstd		%fr5,0(rp)
75	xmpyu		%fr4l,%fr4l,%fr7
76	fstd		%fr7,8(rp)
77	ldd		-120(%r30),p32
78	ldd		-16(rp),p00		C accumulate in int regs
79	ldd		-8(rp),p64
80	depd,z		p32,30,31,t0
81	add		t0,p00,p00
82	std		p00,-16(rp)
83	extrd,u		p32,32,33,t1
84	add,dc		t1,p64,p64
85	std		p64,-8(rp)
86	addib,=		-1,n,L(exit)
87	ldo		16(rp),rp
88
89	fldds,ma	8(up),%fr4
90	xmpyu		%fr8l,%fr8r,%fr10
91	fstd		%fr10,-120(%r30)
92	xmpyu		%fr8r,%fr8r,%fr9
93	fstd		%fr9,0(rp)
94	xmpyu		%fr8l,%fr8l,%fr11
95	fstd		%fr11,8(rp)
96	ldd		-128(%r30),p32
97	ldd		-16(rp),p00
98	ldd		-8(rp),p64
99	depd,z		p32,30,31,t0
100	add		t0,p00,p00
101	std		p00,-16(rp)
102	extrd,u		p32,32,33,t1
103	add,dc		t1,p64,p64
104	std		p64,-8(rp)
105	addib,<>	-1,n,L(loop)
106	ldo		16(rp),rp
107
108LDEF(end2)
109	xmpyu		%fr4l,%fr4r,%fr6
110	fstd		%fr6,-128(%r30)
111	xmpyu		%fr4r,%fr4r,%fr5
112	fstd		%fr5,0(rp)
113	xmpyu		%fr4l,%fr4l,%fr7
114	fstd		%fr7,8(rp)
115	ldd		-120(%r30),p32
116	ldd		-16(rp),p00
117	ldd		-8(rp),p64
118	depd,z		p32,30,31,t0
119	add		t0,p00,p00
120	std		p00,-16(rp)
121	extrd,u		p32,32,33,t1
122	add,dc		t1,p64,p64
123	std		p64,-8(rp)
124	ldo		16(rp),rp
125	ldd		-128(%r30),p32
126	ldd		-16(rp),p00
127	ldd		-8(rp),p64
128	depd,z		p32,30,31,t0
129	add		t0,p00,p00
130	std		p00,-16(rp)
131	extrd,u		p32,32,33,t1
132	add,dc		t1,p64,p64
133	std		p64,-8(rp)
134	bve		(%r2)
135	ldo		-128(%r30),%r30
136
137LDEF(exit)
138	xmpyu		%fr8l,%fr8r,%fr10
139	fstd		%fr10,-120(%r30)
140	xmpyu		%fr8r,%fr8r,%fr9
141	fstd		%fr9,0(rp)
142	xmpyu		%fr8l,%fr8l,%fr11
143	fstd		%fr11,8(rp)
144	ldd		-128(%r30),p32
145	ldd		-16(rp),p00
146	ldd		-8(rp),p64
147	depd,z		p32,31,32,t0
148	add		t0,p00,p00
149	extrd,u		p32,31,32,t1
150	add,dc		t1,p64,p64
151	add		t0,p00,p00
152	add,dc		t1,p64,p64
153	std		p00,-16(rp)
154	std		p64,-8(rp)
155	ldo		16(rp),rp
156	ldd		-120(%r30),p32
157	ldd		-16(rp),p00
158	ldd		-8(rp),p64
159	depd,z		p32,31,32,t0
160	add		t0,p00,p00
161	extrd,u		p32,31,32,t1
162	add,dc		t1,p64,p64
163	add		t0,p00,p00
164	add,dc		t1,p64,p64
165	std		p00,-16(rp)
166	std		p64,-8(rp)
167	bve		(%r2)
168	ldo		-128(%r30),%r30
169
170LDEF(end1)
171	xmpyu		%fr8l,%fr8r,%fr10
172	fstd		%fr10,-128(%r30)
173	xmpyu		%fr8r,%fr8r,%fr9
174	fstd		%fr9,0(rp)
175	xmpyu		%fr8l,%fr8l,%fr11
176	fstd		%fr11,8(rp)
177	ldo		16(rp),rp
178	ldd		-128(%r30),p32
179	ldd		-16(rp),p00
180	ldd		-8(rp),p64
181	depd,z		p32,31,32,t0
182	add		t0,p00,p00
183	extrd,u		p32,31,32,t1
184	add,dc		t1,p64,p64
185	add		t0,p00,p00
186	add,dc		t1,p64,p64
187	std		p00,-16(rp)
188	std		p64,-8(rp)
189	bve		(%r2)
190	ldo		-128(%r30),%r30
191EPILOGUE(mpn_sqr_diagonal)
192