1dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
2
3dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C         cycles/limb
23C Itanium:    2.5
24C Itanium 2:  1.5
25
26C TODO
27C  * Rewrite function entry code using aorslsh1_n.asm style.
28C  * Micro-optimize feed-in and wind-down code.
29
30C INPUT PARAMETERS
31define(`rp',`r32')
32define(`up',`r33')
33define(`vp',`r34')
34define(`n',`r35')
35
36define(`OPERATION_rsh1add_n',1)
37
38ifdef(`OPERATION_rsh1add_n',`
39  define(ADDSUB,       add)
40  define(PRED,	       ltu)
41  define(INCR,	       1)
42  define(LIM,	       -1)
43  define(func, mpn_rsh1add_n)
44')
45ifdef(`OPERATION_rsh1sub_n',`
46  define(ADDSUB,       sub)
47  define(PRED,	       gtu)
48  define(INCR,	       -1)
49  define(LIM,	       0)
50  define(func, mpn_rsh1sub_n)
51')
52
53C Some useful aliases for registers we use
54define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
55define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
56define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
57define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
58
59ASM_START()
60PROLOGUE(func)
61	.prologue
62	.save	ar.lc, r2
63	.body
64ifdef(`HAVE_ABI_32',`
65	addp4		rp = 0, rp		C			M I
66	addp4		up = 0, up		C			M I
67	addp4		vp = 0, vp		C			M I
68	zxt4		n = n			C			I
69	;;
70')
71 {.mmi;	ld8		r11 = [vp], 8		C			M01
72	ld8		r10 = [up], 8		C			M01
73	mov.i		r2 = ar.lc		C			I0
74}{.mmi;	and		r14 = 3, n		C			M I
75	cmp.lt		p15, p0 = 4, n		C			M I
76	add		n = -4, n		C			M I
77	;;
78}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
79	cmp.eq		p7, p0 = 2, r14		C			M I
80	cmp.eq		p8, p0 = 3, r14		C			M I
81}{.bbb
82  (p6)	br.dptk		.Lb01			C			B
83  (p7)	br.dptk		.Lb10			C			B
84  (p8)	br.dptk		.Lb11			C			B
85}
86
87.Lb00:	ld8		v0 = [vp], 8		C			M01
88	ld8		u0 = [up], 8		C			M01
89	shr.u		n = n, 2		C			I0
90	;;
91	ld8		v1 = [vp], 8		C			M01
92	ld8		u1 = [up], 8		C			M01
93	ADDSUB		w3 = r10, r11		C			M I
94	;;
95	ld8		v2 = [vp], 8		C			M01
96	ld8		u2 = [up], 8		C			M01
97  (p15)	br.dpnt		.grt4			C			B
98	;;
99
100	cmp.PRED	p7, p0 = w3, r10	C			M I
101	and		r8 = 1, w3		C			M I
102	ADDSUB		w0 = u0, v0		C			M I
103	;;
104	cmp.PRED	p8, p0 = w0, u0		C			M I
105	ADDSUB		w1 = u1, v1		C			M I
106	;;
107	cmp.PRED	p9, p0 = w1, u1		C			M I
108   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
109   (p7)	add		w0 = INCR, w0		C			M I
110	;;
111	shrp		x3 = w0, w3, 1		C			I0
112	ADDSUB		w2 = u2, v2		C			M I
113   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
114   (p8)	add		w1 = INCR, w1		C			M I
115	br		.Lcj4			C			B
116
117.grt4:	ld8		v3 = [vp], 8		C			M01
118	cmp.PRED	p7, p0 = w3, r10	C			M I
119	ld8		u3 = [up], 8		C			M01
120	and		r8 = 1, w3		C			M I
121	;;
122	ADDSUB		w0 = u0, v0		C			M I
123	ld8		v0 = [vp], 8		C			M01
124	add		n = -1, n
125	;;
126	cmp.PRED	p8, p0 = w0, u0		C			M I
127	ld8		u0 = [up], 8		C			M01
128	ADDSUB		w1 = u1, v1		C			M I
129	;;
130	ld8		v1 = [vp], 8		C			M01
131	mov.i		ar.lc = n		C			I0
132	cmp.PRED	p9, p0 = w1, u1		C			M I
133	ld8		u1 = [up], 8		C			M01
134   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
135   (p7)	add		w0 = INCR, w0		C			M I
136	;;
137	ADDSUB		w2 = u2, v2		C			M I
138	ld8		v2 = [vp], 8		C			M01
139	shrp		x3 = w0, w3, 1		C			I0
140   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
141   (p8)	add		w1 = INCR, w1		C			M I
142	br		.LL00			C			B
143
144
145.Lb01:	ADDSUB		w2 = r10, r11		C			M I
146	shr.u		n = n, 2		C			I0
147  (p15)	br.dpnt		.grt1			C			B
148	;;
149
150	cmp.PRED	p6, p7 = w2, r10	C			M I
151	shr.u		x2 = w2, 1		C			I0
152	and		r8 = 1, w2		C			M I
153	;;
154   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
155	br		.Lcj1			C			B
156
157.grt1:	ld8		v3 = [vp], 8		C			M01
158	ld8		u3 = [up], 8		C			M01
159	;;
160	ld8		v0 = [vp], 8		C			M01
161	ld8		u0 = [up], 8		C			M01
162	mov.i		ar.lc = n		C FIXME swap with next	I0
163	;;
164	ld8		v1 = [vp], 8		C			M01
165	ld8		u1 = [up], 8		C			M01
166	;;
167	ld8		v2 = [vp], 8		C			M01
168	ld8		u2 = [up], 8		C			M01
169	cmp.PRED	p6, p0 = w2, r10	C			M I
170	and		r8 = 1, w2		C			M I
171	ADDSUB		w3 = u3, v3		C			M I
172	br.cloop.dptk	.grt5			C			B
173	;;
174
175	cmp.PRED	p7, p0 = w3, u3		C			M I
176	;;
177	ADDSUB		w0 = u0, v0		C			M I
178   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
179   (p6)	add		w3 = INCR, w3		C			M I
180	;;
181	cmp.PRED	p8, p0 = w0, u0		C			M I
182	shrp		x2 = w3, w2, 1		C			I0
183	ADDSUB		w1 = u1, v1		C			M I
184	;;
185	cmp.PRED	p9, p0 = w1, u1		C			M I
186   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
187   (p7)	add		w0 = INCR, w0		C			M I
188	br		.Lcj5			C			B
189
190.grt5:	ld8		v3 = [vp], 8		C			M01
191	cmp.PRED	p7, p0 = w3, u3		C			M I
192	ld8		u3 = [up], 8		C			M01
193	;;
194	ADDSUB		w0 = u0, v0		C			M I
195	ld8		v0 = [vp], 8		C			M01
196   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
197   (p6)	add		w3 = INCR, w3		C			M I
198	;;
199	cmp.PRED	p8, p0 = w0, u0		C			M I
200	shrp		x2 = w3, w2, 1		C			I0
201	ld8		u0 = [up], 8		C			M01
202	ADDSUB		w1 = u1, v1		C			M I
203	;;
204	ld8		v1 = [vp], 8		C			M01
205	cmp.PRED	p9, p0 = w1, u1		C			M I
206	ld8		u1 = [up], 8		C			M01
207   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
208   (p7)	add		w0 = INCR, w0		C			M I
209	br		.LL01			C			B
210
211
212.Lb10:	ld8		v2 = [vp], 8		C			M01
213	ld8		u2 = [up], 8		C			M01
214	shr.u		n = n, 2		C			I0
215	ADDSUB		w1 = r10, r11		C			M I
216  (p15)	br.dpnt		.grt2			C			B
217	;;
218
219	cmp.PRED	p9, p0 = w1, r10	C			M I
220	and		r8 = 1, w1		C			M I
221	ADDSUB		w2 = u2, v2		C			M I
222	;;
223	cmp.PRED	p6, p0 = w2, u2		C			M I
224	;;
225   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
226   (p9)	add		w2 = INCR, w2		C			M I
227	;;
228	shrp		x1 = w2, w1, 1		C			I0
229	shr.u		x2 = w2, 1		C			I0
230	br		.Lcj2			C			B
231
232.grt2:	ld8		v3 = [vp], 8		C			M01
233	ld8		u3 = [up], 8		C			M01
234	;;
235	ld8		v0 = [vp], 8		C			M01
236	ld8		u0 = [up], 8		C			M01
237	mov.i		ar.lc = n		C			I0
238	;;
239	ld8		v1 = [vp], 8		C			M01
240	cmp.PRED	p9, p0 = w1, r10	C			M I
241	ld8		u1 = [up], 8		C			M01
242	and		r8 = 1, w1		C			M I
243	;;
244	ADDSUB		w2 = u2, v2		C			M I
245	ld8		v2 = [vp], 8		C			M01
246	;;
247	cmp.PRED	p6, p0 = w2, u2		C			M I
248	ld8		u2 = [up], 8		C			M01
249	ADDSUB		w3 = u3, v3		C			M I
250	br.cloop.dptk	.grt6			C			B
251	;;
252
253	cmp.PRED	p7, p0 = w3, u3		C			M I
254   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
255   (p9)	add		w2 = INCR, w2		C			M I
256	;;
257	shrp		x1 = w2, w1, 1		C			I0
258	ADDSUB		w0 = u0, v0		C			M I
259   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
260   (p6)	add		w3 = INCR, w3		C			M I
261	br		.Lcj6			C			B
262
263.grt6:	ld8		v3 = [vp], 8		C			M01
264	cmp.PRED	p7, p0 = w3, u3		C			M I
265	ld8		u3 = [up], 8		C			M01
266   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
267   (p9)	add		w2 = INCR, w2		C			M I
268	;;
269	shrp		x1 = w2, w1, 1		C			I0
270	ADDSUB		w0 = u0, v0		C			M I
271	ld8		v0 = [vp], 8		C			M01
272   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
273   (p6)	add		w3 = INCR, w3		C			M I
274	br		.LL10			C			B
275
276
277.Lb11:	ld8		v1 = [vp], 8		C			M01
278	ld8		u1 = [up], 8		C			M01
279	shr.u		n = n, 2		C			I0
280	;;
281	ld8		v2 = [vp], 8		C			M01
282	ld8		u2 = [up], 8		C			M01
283	ADDSUB		w0 = r10, r11		C			M I
284  (p15)	br.dpnt		.grt3			C			B
285	;;
286
287	cmp.PRED	p8, p0 = w0, r10	C			M I
288	ADDSUB		w1 = u1, v1		C			M I
289	and		r8 = 1, w0		C			M I
290	;;
291	cmp.PRED	p9, p0 = w1, u1		C			M I
292	;;
293	ADDSUB		w2 = u2, v2		C			M I
294   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
295   (p8)	add		w1 = INCR, w1		C			M I
296	;;
297	cmp.PRED	p6, p0 = w2, u2		C			M I
298	shrp		x0 = w1, w0, 1		C			I0
299	;;
300   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
301   (p9)	add		w2 = INCR, w2		C			M I
302	br		.Lcj3			C			B
303
304.grt3:	ld8		v3 = [vp], 8		C			M01
305	ld8		u3 = [up], 8		C			M01
306	;;
307	ld8		v0 = [vp], 8		C			M01
308	mov.i		ar.lc = n		C			I0
309	cmp.PRED	p8, p0 = w0, r10	C			M I
310	ld8		u0 = [up], 8		C			M01
311	ADDSUB		w1 = u1, v1		C			M I
312	and		r8 = 1, w0		C			M I
313	;;
314	ld8		v1 = [vp], 8		C			M01
315	cmp.PRED	p9, p0 = w1, u1		C			M I
316	ld8		u1 = [up], 8		C			M01
317	;;
318	ADDSUB		w2 = u2, v2		C			M I
319	ld8		v2 = [vp], 8		C			M01
320   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
321   (p8)	add		w1 = INCR, w1		C			M I
322	;;
323	cmp.PRED	p6, p0 = w2, u2		C			M I
324	shrp		x0 = w1, w0, 1		C			I0
325	ld8		u2 = [up], 8		C			M01
326	ADDSUB		w3 = u3, v3		C			M I
327	br.cloop.dptk	.grt7			C			B
328	;;
329
330	cmp.PRED	p7, p0 = w3, u3		C			M I
331   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
332   (p9)	add		w2 = INCR, w2		C			M I
333	br		.Lcj7			C			B
334
335.grt7:	ld8		v3 = [vp], 8		C			M01
336	cmp.PRED	p7, p0 = w3, u3		C			M I
337	ld8		u3 = [up], 8		C			M01
338   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
339   (p9)	add		w2 = INCR, w2		C			M I
340	br		.LL11			C			B
341
342
343C *** MAIN LOOP START ***
344	ALIGN(32)
345.Loop:	st8		[rp] = x3, 8		C			M23
346	ld8		v3 = [vp], 8		C			M01
347	cmp.PRED	p7, p0 = w3, u3		C			M I
348	ld8		u3 = [up], 8		C			M01
349   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
350   (p9)	add		w2 = INCR, w2		C			M I
351	;;
352.LL11:	st8		[rp] = x0, 8		C			M23
353	shrp		x1 = w2, w1, 1		C			I0
354	ADDSUB		w0 = u0, v0		C			M I
355	ld8		v0 = [vp], 8		C			M01
356   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
357   (p6)	add		w3 = INCR, w3		C			M I
358	;;
359.LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
360	shrp		x2 = w3, w2, 1		C			I0
361	nop.b		0
362	ld8		u0 = [up], 8		C			M01
363	ADDSUB		w1 = u1, v1		C			M I
364	nop.b		0
365	;;
366	st8		[rp] = x1, 8		C			M23
367	ld8		v1 = [vp], 8		C			M01
368	cmp.PRED	p9, p0 = w1, u1		C			M I
369	ld8		u1 = [up], 8		C			M01
370   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
371   (p7)	add		w0 = INCR, w0		C			M I
372	;;
373.LL01:	st8		[rp] = x2, 8		C			M23
374	shrp		x3 = w0, w3, 1		C			I0
375	ADDSUB		w2 = u2, v2		C			M I
376	ld8		v2 = [vp], 8		C			M01
377   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
378   (p8)	add		w1 = INCR, w1		C			M I
379	;;
380.LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
381	shrp		x0 = w1, w0, 1		C			I0
382	nop.b		0
383	ld8		u2 = [up], 8		C			M01
384	ADDSUB		w3 = u3, v3		C			M I
385	br.cloop.dptk	.Loop			C			B
386	;;
387C *** MAIN LOOP END ***
388
389.Lskip:	st8		[rp] = x3, 8		C			M23
390	cmp.PRED	p7, p0 = w3, u3		C			M I
391   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
392   (p9)	add		w2 = INCR, w2		C			M I
393	;;
394.Lcj7:	st8		[rp] = x0, 8		C			M23
395	shrp		x1 = w2, w1, 1		C			I0
396	ADDSUB		w0 = u0, v0		C			M I
397   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
398   (p6)	add		w3 = INCR, w3		C			M I
399	;;
400.Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
401	shrp		x2 = w3, w2, 1		C			I0
402	ADDSUB		w1 = u1, v1		C			M I
403	;;
404	st8		[rp] = x1, 8		C			M23
405	cmp.PRED	p9, p0 = w1, u1		C			M I
406   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
407   (p7)	add		w0 = INCR, w0		C			M I
408	;;
409.Lcj5:	st8		[rp] = x2, 8		C			M23
410	shrp		x3 = w0, w3, 1		C			I0
411	ADDSUB		w2 = u2, v2		C			M I
412   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
413   (p8)	add		w1 = INCR, w1		C			M I
414	;;
415.Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
416	shrp		x0 = w1, w0, 1		C			I0
417	;;
418	st8		[rp] = x3, 8		C			M23
419   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
420   (p9)	add		w2 = INCR, w2		C			M I
421	;;
422.Lcj3:	st8		[rp] = x0, 8		C			M23
423	shrp		x1 = w2, w1, 1		C			I0
424	shr.u		x2 = w2, 1		C			I0
425	;;
426.Lcj2:	st8		[rp] = x1, 8		C			M23
427   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
428	;;
429.Lcj1:	st8		[rp] = x2		C			M23
430	mov.i		ar.lc = r2		C			I0
431	br.ret.sptk.many b0			C			B
432EPILOGUE()
433