1dnl  IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:      2.67
37C Itanium 2:    1.25
38
39C TODO
40C  * Consider using special code for small n, using something like
41C    "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
42C  * The non-nc code was trimmed cycle for cycle to its current state.  It is
43C    probably hard to save more that an odd cycle there.  The nc code is much
44C    cruder (since tune/speed doesn't have any applicable direct measurements).
45C  * Without the nc entry points, this becomes around 1800 bytes of object
46C    code; the nc code adds over 1000 bytes.  We should perhaps sacrifice a
47C    few cycles for the non-nc code and let it fall into the nc code.
48
49C INPUT PARAMETERS
50define(`rp', `r32')
51define(`up', `r33')
52define(`vp', `r34')
53define(`n',  `r35')
54define(`cy', `r36')
55
56ifdef(`OPERATION_add_n',`
57  define(ADDSUB,	add)
58  define(CND,		ltu)
59  define(INCR,		1)
60  define(LIM,		-1)
61  define(LIM2,		0)
62  define(func,    mpn_add_n)
63  define(func_nc, mpn_add_nc)
64')
65ifdef(`OPERATION_sub_n',`
66  define(ADDSUB,	sub)
67  define(CND,		gtu)
68  define(INCR,		-1)
69  define(LIM,		0)
70  define(LIM2,		-1)
71  define(func,    mpn_sub_n)
72  define(func_nc, mpn_sub_nc)
73')
74
75define(PFDIST, 500)
76
77C Some useful aliases for registers we use
78define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
79define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
80define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
81define(`rpx',`r3')
82define(`upadv',`r20') define(`vpadv',`r21')
83
84MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
85
86ASM_START()
87PROLOGUE(func_nc)
88	.prologue
89	.save	ar.lc, r2
90	.body
91ifdef(`HAVE_ABI_32',`
92		addp4	rp = 0, rp		C			M I
93		addp4	up = 0, up		C			M I
94		nop.i	0
95		addp4	vp = 0, vp		C			M I
96		nop.m	0
97		zxt4	n = n			C			I
98	;;
99')
100
101 {.mmi;		ld8	r11 = [vp], 8		C			M01
102		ld8	r10 = [up], 8		C			M01
103		mov	r2 = ar.lc		C			I0
104}{.mmi;		and	r14 = 7, n		C			M I
105		cmp.lt	p15, p14 = 8, n		C			M I
106		add	n = -6, n		C			M I
107	;;
108}{.mmi;		add	upadv = PFDIST, up	C Merging these lines into the feed-in
109		add	vpadv = PFDIST, vp	C code could save a cycle per call at
110		mov	r23 = cy		C the expense of code size.
111	;;
112}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
113		cmp.eq	p7, p0 = 2, r14		C			M I
114		cmp.eq	p8, p0 = 3, r14		C			M I
115}{.bbb;	(p6)	br.dptk	.Lc001			C			B
116	(p7)	br.dptk	.Lc010			C			B
117	(p8)	br.dptk	.Lc011			C			B
118	;;
119}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
120		cmp.eq	p10, p0 = 5, r14	C			M I
121		cmp.eq	p11, p0 = 6, r14	C			M I
122}{.bbb;	(p9)	br.dptk	.Lc100			C			B
123	(p10)	br.dptk	.Lc101			C			B
124	(p11)	br.dptk	.Lc110			C			B
125	;;
126}{.mmi;		ld8	r19 = [vp], 8		C			M01
127		ld8	r18 = [up], 8		C			M01
128		cmp.ne	p13, p0 = 0, cy		C copy cy to p13	M I
129}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
130		nop	0
131	(p12)	br.dptk	.Lc111			C			B
132	;;
133}
134
135.Lc000:
136 {.mmi;		ld8	v3 = [vp], 8		C			M01
137		ld8	u3 = [up], 8		C			M01
138		shr.u	n = n, 3		C			I0
139	;;
140}{.mmi;		add	vpadv = PFDIST, vp	C			M I
141		ld8	v0 = [vp], 8		C			M01
142		mov	ar.lc = n		C			I0
143}{.mmi;		ld8	u0 = [up], 8		C			M01
144		ADDSUB	w1 = r10, r11		C			M I
145		nop	0
146	;;
147}{.mmi;		add	upadv = PFDIST, up	C			M I
148		ld8	v1 = [vp], 8		C			M01
149		cmp.CND	p7, p0 = w1, r10	C			M I
150}{.mmi;		ld8	u1 = [up], 8		C			M01
151		ADDSUB	w2 = r18, r19		C			M I
152		add	rpx = 8, rp		C			M I
153	;;
154}{.mmi;		ld8	v2 = [vp], 8		C			M01
155		cmp.CND	p8, p0 = w2, r18	C			M I
156	(p13)	cmpeqor	p7, p0 = LIM, w1	C			M I
157}{.mmi;		ld8	u2 = [up], 8		C			M01
158	(p13)	add	w1 = INCR, w1		C			M I
159		ADDSUB	w3 = u3, v3		C			M I
160	;;
161}{.mmi;		ld8	v3 = [vp], 8		C			M01
162		cmp.CND	p9, p0 = w3, u3		C			M I
163	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
164}{.mmb;		ld8	u3 = [up], 8		C			M01
165	(p7)	add	w2 = INCR, w2		C			M I
166		br	L(m0)
167}
168
169.Lc001:
170 {.mmi;	(p15)	ld8	v1 = [vp], 8		C			M01
171	(p15)	ld8	u1 = [up], 8		C			M01
172		ADDSUB	w0 = r10, r11		C			M I
173}{.mmb;		nop	0
174		nop	0
175	(p15)	br	L(0)
176	;;
177}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
178		mov	r8 = 0
179		cmp.CND	p6, p0 = w0, r10	C			M I
180	;;
181}{.mmb;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
182	(p9)	add	w0 = INCR, w0		C			M I
183		br	L(cj1)			C			B
184}
185L(0):
186 {.mmi;		ld8	v2 = [vp], 8		C			M01
187		ld8	u2 = [up], 8		C			M01
188		shr.u	n = n, 3		C			I0
189	;;
190}{.mmi;		ld8	v3 = [vp], 8		C			M01
191		ld8	u3 = [up], 8		C			M01
192		mov	ar.lc = n		C			I0
193}{.mmi;		nop	0
194		cmp.ne	p9, p0 = 0, r23		C			M I
195		nop	0
196	;;
197}{.mmi;		ld8	v0 = [vp], 8		C			M01
198		cmp.CND	p6, p0 = w0, r10	C			M I
199		add	rpx = 16, rp		C			M I
200}{.mmb;		ld8	u0 = [up], 8		C			M01
201		ADDSUB	w1 = u1, v1		C			M I
202		br	L(c1)			C			B
203}
204
205.Lc010:
206 {.mmi;		ld8	v0 = [vp], 8		C			M01
207		ld8	u0 = [up], 8		C			M01
208		mov	r8 = 0			C			M I
209}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
210		cmp.ne	p8, p0 = 0, r23		C			M I
211	(p15)	br	L(1)			C			B
212	;;
213}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
214		ADDSUB	w0 = u0, v0		C			M I
215	(p8)	add	w3 = INCR, w3		C			M I
216	;;
217}{.mmb;		cmp.CND	p6, p0 = w0, u0		C			M I
218	(p8)	cmpeqor	p9, p0 = LIM2, w3	C			M I
219		br	L(cj2)			C			B
220}
221L(1):
222 {.mmi;		ld8	v1 = [vp], 8		C			M01
223		ld8	u1 = [up], 8		C			M01
224		shr.u	n = n, 3		C			I0
225	;;
226}{.mmi;		ld8	v2 = [vp], 8		C			M01
227		ld8	u2 = [up], 8		C			M01
228		mov	ar.lc = n		C			I0
229	;;
230}{.mmi;		ld8	v3 = [vp], 8		C			M01
231		ld8	u3 = [up], 8		C			M01
232		cmp.CND	p9, p0 = w3, r10	C			M I
233	;;
234}{.mmi;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
235	(p8)	add	w3 = INCR, w3		C			M I
236		ADDSUB	w0 = u0, v0		C			M I
237}{.mmb;		add	rpx = 24, rp		C			M I
238		nop	0
239		br	L(m23)			C			B
240}
241
242.Lc011:
243 {.mmi;		ld8	v3 = [vp], 8		C			M01
244		ld8	u3 = [up], 8		C			M01
245		shr.u	n = n, 3		C			I0
246}{.mmi;		ADDSUB	w2 = r10, r11		C			M I
247		cmp.ne	p7, p0 = 0, r23		C			M I
248		nop	0
249	;;
250}{.mmb;		ld8	v0 = [vp], 8		C			M01
251		ld8	u0 = [up], 8		C			M01
252	(p15)	br	L(2)			C			B
253}{.mmi;		cmp.CND	p8, p0 = w2, r10	C			M I
254		ADDSUB	w3 = u3, v3		C			M I
255		nop	0
256	;;
257}{.mmb;	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
258	(p7)	add	w2 = INCR, w2		C			M I
259		br	L(cj3)			C			B
260}
261L(2):
262 {.mmi;		ld8	v1 = [vp], 8		C			M01
263		ld8	u1 = [up], 8		C			M01
264		ADDSUB	w3 = u3, v3		C			M I
265	;;
266}{.mmi;		ld8	v2 = [vp], 8		C			M01
267		ld8	u2 = [up], 8		C			M01
268		cmp.CND	p8, p0 = w2, r10	C			M I
269	;;
270}{.mmi;		ld8	v3 = [vp], 8		C			M01
271		cmp.CND	p9, p0 = w3, u3		C			M I
272		mov	ar.lc = n		C			I0
273}{.mmi;		ld8	u3 = [up], 8		C			M01
274	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
275	(p7)	add	w2 = INCR, w2		C			M I
276	;;
277}{.mmi;		add	rpx = 32, rp		C			M I
278		st8	[rp] = w2, 8		C			M23
279	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
280}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
281		ADDSUB	w0 = u0, v0		C			M I
282		br	L(m23)
283}
284
285.Lc100:
286 {.mmi;		ld8	v2 = [vp], 8		C			M01
287		ld8	u2 = [up], 8		C			M01
288		shr.u	n = n, 3		C			I0
289}{.mmi;		ADDSUB	w1 = r10, r11		C			M I
290		nop	0
291		nop	0
292	;;
293}{.mmi;		ld8	v3 = [vp], 8		C			M01
294		ld8	u3 = [up], 8		C			M01
295		add	rpx = 8, rp		C			M I
296}{.mmi;		cmp.ne	p6, p0 = 0, r23		C			M I
297		cmp.CND	p7, p0 = w1, r10	C			M I
298		nop	0
299	;;
300}{.mmi;		ld8	v0 = [vp], 8		C			M01
301		ld8	u0 = [up], 8		C			M01
302		ADDSUB	w2 = u2, v2		C			M I
303}{.mmb;	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
304	(p6)	add	w1 = INCR, w1		C			M I
305	(p14)	br	L(cj4)
306	;;
307}{.mmi;		ld8	v1 = [vp], 8		C			M01
308		ld8	u1 = [up], 8		C			M01
309		mov	ar.lc = n		C			I0
310	;;
311}{.mmi;		ld8	v2 = [vp], 8		C			M01
312		cmp.CND	p8, p0 = w2, u2		C			M I
313		nop	0
314}{.mmi;		ld8	u2 = [up], 8		C			M01
315		nop	0
316		ADDSUB	w3 = u3, v3		C			M I
317	;;
318}{.mmi;		ld8	v3 = [vp], 8		C			M01
319		cmp.CND	p9, p0 = w3, u3		C			M I
320	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
321}{.mmb;		ld8	u3 = [up], 8		C			M01
322	(p7)	add	w2 = INCR, w2		C			M I
323		br	L(m4)
324}
325
326.Lc101:
327 {.mmi;		ld8	v1 = [vp], 8		C			M01
328		ld8	u1 = [up], 8		C			M01
329		shr.u	n = n, 3		C			I0
330	;;
331}{.mmi;		ld8	v2 = [vp], 8		C			M01
332		ld8	u2 = [up], 8		C			M01
333		mov	ar.lc = n		C			I0
334	;;
335}{.mmi;		ld8	v3 = [vp], 8		C			M01
336		ld8	u3 = [up], 8		C			M01
337		ADDSUB	w0 = r10, r11		C			M I
338}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
339		add	rpx = 16, rp		C			M I
340		nop	0
341	;;
342}{.mmi;		ld8	v0 = [vp], 8		C			M01
343		ld8	u0 = [up], 8		C			M01
344		cmp.CND	p6, p0 = w0, r10	C			M I
345}{.mbb;		ADDSUB	w1 = u1, v1		C			M I
346	(p15)	br	L(c5)			C			B
347		br	L(end)			C			B
348}
349
350.Lc110:
351 {.mmi;		ld8	v0 = [vp], 8		C			M01
352		ld8	u0 = [up], 8		C			M01
353		shr.u	n = n, 3		C			I0
354	;;
355}{.mmi;		add	upadv = PFDIST, up	C			M I
356		add	vpadv = PFDIST, vp	C			M I
357		mov	ar.lc = n		C			I0
358}{.mmi;		ld8	v1 = [vp], 8		C			M01
359		ld8	u1 = [up], 8		C			M01
360		ADDSUB	w3 = r10, r11		C			M I
361	;;
362}{.mmi;		ld8	v2 = [vp], 8		C			M01
363		ld8	u2 = [up], 8		C			M01
364		ADDSUB	w0 = u0, v0		C			M I
365}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
366		cmp.ne	p8, p0 = 0, r23		C			M I
367		add	rpx = 24, rp		C			M I
368	;;
369}{.mmi;		ld8	v3 = [vp], 8		C			M01
370		ld8	u3 = [up], 8		C			M01
371		nop	0
372}{.mmb;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
373	(p8)	add	w3 = INCR, w3		C			M I
374		br	L(m67)			C			B
375}
376
377.Lc111:
378 {.mmi;		ld8	v0 = [vp], 8		C			M01
379		ld8	u0 = [up], 8		C			M01
380		shr.u	n = n, 3		C			I0
381	;;
382}{.mmi;		add	upadv = PFDIST, up	C			M I
383		ld8	v1 = [vp], 8		C			M01
384		mov	ar.lc = n		C			I0
385}{.mmi;		ld8	u1 = [up], 8		C			M01
386		ADDSUB	w2 = r10, r11		C			M I
387		nop	0
388	;;
389}{.mmi;		add	vpadv = PFDIST, vp	C			M I
390		ld8	v2 = [vp], 8		C			M01
391		cmp.CND	p8, p0 = w2, r10	C			M I
392}{.mmi;		ld8	u2 = [up], 8		C			M01
393		ADDSUB	w3 = r18, r19		C			M I
394		nop	0
395	;;
396}{.mmi;		ld8	v3 = [vp], 8		C			M01
397		cmp.CND	p9, p0 = w3, r18	C			M I
398	(p13)	cmpeqor	p8, p0 = LIM, w2	C			M I
399}{.mmi;		ld8	u3 = [up], 8		C			M01
400	(p13)	add	w2 = INCR, w2		C			M I
401		nop	0
402	;;
403}{.mmi;		add	rpx = 32, rp		C			M I
404		st8	[rp] = w2, 8		C			M23
405	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
406}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
407		ADDSUB	w0 = u0, v0		C			M I
408		br	L(m67)
409}
410EPILOGUE()
411
412PROLOGUE(func)
413	.prologue
414	.save	ar.lc, r2
415	.body
416ifdef(`HAVE_ABI_32',`
417		addp4	rp = 0, rp		C			M I
418		addp4	up = 0, up		C			M I
419		nop.i	0
420		addp4	vp = 0, vp		C			M I
421		nop.m	0
422		zxt4	n = n			C			I
423	;;
424')
425
426 {.mmi;		ld8	r11 = [vp], 8		C			M01
427		ld8	r10 = [up], 8		C			M01
428		mov	r2 = ar.lc		C			I0
429}{.mmi;		and	r14 = 7, n		C			M I
430		cmp.lt	p15, p14 = 8, n		C			M I
431		add	n = -6, n		C			M I
432	;;
433}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
434		cmp.eq	p7, p0 = 2, r14		C			M I
435		cmp.eq	p8, p0 = 3, r14		C			M I
436}{.bbb;	(p6)	br.dptk	.Lb001			C			B
437	(p7)	br.dptk	.Lb010			C			B
438	(p8)	br.dptk	.Lb011			C			B
439	;;
440}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
441		cmp.eq	p10, p0 = 5, r14	C			M I
442		cmp.eq	p11, p0 = 6, r14	C			M I
443}{.bbb;	(p9)	br.dptk	.Lb100			C			B
444	(p10)	br.dptk	.Lb101			C			B
445	(p11)	br.dptk	.Lb110			C			B
446	;;
447}{.mmi;		ld8	r19 = [vp], 8		C			M01
448		ld8	r18 = [up], 8		C			M01
449		cmp.ne	p13, p0 = r0, r0	C clear "CF"		M I
450}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
451		mov	r23 = 0			C			M I
452	(p12)	br.dptk	.Lb111			C			B
453	;;
454}
455
456.Lb000:
457 {.mmi;		ld8	v3 = [vp], 8		C			M01
458		ld8	u3 = [up], 8		C			M01
459		shr.u	n = n, 3		C			I0
460	;;
461}{.mmi;		ld8	v0 = [vp], 8		C			M01
462		ld8	u0 = [up], 8		C			M01
463		ADDSUB	w1 = r10, r11		C			M I
464	;;
465}{.mmi;		ld8	v1 = [vp], 8		C			M01
466		cmp.CND	p7, p0 = w1, r10	C			M I
467		mov	ar.lc = n		C			I0
468}{.mmi;		ld8	u1 = [up], 8		C			M01
469		ADDSUB	w2 = r18, r19		C			M I
470		add	rpx = 8, rp		C			M I
471	;;
472}{.mmi;		add	upadv = PFDIST, up
473		add	vpadv = PFDIST, vp
474		cmp.CND	p8, p0 = w2, r18	C			M I
475}{.mmi;		ld8	v2 = [vp], 8		C			M01
476		ld8	u2 = [up], 8		C			M01
477		ADDSUB	w3 = u3, v3		C			M I
478	;;
479}{.mmi;		ld8	v3 = [vp], 8		C			M01
480		cmp.CND	p9, p0 = w3, u3		C			M I
481	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
482}{.mmb;		ld8	u3 = [up], 8		C			M01
483	(p7)	add	w2 = INCR, w2		C			M I
484		br	L(m0)			C			B
485}
486
487	ALIGN(32)
488.Lb001:
489 {.mmi;		ADDSUB	w0 = r10, r11		C			M I
490	(p15)	ld8	v1 = [vp], 8		C			M01
491		mov	r8 = 0			C			M I
492	;;
493}{.mmb;		cmp.CND	p6, p0 = w0, r10	C			M I
494	(p15)	ld8	u1 = [up], 8		C			M01
495	(p14)	br	L(cj1)			C			B
496	;;
497}{.mmi;		add	upadv = PFDIST, up
498		add	vpadv = PFDIST, vp
499		shr.u	n = n, 3		C			I0
500}{.mmi;		ld8	v2 = [vp], 8		C			M01
501		ld8	u2 = [up], 8		C			M01
502		cmp.CND	p6, p0 = w0, r10	C			M I
503	;;
504}{.mmi;		ld8	v3 = [vp], 8		C			M01
505		ld8	u3 = [up], 8		C			M01
506		mov	ar.lc = n		C			I0
507	;;
508}{.mmi;		ld8	v0 = [vp], 8		C			M01
509		ld8	u0 = [up], 8		C			M01
510		ADDSUB	w1 = u1, v1		C			M I
511	;;
512}{.mmi;		ld8	v1 = [vp], 8		C			M01
513		cmp.CND	p7, p0 = w1, u1		C			M I
514		ADDSUB	w2 = u2, v2		C			M I
515}{.mmb;		ld8	u1 = [up], 8		C			M01
516		add	rpx = 16, rp		C			M I
517		br	L(m1)			C			B
518}
519
520	ALIGN(32)
521.Lb010:
522 {.mmi;		ld8	v0 = [vp], 8		C			M01
523		ld8	u0 = [up], 8		C			M01
524		shr.u	n = n, 3		C			I0
525}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
526		nop	0
527	(p15)	br	L(gt2)			C			B
528	;;
529}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
530		ADDSUB	w0 = u0, v0		C			M I
531		mov	r8 = 0			C			M I
532	;;
533}{.mmb;		nop	0
534		cmp.CND	p6, p0 = w0, u0		C			M I
535		br	L(cj2)			C			B
536}
537L(gt2):
538 {.mmi;		ld8	v1 = [vp], 8		C			M01
539		ld8	u1 = [up], 8		C			M01
540		nop	0
541	;;
542}{.mmi;		add	upadv = PFDIST, up
543		add	vpadv = PFDIST, vp
544		mov	ar.lc = n		C			I0
545}{.mmi;		ld8	v2 = [vp], 8		C			M01
546		ld8	u2 = [up], 8		C			M01
547		nop	0
548	;;
549}{.mmi;		ld8	v3 = [vp], 8		C			M01
550		cmp.CND	p9, p0 = w3, r10	C			M I
551		ADDSUB	w0 = u0, v0		C			M I
552}{.mmb;		ld8	u3 = [up], 8		C			M01
553		add	rpx = 24, rp		C			M I
554		br	L(m23)			C			B
555}
556
557	ALIGN(32)
558.Lb011:
559 {.mmi;		ld8	v3 = [vp], 8		C			M01
560		ld8	u3 = [up], 8		C			M01
561		ADDSUB	w2 = r10, r11		C			M I
562	;;
563}{.mmb;		ld8	v0 = [vp], 8		C			M01
564		ld8	u0 = [up], 8		C			M01
565	(p15)	br	L(3)			C			B
566}{.mmb;		cmp.CND	p8, p0 = w2, r10	C			M I
567		ADDSUB	w3 = u3, v3		C			M I
568		br	L(cj3)			C			B
569}
570L(3):
571 {.mmi;		ld8	v1 = [vp], 8		C			M01
572		ld8	u1 = [up], 8		C			M01
573		shr.u	n = n, 3		C			I0
574	;;
575}{.mmi;		add	upadv = PFDIST, up
576		add	vpadv = PFDIST, vp
577		ADDSUB	w3 = u3, v3		C			M I
578}{.mmi;		ld8	v2 = [vp], 8		C			M01
579		ld8	u2 = [up], 8		C			M01
580		cmp.CND	p8, p0 = w2, r10	C			M I
581	;;
582}{.mmi;		ld8	v3 = [vp], 8		C			M01
583		cmp.CND	p9, p0 = w3, u3		C			M I
584		mov	ar.lc = n		C			I0
585}{.mmi;		ld8	u3 = [up], 8		C			M01
586		nop	0
587		nop	0
588	;;
589}{.mmi;		add	rpx = 32, rp		C			M I
590		st8	[rp] = w2, 8		C			M23
591	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
592}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
593		ADDSUB	w0 = u0, v0		C			M I
594		br	L(m23)			C			B
595}
596
597	ALIGN(32)
598.Lb100:
599 {.mmi;		ld8	v2 = [vp], 8		C			M01
600		ld8	u2 = [up], 8		C			M01
601		shr.u	n = n, 3		C			I0
602	;;
603}{.mmi;		ld8	v3 = [vp], 8		C			M01
604		ld8	u3 = [up], 8		C			M01
605		ADDSUB	w1 = r10, r11		C			M I
606	;;
607}{.mmi;		ld8	v0 = [vp], 8		C			M01
608		ld8	u0 = [up], 8		C			M01
609		cmp.CND	p7, p0 = w1, r10	C			M I
610}{.mmb;		nop	0
611		ADDSUB	w2 = u2, v2		C			M I
612	(p14)	br	L(cj4)			C			B
613	;;
614}
615L(gt4):
616 {.mmi;		add	upadv = PFDIST, up
617		add	vpadv = PFDIST, vp
618		mov	ar.lc = n		C			I0
619}{.mmi;		ld8	v1 = [vp], 8		C			M01
620		ld8	u1 = [up], 8		C			M01
621		nop	0
622	;;
623}{.mmi;		ld8	v2 = [vp], 8		C			M01
624		cmp.CND	p8, p0 = w2, u2		C			M I
625		nop	0
626}{.mmi;		ld8	u2 = [up], 8		C			M01
627		ADDSUB	w3 = u3, v3		C			M I
628		add	rpx = 8, rp		C			M I
629	;;
630}{.mmi;		ld8	v3 = [vp], 8		C			M01
631		cmp.CND	p9, p0 = w3, u3		C			M I
632	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
633}{.mmb;		ld8	u3 = [up], 8		C			M01
634	(p7)	add	w2 = INCR, w2		C			M I
635		br	L(m4)			C			B
636}
637
638	ALIGN(32)
639.Lb101:
640 {.mmi;		ld8	v1 = [vp], 8		C			M01
641		ld8	u1 = [up], 8		C			M01
642		shr.u	n = n, 3		C			I0
643	;;
644}{.mmi;		ld8	v2 = [vp], 8		C			M01
645		ld8	u2 = [up], 8		C			M01
646		ADDSUB	w0 = r10, r11		C			M I
647	;;
648}{.mmi;		add	upadv = PFDIST, up
649		add	vpadv = PFDIST, vp
650		add	rpx = 16, rp		C			M I
651}{.mmi;		ld8	v3 = [vp], 8		C			M01
652		ld8	u3 = [up], 8		C			M01
653		nop	0
654	;;
655}{.mmi;		ld8	v0 = [vp], 8		C			M01
656		cmp.CND	p6, p0 = w0, r10	C			M I
657		nop	0
658}{.mmb;		ld8	u0 = [up], 8		C			M01
659		ADDSUB	w1 = u1, v1		C			M I
660	(p14)	br	L(cj5)			C			B
661	;;
662}
663L(gt5):
664 {.mmi;		ld8	v1 = [vp], 8		C			M01
665		cmp.CND	p7, p0 = w1, u1		C			M I
666		mov	ar.lc = n		C			I0
667}{.mmb;		ld8	u1 = [up], 8		C			M01
668		ADDSUB	w2 = u2, v2		C			M I
669		br	L(m5)			C			B
670}
671
672	ALIGN(32)
673.Lb110:
674 {.mmi;		ld8	v0 = [vp], 8		C			M01
675		ld8	u0 = [up], 8		C			M01
676		shr.u	n = n, 3		C			I0
677	;;
678}{.mmi;		ld8	v1 = [vp], 8		C			M01
679		ld8	u1 = [up], 8		C			M01
680		ADDSUB	w3 = r10, r11		C			M I
681	;;
682}{.mmi;		add	upadv = PFDIST, up
683		add	vpadv = PFDIST, vp
684		mov	ar.lc = n		C			I0
685}{.mmi;		ld8	v2 = [vp], 8		C			M01
686		ld8	u2 = [up], 8		C			M01
687		nop	0
688	;;
689}{.mmi;		ld8	v3 = [vp], 8		C			M01
690		cmp.CND	p9, p0 = w3, r10	C			M I
691		ADDSUB	w0 = u0, v0		C			M I
692}{.mmb;		ld8	u3 = [up], 8		C			M01
693		add	rpx = 24, rp		C			M I
694		br	L(m67)			C			B
695}
696
697	ALIGN(32)
698.Lb111:
699 {.mmi;		ld8	v0 = [vp], 8		C			M01
700		ld8	u0 = [up], 8		C			M01
701		shr.u	n = n, 3		C			I0
702	;;
703}{.mmi;		ld8	v1 = [vp], 8		C			M01
704		ld8	u1 = [up], 8		C			M01
705		ADDSUB	w2 = r10, r11		C			M I
706	;;
707}{.mmi;		ld8	v2 = [vp], 8		C			M01
708		cmp.CND	p8, p0 = w2, r10	C			M I
709		mov	ar.lc = n		C			I0
710}{.mmi;		ld8	u2 = [up], 8		C			M01
711		ADDSUB	w3 = r18, r19		C			M I
712		nop	0
713	;;
714}{.mmi;		add	upadv = PFDIST, up
715		add	vpadv = PFDIST, vp
716		nop	0
717}{.mmi;		ld8	v3 = [vp], 8		C			M01
718		ld8	u3 = [up], 8		C			M01
719		cmp.CND	p9, p0 = w3, r18	C			M I
720	;;
721}{.mmi;		add	rpx = 32, rp		C			M I
722		st8	[rp] = w2, 8		C			M23
723	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
724}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
725		ADDSUB	w0 = u0, v0		C			M I
726		br	L(m67)			C			B
727}
728
729C *** MAIN LOOP START ***
730	ALIGN(32)
731L(top):
732L(c5):		ld8	v1 = [vp], 8		C			M01
733		cmp.CND	p7, p0 = w1, u1		C			M I
734	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
735		ld8	u1 = [up], 8		C			M01
736	(p9)	add	w0 = INCR, w0		C			M I
737		ADDSUB	w2 = u2, v2		C			M I
738	;;
739L(m5):		ld8	v2 = [vp], 8		C			M01
740		cmp.CND	p8, p0 = w2, u2		C			M I
741	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
742		ld8	u2 = [up], 8		C			M01
743	(p6)	add	w1 = INCR, w1		C			M I
744		ADDSUB	w3 = u3, v3		C			M I
745	;;
746		st8	[rp] = w0, 8		C			M23
747		ld8	v3 = [vp], 8		C			M01
748		cmp.CND	p9, p0 = w3, u3		C			M I
749	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
750		ld8	u3 = [up], 8		C			M01
751	(p7)	add	w2 = INCR, w2		C			M I
752	;;
753L(m4):		st8	[rp] = w1, 16		C			M23
754		st8	[rpx] = w2, 32		C			M23
755	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
756		lfetch	[upadv], 64
757	(p8)	add	w3 = INCR, w3		C			M I
758		ADDSUB	w0 = u0, v0		C			M I
759	;;
760L(m23):		st8	[rp] = w3, 8		C			M23
761		ld8	v0 = [vp], 8		C			M01
762		cmp.CND	p6, p0 = w0, u0		C			M I
763		ld8	u0 = [up], 8		C			M01
764		ADDSUB	w1 = u1, v1		C			M I
765		nop.b	0
766	;;
767L(c1):		ld8	v1 = [vp], 8		C			M01
768		cmp.CND	p7, p0 = w1, u1		C			M I
769	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
770		ld8	u1 = [up], 8		C			M01
771	(p9)	add	w0 = INCR, w0		C			M I
772		ADDSUB	w2 = u2, v2		C			M I
773	;;
774L(m1):		ld8	v2 = [vp], 8		C			M01
775		cmp.CND	p8, p0 = w2, u2		C			M I
776	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
777		ld8	u2 = [up], 8		C			M01
778	(p6)	add	w1 = INCR, w1		C			M I
779		ADDSUB	w3 = u3, v3		C			M I
780	;;
781		st8	[rp] = w0, 8		C			M23
782		ld8	v3 = [vp], 8		C			M01
783		cmp.CND	p9, p0 = w3, u3		C			M I
784	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
785		ld8	u3 = [up], 8		C			M01
786	(p7)	add	w2 = INCR, w2		C			M I
787	;;
788L(m0):		st8	[rp] = w1, 16		C			M23
789		st8	[rpx] = w2, 32		C			M23
790	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
791		lfetch	[vpadv], 64
792	(p8)	add	w3 = INCR, w3		C			M I
793		ADDSUB	w0 = u0, v0		C			M I
794	;;
795L(m67):		st8	[rp] = w3, 8		C			M23
796		ld8	v0 = [vp], 8		C			M01
797		cmp.CND	p6, p0 = w0, u0		C			M I
798		ld8	u0 = [up], 8		C			M01
799		ADDSUB	w1 = u1, v1		C			M I
800		br.cloop.dptk	L(top)		C			B
801	;;
802C *** MAIN LOOP END ***
803
804L(end):
805 {.mmi;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
806	(p9)	add	w0 = INCR, w0		C			M I
807		mov	ar.lc = r2		C			I0
808}
809L(cj5):
810 {.mmi;		cmp.CND	p7, p0 = w1, u1		C			M I
811		ADDSUB	w2 = u2, v2		C			M I
812		nop	0
813	;;
814}{.mmi;		st8	[rp] = w0, 8		C			M23
815	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
816	(p6)	add	w1 = INCR, w1		C			M I
817}
818L(cj4):
819 {.mmi;		cmp.CND	p8, p0 = w2, u2		C			M I
820		ADDSUB	w3 = u3, v3		C			M I
821		nop	0
822	;;
823}{.mmi;		st8	[rp] = w1, 8		C			M23
824	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
825	(p7)	add	w2 = INCR, w2		C			M I
826}
827L(cj3):
828 {.mmi;		cmp.CND	p9, p0 = w3, u3		C			M I
829		ADDSUB	w0 = u0, v0		C			M I
830		nop	0
831	;;
832}{.mmi;		st8	[rp] = w2, 8		C			M23
833	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
834	(p8)	add	w3 = INCR, w3		C			M I
835}{.mmi;		cmp.CND	p6, p0 = w0, u0		C			M I
836		nop	0
837		mov	r8 = 0			C			M I
838	;;
839}
840L(cj2):
841 {.mmi;		st8	[rp] = w3, 8		C			M23
842	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
843	(p9)	add	w0 = INCR, w0		C			M I
844	;;
845}
846L(cj1):
847 {.mmb;		st8	[rp] = w0, 8		C			M23
848	(p6)	mov	r8 = 1			C			M I
849		br.ret.sptk.many b0		C			B
850}
851EPILOGUE()
852ASM_END()
853