1dnl  IA-64 mpn_sec_tabselect.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C           cycles/limb
34C Itanium:       ?
35C Itanium 2:     2.5
36
37C NOTES
38C  * Using software pipelining could trivially yield 2 c/l without unrolling,
39C    or 1+epsilon with unrolling.  (This code was modelled after the powerpc64
40C    code, for simplicity.)
41
42C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
43define(`rp',     `r32')
44define(`tp',     `r33')
45define(`n',      `r34')
46define(`nents',  `r35')
47define(`which',  `r36')
48
49define(`mask',   `r8')
50
51define(`rp1',     `r32')
52define(`tp1',     `r33')
53define(`rp2',     `r14')
54define(`tp2',     `r15')
55
56ASM_START()
57PROLOGUE(mpn_sec_tabselect)
58	.prologue
59	.save	ar.lc, r2
60	.body
61ifdef(`HAVE_ABI_32',`
62 {.mmi;	addp4	rp = 0, rp		C			M I
63	addp4	tp = 0, tp		C			M I
64	zxt4	n = n			C			I
65}{.mii;	nop	0
66	zxt4	nents = nents		C			I
67	zxt4	which = which		C			I
68	;;
69}')
70 {.mmi;	add	rp2 = 8, rp1
71	add	tp2 = 8, tp1
72	add	r6 = -2, n
73	;;
74}{.mmi;	cmp.eq	p10, p0 = 1, n
75	and	r9 = 1, n		C set cr0 for use in inner loop
76	shr.u	r6 = r6, 1		C inner loop count
77	;;
78}{.mmi;	cmp.eq	p8, p0 = 0, r9
79	sub	which = nents, which
80	shl	n = n, 3
81	;;
82}
83L(outer):
84 {.mmi;	cmp.eq	p6, p7 = which, nents	C are we at the selected table entry?
85	nop	0
86	mov	ar.lc = r6		C			I0
87	;;
88}{.mmb;
89  (p6)	mov	mask = -1
90  (p7)	mov	mask = 0
91  (p8)	br.dptk	L(top)			C branch to loop entry if n even
92	;;
93}{.mmi;	ld8	r16 = [tp1], 8
94	add	tp2 = 8, tp2
95	nop	0
96	;;
97}{.mmi;	ld8	r18 = [rp1]
98	and	r16 = r16, mask
99	nop	0
100	;;
101}{.mmi;	andcm	r18 = r18, mask
102	;;
103	or	r16 = r16, r18
104	nop	0
105	;;
106}{.mmb;	st8	[rp1] = r16, 8
107	add	rp2 = 8, rp2
108  (p10)	br.dpnt	L(end)
109}
110	ALIGN(32)
111L(top):
112 {.mmi;	ld8	r16 = [tp1], 16
113	ld8	r17 = [tp2], 16
114	nop	0
115	;;
116}{.mmi;	ld8	r18 = [rp1]
117	and	r16 = r16, mask
118	nop	0
119}{.mmi;	ld8	r19 = [rp2]
120	and	r17 = r17, mask
121	nop	0
122	;;
123}{.mmi;	andcm	r18 = r18, mask
124	andcm	r19 = r19, mask
125	nop	0
126	;;
127}{.mmi;	or	r16 = r16, r18
128	or	r17 = r17, r19
129	nop	0
130	;;
131}{.mmb;	st8	[rp1] = r16, 16
132	st8	[rp2] = r17, 16
133	br.cloop.dptk	L(top)
134	;;
135}
136L(end):
137 {.mmi;	sub	rp1 = rp1, n		C move rp back to beginning
138	sub	rp2 = rp2, n		C move rp back to beginning
139	cmp.ne	p9, p0 = 1, nents
140}{.mmb;	add	nents = -1, nents
141	nop	0
142  (p9)	br.dptk	L(outer)
143	;;
144}{.mib;	nop	0
145	nop	0
146	br.ret.sptk.many b0
147}
148EPILOGUE()
149