1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
2
3dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C 7400,7410 (G4):       ?
35C 744x,745x (G4+):      1.125
36C 970 (G5):             2.25
37
38C TODO
39C  * Rewrite the awkward huge n outer loop code.
40C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
41C  * Compress cnsts table in 64-bit mode, only half the values are needed.
42
43define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
44define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
45define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
46
47define(`OPERATION_popcount')
48
49define(`ap',	`r3')
50define(`n',	`r4')
51
52define(`rtab',	`v10')
53define(`cnt4',	`v11')
54
55ifelse(GMP_LIMB_BITS,32,`
56	define(`LIMB32',`	$1')
57	define(`LIMB64',`')
58',`
59	define(`LIMB32',`')
60	define(`LIMB64',`	$1')
61')
62
63C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
64C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
65define(`LIMBS_PER_CHUNK', 0x1000)
66define(`LIMBS_CHUNK_THRES', 0x1001)
67
68ASM_START()
69PROLOGUE(mpn_popcount,toc)
70	mfspr	r10, 256
71	oris	r0, r10, 0xfffc		C Set VRSAVE bit 0-13
72	mtspr	256, r0
73
74ifdef(`HAVE_ABI_mode32',
75`	rldicl	n, n, 0, 32')		C zero extend n
76
77C Load various constants into vector registers
78	LEAL(	r11, cnsts)
79	li	r12, 16
80	vspltisb cnt4, 4		C 0x0404...04 used as shift count
81
82	li	r7, 160
83	lvx	rtab, 0, r11
84
85LIMB64(`lis	r0, LIMBS_CHUNK_THRES	')
86LIMB64(`cmpd	cr7, n, r0		')
87
88	lvx	v0, 0, ap
89	addi	r7, r11, 80
90	rlwinm	r6, ap, 2,26,29
91	lvx	v8, r7, r6
92	vand	v0, v0, v8
93
94LIMB32(`rlwinm	r8, ap, 30,30,31	')
95LIMB64(`rlwinm	r8, ap, 29,31,31	')
96	add	n, n, r8		C compensate n for rounded down `ap'
97
98	vxor	v1, v1, v1
99	li	r8, 0			C grand total count
100
101	vxor	v12, v12, v12		C zero total count
102	vxor	v13, v13, v13		C zero total count
103
104	addic.	n, n, -LIMBS_PER_VR
105	ble	L(sum)
106
107	addic.	n, n, -LIMBS_PER_VR
108	ble	L(lsum)
109
110C For 64-bit machines, handle huge n that would overflow vsum4ubs
111LIMB64(`ble	cr7, L(small)		')
112LIMB64(`addis	r9, n, -LIMBS_PER_CHUNK	') C remaining n
113LIMB64(`lis	n, LIMBS_PER_CHUNK	')
114
115	ALIGN(16)
116L(small):
117LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
118LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
119	addi	r7, r7, 1
120	mtctr	r7			C copy n to count register
121	b	L(ent)
122
123	ALIGN(16)
124L(top):
125	lvx	v0, 0, ap
126L(ent):	lvx	v1, r12, ap
127	addi	ap, ap, 32
128	vsrb	v8, v0, cnt4
129	vsrb	v9, v1, cnt4
130	vperm	v2, rtab, rtab, v0
131	vperm	v3, rtab, rtab, v8
132	vperm	v4, rtab, rtab, v1
133	vperm	v5, rtab, rtab, v9
134	vaddubm	v6, v2, v3
135	vaddubm	v7, v4, v5
136	vsum4ubs v12, v6, v12
137	vsum4ubs v13, v7, v13
138	bdnz	L(top)
139
140	andi.	n, n, eval(LIMBS_PER_2VR-1)
141	beq	L(rt)
142
143	lvx	v0, 0, ap
144	vxor	v1, v1, v1
145	cmpwi	n, LIMBS_PER_VR
146	ble	L(sum)
147L(lsum):
148	vor	v1, v0, v0
149	lvx	v0, r12, ap
150L(sum):
151LIMB32(`rlwinm	r6, n, 4,26,27	')
152LIMB64(`rlwinm	r6, n, 5,26,26	')
153	addi	r7, r11, 16
154	lvx	v8, r7, r6
155	vand	v0, v0, v8
156	vsrb	v8, v0, cnt4
157	vsrb	v9, v1, cnt4
158	vperm	v2, rtab, rtab, v0
159	vperm	v3, rtab, rtab, v8
160	vperm	v4, rtab, rtab, v1
161	vperm	v5, rtab, rtab, v9
162	vaddubm	v6, v2, v3
163	vaddubm	v7, v4, v5
164	vsum4ubs v12, v6, v12
165	vsum4ubs v13, v7, v13
166
167	ALIGN(16)
168L(rt):	vadduwm	v3, v12, v13
169	li	r7, -16			C FIXME: does all ppc32 and ppc64 ABIs
170	stvx	v3, r7, r1		C FIXME: ...support storing below sp?
171
172	lwz	r7, -16(r1)
173	add	r8, r8, r7
174	lwz	r7, -12(r1)
175	add	r8, r8, r7
176	lwz	r7, -8(r1)
177	add	r8, r8, r7
178	lwz	r7, -4(r1)
179	add	r8, r8, r7
180
181C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
182LIMB64(`ble	cr7, L(ret)
183	vxor	v12, v12, v12		C zero total count
184	vxor	v13, v13, v13		C zero total count
185	mr	n, r9
186	cmpd	cr7, n, r0
187	ble	cr7, L(2)
188	addis	r9, n, -LIMBS_PER_CHUNK	C remaining n
189	lis	n, LIMBS_PER_CHUNK
190L(2):	srdi	r7, n, 2		C loop count corresponding to n
191	mtctr	r7			C copy n to count register
192	b	L(top)
193')
194
195	ALIGN(16)
196L(ret):	mr	r3, r8
197	mtspr	256, r10
198	blr
199EPILOGUE()
200
201DEF_OBJECT(cnsts,16)
202C Counts for vperm
203	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
204	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
205C Masks for high end of number
206	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
207	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
208
209	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
210	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
211
212	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
213	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
214
215	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
216	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
217C Masks for low end of number
218	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
219	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
220
221	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
222	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
223
224	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
225	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
226
227	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
228	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
229END_OBJECT(cnsts)
230ASM_END()
231