1dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
2dnl  hamming distance.
3
4dnl  Copyright 2000-2002 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C        popcount  hamdist
36C K6-2:    9.0       11.5   cycles/limb
37C K6:      12.5      13.0
38
39
40C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
41C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
42C
43C The code here isn't optimal, but it's already a 2x speedup over the plain
44C integer mpn/generic/popcount.c,hamdist.c.
45
46
47ifdef(`OPERATION_popcount',,
48`ifdef(`OPERATION_hamdist',,
49`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
50')m4exit(1)')')
51
52define(HAM,
53m4_assert_numargs(1)
54`ifdef(`OPERATION_hamdist',`$1')')
55
56define(POP,
57m4_assert_numargs(1)
58`ifdef(`OPERATION_popcount',`$1')')
59
60HAM(`
61defframe(PARAM_SIZE,   12)
62defframe(PARAM_SRC2,   8)
63defframe(PARAM_SRC,    4)
64define(M4_function,mpn_hamdist)
65')
66POP(`
67defframe(PARAM_SIZE,   8)
68defframe(PARAM_SRC,    4)
69define(M4_function,mpn_popcount)
70')
71
72MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
73
74
75ifdef(`PIC',,`
76	dnl  non-PIC
77
78	RODATA
79	ALIGN(8)
80
81L(rodata_AAAAAAAAAAAAAAAA):
82	.long	0xAAAAAAAA
83	.long	0xAAAAAAAA
84
85L(rodata_3333333333333333):
86	.long	0x33333333
87	.long	0x33333333
88
89L(rodata_0F0F0F0F0F0F0F0F):
90	.long	0x0F0F0F0F
91	.long	0x0F0F0F0F
92
93L(rodata_000000FF000000FF):
94	.long	0x000000FF
95	.long	0x000000FF
96')
97
98	TEXT
99	ALIGN(32)
100
101POP(`ifdef(`PIC', `
102	C avoid shrl crossing a 32-byte boundary
103	nop')')
104
105PROLOGUE(M4_function)
106deflit(`FRAME',0)
107
108	movl	PARAM_SIZE, %ecx
109
110ifdef(`PIC',`
111	movl	$0xAAAAAAAA, %eax
112	movl	$0x33333333, %edx
113
114	movd	%eax, %mm7
115	movd	%edx, %mm6
116
117	movl	$0x0F0F0F0F, %eax
118	movl	$0x000000FF, %edx
119
120	punpckldq %mm7, %mm7
121	punpckldq %mm6, %mm6
122
123	movd	%eax, %mm5
124	movd	%edx, %mm4
125
126	punpckldq %mm5, %mm5
127	punpckldq %mm4, %mm4
128',`
129
130	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
131	movq	L(rodata_3333333333333333), %mm6
132	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
133	movq	L(rodata_000000FF000000FF), %mm4
134')
135
136define(REG_AAAAAAAAAAAAAAAA, %mm7)
137define(REG_3333333333333333, %mm6)
138define(REG_0F0F0F0F0F0F0F0F, %mm5)
139define(REG_000000FF000000FF, %mm4)
140
141
142	movl	PARAM_SRC, %eax
143HAM(`	movl	PARAM_SRC2, %edx')
144
145	pxor	%mm2, %mm2	C total
146
147	shrl	%ecx
148	jnc	L(top)
149
150Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
151
152HAM(`
153Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
154	pxor	%mm0, %mm1
155')
156
157	incl	%ecx
158	jmp	L(loaded)
159
160
161	ALIGN(16)
162POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
163
164L(top):
165	C eax	src
166	C ebx
167	C ecx	counter, qwords, decrementing
168	C edx	[hamdist] src2
169	C
170	C mm0	(scratch)
171	C mm1	(scratch)
172	C mm2	total (low dword)
173	C mm3
174	C mm4	\
175	C mm5	| special constants
176	C mm6	|
177	C mm7	/
178
179	movq	-8(%eax,%ecx,8), %mm1
180HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
181
182L(loaded):
183	movq	%mm1, %mm0
184	pand	REG_AAAAAAAAAAAAAAAA, %mm1
185
186	psrlq	$1, %mm1
187HAM(`	nop			C code alignment')
188
189	psubd	%mm1, %mm0	C bit pairs
190HAM(`	nop			C code alignment')
191
192
193	movq	%mm0, %mm1
194	psrlq	$2, %mm0
195
196	pand	REG_3333333333333333, %mm0
197	pand	REG_3333333333333333, %mm1
198
199	paddd	%mm1, %mm0	C nibbles
200
201
202	movq	%mm0, %mm1
203	psrlq	$4, %mm0
204
205	pand	REG_0F0F0F0F0F0F0F0F, %mm0
206	pand	REG_0F0F0F0F0F0F0F0F, %mm1
207
208	paddd	%mm1, %mm0	C bytes
209
210	movq	%mm0, %mm1
211	psrlq	$8, %mm0
212
213
214	paddb	%mm1, %mm0	C words
215
216
217	movq	%mm0, %mm1
218	psrlq	$16, %mm0
219
220	paddd	%mm1, %mm0	C dwords
221
222	pand	REG_000000FF000000FF, %mm0
223
224	paddd	%mm0, %mm2	C low to total
225	psrlq	$32, %mm0
226
227	paddd	%mm0, %mm2	C high to total
228	loop	L(top)
229
230
231
232	movd	%mm2, %eax
233	emms_or_femms
234	ret
235
236EPILOGUE()
237