1dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and 2dnl hamming distance. 3 4dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C popcount hamdist 36C K6-2: 9.0 11.5 cycles/limb 37C K6: 12.5 13.0 38 39 40C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 41C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 42C 43C The code here isn't optimal, but it's already a 2x speedup over the plain 44C integer mpn/generic/popcount.c,hamdist.c. 45 46 47ifdef(`OPERATION_popcount',, 48`ifdef(`OPERATION_hamdist',, 49`m4_error(`Need OPERATION_popcount or OPERATION_hamdist 50')m4exit(1)')') 51 52define(HAM, 53m4_assert_numargs(1) 54`ifdef(`OPERATION_hamdist',`$1')') 55 56define(POP, 57m4_assert_numargs(1) 58`ifdef(`OPERATION_popcount',`$1')') 59 60HAM(` 61defframe(PARAM_SIZE, 12) 62defframe(PARAM_SRC2, 8) 63defframe(PARAM_SRC, 4) 64define(M4_function,mpn_hamdist) 65') 66POP(` 67defframe(PARAM_SIZE, 8) 68defframe(PARAM_SRC, 4) 69define(M4_function,mpn_popcount) 70') 71 72MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 73 74 75ifdef(`PIC',,` 76 dnl non-PIC 77 78 RODATA 79 ALIGN(8) 80 81L(rodata_AAAAAAAAAAAAAAAA): 82 .long 0xAAAAAAAA 83 .long 0xAAAAAAAA 84 85L(rodata_3333333333333333): 86 .long 0x33333333 87 .long 0x33333333 88 89L(rodata_0F0F0F0F0F0F0F0F): 90 .long 0x0F0F0F0F 91 .long 0x0F0F0F0F 92 93L(rodata_000000FF000000FF): 94 .long 0x000000FF 95 .long 0x000000FF 96') 97 98 TEXT 99 ALIGN(32) 100 101POP(`ifdef(`PIC', ` 102 C avoid shrl crossing a 32-byte boundary 103 nop')') 104 105PROLOGUE(M4_function) 106deflit(`FRAME',0) 107 108 movl PARAM_SIZE, %ecx 109 110ifdef(`PIC',` 111 movl $0xAAAAAAAA, %eax 112 movl $0x33333333, %edx 113 114 movd %eax, %mm7 115 movd %edx, %mm6 116 117 movl $0x0F0F0F0F, %eax 118 movl $0x000000FF, %edx 119 120 punpckldq %mm7, %mm7 121 punpckldq %mm6, %mm6 122 123 movd %eax, %mm5 124 movd %edx, %mm4 125 126 punpckldq %mm5, %mm5 127 punpckldq %mm4, %mm4 128',` 129 130 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 131 movq L(rodata_3333333333333333), %mm6 132 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 133 movq L(rodata_000000FF000000FF), %mm4 134') 135 136define(REG_AAAAAAAAAAAAAAAA, %mm7) 137define(REG_3333333333333333, %mm6) 138define(REG_0F0F0F0F0F0F0F0F, %mm5) 139define(REG_000000FF000000FF, %mm4) 140 141 142 movl PARAM_SRC, %eax 143HAM(` movl PARAM_SRC2, %edx') 144 145 pxor %mm2, %mm2 C total 146 147 shrl %ecx 148 jnc L(top) 149 150Zdisp( movd, 0,(%eax,%ecx,8), %mm1) 151 152HAM(` 153Zdisp( movd, 0,(%edx,%ecx,8), %mm0) 154 pxor %mm0, %mm1 155') 156 157 incl %ecx 158 jmp L(loaded) 159 160 161 ALIGN(16) 162POP(` nop C alignment to avoid crossing 32-byte boundaries') 163 164L(top): 165 C eax src 166 C ebx 167 C ecx counter, qwords, decrementing 168 C edx [hamdist] src2 169 C 170 C mm0 (scratch) 171 C mm1 (scratch) 172 C mm2 total (low dword) 173 C mm3 174 C mm4 \ 175 C mm5 | special constants 176 C mm6 | 177 C mm7 / 178 179 movq -8(%eax,%ecx,8), %mm1 180HAM(` pxor -8(%edx,%ecx,8), %mm1') 181 182L(loaded): 183 movq %mm1, %mm0 184 pand REG_AAAAAAAAAAAAAAAA, %mm1 185 186 psrlq $1, %mm1 187HAM(` nop C code alignment') 188 189 psubd %mm1, %mm0 C bit pairs 190HAM(` nop C code alignment') 191 192 193 movq %mm0, %mm1 194 psrlq $2, %mm0 195 196 pand REG_3333333333333333, %mm0 197 pand REG_3333333333333333, %mm1 198 199 paddd %mm1, %mm0 C nibbles 200 201 202 movq %mm0, %mm1 203 psrlq $4, %mm0 204 205 pand REG_0F0F0F0F0F0F0F0F, %mm0 206 pand REG_0F0F0F0F0F0F0F0F, %mm1 207 208 paddd %mm1, %mm0 C bytes 209 210 movq %mm0, %mm1 211 psrlq $8, %mm0 212 213 214 paddb %mm1, %mm0 C words 215 216 217 movq %mm0, %mm1 218 psrlq $16, %mm0 219 220 paddd %mm1, %mm0 C dwords 221 222 pand REG_000000FF000000FF, %mm0 223 224 paddd %mm0, %mm2 C low to total 225 psrlq $32, %mm0 226 227 paddd %mm0, %mm2 C high to total 228 loop L(top) 229 230 231 232 movd %mm2, %eax 233 emms_or_femms 234 ret 235 236EPILOGUE() 237