1dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, 2dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. 3 4dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 2.1 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public 19dnl License along with the GNU MP Library; see the file COPYING.LIB. If 20dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, 21dnl Fifth Floor, Boston, MA 02110-1301, USA. 22 23include(`../config.m4') 24 25NAILS_SUPPORT(0-31) 26 27 28C alignment dst/src1/src2, A=0mod8, N=4mod8 29C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 30C 31C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor 32C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor 33C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior 34C 35C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor 36C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor 37C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior 38 39 40dnl M4_p and M4_i are the MMX and integer instructions 41dnl M4_*_neg_dst means whether to negate the final result before writing 42dnl M4_*_neg_src2 means whether to negate the src2 values before using them 43 44define(`OPERATION_nand_n',1) 45 46define(M4_choose_op, 47m4_assert_numargs(7) 48`ifdef(`OPERATION_$1',` 49define(`M4_function', `mpn_$1') 50define(`M4_operation', `$1') 51define(`M4_p', `$2') 52define(`M4_p_neg_dst', `$3') 53define(`M4_p_neg_src2',`$4') 54define(`M4_i', `$5') 55define(`M4_i_neg_dst', `$6') 56define(`M4_i_neg_src2',`$7') 57')') 58 59dnl xnor is done in "iorn" style because it's a touch faster than "nior" 60dnl style (the two are equivalent for xor). 61dnl 62dnl pandn can't be used with nails. 63 64M4_choose_op( and_n, pand,0,0, andl,0,0) 65ifelse(GMP_NAIL_BITS,0, 66`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', 67`M4_choose_op(andn_n, pand,0,1, andl,0,1)') 68M4_choose_op( nand_n, pand,1,0, andl,1,0) 69M4_choose_op( ior_n, por,0,0, orl,0,0) 70M4_choose_op( iorn_n, por,0,1, orl,0,1) 71M4_choose_op( nior_n, por,1,0, orl,1,0) 72M4_choose_op( xor_n, pxor,0,0, xorl,0,0) 73M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) 74 75ifdef(`M4_function',, 76`m4_error(`Unrecognised or undefined OPERATION symbol 77')') 78 79C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 80C mp_size_t size); 81C 82C Do src1,size M4_operation src2,size, storing the result in dst,size. 83C 84C Unaligned movq loads and stores are a bit slower than aligned ones. The 85C test at the start of the routine checks the alignment of src1 and if 86C necessary processes one limb separately at the low end to make it aligned. 87C 88C The raw speeds without this alignment switch are as follows. 89C 90C alignment dst/src1/src2, A=0mod8, N=4mod8 91C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 92C 93C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor 94C K6 1.75 2.2 2.0 2.28 iorn,xnor 95C K6 2.0 2.25 2.35 2.28 nand,nior 96C 97C 98C Future: 99C 100C K6 can do one 64-bit load per cycle so each of these routines should be 101C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be 102C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. 103C The others are 4 instructions per 2 limbs, and so can only approach 1.0 104C because there's nowhere to hide some loop control. 105 106defframe(PARAM_SIZE,16) 107defframe(PARAM_SRC2,12) 108defframe(PARAM_SRC1,8) 109defframe(PARAM_DST, 4) 110deflit(`FRAME',0) 111 112 TEXT 113 ALIGN(32) 114PROLOGUE(M4_function) 115 movl PARAM_SIZE, %ecx 116 pushl %ebx FRAME_pushl() 117 118 movl PARAM_SRC1, %eax 119 120 movl PARAM_SRC2, %ebx 121 cmpl $1, %ecx 122 123 movl PARAM_DST, %edx 124 ja L(two_or_more) 125 126 127 movl (%ebx), %ecx 128 popl %ebx 129ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 130 M4_i (%eax), %ecx 131ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 132 movl %ecx, (%edx) 133 134 ret 135 136 137L(two_or_more): 138 C eax src1 139 C ebx src2 140 C ecx size 141 C edx dst 142 C esi 143 C edi 144 C ebp 145 146 pushl %esi FRAME_pushl() 147 testl $4, %eax 148 jz L(alignment_ok) 149 150 movl (%ebx), %esi 151 addl $4, %ebx 152ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') 153 M4_i (%eax), %esi 154 addl $4, %eax 155ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') 156 movl %esi, (%edx) 157 addl $4, %edx 158 decl %ecx 159 160L(alignment_ok): 161 movl %ecx, %esi 162 shrl %ecx 163 jnz L(still_two_or_more) 164 165 movl (%ebx), %ecx 166 popl %esi 167ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 168 M4_i (%eax), %ecx 169ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 170 popl %ebx 171 movl %ecx, (%edx) 172 ret 173 174 175L(still_two_or_more): 176ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` 177 pcmpeqd %mm7, %mm7 C all ones 178ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails 179') 180 181 ALIGN(16) 182L(top): 183 C eax src1 184 C ebx src2 185 C ecx counter 186 C edx dst 187 C esi 188 C edi 189 C ebp 190 C 191 C carry bit is low of size 192 193 movq -8(%ebx,%ecx,8), %mm0 194ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') 195 M4_p -8(%eax,%ecx,8), %mm0 196ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') 197 movq %mm0, -8(%edx,%ecx,8) 198 199 loop L(top) 200 201 202 jnc L(no_extra) 203 204 movl -4(%ebx,%esi,4), %ebx 205ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') 206 M4_i -4(%eax,%esi,4), %ebx 207ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') 208 movl %ebx, -4(%edx,%esi,4) 209L(no_extra): 210 211 popl %esi 212 popl %ebx 213 emms 214 ret 215 216EPILOGUE() 217