1dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 2.1 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public 18dnl License along with the GNU MP Library; see the file COPYING.LIB. If 19dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, 20dnl Fifth Floor, Boston, MA 02110-1301, USA. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C Athlon: 11.0 27C Hammer: 9.0 28 29 30C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 31C mp_limb_t divisor); 32C 33C The dependent chain is mul+imul+sub for 11 cycles and that speed is 34C achieved with no special effort. The load and shrld latencies are hidden 35C by out of order execution. 36C 37C It's a touch faster on size==1 to use the mul-by-inverse than divl. 38 39defframe(PARAM_DIVISOR,16) 40defframe(PARAM_SIZE, 12) 41defframe(PARAM_SRC, 8) 42defframe(PARAM_DST, 4) 43 44defframe(SAVE_EBX, -4) 45defframe(SAVE_ESI, -8) 46defframe(SAVE_EDI, -12) 47defframe(SAVE_EBP, -16) 48defframe(VAR_INVERSE, -20) 49defframe(VAR_DST_END, -24) 50 51deflit(STACK_SPACE, 24) 52 53 TEXT 54 55 ALIGN(16) 56PROLOGUE(mpn_divexact_1) 57deflit(`FRAME',0) 58 59 movl PARAM_DIVISOR, %eax 60 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 61 movl $-1, %ecx C shift count 62 63 movl %ebp, SAVE_EBP 64 movl PARAM_SIZE, %ebp 65 66 movl %esi, SAVE_ESI 67 movl %edi, SAVE_EDI 68 69 C If there's usually only one or two trailing zero bits then this 70 C should be faster than bsfl. 71L(strip_twos): 72 incl %ecx 73 shrl %eax 74 jnc L(strip_twos) 75 76 movl %ebx, SAVE_EBX 77 leal 1(%eax,%eax), %ebx C d without twos 78 andl $127, %eax C d/2, 7 bits 79 80ifdef(`PIC',` 81 call L(movl_eip_edx) 82 83 addl $_GLOBAL_OFFSET_TABLE_, %edx 84 85 movl modlimb_invert_table@GOT(%edx), %edx 86 87 movzbl (%eax,%edx), %eax C inv 8 bits 88',` 89dnl non-PIC 90 movzbl modlimb_invert_table(%eax), %eax C inv 8 bits 91') 92 93 leal (%eax,%eax), %edx C 2*inv 94 movl %ebx, PARAM_DIVISOR C d without twos 95 96 imull %eax, %eax C inv*inv 97 98 movl PARAM_SRC, %esi 99 movl PARAM_DST, %edi 100 101 imull %ebx, %eax C inv*inv*d 102 103 subl %eax, %edx C inv = 2*inv - inv*inv*d 104 leal (%edx,%edx), %eax C 2*inv 105 106 imull %edx, %edx C inv*inv 107 108 leal (%esi,%ebp,4), %esi C src end 109 leal (%edi,%ebp,4), %edi C dst end 110 negl %ebp C -size 111 112 imull %ebx, %edx C inv*inv*d 113 114 subl %edx, %eax C inv = 2*inv - inv*inv*d 115 116 ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB 117 pushl %eax FRAME_pushl() 118 imull PARAM_DIVISOR, %eax 119 cmpl $1, %eax 120 popl %eax FRAME_popl()') 121 122 movl %eax, VAR_INVERSE 123 movl (%esi,%ebp,4), %eax C src[0] 124 125 incl %ebp 126 jz L(one) 127 128 movl (%esi,%ebp,4), %edx C src[1] 129 130 shrdl( %cl, %edx, %eax) 131 132 movl %edi, VAR_DST_END 133 xorl %ebx, %ebx 134 jmp L(entry) 135 136ifdef(`PIC',` 137L(movl_eip_edx): 138 movl (%esp), %edx 139 ret_internal 140') 141 142 ALIGN(8) 143L(top): 144 C eax q 145 C ebx carry bit, 0 or 1 146 C ecx shift 147 C edx 148 C esi src end 149 C edi dst end 150 C ebp counter, limbs, negative 151 152 mull PARAM_DIVISOR C carry limb in edx 153 154 movl -4(%esi,%ebp,4), %eax 155 movl (%esi,%ebp,4), %edi 156 157 shrdl( %cl, %edi, %eax) 158 159 subl %ebx, %eax C apply carry bit 160 setc %bl 161 movl VAR_DST_END, %edi 162 163 subl %edx, %eax C apply carry limb 164 adcl $0, %ebx 165 166L(entry): 167 imull VAR_INVERSE, %eax 168 169 movl %eax, -4(%edi,%ebp,4) 170 incl %ebp 171 jnz L(top) 172 173 174 mull PARAM_DIVISOR C carry limb in edx 175 176 movl -4(%esi), %eax C src high limb 177 shrl %cl, %eax 178 movl SAVE_ESI, %esi 179 180 subl %ebx, %eax C apply carry bit 181 movl SAVE_EBX, %ebx 182 movl SAVE_EBP, %ebp 183 184 subl %edx, %eax C apply carry limb 185 186 imull VAR_INVERSE, %eax 187 188 movl %eax, -4(%edi) 189 movl SAVE_EDI, %edi 190 addl $STACK_SPACE, %esp 191 192 ret 193 194 195L(one): 196 shrl %cl, %eax 197 movl SAVE_ESI, %esi 198 movl SAVE_EBX, %ebx 199 200 imull VAR_INVERSE, %eax 201 202 movl SAVE_EBP, %ebp 203 movl %eax, -4(%edi) 204 205 movl SAVE_EDI, %edi 206 addl $STACK_SPACE, %esp 207 208 ret 209 210EPILOGUE() 211