1dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. 4 5dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C odd even divisor 37C P6: 10.0 12.0 cycles/limb 38 39C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 40 41C The odd case is basically the same as mpn_modexact_1_odd, just with an 42C extra store, and it runs at the same 10 cycles which is the dependent 43C chain. 44C 45C The shifts for the even case aren't on the dependent chain so in principle 46C it could run the same too, but nothing running at 10 has been found. 47C Perhaps there's too many uops (an extra 4 over the odd case). 48 49defframe(PARAM_SHIFT, 24) 50defframe(PARAM_INVERSE,20) 51defframe(PARAM_DIVISOR,16) 52defframe(PARAM_SIZE, 12) 53defframe(PARAM_SRC, 8) 54defframe(PARAM_DST, 4) 55 56defframe(SAVE_EBX, -4) 57defframe(SAVE_ESI, -8) 58defframe(SAVE_EDI, -12) 59defframe(SAVE_EBP, -16) 60deflit(STACK_SPACE, 16) 61 62dnl re-use parameter space 63define(VAR_INVERSE,`PARAM_SRC') 64 65 TEXT 66 67C mp_limb_t 68C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 69C mp_limb_t inverse, int shift) 70 71 ALIGN(16) 72PROLOGUE(mpn_pi1_bdiv_q_1) 73deflit(`FRAME',0) 74 75 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 76 77 movl %esi, SAVE_ESI 78 movl PARAM_SRC, %esi 79 80 movl %ebx, SAVE_EBX 81 movl PARAM_SIZE, %ebx 82 83 movl %ebp, SAVE_EBP 84 movl PARAM_INVERSE, %ebp 85 86 movl PARAM_SHIFT, %ecx C trailing twos 87 88L(common): 89 movl %edi, SAVE_EDI 90 movl PARAM_DST, %edi 91 92 leal (%esi,%ebx,4), %esi C src end 93 94 leal (%edi,%ebx,4), %edi C dst end 95 negl %ebx C -size 96 97 movl (%esi,%ebx,4), %eax C src[0] 98 99 orl %ecx, %ecx 100 jz L(odd_entry) 101 102 movl %edi, PARAM_DST 103 movl %ebp, VAR_INVERSE 104 105L(even): 106 C eax src[0] 107 C ebx counter, limbs, negative 108 C ecx shift 109 C edx 110 C esi 111 C edi 112 C ebp 113 114 xorl %ebp, %ebp C initial carry bit 115 xorl %edx, %edx C initial carry limb (for size==1) 116 117 incl %ebx 118 jz L(even_one) 119 120 movl (%esi,%ebx,4), %edi C src[1] 121 122 shrdl( %cl, %edi, %eax) 123 124 jmp L(even_entry) 125 126 127L(even_top): 128 C eax scratch 129 C ebx counter, limbs, negative 130 C ecx shift 131 C edx scratch 132 C esi &src[size] 133 C edi &dst[size] and scratch 134 C ebp carry bit 135 136 movl (%esi,%ebx,4), %edi 137 138 mull PARAM_DIVISOR 139 140 movl -4(%esi,%ebx,4), %eax 141 shrdl( %cl, %edi, %eax) 142 143 subl %ebp, %eax 144 145 sbbl %ebp, %ebp 146 subl %edx, %eax 147 148 sbbl $0, %ebp 149 150L(even_entry): 151 imull VAR_INVERSE, %eax 152 153 movl PARAM_DST, %edi 154 negl %ebp 155 156 movl %eax, -4(%edi,%ebx,4) 157 incl %ebx 158 jnz L(even_top) 159 160 mull PARAM_DIVISOR 161 162 movl -4(%esi), %eax 163 164L(even_one): 165 shrl %cl, %eax 166 movl SAVE_ESI, %esi 167 168 subl %ebp, %eax 169 movl SAVE_EBP, %ebp 170 171 subl %edx, %eax 172 movl SAVE_EBX, %ebx 173 174 imull VAR_INVERSE, %eax 175 176 movl %eax, -4(%edi) 177 movl SAVE_EDI, %edi 178 addl $STACK_SPACE, %esp 179 180 ret 181 182C The dependent chain here is 183C 184C subl %edx, %eax 1 185C imull %ebp, %eax 4 186C mull PARAM_DIVISOR 5 187C ---- 188C total 10 189C 190C and this is the measured speed. No special scheduling is necessary, out 191C of order execution hides the load latency. 192 193L(odd_top): 194 C eax scratch (src limb) 195 C ebx counter, limbs, negative 196 C ecx carry bit 197 C edx carry limb, high of last product 198 C esi &src[size] 199 C edi &dst[size] 200 C ebp inverse 201 202 mull PARAM_DIVISOR 203 204 movl (%esi,%ebx,4), %eax 205 subl %ecx, %eax 206 207 sbbl %ecx, %ecx 208 subl %edx, %eax 209 210 sbbl $0, %ecx 211 212L(odd_entry): 213 imull %ebp, %eax 214 215 movl %eax, (%edi,%ebx,4) 216 negl %ecx 217 218 incl %ebx 219 jnz L(odd_top) 220 221 222 movl SAVE_ESI, %esi 223 224 movl SAVE_EDI, %edi 225 226 movl SAVE_EBP, %ebp 227 228 movl SAVE_EBX, %ebx 229 addl $STACK_SPACE, %esp 230 231 ret 232 233EPILOGUE() 234 235C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 236C mp_limb_t divisor); 237C 238 239 ALIGN(16) 240PROLOGUE(mpn_bdiv_q_1) 241deflit(`FRAME',0) 242 243 movl PARAM_DIVISOR, %eax 244 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 245 246 movl %esi, SAVE_ESI 247 movl PARAM_SRC, %esi 248 249 movl %ebx, SAVE_EBX 250 movl PARAM_SIZE, %ebx 251 252 bsfl %eax, %ecx C trailing twos 253 254 movl %ebp, SAVE_EBP 255 256 shrl %cl, %eax C d without twos 257 258 movl %eax, %edx 259 shrl %eax C d/2 without twos 260 261 movl %edx, PARAM_DIVISOR 262 andl $127, %eax 263 264ifdef(`PIC',` 265 LEA( binvert_limb_table, %ebp) 266 movzbl (%eax,%ebp), %ebp C inv 8 bits 267',` 268 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 269') 270 271 leal (%ebp,%ebp), %eax C 2*inv 272 273 imull %ebp, %ebp C inv*inv 274 imull %edx, %ebp C inv*inv*d 275 276 subl %ebp, %eax C inv = 2*inv - inv*inv*d 277 leal (%eax,%eax), %ebp C 2*inv 278 279 imull %eax, %eax C inv*inv 280 imull %edx, %eax C inv*inv*d 281 282 subl %eax, %ebp C inv = 2*inv - inv*inv*d 283 284 jmp L(common) 285 286EPILOGUE() 287ASM_END() 288