1dnl x86-32 mpn_mod_1_1p, requiring cmov. 2 3dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. 4 5dnl Copyright 2010, 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C P5 ? 37C P6 model 0-8,10-12 ? 38C P6 model 9 (Banias) ? 39C P6 model 13 (Dothan) ? 40C P4 model 0 (Willamette) ? 41C P4 model 1 (?) ? 42C P4 model 2 (Northwood) ? 43C P4 model 3 (Prescott) ? 44C P4 model 4 (Nocona) ? 45C AMD K6 ? 46C AMD K7 7 47C AMD K8 ? 48 49define(`B2mb', `%ebx') 50define(`r0', `%esi') 51define(`r2', `%ebp') 52define(`t0', `%edi') 53define(`ap', `%ecx') C Also shift count 54 55C Stack frame 56C pre 36(%esp) 57C b 32(%esp) 58C n 28(%esp) 59C ap 24(%esp) 60C return 20(%esp) 61C %ebp 16(%esp) 62C %edi 12(%esp) 63C %esi 8(%esp) 64C %ebx 4(%esp) 65C B2mod (%esp) 66 67define(`B2modb', `(%esp)') 68define(`n', `28(%esp)') 69define(`b', `32(%esp)') 70define(`pre', `36(%esp)') 71 72C mp_limb_t 73C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) 74C 75C The pre array contains bi, cnt, B1modb, B2modb 76C Note: This implementation needs B1modb only when cnt > 0 77 78ASM_START() 79 TEXT 80 ALIGN(8) 81PROLOGUE(mpn_mod_1_1p) 82 push %ebp 83 push %edi 84 push %esi 85 push %ebx 86 mov 32(%esp), %ebp C pre[] 87 88 mov 12(%ebp), %eax C B2modb 89 push %eax C Put it on stack 90 91 mov n, %edx 92 mov 24(%esp), ap 93 94 lea (ap, %edx, 4), ap 95 mov -4(ap), %eax 96 cmp $3, %edx 97 jnc L(first) 98 mov -8(ap), r0 99 jmp L(reduce_two) 100 101L(first): 102 C First iteration, no r2 103 mull B2modb 104 mov -12(ap), r0 105 add %eax, r0 106 mov -8(ap), %eax 107 adc %edx, %eax 108 sbb r2, r2 109 subl $3, n 110 lea -16(ap), ap 111 jz L(reduce_three) 112 113 mov B2modb, B2mb 114 sub b, B2mb 115 lea (B2mb, r0), t0 116 jmp L(mid) 117 118 ALIGN(16) 119L(top): C Loopmixed to 7 c/l on k7 120 add %eax, r0 121 lea (B2mb, r0), t0 122 mov r2, %eax 123 adc %edx, %eax 124 sbb r2, r2 125L(mid): mull B2modb 126 and B2modb, r2 127 add r0, r2 128 decl n 129 mov (ap), r0 130 cmovc( t0, r2) 131 lea -4(ap), ap 132 jnz L(top) 133 134 add %eax, r0 135 mov r2, %eax 136 adc %edx, %eax 137 sbb r2, r2 138 139L(reduce_three): 140 C Eliminate r2 141 and b, r2 142 sub r2, %eax 143 144L(reduce_two): 145 mov pre, %ebp 146 movb 4(%ebp), %cl 147 test %cl, %cl 148 jz L(normalized) 149 150 C Unnormalized, use B1modb to reduce to size < B b 151 mull 8(%ebp) 152 xor t0, t0 153 add %eax, r0 154 adc %edx, t0 155 mov t0, %eax 156 157 C Left-shift to normalize 158 shld %cl, r0, %eax C Always use shld? 159 160 shl %cl, r0 161 jmp L(udiv) 162 163L(normalized): 164 mov %eax, t0 165 sub b, t0 166 cmovnc( t0, %eax) 167 168L(udiv): 169 lea 1(%eax), t0 170 mull (%ebp) 171 mov b, %ebx C Needed in register for lea 172 add r0, %eax 173 adc t0, %edx 174 imul %ebx, %edx 175 sub %edx, r0 176 cmp r0, %eax 177 lea (%ebx, r0), %eax 178 cmovnc( r0, %eax) 179 cmp %ebx, %eax 180 jnc L(fix) 181L(ok): shr %cl, %eax 182 183 add $4, %esp 184 pop %ebx 185 pop %esi 186 pop %edi 187 pop %ebp 188 189 ret 190L(fix): sub %ebx, %eax 191 jmp L(ok) 192EPILOGUE() 193 194PROLOGUE(mpn_mod_1_1p_cps) 195 push %ebp 196 mov 12(%esp), %ebp 197 push %esi 198 bsr %ebp, %ecx 199 push %ebx 200 xor $31, %ecx 201 mov 16(%esp), %esi 202 sal %cl, %ebp 203 mov %ebp, %edx 204 not %edx 205 mov $-1, %eax 206 div %ebp C On K7, invert_limb would be a few cycles faster. 207 mov %eax, (%esi) C store bi 208 mov %ecx, 4(%esi) C store cnt 209 neg %ebp 210 mov $1, %edx 211 shld %cl, %eax, %edx 212 imul %ebp, %edx 213 shr %cl, %edx 214 imul %ebp, %eax 215 mov %edx, 8(%esi) C store B1modb 216 mov %eax, 12(%esi) C store B2modb 217 pop %ebx 218 pop %esi 219 pop %ebp 220 ret 221EPILOGUE() 222