1dnl x86 mpn_gcd_1 optimised for AMD K7. 2 3dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn 4dnl Granlund. 5 6dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36 37C cycles/bit (approx) 38C AMD K7 5.31 39C AMD K8,K9 5.33 40C AMD K10 5.30 41C AMD bd1 ? 42C AMD bobcat 7.02 43C Intel P4-2 10.1 44C Intel P4-3/4 10.0 45C Intel P6/13 5.88 46C Intel core2 6.26 47C Intel NHM 6.83 48C Intel SBR 8.50 49C Intel atom 8.90 50C VIA nano ? 51C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 52 53C TODO 54C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny. 55C * Stream things better through registers, avoiding some copying. 56 57C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 58 59deflit(MAXSHIFT, 6) 60deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 61 62DEF_OBJECT(ctz_table,64) 63 .byte MAXSHIFT 64forloop(i,1,MASK, 65` .byte m4_count_trailing_zeros(i) 66') 67END_OBJECT(ctz_table) 68 69C Threshold of when to call bmod when U is one limb. Should be about 70C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). 71define(`DIV_THRES_LOG2', 7) 72 73 74define(`up', `%edi') 75define(`n', `%esi') 76define(`v0', `%edx') 77 78 79ASM_START() 80 TEXT 81 ALIGN(16) 82PROLOGUE(mpn_gcd_1) 83 push %edi 84 push %esi 85 86 mov 12(%esp), up 87 mov 16(%esp), n 88 mov 20(%esp), v0 89 90 mov (up), %eax C U low limb 91 or v0, %eax C x | y 92 mov $-1, %ecx 93 94L(twos): 95 inc %ecx 96 shr %eax 97 jnc L(twos) 98 99 shr %cl, v0 100 mov %ecx, %eax C common twos 101 102L(divide_strip_y): 103 shr v0 104 jnc L(divide_strip_y) 105 adc v0, v0 106 107 push %eax 108 push v0 109 110 cmp $1, n 111 jnz L(reduce_nby1) 112 113C Both U and V are single limbs, reduce with bmod if u0 >> v0. 114 mov (up), %ecx 115 mov %ecx, %eax 116 shr $DIV_THRES_LOG2, %ecx 117 cmp %ecx, v0 118 ja L(reduced) 119 120 mov v0, %esi 121 xor %edx, %edx 122 div %esi 123 mov %edx, %eax 124 jmp L(reduced) 125 126L(reduce_nby1): 127ifdef(`PIC_WITH_EBX',` 128 push %ebx 129 call L(movl_eip_to_ebx) 130 add $_GLOBAL_OFFSET_TABLE_, %ebx 131') 132 push v0 C param 3 133 push n C param 2 134 push up C param 1 135 cmp $BMOD_1_TO_MOD_1_THRESHOLD, n 136 jl L(bmod) 137 CALL( mpn_mod_1) 138 jmp L(called) 139L(bmod): 140 CALL( mpn_modexact_1_odd) 141 142L(called): 143 add $12, %esp C deallocate params 144ifdef(`PIC_WITH_EBX',` 145 pop %ebx 146') 147L(reduced): 148 pop %edx 149 150 LEA( ctz_table, %esi) 151 test %eax, %eax 152 mov %eax, %ecx 153 jnz L(mid) 154 jmp L(end) 155 156 ALIGN(16) C K8 BC P4 NHM SBR 157L(top): cmovc( %ecx, %eax) C if x-y < 0 0 158 cmovc( %edi, %edx) C use x,y-x 0 159L(mid): and $MASK, %ecx C 0 160 movzbl (%esi,%ecx), %ecx C 1 161 jz L(shift_alot) C 1 162 shr %cl, %eax C 3 163 mov %eax, %edi C 4 164 mov %edx, %ecx C 3 165 sub %eax, %ecx C 4 166 sub %edx, %eax C 4 167 jnz L(top) C 5 168 169L(end): pop %ecx 170 mov %edx, %eax 171 shl %cl, %eax 172 pop %esi 173 pop %edi 174 ret 175 176L(shift_alot): 177 shr $MAXSHIFT, %eax 178 mov %eax, %ecx 179 jmp L(mid) 180 181ifdef(`PIC_WITH_EBX',` 182L(movl_eip_to_ebx): 183 mov (%esp), %ebx 184 ret 185') 186EPILOGUE() 187