1dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The 36C innerloop is 2*3-way unrolled, which is best we can do with the available 37C registers. It seems tricky to use the same structure for rsblsh1_n, since we 38C cannot feed carry between operations there. 39 40C cycles/limb 41C P5 42C P6 model 0-8,10-12 43C P6 model 9 (Banias) 44C P6 model 13 (Dothan) 45C P4 model 0 (Willamette) 46C P4 model 1 (?) 47C P4 model 2 (Northwood) 48C P4 model 3 (Prescott) 49C P4 model 4 (Nocona) 50C Intel Atom 6.75 51C AMD K6 52C AMD K7 53C AMD K8 54 55C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 56C processors. It uses 2*4-way unrolling, for good reasons. 57C 58C Breaking carry recurrency might be a good idea. We would then need separate 59C registers for the shift carry and add/subtract carry, which in turn would 60C force us to 2*2-way unrolling. 61 62defframe(PARAM_SIZE, 12) 63defframe(PARAM_SRC, 8) 64defframe(PARAM_DST, 4) 65 66dnl re-use parameter space 67define(VAR_COUNT,`PARAM_SIZE') 68define(SAVE_EBX,`PARAM_SRC') 69define(SAVE_EBP,`PARAM_DST') 70 71ASM_START() 72 TEXT 73 ALIGN(8) 74PROLOGUE(mpn_sublsh1_n_ip1) 75deflit(`FRAME',0) 76 77define(`rp', `%edi') 78define(`up', `%esi') 79 80 mov PARAM_SIZE, %eax C size 81 push up FRAME_pushl() 82 push rp FRAME_pushl() 83 xor %edx, %edx 84 mov PARAM_SRC, up 85 mov PARAM_DST, rp 86 mov %ebx, SAVE_EBX 87 mov %eax, %ebx 88 shr $3, %eax 89 90 not %eax C count = -(size\8)-i 91 and $7, %ebx C size % 8 92 jz L(exact) 93 94L(oop): 95ifdef(`CPU_P6',` 96 shr %edx ') C restore 2nd saved carry bit 97 mov (up), %ecx 98 adc %ecx, %ecx 99 rcr %edx C restore 1st saved carry bit 100 lea 4(up), up 101 sbb %ecx, (rp) 102 lea 4(rp), rp 103 adc %edx, %edx C save a carry bit in edx 104ifdef(`CPU_P6',` 105 adc %edx, %edx ') C save another carry bit in edx 106 dec %ebx 107 jnz L(oop) 108L(exact): 109 inc %eax 110 jz L(end) 111 mov %eax, VAR_COUNT 112 mov %ebp, SAVE_EBP 113 114 ALIGN(16) 115L(top): 116ifdef(`CPU_P6',` 117 shr %edx ') C restore 2nd saved carry bit 118 mov (up), %eax 119 adc %eax, %eax 120 mov 4(up), %ebx 121 adc %ebx, %ebx 122 mov 8(up), %ecx 123 adc %ecx, %ecx 124 mov 12(up), %ebp 125 adc %ebp, %ebp 126 127 rcr %edx C restore 1st saved carry bit 128 129 sbb %eax, (rp) 130 sbb %ebx, 4(rp) 131 sbb %ecx, 8(rp) 132 sbb %ebp, 12(rp) 133 134 mov 16(up), %eax 135 adc %eax, %eax 136 mov 20(up), %ebx 137 adc %ebx, %ebx 138 mov 24(up), %ecx 139 adc %ecx, %ecx 140 mov 28(up), %ebp 141 adc %ebp, %ebp 142 143 lea 32(up), up 144 adc %edx, %edx C save a carry bit in edx 145 146 sbb %eax, 16(rp) 147 sbb %ebx, 20(rp) 148 sbb %ecx, 24(rp) 149 sbb %ebp, 28(rp) 150 151ifdef(`CPU_P6',` 152 adc %edx, %edx ') C save another carry bit in edx 153 incl VAR_COUNT 154 lea 32(rp), rp 155 jne L(top) 156 157 mov SAVE_EBP, %ebp 158L(end): 159 mov SAVE_EBX, %ebx 160 161ifdef(`CPU_P6',` 162 xor %eax, %eax 163 shr $1, %edx 164 adc %edx, %eax 165',` 166 adc $0, %edx 167 mov %edx, %eax 168') 169 pop rp FRAME_popl() 170 pop up FRAME_popl() 171 ret 172EPILOGUE() 173ASM_END() 174