1d25e02daSmrgdnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. 2d25e02daSmrg 3*f81b1c5bSmrgdnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. 4*f81b1c5bSmrg 5d25e02daSmrgdnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 6*f81b1c5bSmrg 7d25e02daSmrgdnl This file is part of the GNU MP Library. 8d25e02daSmrgdnl 9*f81b1c5bSmrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 10*f81b1c5bSmrgdnl it under the terms of either: 11d25e02daSmrgdnl 12*f81b1c5bSmrgdnl * the GNU Lesser General Public License as published by the Free 13*f81b1c5bSmrgdnl Software Foundation; either version 3 of the License, or (at your 14*f81b1c5bSmrgdnl option) any later version. 15d25e02daSmrgdnl 16*f81b1c5bSmrgdnl or 17d25e02daSmrgdnl 18*f81b1c5bSmrgdnl * the GNU General Public License as published by the Free Software 19*f81b1c5bSmrgdnl Foundation; either version 2 of the License, or (at your option) any 20*f81b1c5bSmrgdnl later version. 21*f81b1c5bSmrgdnl 22*f81b1c5bSmrgdnl or both in parallel, as here. 23*f81b1c5bSmrgdnl 24*f81b1c5bSmrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 25*f81b1c5bSmrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26*f81b1c5bSmrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27*f81b1c5bSmrgdnl for more details. 28*f81b1c5bSmrgdnl 29*f81b1c5bSmrgdnl You should have received copies of the GNU General Public License and the 30*f81b1c5bSmrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 31*f81b1c5bSmrgdnl see https://www.gnu.org/licenses/. 32d25e02daSmrg 33d25e02daSmrginclude(`../config.m4') 34d25e02daSmrg 35d25e02daSmrg 36d25e02daSmrgC P4: 19.0 cycles/limb 37d25e02daSmrg 38d25e02daSmrgC Pairs of movd's are used to avoid unaligned loads. Despite the loads not 39d25e02daSmrgC being on the dependent chain and there being plenty of cycles available, 40d25e02daSmrgC using an unaligned movq on every second iteration measured about 23 c/l. 41d25e02daSmrgC 42d25e02daSmrg 43d25e02daSmrgdefframe(PARAM_SHIFT, 24) 44d25e02daSmrgdefframe(PARAM_INVERSE,20) 45d25e02daSmrgdefframe(PARAM_DIVISOR,16) 46d25e02daSmrgdefframe(PARAM_SIZE, 12) 47d25e02daSmrgdefframe(PARAM_SRC, 8) 48d25e02daSmrgdefframe(PARAM_DST, 4) 49d25e02daSmrg 50d25e02daSmrg TEXT 51d25e02daSmrg 52d25e02daSmrgC mp_limb_t 53d25e02daSmrgC mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 54d25e02daSmrgC mp_limb_t inverse, int shift) 55d25e02daSmrg ALIGN(32) 56d25e02daSmrgPROLOGUE(mpn_pi1_bdiv_q_1) 57d25e02daSmrgdeflit(`FRAME',0) 58d25e02daSmrg 59d25e02daSmrg movl PARAM_SIZE, %edx 60d25e02daSmrg 61d25e02daSmrg movl PARAM_SRC, %eax 62d25e02daSmrg 63d25e02daSmrg movl PARAM_DIVISOR, %ecx 64d25e02daSmrg 65d25e02daSmrg movd %ecx, %mm6 66d25e02daSmrg movl PARAM_SHIFT, %ecx 67d25e02daSmrg 68d25e02daSmrg movd %ecx, %mm7 C shift 69d25e02daSmrg 70d25e02daSmrg C 71d25e02daSmrg 72d25e02daSmrg movl PARAM_INVERSE, %ecx 73d25e02daSmrg movd %ecx, %mm5 C inv 74d25e02daSmrg 75d25e02daSmrg movl PARAM_DST, %ecx 76d25e02daSmrg pxor %mm1, %mm1 C initial carry limb 77d25e02daSmrg pxor %mm0, %mm0 C initial carry bit 78d25e02daSmrg 79d25e02daSmrg subl $1, %edx 80d25e02daSmrg jz L(done) 81d25e02daSmrg 82d25e02daSmrg pcmpeqd %mm4, %mm4 83d25e02daSmrg psrlq $32, %mm4 C 0x00000000FFFFFFFF 84d25e02daSmrg 85d25e02daSmrgC The dependent chain here is as follows. 86d25e02daSmrgC 87d25e02daSmrgC latency 88d25e02daSmrgC psubq s = (src-cbit) - climb 2 89d25e02daSmrgC pmuludq q = s*inverse 8 90d25e02daSmrgC pmuludq prod = q*divisor 8 91d25e02daSmrgC psrlq climb = high(prod) 2 92d25e02daSmrgC -- 93d25e02daSmrgC 20 94d25e02daSmrgC 95d25e02daSmrgC Yet the loop measures 19.0 c/l, so obviously there's something gained 96d25e02daSmrgC there over a straight reading of the chip documentation. 97d25e02daSmrg 98d25e02daSmrgL(top): 99d25e02daSmrg C eax src, incrementing 100d25e02daSmrg C ebx 101d25e02daSmrg C ecx dst, incrementing 102d25e02daSmrg C edx counter, size-1 iterations 103d25e02daSmrg C 104d25e02daSmrg C mm0 carry bit 105d25e02daSmrg C mm1 carry limb 106d25e02daSmrg C mm4 0x00000000FFFFFFFF 107d25e02daSmrg C mm5 inverse 108d25e02daSmrg C mm6 divisor 109d25e02daSmrg C mm7 shift 110d25e02daSmrg 111d25e02daSmrg movd (%eax), %mm2 112d25e02daSmrg movd 4(%eax), %mm3 113d25e02daSmrg addl $4, %eax 114d25e02daSmrg punpckldq %mm3, %mm2 115d25e02daSmrg 116d25e02daSmrg psrlq %mm7, %mm2 117d25e02daSmrg pand %mm4, %mm2 C src 118d25e02daSmrg psubq %mm0, %mm2 C src - cbit 119d25e02daSmrg 120d25e02daSmrg psubq %mm1, %mm2 C src - cbit - climb 121d25e02daSmrg movq %mm2, %mm0 122d25e02daSmrg psrlq $63, %mm0 C new cbit 123d25e02daSmrg 124d25e02daSmrg pmuludq %mm5, %mm2 C s*inverse 125d25e02daSmrg movd %mm2, (%ecx) C q 126d25e02daSmrg addl $4, %ecx 127d25e02daSmrg 128d25e02daSmrg movq %mm6, %mm1 129d25e02daSmrg pmuludq %mm2, %mm1 C q*divisor 130d25e02daSmrg psrlq $32, %mm1 C new climb 131d25e02daSmrg 132d25e02daSmrgL(entry): 133d25e02daSmrg subl $1, %edx 134d25e02daSmrg jnz L(top) 135d25e02daSmrg 136d25e02daSmrgL(done): 137d25e02daSmrg movd (%eax), %mm2 138d25e02daSmrg psrlq %mm7, %mm2 C src 139d25e02daSmrg psubq %mm0, %mm2 C src - cbit 140d25e02daSmrg 141d25e02daSmrg psubq %mm1, %mm2 C src - cbit - climb 142d25e02daSmrg 143d25e02daSmrg pmuludq %mm5, %mm2 C s*inverse 144d25e02daSmrg movd %mm2, (%ecx) C q 145d25e02daSmrg 146d25e02daSmrg emms 147d25e02daSmrg ret 148d25e02daSmrg 149d25e02daSmrgEPILOGUE() 150d25e02daSmrg 151d25e02daSmrg ALIGN(16) 152d25e02daSmrgC mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 153d25e02daSmrgC mp_limb_t divisor); 154d25e02daSmrgC 155d25e02daSmrgPROLOGUE(mpn_bdiv_q_1) 156d25e02daSmrgdeflit(`FRAME',0) 157d25e02daSmrg 158d25e02daSmrg movl PARAM_SIZE, %edx 159d25e02daSmrg 160d25e02daSmrg movl PARAM_DIVISOR, %ecx 161d25e02daSmrg 162d25e02daSmrg C eax src 163d25e02daSmrg C ebx 164d25e02daSmrg C ecx divisor 165d25e02daSmrg C edx size-1 166d25e02daSmrg 167d25e02daSmrg movl %ecx, %eax 168d25e02daSmrg bsfl %ecx, %ecx C trailing twos 169d25e02daSmrg 170d25e02daSmrg shrl %cl, %eax C d = divisor without twos 171d25e02daSmrg movd %eax, %mm6 172d25e02daSmrg movd %ecx, %mm7 C shift 173d25e02daSmrg 174d25e02daSmrg shrl %eax C d/2 175d25e02daSmrg 176d25e02daSmrg andl $127, %eax C d/2, 7 bits 177d25e02daSmrg 178d25e02daSmrgifdef(`PIC',` 179d25e02daSmrg LEA( binvert_limb_table, %ecx) 180d25e02daSmrg movzbl (%eax,%ecx), %eax C inv 8 bits 181d25e02daSmrg',` 182d25e02daSmrg movzbl binvert_limb_table(%eax), %eax C inv 8 bits 183d25e02daSmrg') 184d25e02daSmrg 185d25e02daSmrg C 186d25e02daSmrg 187d25e02daSmrg movd %eax, %mm5 C inv 188d25e02daSmrg 189d25e02daSmrg movd %eax, %mm0 C inv 190d25e02daSmrg 191d25e02daSmrg pmuludq %mm5, %mm5 C inv*inv 192d25e02daSmrg 193d25e02daSmrg C 194d25e02daSmrg 195d25e02daSmrg pmuludq %mm6, %mm5 C inv*inv*d 196d25e02daSmrg paddd %mm0, %mm0 C 2*inv 197d25e02daSmrg 198d25e02daSmrg C 199d25e02daSmrg 200d25e02daSmrg psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d 201d25e02daSmrg pxor %mm5, %mm5 202d25e02daSmrg 203d25e02daSmrg paddd %mm0, %mm5 204d25e02daSmrg pmuludq %mm0, %mm0 C inv*inv 205d25e02daSmrg 206d25e02daSmrg pcmpeqd %mm4, %mm4 207d25e02daSmrg psrlq $32, %mm4 C 0x00000000FFFFFFFF 208d25e02daSmrg 209d25e02daSmrg C 210d25e02daSmrg 211d25e02daSmrg pmuludq %mm6, %mm0 C inv*inv*d 212d25e02daSmrg paddd %mm5, %mm5 C 2*inv 213d25e02daSmrg 214d25e02daSmrg movl PARAM_SRC, %eax 215d25e02daSmrg movl PARAM_DST, %ecx 216d25e02daSmrg pxor %mm1, %mm1 C initial carry limb 217d25e02daSmrg 218d25e02daSmrg C 219d25e02daSmrg 220d25e02daSmrg psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d 221d25e02daSmrg 222d25e02daSmrg ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 223d25e02daSmrg pushl %eax FRAME_pushl() 224d25e02daSmrg movq %mm6, %mm0 225d25e02daSmrg pmuludq %mm5, %mm0 226d25e02daSmrg movd %mm0, %eax 227d25e02daSmrg cmpl $1, %eax 228d25e02daSmrg popl %eax FRAME_popl()') 229d25e02daSmrg 230d25e02daSmrg pxor %mm0, %mm0 C initial carry bit 231d25e02daSmrg jmp L(entry) 232d25e02daSmrg 233d25e02daSmrgEPILOGUE() 234*f81b1c5bSmrgASM_END() 235