1dnl X86_64 mpn_divrem_hensel_qr_1_2 2 3dnl Copyright 2009 Jason Moxham 4 5dnl This file is part of the MPIR Library. 6 7dnl The MPIR Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 2.1 of the License, or (at 10dnl your option) any later version. 11 12dnl The MPIR Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the MPIR Library; see the file COPYING.LIB. If not, write 19dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 20dnl Boston, MA 02110-1301, USA. 21 22include(`../config.m4') 23 24C (rdi,rdx)=(rsi,rdx) / rcx rdx>=1 25C rax=hensel remainder from div 26 27C This is divrem_hensel_1 with shifting on the output of the quotient 28C On k8/k10 the shifting comes for free so no need to have different 29C fn for that. And on K8/K10 this runs at 10c/l which is optimal 30C This function "replaces" divexact_1 and modexact_1_odd 31C This is same as the shifting version but with no shifting 32 33ASM_START() 34PROLOGUE(mpn_divrem_hensel_qr_1_2) 35mov $1,%r9 36sub %rdx,%r9 37lea -8(%rdi,%rdx,8),%rdi 38lea -8(%rsi,%rdx,8),%rsi 39 40push %r12 41push %r13 42push %r14 43 44mov %rcx,%rdx 45C // rdx is 3 bit inverse 46 47mov %rdx,%rax 48imul %ecx,%edx 49mov $2,%r11 50sub %rdx,%r11 51imul %eax,%r11d 52C //r11 has 4 bits 53 54mov %r11,%rax 55imul %ecx,%r11d 56mov $2,%rdx 57sub %r11,%rdx 58imul %eax,%edx 59C //rdx has 8 bits 60 61mov %rdx,%rax 62imul %ecx,%edx 63mov $2,%r11 64sub %rdx,%r11 65imul %eax,%r11d 66C //r11 has 16 bits 67 68mov %r11,%rax 69imul %ecx,%r11d 70mov $2,%rdx 71sub %r11,%rdx 72imul %eax,%edx 73C // rdx has 32 bits 74 75mov %rdx,%rax 76imul %rcx,%rdx 77mov $2,%r11 78sub %rdx,%r11 79imul %rax,%r11 80C //r11 has 64 bits 81 82mov %r11,%rax 83mov %r11,%r12 84mul %rcx 85neg %rdx 86imul %rdx,%r12 87C // r12,r11 has 128 bits 88 89mov %r11,%r13 90mov %r12,%r14 91 92mov (%rsi,%r9,8),%r11 93mov 8(%rsi,%r9,8),%r12 94mov $0,%r10 95add $2,%r9 96jc skiplp 97ALIGN(16) 98lp: 99 mov %r12,%r8 100 mov %r13,%rax 101 mul %r11 102 mov %rax,-16(%rdi,%r9,8) 103 imul %r14,%r11 104 imul %r13,%r12 105 add %r11,%rdx 106 add %r12,%rdx 107 mov (%rsi,%r9,8),%r11 108 mov 8(%rsi,%r9,8),%r12 109 mov %rdx,-8(%rdi,%r9,8) 110 mov %rcx,%rax 111 mul %rdx 112 add %r10,%r10 113 sbb $0,%r11 114 sbb $0,%r12 115 sbb %r10,%r10 116 cmp %rax,%r8 117 sbb %rdx,%r11 118 sbb $0,%r12 119 sbb $0,%r10 120 add $2,%r9 121 jnc lp 122skiplp: 123mov %r12,%r8 124mov %r13,%rax 125mul %r11 126mov %rax,-16(%rdi,%r9,8) 127imul %r14,%r11 128imul %r13,%r12 129add %r11,%rdx 130add %r12,%rdx 131cmp $0,%r9 132jne case0 133case1: 134 mov (%rsi,%r9,8),%r11 135 mov %rdx,-8(%rdi,%r9,8) 136 mov %rcx,%rax 137 mul %rdx 138 add %r10,%r10 139 sbb $0,%r11 140 sbb %r10,%r10 141 cmp %rax,%r8 142 sbb %rdx,%r11 143 sbb $0,%r10 144 mov %r11,%rax 145 imul %r13,%rax 146 mov %rax,(%rdi,%r9,8) 147 mul %rcx 148 add %r10,%r10 149 mov $0,%rax 150 adc %rdx,%rax 151 pop %r14 152 pop %r13 153 pop %r12 154 ret 155case0: 156 mov %rdx,-8(%rdi,%r9,8) 157 mov %rcx,%rax 158 mul %rdx 159 cmp %rax,%r8 160 mov $0,%rax 161 adc %rdx,%rax 162 sub %r10,%rax 163 pop %r14 164 pop %r13 165 pop %r12 166 ret 167EPILOGUE() 168