1dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder 2 3dnl Copyright 2003, 2004 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C EV4: 47 36C EV5: 30 37C EV6: 15 38 39 40C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, 41C mp_limb_t c) 42C 43C This code follows the "alternate" code in mpn/generic/mode1o.c, 44C eliminating cbit+climb from the dependent chain. This leaves, 45C 46C ev4 ev5 ev6 47C 1 3 1 subq y = x - h 48C 23 13 7 mulq q = y * inverse 49C 23 14 7 umulh h = high (q * d) 50C -- -- -- 51C 47 30 15 52C 53C In each case, the load latency, loop control, and extra carry bit handling 54C hide under the multiply latencies. Those latencies are long enough that 55C we don't need to worry about alignment or pairing to squeeze out 56C performance. 57C 58C For the first limb, some of the loop code is broken out and scheduled back 59C since it can be done earlier. 60C 61C - The first ldq src[0] is near the start of the routine, for maximum 62C time from memory. 63C 64C - The subq y=x-climb can be done without waiting for the inverse. 65C 66C - The mulq y*inverse is replicated after the final subq for the inverse, 67C instead of branching to the mulq in the main loop. On ev4 a branch 68C there would cost cycles, but we can hide them under the mulq latency. 69C 70C For the last limb, high<divisor is tested and if that's true a subtract 71C and addback is done, as per the main mpn/generic/mode1o.c code. This is a 72C data-dependent branch, but we're waiting for umulh so any penalty should 73C hide there. The multiplies saved would be worth the cost anyway. 74C 75C Enhancements: 76C 77C For size==1, a plain division (done bitwise say) might be faster than 78C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on 79C ev5. A call to gcc __remqu might be a possibility. 80 81ASM_START() 82PROLOGUE(mpn_modexact_1c_odd,gp) 83 84 C r16 src 85 C r17 size 86 C r18 d 87 C r19 c 88 89 LEA(r0, binvert_limb_table) 90 srl r18, 1, r20 C d >> 1 91 92 and r20, 127, r20 C idx = d>>1 & 0x7F 93 94 addq r0, r20, r21 C table + idx 95 96ifelse(bwx_available_p,1, 97` ldbu r20, 0(r21) C table[idx], inverse 8 bits 98',` 99 ldq_u r20, 0(r21) C table[idx] qword 100 extbl r20, r21, r20 C table[idx], inverse 8 bits 101') 102 103 mull r20, r20, r7 C i*i 104 addq r20, r20, r20 C 2*i 105 106 ldq r2, 0(r16) C x = s = src[0] 107 lda r17, -1(r17) C size-- 108 clr r0 C initial cbit=0 109 110 mull r7, r18, r7 C i*i*d 111 112 subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits 113 114 mull r20, r20, r7 C i*i 115 addq r20, r20, r20 C 2*i 116 117 mull r7, r18, r7 C i*i*d 118 119 subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits 120 121 mulq r20, r20, r7 C i*i 122 addq r20, r20, r20 C 2*i 123 124 mulq r7, r18, r7 C i*i*d 125 subq r2, r19, r3 C y = x - climb 126 127 subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits 128 129ASSERT(r7, C should have d*inv==1 mod 2^64 130` mulq r18, r20, r7 131 cmpeq r7, 1, r7') 132 133 mulq r3, r20, r4 C first q = y * inv 134 135 beq r17, L(one) C if size==1 136 br L(entry) 137 138 139L(top): 140 C r0 cbit 141 C r16 src, incrementing 142 C r17 size, decrementing 143 C r18 d 144 C r19 climb 145 C r20 inv 146 147 ldq r1, 0(r16) C s = src[i] 148 subq r1, r0, r2 C x = s - cbit 149 cmpult r1, r0, r0 C new cbit = s < cbit 150 151 subq r2, r19, r3 C y = x - climb 152 153 mulq r3, r20, r4 C q = y * inv 154L(entry): 155 cmpult r2, r19, r5 C cbit2 = x < climb 156 addq r5, r0, r0 C cbit += cbit2 157 lda r16, 8(r16) C src++ 158 lda r17, -1(r17) C size-- 159 160 umulh r4, r18, r19 C climb = q * d 161 bne r17, L(top) C while 2 or more limbs left 162 163 164 165 C r0 cbit 166 C r18 d 167 C r19 climb 168 C r20 inv 169 170 ldq r1, 0(r16) C s = src[size-1] high limb 171 172 cmpult r1, r18, r2 C test high<divisor 173 bne r2, L(skip) C skip if so 174 175 C can't skip a division, repeat loop code 176 177 subq r1, r0, r2 C x = s - cbit 178 cmpult r1, r0, r0 C new cbit = s < cbit 179 180 subq r2, r19, r3 C y = x - climb 181 182 mulq r3, r20, r4 C q = y * inv 183L(one): 184 cmpult r2, r19, r5 C cbit2 = x < climb 185 addq r5, r0, r0 C cbit += cbit2 186 187 umulh r4, r18, r19 C climb = q * d 188 189 addq r19, r0, r0 C return climb + cbit 190 ret r31, (r26), 1 191 192 193 ALIGN(8) 194L(skip): 195 C with high<divisor, the final step can be just (cbit+climb)-s and 196 C an addback of d if that underflows 197 198 addq r19, r0, r19 C c = climb + cbit 199 200 subq r19, r1, r2 C c - s 201 cmpult r19, r1, r3 C c < s 202 203 addq r2, r18, r0 C return c-s + divisor 204 205 cmoveq r3, r2, r0 C return c-s if no underflow 206 ret r31, (r26), 1 207 208EPILOGUE() 209ASM_END() 210