1dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1. 2 3dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C Itanium: 2.5 24C Itanium 2: 1.5 25 26C TODO 27C * Rewrite function entry code using aorslsh1_n.asm style. 28C * Micro-optimize feed-in and wind-down code. 29 30C INPUT PARAMETERS 31define(`rp',`r32') 32define(`up',`r33') 33define(`vp',`r34') 34define(`n',`r35') 35 36define(`OPERATION_rsh1add_n',1) 37 38ifdef(`OPERATION_rsh1add_n',` 39 define(ADDSUB, add) 40 define(PRED, ltu) 41 define(INCR, 1) 42 define(LIM, -1) 43 define(func, mpn_rsh1add_n) 44') 45ifdef(`OPERATION_rsh1sub_n',` 46 define(ADDSUB, sub) 47 define(PRED, gtu) 48 define(INCR, -1) 49 define(LIM, 0) 50 define(func, mpn_rsh1sub_n) 51') 52 53C Some useful aliases for registers we use 54define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') 55define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') 56define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') 57define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31') 58 59ASM_START() 60PROLOGUE(func) 61 .prologue 62 .save ar.lc, r2 63 .body 64ifdef(`HAVE_ABI_32',` 65 addp4 rp = 0, rp C M I 66 addp4 up = 0, up C M I 67 addp4 vp = 0, vp C M I 68 zxt4 n = n C I 69 ;; 70') 71 {.mmi; ld8 r11 = [vp], 8 C M01 72 ld8 r10 = [up], 8 C M01 73 mov.i r2 = ar.lc C I0 74}{.mmi; and r14 = 3, n C M I 75 cmp.lt p15, p0 = 4, n C M I 76 add n = -4, n C M I 77 ;; 78}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 79 cmp.eq p7, p0 = 2, r14 C M I 80 cmp.eq p8, p0 = 3, r14 C M I 81}{.bbb 82 (p6) br.dptk .Lb01 C B 83 (p7) br.dptk .Lb10 C B 84 (p8) br.dptk .Lb11 C B 85} 86 87.Lb00: ld8 v0 = [vp], 8 C M01 88 ld8 u0 = [up], 8 C M01 89 shr.u n = n, 2 C I0 90 ;; 91 ld8 v1 = [vp], 8 C M01 92 ld8 u1 = [up], 8 C M01 93 ADDSUB w3 = r10, r11 C M I 94 ;; 95 ld8 v2 = [vp], 8 C M01 96 ld8 u2 = [up], 8 C M01 97 (p15) br.dpnt .grt4 C B 98 ;; 99 100 cmp.PRED p7, p0 = w3, r10 C M I 101 and r8 = 1, w3 C M I 102 ADDSUB w0 = u0, v0 C M I 103 ;; 104 cmp.PRED p8, p0 = w0, u0 C M I 105 ADDSUB w1 = u1, v1 C M I 106 ;; 107 cmp.PRED p9, p0 = w1, u1 C M I 108 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 109 (p7) add w0 = INCR, w0 C M I 110 ;; 111 shrp x3 = w0, w3, 1 C I0 112 ADDSUB w2 = u2, v2 C M I 113 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 114 (p8) add w1 = INCR, w1 C M I 115 br .Lcj4 C B 116 117.grt4: ld8 v3 = [vp], 8 C M01 118 cmp.PRED p7, p0 = w3, r10 C M I 119 ld8 u3 = [up], 8 C M01 120 and r8 = 1, w3 C M I 121 ;; 122 ADDSUB w0 = u0, v0 C M I 123 ld8 v0 = [vp], 8 C M01 124 add n = -1, n 125 ;; 126 cmp.PRED p8, p0 = w0, u0 C M I 127 ld8 u0 = [up], 8 C M01 128 ADDSUB w1 = u1, v1 C M I 129 ;; 130 ld8 v1 = [vp], 8 C M01 131 mov.i ar.lc = n C I0 132 cmp.PRED p9, p0 = w1, u1 C M I 133 ld8 u1 = [up], 8 C M01 134 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 135 (p7) add w0 = INCR, w0 C M I 136 ;; 137 ADDSUB w2 = u2, v2 C M I 138 ld8 v2 = [vp], 8 C M01 139 shrp x3 = w0, w3, 1 C I0 140 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 141 (p8) add w1 = INCR, w1 C M I 142 br .LL00 C B 143 144 145.Lb01: ADDSUB w2 = r10, r11 C M I 146 shr.u n = n, 2 C I0 147 (p15) br.dpnt .grt1 C B 148 ;; 149 150 cmp.PRED p6, p7 = w2, r10 C M I 151 shr.u x2 = w2, 1 C I0 152 and r8 = 1, w2 C M I 153 ;; 154 (p6) dep x2 = -1, x2, 63, 1 C I0 155 br .Lcj1 C B 156 157.grt1: ld8 v3 = [vp], 8 C M01 158 ld8 u3 = [up], 8 C M01 159 ;; 160 ld8 v0 = [vp], 8 C M01 161 ld8 u0 = [up], 8 C M01 162 mov.i ar.lc = n C FIXME swap with next I0 163 ;; 164 ld8 v1 = [vp], 8 C M01 165 ld8 u1 = [up], 8 C M01 166 ;; 167 ld8 v2 = [vp], 8 C M01 168 ld8 u2 = [up], 8 C M01 169 cmp.PRED p6, p0 = w2, r10 C M I 170 and r8 = 1, w2 C M I 171 ADDSUB w3 = u3, v3 C M I 172 br.cloop.dptk .grt5 C B 173 ;; 174 175 cmp.PRED p7, p0 = w3, u3 C M I 176 ;; 177 ADDSUB w0 = u0, v0 C M I 178 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 179 (p6) add w3 = INCR, w3 C M I 180 ;; 181 cmp.PRED p8, p0 = w0, u0 C M I 182 shrp x2 = w3, w2, 1 C I0 183 ADDSUB w1 = u1, v1 C M I 184 ;; 185 cmp.PRED p9, p0 = w1, u1 C M I 186 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 187 (p7) add w0 = INCR, w0 C M I 188 br .Lcj5 C B 189 190.grt5: ld8 v3 = [vp], 8 C M01 191 cmp.PRED p7, p0 = w3, u3 C M I 192 ld8 u3 = [up], 8 C M01 193 ;; 194 ADDSUB w0 = u0, v0 C M I 195 ld8 v0 = [vp], 8 C M01 196 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 197 (p6) add w3 = INCR, w3 C M I 198 ;; 199 cmp.PRED p8, p0 = w0, u0 C M I 200 shrp x2 = w3, w2, 1 C I0 201 ld8 u0 = [up], 8 C M01 202 ADDSUB w1 = u1, v1 C M I 203 ;; 204 ld8 v1 = [vp], 8 C M01 205 cmp.PRED p9, p0 = w1, u1 C M I 206 ld8 u1 = [up], 8 C M01 207 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 208 (p7) add w0 = INCR, w0 C M I 209 br .LL01 C B 210 211 212.Lb10: ld8 v2 = [vp], 8 C M01 213 ld8 u2 = [up], 8 C M01 214 shr.u n = n, 2 C I0 215 ADDSUB w1 = r10, r11 C M I 216 (p15) br.dpnt .grt2 C B 217 ;; 218 219 cmp.PRED p9, p0 = w1, r10 C M I 220 and r8 = 1, w1 C M I 221 ADDSUB w2 = u2, v2 C M I 222 ;; 223 cmp.PRED p6, p0 = w2, u2 C M I 224 ;; 225 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 226 (p9) add w2 = INCR, w2 C M I 227 ;; 228 shrp x1 = w2, w1, 1 C I0 229 shr.u x2 = w2, 1 C I0 230 br .Lcj2 C B 231 232.grt2: ld8 v3 = [vp], 8 C M01 233 ld8 u3 = [up], 8 C M01 234 ;; 235 ld8 v0 = [vp], 8 C M01 236 ld8 u0 = [up], 8 C M01 237 mov.i ar.lc = n C I0 238 ;; 239 ld8 v1 = [vp], 8 C M01 240 cmp.PRED p9, p0 = w1, r10 C M I 241 ld8 u1 = [up], 8 C M01 242 and r8 = 1, w1 C M I 243 ;; 244 ADDSUB w2 = u2, v2 C M I 245 ld8 v2 = [vp], 8 C M01 246 ;; 247 cmp.PRED p6, p0 = w2, u2 C M I 248 ld8 u2 = [up], 8 C M01 249 ADDSUB w3 = u3, v3 C M I 250 br.cloop.dptk .grt6 C B 251 ;; 252 253 cmp.PRED p7, p0 = w3, u3 C M I 254 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 255 (p9) add w2 = INCR, w2 C M I 256 ;; 257 shrp x1 = w2, w1, 1 C I0 258 ADDSUB w0 = u0, v0 C M I 259 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 260 (p6) add w3 = INCR, w3 C M I 261 br .Lcj6 C B 262 263.grt6: ld8 v3 = [vp], 8 C M01 264 cmp.PRED p7, p0 = w3, u3 C M I 265 ld8 u3 = [up], 8 C M01 266 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 267 (p9) add w2 = INCR, w2 C M I 268 ;; 269 shrp x1 = w2, w1, 1 C I0 270 ADDSUB w0 = u0, v0 C M I 271 ld8 v0 = [vp], 8 C M01 272 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 273 (p6) add w3 = INCR, w3 C M I 274 br .LL10 C B 275 276 277.Lb11: ld8 v1 = [vp], 8 C M01 278 ld8 u1 = [up], 8 C M01 279 shr.u n = n, 2 C I0 280 ;; 281 ld8 v2 = [vp], 8 C M01 282 ld8 u2 = [up], 8 C M01 283 ADDSUB w0 = r10, r11 C M I 284 (p15) br.dpnt .grt3 C B 285 ;; 286 287 cmp.PRED p8, p0 = w0, r10 C M I 288 ADDSUB w1 = u1, v1 C M I 289 and r8 = 1, w0 C M I 290 ;; 291 cmp.PRED p9, p0 = w1, u1 C M I 292 ;; 293 ADDSUB w2 = u2, v2 C M I 294 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 295 (p8) add w1 = INCR, w1 C M I 296 ;; 297 cmp.PRED p6, p0 = w2, u2 C M I 298 shrp x0 = w1, w0, 1 C I0 299 ;; 300 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 301 (p9) add w2 = INCR, w2 C M I 302 br .Lcj3 C B 303 304.grt3: ld8 v3 = [vp], 8 C M01 305 ld8 u3 = [up], 8 C M01 306 ;; 307 ld8 v0 = [vp], 8 C M01 308 mov.i ar.lc = n C I0 309 cmp.PRED p8, p0 = w0, r10 C M I 310 ld8 u0 = [up], 8 C M01 311 ADDSUB w1 = u1, v1 C M I 312 and r8 = 1, w0 C M I 313 ;; 314 ld8 v1 = [vp], 8 C M01 315 cmp.PRED p9, p0 = w1, u1 C M I 316 ld8 u1 = [up], 8 C M01 317 ;; 318 ADDSUB w2 = u2, v2 C M I 319 ld8 v2 = [vp], 8 C M01 320 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 321 (p8) add w1 = INCR, w1 C M I 322 ;; 323 cmp.PRED p6, p0 = w2, u2 C M I 324 shrp x0 = w1, w0, 1 C I0 325 ld8 u2 = [up], 8 C M01 326 ADDSUB w3 = u3, v3 C M I 327 br.cloop.dptk .grt7 C B 328 ;; 329 330 cmp.PRED p7, p0 = w3, u3 C M I 331 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 332 (p9) add w2 = INCR, w2 C M I 333 br .Lcj7 C B 334 335.grt7: ld8 v3 = [vp], 8 C M01 336 cmp.PRED p7, p0 = w3, u3 C M I 337 ld8 u3 = [up], 8 C M01 338 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 339 (p9) add w2 = INCR, w2 C M I 340 br .LL11 C B 341 342 343C *** MAIN LOOP START *** 344 ALIGN(32) 345.Loop: st8 [rp] = x3, 8 C M23 346 ld8 v3 = [vp], 8 C M01 347 cmp.PRED p7, p0 = w3, u3 C M I 348 ld8 u3 = [up], 8 C M01 349 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 350 (p9) add w2 = INCR, w2 C M I 351 ;; 352.LL11: st8 [rp] = x0, 8 C M23 353 shrp x1 = w2, w1, 1 C I0 354 ADDSUB w0 = u0, v0 C M I 355 ld8 v0 = [vp], 8 C M01 356 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 357 (p6) add w3 = INCR, w3 C M I 358 ;; 359.LL10: cmp.PRED p8, p0 = w0, u0 C M I 360 shrp x2 = w3, w2, 1 C I0 361 nop.b 0 362 ld8 u0 = [up], 8 C M01 363 ADDSUB w1 = u1, v1 C M I 364 nop.b 0 365 ;; 366 st8 [rp] = x1, 8 C M23 367 ld8 v1 = [vp], 8 C M01 368 cmp.PRED p9, p0 = w1, u1 C M I 369 ld8 u1 = [up], 8 C M01 370 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 371 (p7) add w0 = INCR, w0 C M I 372 ;; 373.LL01: st8 [rp] = x2, 8 C M23 374 shrp x3 = w0, w3, 1 C I0 375 ADDSUB w2 = u2, v2 C M I 376 ld8 v2 = [vp], 8 C M01 377 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 378 (p8) add w1 = INCR, w1 C M I 379 ;; 380.LL00: cmp.PRED p6, p0 = w2, u2 C M I 381 shrp x0 = w1, w0, 1 C I0 382 nop.b 0 383 ld8 u2 = [up], 8 C M01 384 ADDSUB w3 = u3, v3 C M I 385 br.cloop.dptk .Loop C B 386 ;; 387C *** MAIN LOOP END *** 388 389.Lskip: st8 [rp] = x3, 8 C M23 390 cmp.PRED p7, p0 = w3, u3 C M I 391 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 392 (p9) add w2 = INCR, w2 C M I 393 ;; 394.Lcj7: st8 [rp] = x0, 8 C M23 395 shrp x1 = w2, w1, 1 C I0 396 ADDSUB w0 = u0, v0 C M I 397 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 398 (p6) add w3 = INCR, w3 C M I 399 ;; 400.Lcj6: cmp.PRED p8, p0 = w0, u0 C M I 401 shrp x2 = w3, w2, 1 C I0 402 ADDSUB w1 = u1, v1 C M I 403 ;; 404 st8 [rp] = x1, 8 C M23 405 cmp.PRED p9, p0 = w1, u1 C M I 406 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 407 (p7) add w0 = INCR, w0 C M I 408 ;; 409.Lcj5: st8 [rp] = x2, 8 C M23 410 shrp x3 = w0, w3, 1 C I0 411 ADDSUB w2 = u2, v2 C M I 412 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 413 (p8) add w1 = INCR, w1 C M I 414 ;; 415.Lcj4: cmp.PRED p6, p0 = w2, u2 C M I 416 shrp x0 = w1, w0, 1 C I0 417 ;; 418 st8 [rp] = x3, 8 C M23 419 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 420 (p9) add w2 = INCR, w2 C M I 421 ;; 422.Lcj3: st8 [rp] = x0, 8 C M23 423 shrp x1 = w2, w1, 1 C I0 424 shr.u x2 = w2, 1 C I0 425 ;; 426.Lcj2: st8 [rp] = x1, 8 C M23 427 (p6) dep x2 = -1, x2, 63, 1 C I0 428 ;; 429.Lcj1: st8 [rp] = x2 C M23 430 mov.i ar.lc = r2 C I0 431 br.ret.sptk.many b0 C B 432EPILOGUE() 433