1// **************************************************************************** 2// * 3// * XVID MPEG-4 VIDEO CODEC 4// * - IA64 halfpel refinement - 5// * 6// * Copyright(C) 2002 Johannes Singler, Daniel Winkler 7// * 8// * This program is free software; you can redistribute it and/or modify it 9// * under the terms of the GNU General Public License as published by 10// * the Free Software Foundation; either version 2 of the License, or 11// * (at your option) any later version. 12// * 13// * This program is distributed in the hope that it will be useful, 14// * but WITHOUT ANY WARRANTY; without even the implied warranty of 15// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16// * GNU General Public License for more details. 17// * 18// * You should have received a copy of the GNU General Public License 19// * along with this program; if not, write to the Free Software 20// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21// * 22// * $Id: halfpel8_refine_ia64.s,v 1.4 2009-02-19 17:07:29 Isibaar Exp $ 23// * 24// ***************************************************************************/ 25// 26// **************************************************************************** 27// * 28// * halfpel8_refine_ia64.s, IA-64 halfpel refinement 29// * 30// * This version was implemented during an IA-64 practical training at 31// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) 32// * 33// **************************************************************************** 34 35// ------------------------------------------------------------------------------ 36// * Programmed by 37// * Johannes Singler (email@jsingler.de), Daniel Winkler (infostudent@uni.de) 38// * 39// * Programmed for the IA64 laboratory held at University Karlsruhe 2002 40// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ 41// * 42// ------------------------------------------------------------------------------ 43// * 44// * This is the optimized assembler version of Halfpel8_Refine. This function 45// * is worth it to be optimized for the IA-64 architecture because of the huge 46// * register set. We can hold all necessary data in general use registers 47// * and reuse it. 48// * 49// * Our approach uses: 50// * - The Itanium command psad1, which solves the problem in hardware. 51// * - Alignment resolving to avoid memory faults 52// * - Massive lopp unrolling 53// * 54// ------------------------------------------------------------------------------ 55// * 56// * ------- Half-pixel steps around the center (*) and corresponding 57// * |0|1|0| register set parts. 58// * ------- 59// * |2|*|2| 60// * ------- 61// * |0|1|0| 62// * ------- 63// * 64// ------------------------------------------------------------------------------ 65// * calc_delta is split up in three parts wich are included from 66// * 67// * calc_delta_1.s 68// * calc_delta_2.s 69// * calc_delta_3.s 70// * 71// ------------------------------------------------------------------------------ 72// * We assume min_dx <= currX <= max_dx && min_dy <= currY <= max_dy 73 74 75.sdata 76 .align 4 77 .type lambda_vec8#,@object 78 .size lambda_vec8#,128 79lambda_vec8: 80 data4 0 81 data4 1 82 data4 1 83 data4 1 84 data4 1 85 data4 2 86 data4 2 87 data4 2 88 data4 2 89 data4 3 90 data4 3 91 data4 3 92 data4 4 93 data4 4 94 data4 4 95 data4 5 96 data4 5 97 data4 6 98 data4 7 99 data4 7 100 data4 8 101 data4 9 102 data4 10 103 data4 11 104 data4 13 105 data4 14 106 data4 16 107 data4 18 108 data4 21 109 data4 25 110 data4 30 111 data4 36 112 113 114 .type mvtab#,@object 115 .size mvtab#,132 116mvtab: 117 data4 1 118 data4 2 119 data4 3 120 data4 4 121 data4 6 122 data4 7 123 data4 7 124 data4 7 125 data4 9 126 data4 9 127 data4 9 128 data4 10 129 data4 10 130 data4 10 131 data4 10 132 data4 10 133 data4 10 134 data4 10 135 data4 10 136 data4 10 137 data4 10 138 data4 10 139 data4 10 140 data4 10 141 data4 10 142 data4 11 143 data4 11 144 data4 11 145 data4 11 146 data4 11 147 data4 11 148 data4 12 149 data4 12 150.text 151 .align 16 152 .global Halfpel8_Refine_ia64# 153 .proc Halfpel8_Refine_ia64# 154 155Halfpel8_Refine_ia64: 156 157 pfs = r14 158 prsave = r15 159 160 // Save important registers 161 162 alloc pfs = ar.pfs, 18, 74, 4, 96 163 mov prsave = pr 164 165 // Naming registers for better readability 166 167 pRef = in0 168 pRefH = in1 169 pRefV = in2 170 pRefHV = in3 171 cura = in4 172 x = in5 173 y = in6 174 currMV = in7 175 iMinSAD = in8 176 dx = in9 177 dy = in10 178 min_dx = in11 179 max_dx = in12 180 min_dy = in13 181 max_dy = in14 182 iFcode = in15 183 iQuant = in16 184 iEdgedWidth = in17 185 186 iSAD = r17 187 backupX = r18 188 backupY = r19 189 currX = r20 190 currY = r21 191 currYAddress = r22 192 bitX0 = r23 193 bitY0 = r24 194 dxd2 = r25 195 dyd2 = r26 196 offset = r27 197 block = r28 198 nob02 = r29 199 nob1 = r30 200 nob64m02 = r31 201 nob64m1 = r127 202 const7 = r126 203 nob56m02 = r125 204 oldX = r124 205 oldY = r123 206 207 .rotr inregisters[18], refaa[3], refab[3], cur[8], ref0a[9], ref0b[9], ref1a[9], mpr[9], ref2a[8], ref2b[8], component[2], sc[2], tabaddress[2] 208 209 fx = f8 210 fy = f9 211 fblock = f10 212 fiEdgedWidth = f11 213 fdxd2 = f12 214 fdyd2 = f13 215 foffset = f14 216 fydiEdgedWidth = f15 217 fQuant = f32 218 fmv = f33 219 220 n = p16 221 h = p17 222 v = p18 223 hv = p19 224 l = p20 225 r = p21 226 t = p22 227 b = p23 228 lt = p24 229 lb = p25 230 rt = p26 231 rb = p27 232 fb = p28 233 non0_0 = p30 234 non0_1 = p31 235 non0_2 = p32 236 non0_3 = p33 237 neg_0 = p34 238 neg_1 = p35 239 neg_2 = p36 240 neg_3 = p37 241 cg32_0 = p29 242 cg32_1 = p38 243 244 // Initialize input variables 245 246 add sp = 16, sp 247 ;; 248 ld4 iMinSAD = [sp], 8 249 ;; 250 sxt4 iMinSAD = iMinSAD 251 252 253 ld4 dx = [sp], 8 254 ;; 255 sxt4 dx = dx 256 257 ld4 dy = [sp], 8 258 ;; 259 sxt4 dy = dy 260 261 ld4 min_dx = [sp], 8 262 ;; 263 sxt4 min_dx = min_dx 264 265 ld4 max_dx = [sp], 8 266 ;; 267 sxt4 max_dx = max_dx 268 269 ld4 min_dy = [sp], 8 270 ;; 271 sxt4 min_dy = min_dy 272 273 ld4 max_dy = [sp], 8 274 ;; 275 sxt4 max_dy = max_dy 276 277 ld4 iFcode = [sp], 8 278 ;; 279 sxt4 iFcode = iFcode 280 281 ld4 iQuant = [sp], 8 282 283 add tabaddress[0] = @gprel(lambda_vec8#), gp 284 ;; 285 shladd tabaddress[0] = iQuant, 2, tabaddress[0] 286 ;; 287 ld4 iQuant = [tabaddress[0]] 288 ;; 289 sxt4 iQuant = iQuant 290 ;; 291 add iFcode = -1, iFcode //only used in decreased version 292 shl iQuant = iQuant, 1 293 ;; 294 setf.sig fQuant = iQuant 295 296 ld4 iEdgedWidth = [sp] 297 add sp = -88, sp 298 299 300 301 302 // Initialize local variables 303 304 305 ld4 currX = [currMV] 306 add currYAddress = 4, currMV 307 ;; 308 sxt4 currX = currX 309 ld4 currY = [currYAddress] 310 ;; 311 sxt4 currY = currY 312 ;; 313 // Calculate references 314 315 cmp.gt l, p0 = currX, min_dx 316 cmp.lt r, p0 = currX, max_dx 317 cmp.gt t, p0 = currY, min_dy 318 cmp.lt b, p0 = currY, max_dy 319 add backupX = -1, currX //move to left upper corner of quadrate 320 add backupY = -1, currY 321 322 ;; 323(b) cmp.gt.unc lb, p0 = currX, min_dx 324(t) cmp.lt.unc rt, p0 = currX, max_dx 325(l) cmp.gt.unc lt, p0 = currY, min_dy 326(r) cmp.lt.unc rb, p0 = currY, max_dy 327 328 and bitX0 = 1, backupX 329 and bitY0 = 1, backupY 330 ;; 331 cmp.eq n, p0 = 0, bitX0 332 cmp.eq h, p0 = 1, bitX0 333 cmp.eq v, p0 = 0, bitX0 334 cmp.eq hv, p0 = 1, bitX0 335 ;; 336 cmp.eq.and n, p0 = 0, bitY0 337 cmp.eq.and h, p0 = 0, bitY0 338 cmp.eq.and v, p0 = 1, bitY0 339 cmp.eq.and hv, p0 = 1, bitY0 340 ;; 341 342 .pred.rel "mutex", p16, p17, p18, p19 //n, h, v, hv 343(n) mov refaa[0] = pRef 344(h) mov refaa[0] = pRefH 345(v) mov refaa[0] = pRefV 346(hv) mov refaa[0] = pRefHV 347 348(n) mov refaa[1] = pRefH 349(h) mov refaa[1] = pRef 350(v) mov refaa[1] = pRefHV 351(hv) mov refaa[1] = pRefV 352 353(n) mov refaa[2] = pRefV 354(h) mov refaa[2] = pRefHV 355(v) mov refaa[2] = pRef 356(hv) mov refaa[2] = pRefH 357 358 359 // Calculate offset (integer multiplication on IA-64 sucks!) 360 361 mov block = 8 362 363 shr dxd2 = backupX, 1 364 shr dyd2 = backupY, 1 365 366 setf.sig fx = x 367 setf.sig fy = y 368 ;; 369 setf.sig fblock = block 370 setf.sig fiEdgedWidth = iEdgedWidth 371 ;; 372 setf.sig fdxd2 = dxd2 373 setf.sig fdyd2 = dyd2 374 ;; 375 xma.l foffset = fx, fblock, fdxd2 376 xma.l fydiEdgedWidth = fy, fblock, fdyd2 377 ;; 378 xma.l foffset = fydiEdgedWidth, fiEdgedWidth, foffset 379 ;; 380 getf.sig offset = foffset 381 ;; 382 add refaa[0] = refaa[0], offset 383 add refaa[1] = refaa[1], offset 384 add refaa[2] = refaa[2], offset 385 ;; 386(h) add refaa[1] = 1, refaa[1] 387(hv) add refaa[1] = 1, refaa[1] 388(v) add refaa[2] = iEdgedWidth, refaa[2] 389(hv) add refaa[2] = iEdgedWidth, refaa[2] 390 391 // Load respecting misalignment of refx... 392 393 mov const7 = 7 394 ;; 395 dep.z nob02 = refaa[0], 3, 3 396 dep.z nob1 = refaa[1], 3, 3 397 ;; 398 andcm refaa[0] = refaa[0], const7 // set last 3 bits = 0 399 andcm refaa[1] = refaa[1], const7 400 andcm refaa[2] = refaa[2], const7 401 ;; 402 add refab[0] = 8, refaa[0] 403 add refab[1] = 8, refaa[1] 404 add refab[2] = 8, refaa[2] 405 ;; 406 ld8 cur[0] = [cura], iEdgedWidth 407 ld8 ref0a[0] = [refaa[0]], iEdgedWidth 408 sub nob64m02 = 64, nob02 // 64 - nob 409 410 ld8 ref0b[0] = [refab[0]], iEdgedWidth 411 ld8 ref1a[0] = [refaa[1]], iEdgedWidth 412 sub nob56m02 = 56, nob02 // 56 - nob 413 414 ld8 mpr[0] = [refab[1]], iEdgedWidth 415 ld8 ref2a[0] = [refaa[2]], iEdgedWidth 416 sub nob64m1 = 64, nob1 417 418 ld8 ref2b[0] = [refab[2]], iEdgedWidth 419 ;; 420 ld8 cur[1] = [cura], iEdgedWidth 421 ld8 ref0a[1] = [refaa[0]], iEdgedWidth 422 ld8 ref0b[1] = [refab[0]], iEdgedWidth 423 ld8 ref1a[1] = [refaa[1]], iEdgedWidth 424 ld8 mpr[1] = [refab[1]], iEdgedWidth 425 ld8 ref2a[1] = [refaa[2]], iEdgedWidth 426 ld8 ref2b[1] = [refab[2]], iEdgedWidth 427 ;; 428 ld8 cur[2] = [cura], iEdgedWidth 429 ld8 ref0a[2] = [refaa[0]], iEdgedWidth 430 ld8 ref0b[2] = [refab[0]], iEdgedWidth 431 ld8 ref1a[2] = [refaa[1]], iEdgedWidth 432 ld8 mpr[2] = [refab[1]], iEdgedWidth 433 ld8 ref2a[2] = [refaa[2]], iEdgedWidth 434 ld8 ref2b[2] = [refab[2]], iEdgedWidth 435 ;; 436 ld8 cur[3] = [cura], iEdgedWidth 437 ld8 ref0a[3] = [refaa[0]], iEdgedWidth 438 ld8 ref0b[3] = [refab[0]], iEdgedWidth 439 ld8 ref1a[3] = [refaa[1]], iEdgedWidth 440 ld8 mpr[3] = [refab[1]], iEdgedWidth 441 ld8 ref2a[3] = [refaa[2]], iEdgedWidth 442 ld8 ref2b[3] = [refab[2]], iEdgedWidth 443 ;; 444 ld8 cur[4] = [cura], iEdgedWidth 445 ld8 ref0a[4] = [refaa[0]], iEdgedWidth 446 ld8 ref0b[4] = [refab[0]], iEdgedWidth 447 ld8 ref1a[4] = [refaa[1]], iEdgedWidth 448 ld8 mpr[4] = [refab[1]], iEdgedWidth 449 ld8 ref2a[4] = [refaa[2]], iEdgedWidth 450 ld8 ref2b[4] = [refab[2]], iEdgedWidth 451 ;; 452 ld8 cur[5] = [cura], iEdgedWidth 453 ld8 ref0a[5] = [refaa[0]], iEdgedWidth 454 ld8 ref0b[5] = [refab[0]], iEdgedWidth 455 ld8 ref1a[5] = [refaa[1]], iEdgedWidth 456 ld8 mpr[5] = [refab[1]], iEdgedWidth 457 ld8 ref2a[5] = [refaa[2]], iEdgedWidth 458 ld8 ref2b[5] = [refab[2]], iEdgedWidth 459 ;; 460 ld8 cur[6] = [cura], iEdgedWidth 461 ld8 ref0a[6] = [refaa[0]], iEdgedWidth 462 ld8 ref0b[6] = [refab[0]], iEdgedWidth 463 ld8 ref1a[6] = [refaa[1]], iEdgedWidth 464 ld8 mpr[6] = [refab[1]], iEdgedWidth 465 ld8 ref2a[6] = [refaa[2]], iEdgedWidth 466 ld8 ref2b[6] = [refab[2]], iEdgedWidth 467 ;; 468 ld8 cur[7] = [cura] 469 ld8 ref0a[7] = [refaa[0]], iEdgedWidth 470 ld8 ref0b[7] = [refab[0]], iEdgedWidth 471 ld8 ref1a[7] = [refaa[1]], iEdgedWidth 472 ld8 mpr[7] = [refab[1]], iEdgedWidth 473 ld8 ref2a[7] = [refaa[2]] 474 ld8 ref2b[7] = [refab[2]] 475 ;; 476 ld8 ref0a[8] = [refaa[0]] 477 ld8 ref0b[8] = [refab[0]] 478 ld8 ref1a[8] = [refaa[1]] 479 ld8 mpr[8] = [refab[1]] 480 ;; 481 482 483 // Align ref1 484 485 shr.u ref1a[0] = ref1a[0], nob1 486 shr.u ref1a[1] = ref1a[1], nob1 487 shr.u ref1a[2] = ref1a[2], nob1 488 shr.u ref1a[3] = ref1a[3], nob1 489 shr.u ref1a[4] = ref1a[4], nob1 490 shr.u ref1a[5] = ref1a[5], nob1 491 shr.u ref1a[6] = ref1a[6], nob1 492 shr.u ref1a[7] = ref1a[7], nob1 493 shr.u ref1a[8] = ref1a[8], nob1 494 495 shl mpr[0] = mpr[0], nob64m1 496 shl mpr[1] = mpr[1], nob64m1 497 shl mpr[2] = mpr[2], nob64m1 498 shl mpr[3] = mpr[3], nob64m1 499 shl mpr[4] = mpr[4], nob64m1 500 shl mpr[5] = mpr[5], nob64m1 501 shl mpr[6] = mpr[6], nob64m1 502 shl mpr[7] = mpr[7], nob64m1 503 shl mpr[8] = mpr[8], nob64m1 504 ;; 505.explicit 506{.mii 507 or ref1a[0] = ref1a[0], mpr[0] 508 shr.u ref0a[0] = ref0a[0], nob02 509 shr.u ref0a[1] = ref0a[1], nob02 510} 511{.mmi 512 or ref1a[1] = ref1a[1], mpr[1] 513 or ref1a[2] = ref1a[2], mpr[2] 514 shr.u ref0a[2] = ref0a[2], nob02 515} 516{.mii 517 or ref1a[3] = ref1a[3], mpr[3] 518 shr.u ref0a[3] = ref0a[3], nob02 519 shr.u ref0a[4] = ref0a[4], nob02 520} 521{.mmi 522 or ref1a[4] = ref1a[4], mpr[4] 523 or ref1a[5] = ref1a[5], mpr[5] 524 shr.u ref0a[5] = ref0a[5], nob02 525} 526{.mii 527 or ref1a[6] = ref1a[6], mpr[6] 528 shr.u ref0a[6] = ref0a[6], nob02 529 shr.u ref0a[7] = ref0a[7], nob02 530} 531{.mii 532 or ref1a[7] = ref1a[7], mpr[7] 533 or ref1a[8] = ref1a[8], mpr[8] 534 shr.u ref0a[8] = ref0a[8], nob02 535} 536.default 537 // ref1a[] now contains center position values 538 // mpr[] not used any more 539 540 // Align ref0 left 541 542 ;; 543 shl mpr[0] = ref0b[0], nob56m02 544 shl mpr[1] = ref0b[1], nob56m02 545 shl mpr[2] = ref0b[2], nob56m02 546 shl mpr[3] = ref0b[3], nob56m02 547 shl mpr[4] = ref0b[4], nob56m02 548 shl mpr[5] = ref0b[5], nob56m02 549 shl mpr[6] = ref0b[6], nob56m02 550 shl mpr[7] = ref0b[7], nob56m02 551 shl mpr[8] = ref0b[8], nob56m02 552 553 shl ref0b[0] = ref0b[0], nob64m02 554 shl ref0b[1] = ref0b[1], nob64m02 555 shl ref0b[2] = ref0b[2], nob64m02 556 shl ref0b[3] = ref0b[3], nob64m02 557 shl ref0b[4] = ref0b[4], nob64m02 558 shl ref0b[5] = ref0b[5], nob64m02 559 shl ref0b[6] = ref0b[6], nob64m02 560 shl ref0b[7] = ref0b[7], nob64m02 561 shl ref0b[8] = ref0b[8], nob64m02 562 ;; 563 or ref0a[0] = ref0a[0], ref0b[0] 564 or ref0a[1] = ref0a[1], ref0b[1] 565 or ref0a[2] = ref0a[2], ref0b[2] 566 or ref0a[3] = ref0a[3], ref0b[3] 567 or ref0a[4] = ref0a[4], ref0b[4] 568 or ref0a[5] = ref0a[5], ref0b[5] 569 or ref0a[6] = ref0a[6], ref0b[6] 570 or ref0a[7] = ref0a[7], ref0b[7] 571 or ref0a[8] = ref0a[8], ref0b[8] 572 ;; 573 574 // ref0a[] now contains left position values 575 // mpr[] contains intermediate result for right position values (former ref0a << 56 - nob02) 576 577 // Align ref0 right 578 579 // Shift one byte more to the right (seen als big-endian) 580 shr.u ref0b[0] = ref0a[0], 8 581 shr.u ref0b[1] = ref0a[1], 8 582 shr.u ref0b[2] = ref0a[2], 8 583 shr.u ref0b[3] = ref0a[3], 8 584 shr.u ref0b[4] = ref0a[4], 8 585 shr.u ref0b[5] = ref0a[5], 8 586 shr.u ref0b[6] = ref0a[6], 8 587 shr.u ref0b[7] = ref0a[7], 8 588 shr.u ref0b[8] = ref0a[8], 8 589 ;; 590.explicit 591{.mii 592 or ref0b[0] = ref0b[0], mpr[0] 593 shr.u ref2a[0] = ref2a[0], nob02 594 shr.u ref2a[1] = ref2a[1], nob02 595} 596{.mmi 597 or ref0b[1] = ref0b[1], mpr[1] 598 or ref0b[2] = ref0b[2], mpr[2] 599 shr.u ref2a[2] = ref2a[2], nob02 600} 601{.mii 602 or ref0b[3] = ref0b[3], mpr[3] 603 shr.u ref2a[3] = ref2a[3], nob02 604 shr.u ref2a[4] = ref2a[4], nob02 605} 606{.mmi 607 or ref0b[4] = ref0b[4], mpr[4] 608 or ref0b[5] = ref0b[5], mpr[5] 609 shr.u ref2a[5] = ref2a[5], nob02 610} 611{.mii 612 or ref0b[6] = ref0b[6], mpr[6] 613 shr.u ref2a[6] = ref2a[6], nob02 614 shr.u ref2a[7] = ref2a[7], nob02 615} 616.default 617 or ref0b[7] = ref0b[7], mpr[7] 618 or ref0b[8] = ref0b[8], mpr[8] 619 620 // ref0b[] now contains right position values 621 // mpr[] not needed any more 622 623 624 // Align ref2 left 625 626 ;; 627 shl mpr[0] = ref2b[0], nob56m02 628 shl mpr[1] = ref2b[1], nob56m02 629 shl mpr[2] = ref2b[2], nob56m02 630 shl mpr[3] = ref2b[3], nob56m02 631 shl mpr[4] = ref2b[4], nob56m02 632 shl mpr[5] = ref2b[5], nob56m02 633 shl mpr[6] = ref2b[6], nob56m02 634 shl mpr[7] = ref2b[7], nob56m02 635 636 shl ref2b[0] = ref2b[0], nob64m02 637 shl ref2b[1] = ref2b[1], nob64m02 638 shl ref2b[2] = ref2b[2], nob64m02 639 shl ref2b[3] = ref2b[3], nob64m02 640 shl ref2b[4] = ref2b[4], nob64m02 641 shl ref2b[5] = ref2b[5], nob64m02 642 shl ref2b[6] = ref2b[6], nob64m02 643 shl ref2b[7] = ref2b[7], nob64m02 644 ;; 645 or ref2a[0] = ref2a[0], ref2b[0] 646 or ref2a[1] = ref2a[1], ref2b[1] 647 or ref2a[2] = ref2a[2], ref2b[2] 648 or ref2a[3] = ref2a[3], ref2b[3] 649 or ref2a[4] = ref2a[4], ref2b[4] 650 or ref2a[5] = ref2a[5], ref2b[5] 651 or ref2a[6] = ref2a[6], ref2b[6] 652 or ref2a[7] = ref2a[7], ref2b[7] 653 ;; 654 655 // ref2a[] now contains left position values 656 // mpr[] contains intermediate result for right position values (former ref2a << 56 - nob02) 657 658 // Align ref2 right 659 660 // Shift one byte more to the right (seen als big-endian) 661 shr.u ref2b[0] = ref2a[0], 8 662 shr.u ref2b[1] = ref2a[1], 8 663 shr.u ref2b[2] = ref2a[2], 8 664 shr.u ref2b[3] = ref2a[3], 8 665 shr.u ref2b[4] = ref2a[4], 8 666 shr.u ref2b[5] = ref2a[5], 8 667 shr.u ref2b[6] = ref2a[6], 8 668 shr.u ref2b[7] = ref2a[7], 8 669 ;; 670 or ref2b[0] = ref2b[0], mpr[0] 671 or ref2b[1] = ref2b[1], mpr[1] 672 or ref2b[2] = ref2b[2], mpr[2] 673 or ref2b[3] = ref2b[3], mpr[3] 674 or ref2b[4] = ref2b[4], mpr[4] 675 or ref2b[5] = ref2b[5], mpr[5] 676 or ref2b[6] = ref2b[6], mpr[6] 677 or ref2b[7] = ref2b[7], mpr[7] 678 679 680 // ref2b[] now contains right position values 681 // mpr[] not needed any more 682 683 684 685 // Let's SAD 686 687 // Left top corner 688 689 690 sub dx = backupX, dx 691 psad1 mpr[0] = cur[0], ref0a[0] 692 psad1 mpr[1] = cur[1], ref0a[1] 693 694 sub dy = backupY, dy 695 psad1 mpr[2] = cur[2], ref0a[2] 696 psad1 mpr[3] = cur[3], ref0a[3] 697 psad1 mpr[4] = cur[4], ref0a[4] 698 psad1 mpr[5] = cur[5], ref0a[5] 699 psad1 mpr[6] = cur[6], ref0a[6] 700 psad1 mpr[7] = cur[7], ref0a[7] 701 ;; 702.include "../../src/motion/ia64_asm/calc_delta_1.s" 703 704 // Top edge 705 706 psad1 mpr[0] = cur[0], ref1a[0] 707 psad1 mpr[1] = cur[1], ref1a[1] 708 psad1 mpr[2] = cur[2], ref1a[2] 709 psad1 mpr[3] = cur[3], ref1a[3] 710 psad1 mpr[4] = cur[4], ref1a[4] 711 712 add dx = 1, dx 713 psad1 mpr[5] = cur[5], ref1a[5] 714 psad1 mpr[6] = cur[6], ref1a[6] 715 716 psad1 mpr[7] = cur[7], ref1a[7] 717 ;; 718 719.include "../../src/motion/ia64_asm/calc_delta_2.s" 720(lt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD 721.include "../../src/motion/ia64_asm/calc_delta_3.s" 722 723 // Right top corner 724 725 726 psad1 mpr[0] = cur[0], ref0b[0] 727 psad1 mpr[1] = cur[1], ref0b[1] 728 psad1 mpr[2] = cur[2], ref0b[2] 729 psad1 mpr[3] = cur[3], ref0b[3] 730 psad1 mpr[4] = cur[4], ref0b[4] 731 732 add backupX = 1, backupX 733 psad1 mpr[5] = cur[5], ref0b[5] 734 psad1 mpr[6] = cur[6], ref0b[6] 735 736 add dx = 1, dx 737 psad1 mpr[7] = cur[7], ref0b[7] 738 ;; 739 740.include "../../src/motion/ia64_asm/calc_delta_1.s" 741(t) cmp.lt.unc fb, p0 = iSAD, iMinSAD 742 ;; 743 744 // Left edge 745 746(fb) mov iMinSAD = iSAD 747 psad1 mpr[0] = cur[0], ref2a[0] 748 749(fb) mov currX = backupX 750 psad1 mpr[1] = cur[1], ref2a[1] 751 psad1 mpr[2] = cur[2], ref2a[2] 752 753(fb) mov currY = backupY 754 psad1 mpr[3] = cur[3], ref2a[3] 755 psad1 mpr[4] = cur[4], ref2a[4] 756 757 add backupX = 1, backupX 758 psad1 mpr[5] = cur[5], ref2a[5] 759 psad1 mpr[6] = cur[6], ref2a[6] 760 761 psad1 mpr[7] = cur[7], ref2a[7] 762 763 add dx = -2, dx 764 add dy = 1, dy 765 ;; 766 767.include "../../src/motion/ia64_asm/calc_delta_2.s" 768(rt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD 769.include "../../src/motion/ia64_asm/calc_delta_3.s" 770 771 // Right edge 772 773 774 psad1 mpr[0] = cur[0], ref2b[0] 775 psad1 mpr[1] = cur[1], ref2b[1] 776 psad1 mpr[2] = cur[2], ref2b[2] 777 psad1 mpr[3] = cur[3], ref2b[3] 778 psad1 mpr[4] = cur[4], ref2b[4] 779 780 add backupX = -2, backupX 781 psad1 mpr[5] = cur[5], ref2b[5] 782 psad1 mpr[6] = cur[6], ref2b[6] 783 784 add backupY = 1, backupY 785 add dx = 2, dx 786 psad1 mpr[7] = cur[7], ref2b[7] 787 ;; 788 789.include "../../src/motion/ia64_asm/calc_delta_1.s" 790(l) cmp.lt.unc fb, p0 = iSAD, iMinSAD 791 ;; 792 793 // Left bottom corner 794 795(fb) mov iMinSAD = iSAD 796 psad1 mpr[0] = cur[0], ref0a[1] 797 798(fb) mov currX = backupX 799 psad1 mpr[1] = cur[1], ref0a[2] 800 psad1 mpr[2] = cur[2], ref0a[3] 801 802(fb) mov currY = backupY 803 psad1 mpr[3] = cur[3], ref0a[4] 804 psad1 mpr[4] = cur[4], ref0a[5] 805 806 add backupX = 2, backupX 807 psad1 mpr[5] = cur[5], ref0a[6] 808 psad1 mpr[6] = cur[6], ref0a[7] 809 810 psad1 mpr[7] = cur[7], ref0a[8] 811 812 add dx = -2, dx 813 add dy = 1, dy 814 ;; 815 816.include "../../src/motion/ia64_asm/calc_delta_2.s" 817(r) cmp.lt.unc fb, p0 = mpr[8], iMinSAD 818.include "../../src/motion/ia64_asm/calc_delta_3.s" 819 820 // Bottom edge 821 822 psad1 mpr[0] = cur[0], ref1a[1] 823 psad1 mpr[1] = cur[1], ref1a[2] 824 psad1 mpr[2] = cur[2], ref1a[3] 825 psad1 mpr[3] = cur[3], ref1a[4] 826 psad1 mpr[4] = cur[4], ref1a[5] 827 828 add backupX = -2, backupX 829 psad1 mpr[5] = cur[5], ref1a[6] 830 psad1 mpr[6] = cur[6], ref1a[7] 831 832 add backupY = 1, backupY 833 add dx = 1, dx 834 psad1 mpr[7] = cur[7], ref1a[8] 835 ;; 836 837.include "../../src/motion/ia64_asm/calc_delta_1.s" 838(lb) cmp.lt.unc fb, p0 = iSAD, iMinSAD 839 ;; 840 // Right bottom corner 841 842 843(fb) mov iMinSAD = iSAD 844 psad1 mpr[0] = cur[0], ref0b[1] 845 846(fb) mov currX = backupX 847 psad1 mpr[1] = cur[1], ref0b[2] 848 psad1 mpr[2] = cur[2], ref0b[3] 849 850(fb) mov currY = backupY 851 psad1 mpr[3] = cur[3], ref0b[4] 852 psad1 mpr[4] = cur[4], ref0b[5] 853 854 add backupX = 1, backupX 855 psad1 mpr[5] = cur[5], ref0b[6] 856 psad1 mpr[6] = cur[6], ref0b[7] 857 858 add dx = 1, dx 859 psad1 mpr[7] = cur[7], ref0b[8] 860 ;; 861 862.include "../../src/motion/ia64_asm/calc_delta_2.s" 863(b) cmp.lt.unc fb, p0 = mpr[8], iMinSAD 864.include "../../src/motion/ia64_asm/calc_delta_3.s" 865 866(rb) getf.sig ret0 = fmv 867 add backupX = 1, backupX 868 ;; 869(rb) add iSAD = iSAD, ret0 870 ;; 871(rb) cmp.lt.unc fb, p0 = iSAD, iMinSAD 872 ;; 873(fb) mov iMinSAD = iSAD 874(fb) mov currX = backupX 875(fb) mov currY = backupY 876 ;; 877 878 // Write back result 879 880 st4 [currMV] = currX 881 st4 [currYAddress] = currY 882 mov ret0 = iMinSAD 883 884 // Restore important registers 885 886 ;; 887 mov pr = prsave, -1 888 mov ar.pfs = pfs 889 br.ret.sptk.many b0 890 891 .endp Halfpel8_Refine_ia64# 892