1@ 2@ ARMv4-optimized halfpel functions 3@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> 4@ 5@ This file is part of FFmpeg. 6@ 7@ FFmpeg is free software; you can redistribute it and/or 8@ modify it under the terms of the GNU Lesser General Public 9@ License as published by the Free Software Foundation; either 10@ version 2.1 of the License, or (at your option) any later version. 11@ 12@ FFmpeg is distributed in the hope that it will be useful, 13@ but WITHOUT ANY WARRANTY; without even the implied warranty of 14@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15@ Lesser General Public License for more details. 16@ 17@ You should have received a copy of the GNU Lesser General Public 18@ License along with FFmpeg; if not, write to the Free Software 19@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20@ 21 22#include "config.h" 23#include "libavutil/arm/asm.S" 24 25#if !HAVE_ARMV5TE_EXTERNAL 26#define pld @ 27#endif 28 29.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 30 mov \Rd0, \Rn0, lsr #(\shift * 8) 31 mov \Rd1, \Rn1, lsr #(\shift * 8) 32 mov \Rd2, \Rn2, lsr #(\shift * 8) 33 mov \Rd3, \Rn3, lsr #(\shift * 8) 34 orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) 35 orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) 36 orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) 37 orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) 38.endm 39.macro ALIGN_DWORD shift, R0, R1, R2 40 mov \R0, \R0, lsr #(\shift * 8) 41 orr \R0, \R0, \R1, lsl #(32 - \shift * 8) 42 mov \R1, \R1, lsr #(\shift * 8) 43 orr \R1, \R1, \R2, lsl #(32 - \shift * 8) 44.endm 45.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 46 mov \Rdst0, \Rsrc0, lsr #(\shift * 8) 47 mov \Rdst1, \Rsrc1, lsr #(\shift * 8) 48 orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) 49 orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) 50.endm 51 52.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask 53 @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) 54 @ Rmask = 0xFEFEFEFE 55 @ Rn = destroy 56 eor \Rd0, \Rn0, \Rm0 57 eor \Rd1, \Rn1, \Rm1 58 orr \Rn0, \Rn0, \Rm0 59 orr \Rn1, \Rn1, \Rm1 60 and \Rd0, \Rd0, \Rmask 61 and \Rd1, \Rd1, \Rmask 62 sub \Rd0, \Rn0, \Rd0, lsr #1 63 sub \Rd1, \Rn1, \Rd1, lsr #1 64.endm 65 66.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask 67 @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) 68 @ Rmask = 0xFEFEFEFE 69 @ Rn = destroy 70 eor \Rd0, \Rn0, \Rm0 71 eor \Rd1, \Rn1, \Rm1 72 and \Rn0, \Rn0, \Rm0 73 and \Rn1, \Rn1, \Rm1 74 and \Rd0, \Rd0, \Rmask 75 and \Rd1, \Rd1, \Rmask 76 add \Rd0, \Rn0, \Rd0, lsr #1 77 add \Rd1, \Rn1, \Rd1, lsr #1 78.endm 79 80.macro JMP_ALIGN tmp, reg 81 ands \tmp, \reg, #3 82 bic \reg, \reg, #3 83 beq 1f 84 subs \tmp, \tmp, #1 85 beq 2f 86 subs \tmp, \tmp, #1 87 beq 3f 88 b 4f 89.endm 90 91@ ---------------------------------------------------------------- 92function ff_put_pixels16_arm, export=1, align=5 93 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 94 @ block = word aligned, pixles = unaligned 95 pld [r1] 96 push {r4-r11, lr} 97 JMP_ALIGN r5, r1 981: 99 ldm r1, {r4-r7} 100 add r1, r1, r2 101 stm r0, {r4-r7} 102 pld [r1] 103 subs r3, r3, #1 104 add r0, r0, r2 105 bne 1b 106 pop {r4-r11, pc} 107 .align 5 1082: 109 ldm r1, {r4-r8} 110 add r1, r1, r2 111 ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 112 pld [r1] 113 subs r3, r3, #1 114 stm r0, {r9-r12} 115 add r0, r0, r2 116 bne 2b 117 pop {r4-r11, pc} 118 .align 5 1193: 120 ldm r1, {r4-r8} 121 add r1, r1, r2 122 ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 123 pld [r1] 124 subs r3, r3, #1 125 stm r0, {r9-r12} 126 add r0, r0, r2 127 bne 3b 128 pop {r4-r11, pc} 129 .align 5 1304: 131 ldm r1, {r4-r8} 132 add r1, r1, r2 133 ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 134 pld [r1] 135 subs r3, r3, #1 136 stm r0, {r9-r12} 137 add r0, r0, r2 138 bne 4b 139 pop {r4-r11,pc} 140endfunc 141 142@ ---------------------------------------------------------------- 143function ff_put_pixels8_arm, export=1, align=5 144 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 145 @ block = word aligned, pixles = unaligned 146 pld [r1] 147 push {r4-r5,lr} 148 JMP_ALIGN r5, r1 1491: 150 ldm r1, {r4-r5} 151 add r1, r1, r2 152 subs r3, r3, #1 153 pld [r1] 154 stm r0, {r4-r5} 155 add r0, r0, r2 156 bne 1b 157 pop {r4-r5,pc} 158 .align 5 1592: 160 ldm r1, {r4-r5, r12} 161 add r1, r1, r2 162 ALIGN_DWORD 1, r4, r5, r12 163 pld [r1] 164 subs r3, r3, #1 165 stm r0, {r4-r5} 166 add r0, r0, r2 167 bne 2b 168 pop {r4-r5,pc} 169 .align 5 1703: 171 ldm r1, {r4-r5, r12} 172 add r1, r1, r2 173 ALIGN_DWORD 2, r4, r5, r12 174 pld [r1] 175 subs r3, r3, #1 176 stm r0, {r4-r5} 177 add r0, r0, r2 178 bne 3b 179 pop {r4-r5,pc} 180 .align 5 1814: 182 ldm r1, {r4-r5, r12} 183 add r1, r1, r2 184 ALIGN_DWORD 3, r4, r5, r12 185 pld [r1] 186 subs r3, r3, #1 187 stm r0, {r4-r5} 188 add r0, r0, r2 189 bne 4b 190 pop {r4-r5,pc} 191endfunc 192 193@ ---------------------------------------------------------------- 194function ff_put_pixels8_x2_arm, export=1, align=5 195 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 196 @ block = word aligned, pixles = unaligned 197 pld [r1] 198 push {r4-r10,lr} 199 ldr r12, =0xfefefefe 200 JMP_ALIGN r5, r1 2011: 202 ldm r1, {r4-r5, r10} 203 add r1, r1, r2 204 ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 205 pld [r1] 206 RND_AVG32 r8, r9, r4, r5, r6, r7, r12 207 subs r3, r3, #1 208 stm r0, {r8-r9} 209 add r0, r0, r2 210 bne 1b 211 pop {r4-r10,pc} 212 .align 5 2132: 214 ldm r1, {r4-r5, r10} 215 add r1, r1, r2 216 ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 217 ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 218 pld [r1] 219 RND_AVG32 r4, r5, r6, r7, r8, r9, r12 220 subs r3, r3, #1 221 stm r0, {r4-r5} 222 add r0, r0, r2 223 bne 2b 224 pop {r4-r10,pc} 225 .align 5 2263: 227 ldm r1, {r4-r5, r10} 228 add r1, r1, r2 229 ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 230 ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 231 pld [r1] 232 RND_AVG32 r4, r5, r6, r7, r8, r9, r12 233 subs r3, r3, #1 234 stm r0, {r4-r5} 235 add r0, r0, r2 236 bne 3b 237 pop {r4-r10,pc} 238 .align 5 2394: 240 ldm r1, {r4-r5, r10} 241 add r1, r1, r2 242 ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 243 pld [r1] 244 RND_AVG32 r8, r9, r6, r7, r5, r10, r12 245 subs r3, r3, #1 246 stm r0, {r8-r9} 247 add r0, r0, r2 248 bne 4b 249 pop {r4-r10,pc} 250endfunc 251 252function ff_put_no_rnd_pixels8_x2_arm, export=1, align=5 253 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 254 @ block = word aligned, pixles = unaligned 255 pld [r1] 256 push {r4-r10,lr} 257 ldr r12, =0xfefefefe 258 JMP_ALIGN r5, r1 2591: 260 ldm r1, {r4-r5, r10} 261 add r1, r1, r2 262 ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 263 pld [r1] 264 NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 265 subs r3, r3, #1 266 stm r0, {r8-r9} 267 add r0, r0, r2 268 bne 1b 269 pop {r4-r10,pc} 270 .align 5 2712: 272 ldm r1, {r4-r5, r10} 273 add r1, r1, r2 274 ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 275 ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 276 pld [r1] 277 NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 278 subs r3, r3, #1 279 stm r0, {r4-r5} 280 add r0, r0, r2 281 bne 2b 282 pop {r4-r10,pc} 283 .align 5 2843: 285 ldm r1, {r4-r5, r10} 286 add r1, r1, r2 287 ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 288 ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 289 pld [r1] 290 NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 291 subs r3, r3, #1 292 stm r0, {r4-r5} 293 add r0, r0, r2 294 bne 3b 295 pop {r4-r10,pc} 296 .align 5 2974: 298 ldm r1, {r4-r5, r10} 299 add r1, r1, r2 300 ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 301 pld [r1] 302 NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 303 subs r3, r3, #1 304 stm r0, {r8-r9} 305 add r0, r0, r2 306 bne 4b 307 pop {r4-r10,pc} 308endfunc 309 310 311@ ---------------------------------------------------------------- 312function ff_put_pixels8_y2_arm, export=1, align=5 313 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 314 @ block = word aligned, pixles = unaligned 315 pld [r1] 316 push {r4-r11,lr} 317 mov r3, r3, lsr #1 318 ldr r12, =0xfefefefe 319 JMP_ALIGN r5, r1 3201: 321 ldm r1, {r4-r5} 322 add r1, r1, r2 3236: ldm r1, {r6-r7} 324 add r1, r1, r2 325 pld [r1] 326 RND_AVG32 r8, r9, r4, r5, r6, r7, r12 327 ldm r1, {r4-r5} 328 add r1, r1, r2 329 stm r0, {r8-r9} 330 add r0, r0, r2 331 pld [r1] 332 RND_AVG32 r8, r9, r6, r7, r4, r5, r12 333 subs r3, r3, #1 334 stm r0, {r8-r9} 335 add r0, r0, r2 336 bne 6b 337 pop {r4-r11,pc} 338 .align 5 3392: 340 ldm r1, {r4-r6} 341 add r1, r1, r2 342 pld [r1] 343 ALIGN_DWORD 1, r4, r5, r6 3446: ldm r1, {r7-r9} 345 add r1, r1, r2 346 pld [r1] 347 ALIGN_DWORD 1, r7, r8, r9 348 RND_AVG32 r10, r11, r4, r5, r7, r8, r12 349 stm r0, {r10-r11} 350 add r0, r0, r2 351 ldm r1, {r4-r6} 352 add r1, r1, r2 353 pld [r1] 354 ALIGN_DWORD 1, r4, r5, r6 355 subs r3, r3, #1 356 RND_AVG32 r10, r11, r7, r8, r4, r5, r12 357 stm r0, {r10-r11} 358 add r0, r0, r2 359 bne 6b 360 pop {r4-r11,pc} 361 .align 5 3623: 363 ldm r1, {r4-r6} 364 add r1, r1, r2 365 pld [r1] 366 ALIGN_DWORD 2, r4, r5, r6 3676: ldm r1, {r7-r9} 368 add r1, r1, r2 369 pld [r1] 370 ALIGN_DWORD 2, r7, r8, r9 371 RND_AVG32 r10, r11, r4, r5, r7, r8, r12 372 stm r0, {r10-r11} 373 add r0, r0, r2 374 ldm r1, {r4-r6} 375 add r1, r1, r2 376 pld [r1] 377 ALIGN_DWORD 2, r4, r5, r6 378 subs r3, r3, #1 379 RND_AVG32 r10, r11, r7, r8, r4, r5, r12 380 stm r0, {r10-r11} 381 add r0, r0, r2 382 bne 6b 383 pop {r4-r11,pc} 384 .align 5 3854: 386 ldm r1, {r4-r6} 387 add r1, r1, r2 388 pld [r1] 389 ALIGN_DWORD 3, r4, r5, r6 3906: ldm r1, {r7-r9} 391 add r1, r1, r2 392 pld [r1] 393 ALIGN_DWORD 3, r7, r8, r9 394 RND_AVG32 r10, r11, r4, r5, r7, r8, r12 395 stm r0, {r10-r11} 396 add r0, r0, r2 397 ldm r1, {r4-r6} 398 add r1, r1, r2 399 pld [r1] 400 ALIGN_DWORD 3, r4, r5, r6 401 subs r3, r3, #1 402 RND_AVG32 r10, r11, r7, r8, r4, r5, r12 403 stm r0, {r10-r11} 404 add r0, r0, r2 405 bne 6b 406 pop {r4-r11,pc} 407endfunc 408 409function ff_put_no_rnd_pixels8_y2_arm, export=1, align=5 410 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 411 @ block = word aligned, pixles = unaligned 412 pld [r1] 413 push {r4-r11,lr} 414 mov r3, r3, lsr #1 415 ldr r12, =0xfefefefe 416 JMP_ALIGN r5, r1 4171: 418 ldm r1, {r4-r5} 419 add r1, r1, r2 4206: ldm r1, {r6-r7} 421 add r1, r1, r2 422 pld [r1] 423 NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 424 ldm r1, {r4-r5} 425 add r1, r1, r2 426 stm r0, {r8-r9} 427 add r0, r0, r2 428 pld [r1] 429 NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 430 subs r3, r3, #1 431 stm r0, {r8-r9} 432 add r0, r0, r2 433 bne 6b 434 pop {r4-r11,pc} 435 .align 5 4362: 437 ldm r1, {r4-r6} 438 add r1, r1, r2 439 pld [r1] 440 ALIGN_DWORD 1, r4, r5, r6 4416: ldm r1, {r7-r9} 442 add r1, r1, r2 443 pld [r1] 444 ALIGN_DWORD 1, r7, r8, r9 445 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 446 stm r0, {r10-r11} 447 add r0, r0, r2 448 ldm r1, {r4-r6} 449 add r1, r1, r2 450 pld [r1] 451 ALIGN_DWORD 1, r4, r5, r6 452 subs r3, r3, #1 453 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 454 stm r0, {r10-r11} 455 add r0, r0, r2 456 bne 6b 457 pop {r4-r11,pc} 458 .align 5 4593: 460 ldm r1, {r4-r6} 461 add r1, r1, r2 462 pld [r1] 463 ALIGN_DWORD 2, r4, r5, r6 4646: ldm r1, {r7-r9} 465 add r1, r1, r2 466 pld [r1] 467 ALIGN_DWORD 2, r7, r8, r9 468 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 469 stm r0, {r10-r11} 470 add r0, r0, r2 471 ldm r1, {r4-r6} 472 add r1, r1, r2 473 pld [r1] 474 ALIGN_DWORD 2, r4, r5, r6 475 subs r3, r3, #1 476 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 477 stm r0, {r10-r11} 478 add r0, r0, r2 479 bne 6b 480 pop {r4-r11,pc} 481 .align 5 4824: 483 ldm r1, {r4-r6} 484 add r1, r1, r2 485 pld [r1] 486 ALIGN_DWORD 3, r4, r5, r6 4876: ldm r1, {r7-r9} 488 add r1, r1, r2 489 pld [r1] 490 ALIGN_DWORD 3, r7, r8, r9 491 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 492 stm r0, {r10-r11} 493 add r0, r0, r2 494 ldm r1, {r4-r6} 495 add r1, r1, r2 496 pld [r1] 497 ALIGN_DWORD 3, r4, r5, r6 498 subs r3, r3, #1 499 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 500 stm r0, {r10-r11} 501 add r0, r0, r2 502 bne 6b 503 pop {r4-r11,pc} 504endfunc 505 506 .ltorg 507 508@ ---------------------------------------------------------------- 509.macro RND_XY2_IT align, rnd 510 @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) 511 @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) 512.if \align == 0 513 ldm r1, {r6-r8} 514.elseif \align == 3 515 ldm r1, {r5-r7} 516.else 517 ldm r1, {r8-r10} 518.endif 519 add r1, r1, r2 520 pld [r1] 521.if \align == 0 522 ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 523.elseif \align == 1 524 ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 525 ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 526.elseif \align == 2 527 ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 528 ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 529.elseif \align == 3 530 ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 531.endif 532 ldr r14, =0x03030303 533 tst r3, #1 534 and r8, r4, r14 535 and r9, r5, r14 536 and r10, r6, r14 537 and r11, r7, r14 538 it eq 539 andeq r14, r14, r14, \rnd #1 540 add r8, r8, r10 541 add r9, r9, r11 542 ldr r12, =0xfcfcfcfc >> 2 543 itt eq 544 addeq r8, r8, r14 545 addeq r9, r9, r14 546 and r4, r12, r4, lsr #2 547 and r5, r12, r5, lsr #2 548 and r6, r12, r6, lsr #2 549 and r7, r12, r7, lsr #2 550 add r10, r4, r6 551 add r11, r5, r7 552 subs r3, r3, #1 553.endm 554 555.macro RND_XY2_EXPAND align, rnd 556 RND_XY2_IT \align, \rnd 5576: push {r8-r11} 558 RND_XY2_IT \align, \rnd 559 pop {r4-r7} 560 add r4, r4, r8 561 add r5, r5, r9 562 ldr r14, =0x0f0f0f0f 563 add r6, r6, r10 564 add r7, r7, r11 565 and r4, r14, r4, lsr #2 566 and r5, r14, r5, lsr #2 567 add r4, r4, r6 568 add r5, r5, r7 569 stm r0, {r4-r5} 570 add r0, r0, r2 571 bge 6b 572 pop {r4-r11,pc} 573.endm 574 575function ff_put_pixels8_xy2_arm, export=1, align=5 576 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 577 @ block = word aligned, pixles = unaligned 578 pld [r1] 579 push {r4-r11,lr} @ R14 is also called LR 580 JMP_ALIGN r5, r1 5811: RND_XY2_EXPAND 0, lsl 582 .align 5 5832: RND_XY2_EXPAND 1, lsl 584 .align 5 5853: RND_XY2_EXPAND 2, lsl 586 .align 5 5874: RND_XY2_EXPAND 3, lsl 588endfunc 589 590function ff_put_no_rnd_pixels8_xy2_arm, export=1, align=5 591 @ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 592 @ block = word aligned, pixles = unaligned 593 pld [r1] 594 push {r4-r11,lr} 595 JMP_ALIGN r5, r1 5961: RND_XY2_EXPAND 0, lsr 597 .align 5 5982: RND_XY2_EXPAND 1, lsr 599 .align 5 6003: RND_XY2_EXPAND 2, lsr 601 .align 5 6024: RND_XY2_EXPAND 3, lsr 603endfunc 604