1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22#include "neon.S" 23 24 /* H.264 qpel MC */ 25 26.macro lowpass_const r 27 movw \r, #5 28 movt \r, #20 29 vmov.32 d6[0], \r 30.endm 31 32.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 33 .if \narrow 34 t0 .req q0 35 t1 .req q8 36 .else 37 t0 .req \d0 38 t1 .req \d1 39 .endif 40 vext.8 d2, \r0, \r1, #2 41 vext.8 d3, \r0, \r1, #3 42 vaddl.u8 q1, d2, d3 43 vext.8 d4, \r0, \r1, #1 44 vext.8 d5, \r0, \r1, #4 45 vaddl.u8 q2, d4, d5 46 vext.8 d30, \r0, \r1, #5 47 vaddl.u8 t0, \r0, d30 48 vext.8 d18, \r2, \r3, #2 49 vmla.i16 t0, q1, d6[1] 50 vext.8 d19, \r2, \r3, #3 51 vaddl.u8 q9, d18, d19 52 vext.8 d20, \r2, \r3, #1 53 vmls.i16 t0, q2, d6[0] 54 vext.8 d21, \r2, \r3, #4 55 vaddl.u8 q10, d20, d21 56 vext.8 d31, \r2, \r3, #5 57 vaddl.u8 t1, \r2, d31 58 vmla.i16 t1, q9, d6[1] 59 vmls.i16 t1, q10, d6[0] 60 .if \narrow 61 vqrshrun.s16 \d0, t0, #5 62 vqrshrun.s16 \d1, t1, #5 63 .endif 64 .unreq t0 65 .unreq t1 66.endm 67 68.macro lowpass_8_1 r0, r1, d0, narrow=1 69 .if \narrow 70 t0 .req q0 71 .else 72 t0 .req \d0 73 .endif 74 vext.8 d2, \r0, \r1, #2 75 vext.8 d3, \r0, \r1, #3 76 vaddl.u8 q1, d2, d3 77 vext.8 d4, \r0, \r1, #1 78 vext.8 d5, \r0, \r1, #4 79 vaddl.u8 q2, d4, d5 80 vext.8 d30, \r0, \r1, #5 81 vaddl.u8 t0, \r0, d30 82 vmla.i16 t0, q1, d6[1] 83 vmls.i16 t0, q2, d6[0] 84 .if \narrow 85 vqrshrun.s16 \d0, t0, #5 86 .endif 87 .unreq t0 88.endm 89 90.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 91 vext.16 q1, \r0, \r1, #2 92 vext.16 q0, \r0, \r1, #3 93 vaddl.s16 q9, d2, d0 94 vext.16 q2, \r0, \r1, #1 95 vaddl.s16 q1, d3, d1 96 vext.16 q3, \r0, \r1, #4 97 vaddl.s16 q10, d4, d6 98 vext.16 \r1, \r0, \r1, #5 99 vaddl.s16 q2, d5, d7 100 vaddl.s16 q0, \h0, \h1 101 vaddl.s16 q8, \l0, \l1 102 103 vshl.i32 q3, q9, #4 104 vshl.i32 q9, q9, #2 105 vshl.i32 q15, q10, #2 106 vadd.i32 q9, q9, q3 107 vadd.i32 q10, q10, q15 108 109 vshl.i32 q3, q1, #4 110 vshl.i32 q1, q1, #2 111 vshl.i32 q15, q2, #2 112 vadd.i32 q1, q1, q3 113 vadd.i32 q2, q2, q15 114 115 vadd.i32 q9, q9, q8 116 vsub.i32 q9, q9, q10 117 118 vadd.i32 q1, q1, q0 119 vsub.i32 q1, q1, q2 120 121 vrshrn.s32 d18, q9, #10 122 vrshrn.s32 d19, q1, #10 123 124 vqmovun.s16 \d, q9 125.endm 126 127function put_h264_qpel16_h_lowpass_neon_packed 128 mov r4, lr 129 mov r12, #16 130 mov r3, #8 131 bl put_h264_qpel8_h_lowpass_neon 132 sub r1, r1, r2, lsl #4 133 add r1, r1, #8 134 mov r12, #16 135 mov lr, r4 136 b put_h264_qpel8_h_lowpass_neon 137endfunc 138 139.macro h264_qpel_h_lowpass type 140function \type\()_h264_qpel16_h_lowpass_neon 141 push {lr} 142 mov r12, #16 143 bl \type\()_h264_qpel8_h_lowpass_neon 144 sub r0, r0, r3, lsl #4 145 sub r1, r1, r2, lsl #4 146 add r0, r0, #8 147 add r1, r1, #8 148 mov r12, #16 149 pop {lr} 150endfunc 151 152function \type\()_h264_qpel8_h_lowpass_neon 1531: vld1.8 {d0, d1}, [r1], r2 154 vld1.8 {d16,d17}, [r1], r2 155 subs r12, r12, #2 156 lowpass_8 d0, d1, d16, d17, d0, d16 157 .ifc \type,avg 158 vld1.8 {d2}, [r0,:64], r3 159 vrhadd.u8 d0, d0, d2 160 vld1.8 {d3}, [r0,:64] 161 vrhadd.u8 d16, d16, d3 162 sub r0, r0, r3 163 .endif 164 vst1.8 {d0}, [r0,:64], r3 165 vst1.8 {d16}, [r0,:64], r3 166 bne 1b 167 bx lr 168endfunc 169.endm 170 171 h264_qpel_h_lowpass put 172 h264_qpel_h_lowpass avg 173 174.macro h264_qpel_h_lowpass_l2 type 175function \type\()_h264_qpel16_h_lowpass_l2_neon 176 push {lr} 177 mov r12, #16 178 bl \type\()_h264_qpel8_h_lowpass_l2_neon 179 sub r0, r0, r2, lsl #4 180 sub r1, r1, r2, lsl #4 181 sub r3, r3, r2, lsl #4 182 add r0, r0, #8 183 add r1, r1, #8 184 add r3, r3, #8 185 mov r12, #16 186 pop {lr} 187endfunc 188 189function \type\()_h264_qpel8_h_lowpass_l2_neon 1901: vld1.8 {d0, d1}, [r1], r2 191 vld1.8 {d16,d17}, [r1], r2 192 vld1.8 {d28}, [r3], r2 193 vld1.8 {d29}, [r3], r2 194 subs r12, r12, #2 195 lowpass_8 d0, d1, d16, d17, d0, d1 196 vrhadd.u8 q0, q0, q14 197 .ifc \type,avg 198 vld1.8 {d2}, [r0,:64], r2 199 vrhadd.u8 d0, d0, d2 200 vld1.8 {d3}, [r0,:64] 201 vrhadd.u8 d1, d1, d3 202 sub r0, r0, r2 203 .endif 204 vst1.8 {d0}, [r0,:64], r2 205 vst1.8 {d1}, [r0,:64], r2 206 bne 1b 207 bx lr 208endfunc 209.endm 210 211 h264_qpel_h_lowpass_l2 put 212 h264_qpel_h_lowpass_l2 avg 213 214function put_h264_qpel16_v_lowpass_neon_packed 215 mov r4, lr 216 mov r2, #8 217 bl put_h264_qpel8_v_lowpass_neon 218 sub r1, r1, r3, lsl #2 219 bl put_h264_qpel8_v_lowpass_neon 220 sub r1, r1, r3, lsl #4 221 sub r1, r1, r3, lsl #2 222 add r1, r1, #8 223 bl put_h264_qpel8_v_lowpass_neon 224 sub r1, r1, r3, lsl #2 225 mov lr, r4 226 b put_h264_qpel8_v_lowpass_neon 227endfunc 228 229.macro h264_qpel_v_lowpass type 230function \type\()_h264_qpel16_v_lowpass_neon 231 mov r4, lr 232 bl \type\()_h264_qpel8_v_lowpass_neon 233 sub r1, r1, r3, lsl #2 234 bl \type\()_h264_qpel8_v_lowpass_neon 235 sub r0, r0, r2, lsl #4 236 add r0, r0, #8 237 sub r1, r1, r3, lsl #4 238 sub r1, r1, r3, lsl #2 239 add r1, r1, #8 240 bl \type\()_h264_qpel8_v_lowpass_neon 241 sub r1, r1, r3, lsl #2 242 mov lr, r4 243endfunc 244 245function \type\()_h264_qpel8_v_lowpass_neon 246 vld1.8 {d8}, [r1], r3 247 vld1.8 {d10}, [r1], r3 248 vld1.8 {d12}, [r1], r3 249 vld1.8 {d14}, [r1], r3 250 vld1.8 {d22}, [r1], r3 251 vld1.8 {d24}, [r1], r3 252 vld1.8 {d26}, [r1], r3 253 vld1.8 {d28}, [r1], r3 254 vld1.8 {d9}, [r1], r3 255 vld1.8 {d11}, [r1], r3 256 vld1.8 {d13}, [r1], r3 257 vld1.8 {d15}, [r1], r3 258 vld1.8 {d23}, [r1] 259 260 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 261 lowpass_8 d8, d9, d10, d11, d8, d10 262 lowpass_8 d12, d13, d14, d15, d12, d14 263 lowpass_8 d22, d23, d24, d25, d22, d24 264 lowpass_8 d26, d27, d28, d29, d26, d28 265 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 266 267 .ifc \type,avg 268 vld1.8 {d9}, [r0,:64], r2 269 vrhadd.u8 d8, d8, d9 270 vld1.8 {d11}, [r0,:64], r2 271 vrhadd.u8 d10, d10, d11 272 vld1.8 {d13}, [r0,:64], r2 273 vrhadd.u8 d12, d12, d13 274 vld1.8 {d15}, [r0,:64], r2 275 vrhadd.u8 d14, d14, d15 276 vld1.8 {d23}, [r0,:64], r2 277 vrhadd.u8 d22, d22, d23 278 vld1.8 {d25}, [r0,:64], r2 279 vrhadd.u8 d24, d24, d25 280 vld1.8 {d27}, [r0,:64], r2 281 vrhadd.u8 d26, d26, d27 282 vld1.8 {d29}, [r0,:64], r2 283 vrhadd.u8 d28, d28, d29 284 sub r0, r0, r2, lsl #3 285 .endif 286 287 vst1.8 {d8}, [r0,:64], r2 288 vst1.8 {d10}, [r0,:64], r2 289 vst1.8 {d12}, [r0,:64], r2 290 vst1.8 {d14}, [r0,:64], r2 291 vst1.8 {d22}, [r0,:64], r2 292 vst1.8 {d24}, [r0,:64], r2 293 vst1.8 {d26}, [r0,:64], r2 294 vst1.8 {d28}, [r0,:64], r2 295 296 bx lr 297endfunc 298.endm 299 300 h264_qpel_v_lowpass put 301 h264_qpel_v_lowpass avg 302 303.macro h264_qpel_v_lowpass_l2 type 304function \type\()_h264_qpel16_v_lowpass_l2_neon 305 mov r4, lr 306 bl \type\()_h264_qpel8_v_lowpass_l2_neon 307 sub r1, r1, r3, lsl #2 308 bl \type\()_h264_qpel8_v_lowpass_l2_neon 309 sub r0, r0, r3, lsl #4 310 sub r12, r12, r2, lsl #4 311 add r0, r0, #8 312 add r12, r12, #8 313 sub r1, r1, r3, lsl #4 314 sub r1, r1, r3, lsl #2 315 add r1, r1, #8 316 bl \type\()_h264_qpel8_v_lowpass_l2_neon 317 sub r1, r1, r3, lsl #2 318 mov lr, r4 319endfunc 320 321function \type\()_h264_qpel8_v_lowpass_l2_neon 322 vld1.8 {d8}, [r1], r3 323 vld1.8 {d10}, [r1], r3 324 vld1.8 {d12}, [r1], r3 325 vld1.8 {d14}, [r1], r3 326 vld1.8 {d22}, [r1], r3 327 vld1.8 {d24}, [r1], r3 328 vld1.8 {d26}, [r1], r3 329 vld1.8 {d28}, [r1], r3 330 vld1.8 {d9}, [r1], r3 331 vld1.8 {d11}, [r1], r3 332 vld1.8 {d13}, [r1], r3 333 vld1.8 {d15}, [r1], r3 334 vld1.8 {d23}, [r1] 335 336 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 337 lowpass_8 d8, d9, d10, d11, d8, d9 338 lowpass_8 d12, d13, d14, d15, d12, d13 339 lowpass_8 d22, d23, d24, d25, d22, d23 340 lowpass_8 d26, d27, d28, d29, d26, d27 341 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 342 343 vld1.8 {d0}, [r12], r2 344 vld1.8 {d1}, [r12], r2 345 vld1.8 {d2}, [r12], r2 346 vld1.8 {d3}, [r12], r2 347 vld1.8 {d4}, [r12], r2 348 vrhadd.u8 q0, q0, q4 349 vld1.8 {d5}, [r12], r2 350 vrhadd.u8 q1, q1, q6 351 vld1.8 {d10}, [r12], r2 352 vrhadd.u8 q2, q2, q11 353 vld1.8 {d11}, [r12], r2 354 vrhadd.u8 q5, q5, q13 355 356 .ifc \type,avg 357 vld1.8 {d16}, [r0,:64], r3 358 vrhadd.u8 d0, d0, d16 359 vld1.8 {d17}, [r0,:64], r3 360 vrhadd.u8 d1, d1, d17 361 vld1.8 {d16}, [r0,:64], r3 362 vrhadd.u8 d2, d2, d16 363 vld1.8 {d17}, [r0,:64], r3 364 vrhadd.u8 d3, d3, d17 365 vld1.8 {d16}, [r0,:64], r3 366 vrhadd.u8 d4, d4, d16 367 vld1.8 {d17}, [r0,:64], r3 368 vrhadd.u8 d5, d5, d17 369 vld1.8 {d16}, [r0,:64], r3 370 vrhadd.u8 d10, d10, d16 371 vld1.8 {d17}, [r0,:64], r3 372 vrhadd.u8 d11, d11, d17 373 sub r0, r0, r3, lsl #3 374 .endif 375 376 vst1.8 {d0}, [r0,:64], r3 377 vst1.8 {d1}, [r0,:64], r3 378 vst1.8 {d2}, [r0,:64], r3 379 vst1.8 {d3}, [r0,:64], r3 380 vst1.8 {d4}, [r0,:64], r3 381 vst1.8 {d5}, [r0,:64], r3 382 vst1.8 {d10}, [r0,:64], r3 383 vst1.8 {d11}, [r0,:64], r3 384 385 bx lr 386endfunc 387.endm 388 389 h264_qpel_v_lowpass_l2 put 390 h264_qpel_v_lowpass_l2 avg 391 392function put_h264_qpel8_hv_lowpass_neon_top 393 lowpass_const r12 394 mov r12, #12 3951: vld1.8 {d0, d1}, [r1], r3 396 vld1.8 {d16,d17}, [r1], r3 397 subs r12, r12, #2 398 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 399 vst1.8 {d22-d25}, [r4,:128]! 400 bne 1b 401 402 vld1.8 {d0, d1}, [r1] 403 lowpass_8_1 d0, d1, q12, narrow=0 404 405 mov r12, #-16 406 add r4, r4, r12 407 vld1.8 {d30,d31}, [r4,:128], r12 408 vld1.8 {d20,d21}, [r4,:128], r12 409 vld1.8 {d18,d19}, [r4,:128], r12 410 vld1.8 {d16,d17}, [r4,:128], r12 411 vld1.8 {d14,d15}, [r4,:128], r12 412 vld1.8 {d12,d13}, [r4,:128], r12 413 vld1.8 {d10,d11}, [r4,:128], r12 414 vld1.8 {d8, d9}, [r4,:128], r12 415 vld1.8 {d6, d7}, [r4,:128], r12 416 vld1.8 {d4, d5}, [r4,:128], r12 417 vld1.8 {d2, d3}, [r4,:128], r12 418 vld1.8 {d0, d1}, [r4,:128] 419 420 swap4 d1, d3, d5, d7, d8, d10, d12, d14 421 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 422 423 swap4 d17, d19, d21, d31, d24, d26, d28, d22 424 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 425 426 vst1.8 {d30,d31}, [r4,:128]! 427 vst1.8 {d6, d7}, [r4,:128]! 428 vst1.8 {d20,d21}, [r4,:128]! 429 vst1.8 {d4, d5}, [r4,:128]! 430 vst1.8 {d18,d19}, [r4,:128]! 431 vst1.8 {d2, d3}, [r4,:128]! 432 vst1.8 {d16,d17}, [r4,:128]! 433 vst1.8 {d0, d1}, [r4,:128] 434 435 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 436 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 437 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 438 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 439 440 vld1.8 {d16,d17}, [r4,:128], r12 441 vld1.8 {d30,d31}, [r4,:128], r12 442 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 443 vld1.8 {d16,d17}, [r4,:128], r12 444 vld1.8 {d30,d31}, [r4,:128], r12 445 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 446 vld1.8 {d16,d17}, [r4,:128], r12 447 vld1.8 {d30,d31}, [r4,:128], r12 448 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 449 vld1.8 {d16,d17}, [r4,:128], r12 450 vld1.8 {d30,d31}, [r4,:128] 451 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 452 453 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 454 455 bx lr 456endfunc 457 458.macro h264_qpel8_hv_lowpass type 459function \type\()_h264_qpel8_hv_lowpass_neon 460 mov r10, lr 461 bl put_h264_qpel8_hv_lowpass_neon_top 462 .ifc \type,avg 463 vld1.8 {d0}, [r0,:64], r2 464 vrhadd.u8 d12, d12, d0 465 vld1.8 {d1}, [r0,:64], r2 466 vrhadd.u8 d13, d13, d1 467 vld1.8 {d2}, [r0,:64], r2 468 vrhadd.u8 d14, d14, d2 469 vld1.8 {d3}, [r0,:64], r2 470 vrhadd.u8 d15, d15, d3 471 vld1.8 {d4}, [r0,:64], r2 472 vrhadd.u8 d8, d8, d4 473 vld1.8 {d5}, [r0,:64], r2 474 vrhadd.u8 d9, d9, d5 475 vld1.8 {d6}, [r0,:64], r2 476 vrhadd.u8 d10, d10, d6 477 vld1.8 {d7}, [r0,:64], r2 478 vrhadd.u8 d11, d11, d7 479 sub r0, r0, r2, lsl #3 480 .endif 481 482 vst1.8 {d12}, [r0,:64], r2 483 vst1.8 {d13}, [r0,:64], r2 484 vst1.8 {d14}, [r0,:64], r2 485 vst1.8 {d15}, [r0,:64], r2 486 vst1.8 {d8}, [r0,:64], r2 487 vst1.8 {d9}, [r0,:64], r2 488 vst1.8 {d10}, [r0,:64], r2 489 vst1.8 {d11}, [r0,:64], r2 490 491 mov lr, r10 492 bx lr 493endfunc 494.endm 495 496 h264_qpel8_hv_lowpass put 497 h264_qpel8_hv_lowpass avg 498 499.macro h264_qpel8_hv_lowpass_l2 type 500function \type\()_h264_qpel8_hv_lowpass_l2_neon 501 mov r10, lr 502 bl put_h264_qpel8_hv_lowpass_neon_top 503 504 vld1.8 {d0, d1}, [r2,:128]! 505 vld1.8 {d2, d3}, [r2,:128]! 506 vrhadd.u8 q0, q0, q6 507 vld1.8 {d4, d5}, [r2,:128]! 508 vrhadd.u8 q1, q1, q7 509 vld1.8 {d6, d7}, [r2,:128]! 510 vrhadd.u8 q2, q2, q4 511 vrhadd.u8 q3, q3, q5 512 .ifc \type,avg 513 vld1.8 {d16}, [r0,:64], r3 514 vrhadd.u8 d0, d0, d16 515 vld1.8 {d17}, [r0,:64], r3 516 vrhadd.u8 d1, d1, d17 517 vld1.8 {d18}, [r0,:64], r3 518 vrhadd.u8 d2, d2, d18 519 vld1.8 {d19}, [r0,:64], r3 520 vrhadd.u8 d3, d3, d19 521 vld1.8 {d20}, [r0,:64], r3 522 vrhadd.u8 d4, d4, d20 523 vld1.8 {d21}, [r0,:64], r3 524 vrhadd.u8 d5, d5, d21 525 vld1.8 {d22}, [r0,:64], r3 526 vrhadd.u8 d6, d6, d22 527 vld1.8 {d23}, [r0,:64], r3 528 vrhadd.u8 d7, d7, d23 529 sub r0, r0, r3, lsl #3 530 .endif 531 vst1.8 {d0}, [r0,:64], r3 532 vst1.8 {d1}, [r0,:64], r3 533 vst1.8 {d2}, [r0,:64], r3 534 vst1.8 {d3}, [r0,:64], r3 535 vst1.8 {d4}, [r0,:64], r3 536 vst1.8 {d5}, [r0,:64], r3 537 vst1.8 {d6}, [r0,:64], r3 538 vst1.8 {d7}, [r0,:64], r3 539 540 mov lr, r10 541 bx lr 542endfunc 543.endm 544 545 h264_qpel8_hv_lowpass_l2 put 546 h264_qpel8_hv_lowpass_l2 avg 547 548.macro h264_qpel16_hv type 549function \type\()_h264_qpel16_hv_lowpass_neon 550 mov r9, lr 551 bl \type\()_h264_qpel8_hv_lowpass_neon 552 sub r1, r1, r3, lsl #2 553 bl \type\()_h264_qpel8_hv_lowpass_neon 554 sub r1, r1, r3, lsl #4 555 sub r1, r1, r3, lsl #2 556 add r1, r1, #8 557 sub r0, r0, r2, lsl #4 558 add r0, r0, #8 559 bl \type\()_h264_qpel8_hv_lowpass_neon 560 sub r1, r1, r3, lsl #2 561 mov lr, r9 562 b \type\()_h264_qpel8_hv_lowpass_neon 563endfunc 564 565function \type\()_h264_qpel16_hv_lowpass_l2_neon 566 mov r9, lr 567 sub r2, r4, #256 568 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 569 sub r1, r1, r3, lsl #2 570 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 571 sub r1, r1, r3, lsl #4 572 sub r1, r1, r3, lsl #2 573 add r1, r1, #8 574 sub r0, r0, r3, lsl #4 575 add r0, r0, #8 576 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 577 sub r1, r1, r3, lsl #2 578 mov lr, r9 579 b \type\()_h264_qpel8_hv_lowpass_l2_neon 580endfunc 581.endm 582 583 h264_qpel16_hv put 584 h264_qpel16_hv avg 585 586.macro h264_qpel8 type 587function ff_\type\()_h264_qpel8_mc10_neon, export=1 588 lowpass_const r3 589 mov r3, r1 590 sub r1, r1, #2 591 mov r12, #8 592 b \type\()_h264_qpel8_h_lowpass_l2_neon 593endfunc 594 595function ff_\type\()_h264_qpel8_mc20_neon, export=1 596 lowpass_const r3 597 sub r1, r1, #2 598 mov r3, r2 599 mov r12, #8 600 b \type\()_h264_qpel8_h_lowpass_neon 601endfunc 602 603function ff_\type\()_h264_qpel8_mc30_neon, export=1 604 lowpass_const r3 605 add r3, r1, #1 606 sub r1, r1, #2 607 mov r12, #8 608 b \type\()_h264_qpel8_h_lowpass_l2_neon 609endfunc 610 611function ff_\type\()_h264_qpel8_mc01_neon, export=1 612 push {lr} 613 mov r12, r1 614\type\()_h264_qpel8_mc01: 615 lowpass_const r3 616 mov r3, r2 617 sub r1, r1, r2, lsl #1 618 vpush {d8-d15} 619 bl \type\()_h264_qpel8_v_lowpass_l2_neon 620 vpop {d8-d15} 621 pop {pc} 622endfunc 623 624function ff_\type\()_h264_qpel8_mc11_neon, export=1 625 push {r0, r1, r11, lr} 626\type\()_h264_qpel8_mc11: 627 lowpass_const r3 628 mov r11, sp 629A bic sp, sp, #15 630T bic r0, r11, #15 631T mov sp, r0 632 sub sp, sp, #64 633 mov r0, sp 634 sub r1, r1, #2 635 mov r3, #8 636 mov r12, #8 637 vpush {d8-d15} 638 bl put_h264_qpel8_h_lowpass_neon 639 ldrd r0, r1, [r11], #8 640 mov r3, r2 641 add r12, sp, #64 642 sub r1, r1, r2, lsl #1 643 mov r2, #8 644 bl \type\()_h264_qpel8_v_lowpass_l2_neon 645 vpop {d8-d15} 646 mov sp, r11 647 pop {r11, pc} 648endfunc 649 650function ff_\type\()_h264_qpel8_mc21_neon, export=1 651 push {r0, r1, r4, r10, r11, lr} 652\type\()_h264_qpel8_mc21: 653 lowpass_const r3 654 mov r11, sp 655A bic sp, sp, #15 656T bic r0, r11, #15 657T mov sp, r0 658 sub sp, sp, #(8*8+16*12) 659 sub r1, r1, #2 660 mov r3, #8 661 mov r0, sp 662 mov r12, #8 663 vpush {d8-d15} 664 bl put_h264_qpel8_h_lowpass_neon 665 mov r4, r0 666 ldrd r0, r1, [r11], #8 667 sub r1, r1, r2, lsl #1 668 sub r1, r1, #2 669 mov r3, r2 670 sub r2, r4, #64 671 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 672 vpop {d8-d15} 673 mov sp, r11 674 pop {r4, r10, r11, pc} 675endfunc 676 677function ff_\type\()_h264_qpel8_mc31_neon, export=1 678 add r1, r1, #1 679 push {r0, r1, r11, lr} 680 sub r1, r1, #1 681 b \type\()_h264_qpel8_mc11 682endfunc 683 684function ff_\type\()_h264_qpel8_mc02_neon, export=1 685 push {lr} 686 lowpass_const r3 687 sub r1, r1, r2, lsl #1 688 mov r3, r2 689 vpush {d8-d15} 690 bl \type\()_h264_qpel8_v_lowpass_neon 691 vpop {d8-d15} 692 pop {pc} 693endfunc 694 695function ff_\type\()_h264_qpel8_mc12_neon, export=1 696 push {r0, r1, r4, r10, r11, lr} 697\type\()_h264_qpel8_mc12: 698 lowpass_const r3 699 mov r11, sp 700A bic sp, sp, #15 701T bic r0, r11, #15 702T mov sp, r0 703 sub sp, sp, #(8*8+16*12) 704 sub r1, r1, r2, lsl #1 705 mov r3, r2 706 mov r2, #8 707 mov r0, sp 708 vpush {d8-d15} 709 bl put_h264_qpel8_v_lowpass_neon 710 mov r4, r0 711 ldrd r0, r1, [r11], #8 712 sub r1, r1, r3, lsl #1 713 sub r1, r1, #2 714 sub r2, r4, #64 715 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 716 vpop {d8-d15} 717 mov sp, r11 718 pop {r4, r10, r11, pc} 719endfunc 720 721function ff_\type\()_h264_qpel8_mc22_neon, export=1 722 push {r4, r10, r11, lr} 723 mov r11, sp 724A bic sp, sp, #15 725T bic r4, r11, #15 726T mov sp, r4 727 sub r1, r1, r2, lsl #1 728 sub r1, r1, #2 729 mov r3, r2 730 sub sp, sp, #(16*12) 731 mov r4, sp 732 vpush {d8-d15} 733 bl \type\()_h264_qpel8_hv_lowpass_neon 734 vpop {d8-d15} 735 mov sp, r11 736 pop {r4, r10, r11, pc} 737endfunc 738 739function ff_\type\()_h264_qpel8_mc32_neon, export=1 740 push {r0, r1, r4, r10, r11, lr} 741 add r1, r1, #1 742 b \type\()_h264_qpel8_mc12 743endfunc 744 745function ff_\type\()_h264_qpel8_mc03_neon, export=1 746 push {lr} 747 add r12, r1, r2 748 b \type\()_h264_qpel8_mc01 749endfunc 750 751function ff_\type\()_h264_qpel8_mc13_neon, export=1 752 push {r0, r1, r11, lr} 753 add r1, r1, r2 754 b \type\()_h264_qpel8_mc11 755endfunc 756 757function ff_\type\()_h264_qpel8_mc23_neon, export=1 758 push {r0, r1, r4, r10, r11, lr} 759 add r1, r1, r2 760 b \type\()_h264_qpel8_mc21 761endfunc 762 763function ff_\type\()_h264_qpel8_mc33_neon, export=1 764 add r1, r1, #1 765 push {r0, r1, r11, lr} 766 add r1, r1, r2 767 sub r1, r1, #1 768 b \type\()_h264_qpel8_mc11 769endfunc 770.endm 771 772 h264_qpel8 put 773 h264_qpel8 avg 774 775.macro h264_qpel16 type 776function ff_\type\()_h264_qpel16_mc10_neon, export=1 777 lowpass_const r3 778 mov r3, r1 779 sub r1, r1, #2 780 b \type\()_h264_qpel16_h_lowpass_l2_neon 781endfunc 782 783function ff_\type\()_h264_qpel16_mc20_neon, export=1 784 lowpass_const r3 785 sub r1, r1, #2 786 mov r3, r2 787 b \type\()_h264_qpel16_h_lowpass_neon 788endfunc 789 790function ff_\type\()_h264_qpel16_mc30_neon, export=1 791 lowpass_const r3 792 add r3, r1, #1 793 sub r1, r1, #2 794 b \type\()_h264_qpel16_h_lowpass_l2_neon 795endfunc 796 797function ff_\type\()_h264_qpel16_mc01_neon, export=1 798 push {r4, lr} 799 mov r12, r1 800\type\()_h264_qpel16_mc01: 801 lowpass_const r3 802 mov r3, r2 803 sub r1, r1, r2, lsl #1 804 vpush {d8-d15} 805 bl \type\()_h264_qpel16_v_lowpass_l2_neon 806 vpop {d8-d15} 807 pop {r4, pc} 808endfunc 809 810function ff_\type\()_h264_qpel16_mc11_neon, export=1 811 push {r0, r1, r4, r11, lr} 812\type\()_h264_qpel16_mc11: 813 lowpass_const r3 814 mov r11, sp 815A bic sp, sp, #15 816T bic r0, r11, #15 817T mov sp, r0 818 sub sp, sp, #256 819 mov r0, sp 820 sub r1, r1, #2 821 mov r3, #16 822 vpush {d8-d15} 823 bl put_h264_qpel16_h_lowpass_neon 824 ldrd r0, r1, [r11], #8 825 mov r3, r2 826 add r12, sp, #64 827 sub r1, r1, r2, lsl #1 828 mov r2, #16 829 bl \type\()_h264_qpel16_v_lowpass_l2_neon 830 vpop {d8-d15} 831 mov sp, r11 832 pop {r4, r11, pc} 833endfunc 834 835function ff_\type\()_h264_qpel16_mc21_neon, export=1 836 push {r0, r1, r4-r5, r9-r11, lr} 837\type\()_h264_qpel16_mc21: 838 lowpass_const r3 839 mov r11, sp 840A bic sp, sp, #15 841T bic r0, r11, #15 842T mov sp, r0 843 sub sp, sp, #(16*16+16*12) 844 sub r1, r1, #2 845 mov r0, sp 846 vpush {d8-d15} 847 bl put_h264_qpel16_h_lowpass_neon_packed 848 mov r4, r0 849 ldrd r0, r1, [r11], #8 850 sub r1, r1, r2, lsl #1 851 sub r1, r1, #2 852 mov r3, r2 853 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 854 vpop {d8-d15} 855 mov sp, r11 856 pop {r4-r5, r9-r11, pc} 857endfunc 858 859function ff_\type\()_h264_qpel16_mc31_neon, export=1 860 add r1, r1, #1 861 push {r0, r1, r4, r11, lr} 862 sub r1, r1, #1 863 b \type\()_h264_qpel16_mc11 864endfunc 865 866function ff_\type\()_h264_qpel16_mc02_neon, export=1 867 push {r4, lr} 868 lowpass_const r3 869 sub r1, r1, r2, lsl #1 870 mov r3, r2 871 vpush {d8-d15} 872 bl \type\()_h264_qpel16_v_lowpass_neon 873 vpop {d8-d15} 874 pop {r4, pc} 875endfunc 876 877function ff_\type\()_h264_qpel16_mc12_neon, export=1 878 push {r0, r1, r4-r5, r9-r11, lr} 879\type\()_h264_qpel16_mc12: 880 lowpass_const r3 881 mov r11, sp 882A bic sp, sp, #15 883T bic r0, r11, #15 884T mov sp, r0 885 sub sp, sp, #(16*16+16*12) 886 sub r1, r1, r2, lsl #1 887 mov r0, sp 888 mov r3, r2 889 vpush {d8-d15} 890 bl put_h264_qpel16_v_lowpass_neon_packed 891 mov r4, r0 892 ldrd r0, r1, [r11], #8 893 sub r1, r1, r3, lsl #1 894 sub r1, r1, #2 895 mov r2, r3 896 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 897 vpop {d8-d15} 898 mov sp, r11 899 pop {r4-r5, r9-r11, pc} 900endfunc 901 902function ff_\type\()_h264_qpel16_mc22_neon, export=1 903 push {r4, r9-r11, lr} 904 lowpass_const r3 905 mov r11, sp 906A bic sp, sp, #15 907T bic r4, r11, #15 908T mov sp, r4 909 sub r1, r1, r2, lsl #1 910 sub r1, r1, #2 911 mov r3, r2 912 sub sp, sp, #(16*12) 913 mov r4, sp 914 vpush {d8-d15} 915 bl \type\()_h264_qpel16_hv_lowpass_neon 916 vpop {d8-d15} 917 mov sp, r11 918 pop {r4, r9-r11, pc} 919endfunc 920 921function ff_\type\()_h264_qpel16_mc32_neon, export=1 922 push {r0, r1, r4-r5, r9-r11, lr} 923 add r1, r1, #1 924 b \type\()_h264_qpel16_mc12 925endfunc 926 927function ff_\type\()_h264_qpel16_mc03_neon, export=1 928 push {r4, lr} 929 add r12, r1, r2 930 b \type\()_h264_qpel16_mc01 931endfunc 932 933function ff_\type\()_h264_qpel16_mc13_neon, export=1 934 push {r0, r1, r4, r11, lr} 935 add r1, r1, r2 936 b \type\()_h264_qpel16_mc11 937endfunc 938 939function ff_\type\()_h264_qpel16_mc23_neon, export=1 940 push {r0, r1, r4-r5, r9-r11, lr} 941 add r1, r1, r2 942 b \type\()_h264_qpel16_mc21 943endfunc 944 945function ff_\type\()_h264_qpel16_mc33_neon, export=1 946 add r1, r1, #1 947 push {r0, r1, r4, r11, lr} 948 add r1, r1, r2 949 sub r1, r1, #1 950 b \type\()_h264_qpel16_mc11 951endfunc 952.endm 953 954 h264_qpel16 put 955 h264_qpel16 avg 956