1/***************************************************************************** 2 * predict.S: arm intra prediction 3 ***************************************************************************** 4 * Copyright (C) 2009-2014 x264 project 5 * 6 * Authors: David Conrad <lessen42@gmail.com> 7 * Mans Rullgard <mans@mansr.com> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22 * 23 * This program is also available under a commercial proprietary license. 24 * For more information, contact us at licensing@x264.com. 25 *****************************************************************************/ 26 27#include "asm.S" 28 29.section .rodata 30.align 4 31 32p16weight: .short 1,2,3,4,5,6,7,8 33 34.text 35 36.macro ldcol.8 rd, rs, rt, n=8, hi=0 37.if \n == 8 || \hi == 0 38 vld1.8 {\rd[0]}, [\rs], \rt 39 vld1.8 {\rd[1]}, [\rs], \rt 40 vld1.8 {\rd[2]}, [\rs], \rt 41 vld1.8 {\rd[3]}, [\rs], \rt 42.endif 43.if \n == 8 || \hi == 1 44 vld1.8 {\rd[4]}, [\rs], \rt 45 vld1.8 {\rd[5]}, [\rs], \rt 46 vld1.8 {\rd[6]}, [\rs], \rt 47 vld1.8 {\rd[7]}, [\rs], \rt 48.endif 49.endm 50 51.macro add16x8 dq, dl, dh, rl, rh 52 vaddl.u8 \dq, \rl, \rh 53 vadd.u16 \dl, \dl, \dh 54 vpadd.u16 \dl, \dl, \dl 55 vpadd.u16 \dl, \dl, \dl 56.endm 57 58 59// because gcc doesn't believe in using the free shift in add 60function x264_predict_4x4_h_armv6 61 ldrb r1, [r0, #0*FDEC_STRIDE-1] 62 ldrb r2, [r0, #1*FDEC_STRIDE-1] 63 ldrb r3, [r0, #2*FDEC_STRIDE-1] 64 ldrb ip, [r0, #3*FDEC_STRIDE-1] 65 add r1, r1, r1, lsl #8 66 add r2, r2, r2, lsl #8 67 add r3, r3, r3, lsl #8 68 add ip, ip, ip, lsl #8 69 add r1, r1, r1, lsl #16 70 str r1, [r0, #0*FDEC_STRIDE] 71 add r2, r2, r2, lsl #16 72 str r2, [r0, #1*FDEC_STRIDE] 73 add r3, r3, r3, lsl #16 74 str r3, [r0, #2*FDEC_STRIDE] 75 add ip, ip, ip, lsl #16 76 str ip, [r0, #3*FDEC_STRIDE] 77 bx lr 78endfunc 79 80function x264_predict_4x4_v_armv6 81 ldr r1, [r0, #0 - 1 * FDEC_STRIDE] 82 str r1, [r0, #0 + 0 * FDEC_STRIDE] 83 str r1, [r0, #0 + 1 * FDEC_STRIDE] 84 str r1, [r0, #0 + 2 * FDEC_STRIDE] 85 str r1, [r0, #0 + 3 * FDEC_STRIDE] 86 bx lr 87endfunc 88 89function x264_predict_4x4_dc_armv6 90 mov ip, #0 91 ldr r1, [r0, #-FDEC_STRIDE] 92 ldrb r2, [r0, #0*FDEC_STRIDE-1] 93 ldrb r3, [r0, #1*FDEC_STRIDE-1] 94 usad8 r1, r1, ip 95 add r2, r2, #4 96 ldrb ip, [r0, #2*FDEC_STRIDE-1] 97 add r2, r2, r3 98 ldrb r3, [r0, #3*FDEC_STRIDE-1] 99 add r2, r2, ip 100 add r2, r2, r3 101 add r1, r1, r2 102 lsr r1, r1, #3 103 add r1, r1, r1, lsl #8 104 add r1, r1, r1, lsl #16 105 str r1, [r0, #0*FDEC_STRIDE] 106 str r1, [r0, #1*FDEC_STRIDE] 107 str r1, [r0, #2*FDEC_STRIDE] 108 str r1, [r0, #3*FDEC_STRIDE] 109 bx lr 110endfunc 111 112function x264_predict_4x4_dc_top_neon 113 mov r12, #FDEC_STRIDE 114 sub r1, r0, #FDEC_STRIDE 115 vld1.32 d1[], [r1,:32] 116 vpaddl.u8 d1, d1 117 vpadd.u16 d1, d1, d1 118 vrshr.u16 d1, d1, #2 119 vdup.8 d1, d1[0] 120 vst1.32 d1[0], [r0,:32], r12 121 vst1.32 d1[0], [r0,:32], r12 122 vst1.32 d1[0], [r0,:32], r12 123 vst1.32 d1[0], [r0,:32], r12 124 bx lr 125endfunc 126 127// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 128.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 129 uhadd8 \a1, \a1, \c1 130 uhadd8 \a2, \a2, \c2 131 uhadd8 \c1, \a1, \b1 132 uhadd8 \c2, \a2, \b2 133 eor \a1, \a1, \b1 134 eor \a2, \a2, \b2 135 and \a1, \a1, \pb_1 136 and \a2, \a2, \pb_1 137 uadd8 \a1, \a1, \c1 138 uadd8 \a2, \a2, \c2 139.endm 140 141function x264_predict_4x4_ddr_armv6 142 ldr r1, [r0, # -FDEC_STRIDE] 143 ldrb r2, [r0, # -FDEC_STRIDE-1] 144 ldrb r3, [r0, #0*FDEC_STRIDE-1] 145 push {r4-r6,lr} 146 add r2, r2, r1, lsl #8 147 ldrb r4, [r0, #1*FDEC_STRIDE-1] 148 add r3, r3, r2, lsl #8 149 ldrb r5, [r0, #2*FDEC_STRIDE-1] 150 ldrb r6, [r0, #3*FDEC_STRIDE-1] 151 add r4, r4, r3, lsl #8 152 add r5, r5, r4, lsl #8 153 add r6, r6, r5, lsl #8 154 ldr ip, =0x01010101 155 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip 156 str r1, [r0, #0*FDEC_STRIDE] 157 lsl r2, r1, #8 158 lsl r3, r1, #16 159 lsl r4, r4, #8 160 lsl r5, r1, #24 161 add r2, r2, r4, lsr #24 162 str r2, [r0, #1*FDEC_STRIDE] 163 add r3, r3, r4, lsr #16 164 str r3, [r0, #2*FDEC_STRIDE] 165 add r5, r5, r4, lsr #8 166 str r5, [r0, #3*FDEC_STRIDE] 167 pop {r4-r6,pc} 168endfunc 169 170function x264_predict_4x4_ddl_neon 171 sub r0, #FDEC_STRIDE 172 mov ip, #FDEC_STRIDE 173 vld1.64 {d0}, [r0], ip 174 vdup.8 d3, d0[7] 175 vext.8 d1, d0, d0, #1 176 vext.8 d2, d0, d3, #2 177 vhadd.u8 d0, d0, d2 178 vrhadd.u8 d0, d0, d1 179 vst1.32 {d0[0]}, [r0,:32], ip 180 vext.8 d1, d0, d0, #1 181 vext.8 d2, d0, d0, #2 182 vst1.32 {d1[0]}, [r0,:32], ip 183 vext.8 d3, d0, d0, #3 184 vst1.32 {d2[0]}, [r0,:32], ip 185 vst1.32 {d3[0]}, [r0,:32], ip 186 bx lr 187endfunc 188 189function x264_predict_8x8_dc_neon 190 mov ip, #0 191 ldrd r2, r3, [r1, #8] 192 push {r4-r5,lr} 193 ldrd r4, r5, [r1, #16] 194 lsl r3, r3, #8 195 ldrb lr, [r1, #7] 196 usad8 r2, r2, ip 197 usad8 r3, r3, ip 198 usada8 r2, r4, ip, r2 199 add lr, lr, #8 200 usada8 r3, r5, ip, r3 201 add r2, r2, lr 202 mov ip, #FDEC_STRIDE 203 add r2, r2, r3 204 lsr r2, r2, #4 205 206 vdup.8 d0, r2 207.rept 8 208 vst1.64 {d0}, [r0,:64], ip 209.endr 210 pop {r4-r5,pc} 211endfunc 212 213function x264_predict_8x8_h_neon 214 add r1, r1, #7 215 mov ip, #FDEC_STRIDE 216 vld1.64 {d16}, [r1] 217 vdup.8 d0, d16[7] 218 vdup.8 d1, d16[6] 219 vst1.64 {d0}, [r0,:64], ip 220 vdup.8 d2, d16[5] 221 vst1.64 {d1}, [r0,:64], ip 222 vdup.8 d3, d16[4] 223 vst1.64 {d2}, [r0,:64], ip 224 vdup.8 d4, d16[3] 225 vst1.64 {d3}, [r0,:64], ip 226 vdup.8 d5, d16[2] 227 vst1.64 {d4}, [r0,:64], ip 228 vdup.8 d6, d16[1] 229 vst1.64 {d5}, [r0,:64], ip 230 vdup.8 d7, d16[0] 231 vst1.64 {d6}, [r0,:64], ip 232 vst1.64 {d7}, [r0,:64], ip 233 bx lr 234endfunc 235 236function x264_predict_8x8_v_neon 237 add r1, r1, #16 238 mov r12, #FDEC_STRIDE 239 vld1.8 {d0}, [r1,:64] 240.rept 8 241 vst1.8 {d0}, [r0,:64], r12 242.endr 243 bx lr 244endfunc 245 246function x264_predict_8x8_ddl_neon 247 add r1, #16 248 vld1.8 {d0, d1}, [r1,:128] 249 vmov.i8 q3, #0 250 vrev64.8 d2, d1 251 vext.8 q8, q3, q0, #15 252 vext.8 q2, q0, q1, #1 253 vhadd.u8 q8, q2 254 mov r12, #FDEC_STRIDE 255 vrhadd.u8 q0, q8 256 vext.8 d2, d0, d1, #1 257 vext.8 d3, d0, d1, #2 258 vst1.8 d2, [r0,:64], r12 259 vext.8 d2, d0, d1, #3 260 vst1.8 d3, [r0,:64], r12 261 vext.8 d3, d0, d1, #4 262 vst1.8 d2, [r0,:64], r12 263 vext.8 d2, d0, d1, #5 264 vst1.8 d3, [r0,:64], r12 265 vext.8 d3, d0, d1, #6 266 vst1.8 d2, [r0,:64], r12 267 vext.8 d2, d0, d1, #7 268 vst1.8 d3, [r0,:64], r12 269 vst1.8 d2, [r0,:64], r12 270 vst1.8 d1, [r0,:64], r12 271 bx lr 272endfunc 273 274function x264_predict_8x8_ddr_neon 275 vld1.8 {d0-d3}, [r1,:128] 276 vext.8 q2, q0, q1, #7 277 vext.8 q3, q0, q1, #9 278 279 vhadd.u8 q2, q2, q3 280 vrhadd.u8 d0, d1, d4 281 vrhadd.u8 d1, d2, d5 282 283 add r0, #7*FDEC_STRIDE 284 mov r12, #-1*FDEC_STRIDE 285 286 vext.8 d2, d0, d1, #1 287 vst1.8 {d0}, [r0,:64], r12 288 vext.8 d4, d0, d1, #2 289 vst1.8 {d2}, [r0,:64], r12 290 vext.8 d5, d0, d1, #3 291 vst1.8 {d4}, [r0,:64], r12 292 vext.8 d4, d0, d1, #4 293 vst1.8 {d5}, [r0,:64], r12 294 vext.8 d5, d0, d1, #5 295 vst1.8 {d4}, [r0,:64], r12 296 vext.8 d4, d0, d1, #6 297 vst1.8 {d5}, [r0,:64], r12 298 vext.8 d5, d0, d1, #7 299 vst1.8 {d4}, [r0,:64], r12 300 vst1.8 {d5}, [r0,:64], r12 301 bx lr 302endfunc 303 304function x264_predict_8x8_vl_neon 305 add r1, #16 306 mov r12, #FDEC_STRIDE 307 308 vld1.8 {d0, d1}, [r1,:128] 309 vext.8 q1, q1, q0, #15 310 vext.8 q2, q0, q2, #1 311 312 vrhadd.u8 q3, q0, q2 313 314 vhadd.u8 q1, q1, q2 315 vrhadd.u8 q0, q0, q1 316 317 vext.8 d2, d0, d1, #1 318 vst1.8 {d6}, [r0,:64], r12 319 vext.8 d3, d6, d7, #1 320 vst1.8 {d2}, [r0,:64], r12 321 vext.8 d2, d0, d1, #2 322 vst1.8 {d3}, [r0,:64], r12 323 vext.8 d3, d6, d7, #2 324 vst1.8 {d2}, [r0,:64], r12 325 vext.8 d2, d0, d1, #3 326 vst1.8 {d3}, [r0,:64], r12 327 vext.8 d3, d6, d7, #3 328 vst1.8 {d2}, [r0,:64], r12 329 vext.8 d2, d0, d1, #4 330 vst1.8 {d3}, [r0,:64], r12 331 vst1.8 {d2}, [r0,:64], r12 332 bx lr 333endfunc 334 335function x264_predict_8x8_vr_neon 336 add r1, #8 337 mov r12, #FDEC_STRIDE 338 vld1.8 {d4,d5}, [r1,:64] 339 340 vext.8 q1, q2, q2, #14 341 vext.8 q0, q2, q2, #15 342 343 vhadd.u8 q3, q2, q1 344 vrhadd.u8 q2, q2, q0 345 vrhadd.u8 q0, q0, q3 346 347 vmov d2, d0 348 349 vst1.8 {d5}, [r0,:64], r12 350 vuzp.8 d2, d0 351 vst1.8 {d1}, [r0,:64], r12 352 vext.8 d6, d0, d5, #7 353 vext.8 d3, d2, d1, #7 354 vst1.8 {d6}, [r0,:64], r12 355 vst1.8 {d3}, [r0,:64], r12 356 vext.8 d6, d0, d5, #6 357 vext.8 d3, d2, d1, #6 358 vst1.8 {d6}, [r0,:64], r12 359 vst1.8 {d3}, [r0,:64], r12 360 vext.8 d6, d0, d5, #5 361 vext.8 d3, d2, d1, #5 362 vst1.8 {d6}, [r0,:64], r12 363 vst1.8 {d3}, [r0,:64], r12 364 bx lr 365endfunc 366 367function x264_predict_8x8_hd_neon 368 mov r12, #FDEC_STRIDE 369 add r1, #7 370 371 vld1.8 {d2,d3}, [r1] 372 vext.8 q3, q1, q1, #1 373 vext.8 q2, q1, q1, #2 374 375 vrhadd.u8 q8, q1, q3 376 377 vhadd.u8 q1, q2 378 vrhadd.u8 q0, q1, q3 379 380 vzip.8 d16, d0 381 382 vext.8 d2, d0, d1, #6 383 vext.8 d3, d0, d1, #4 384 vst1.8 {d2}, [r0,:64], r12 385 vext.8 d2, d0, d1, #2 386 vst1.8 {d3}, [r0,:64], r12 387 vst1.8 {d2}, [r0,:64], r12 388 vext.8 d2, d16, d0, #6 389 vst1.8 {d0}, [r0,:64], r12 390 vext.8 d3, d16, d0, #4 391 vst1.8 {d2}, [r0,:64], r12 392 vext.8 d2, d16, d0, #2 393 vst1.8 {d3}, [r0,:64], r12 394 vst1.8 {d2}, [r0,:64], r12 395 vst1.8 {d16}, [r0,:64], r12 396 397 bx lr 398endfunc 399 400function x264_predict_8x8_hu_neon 401 mov r12, #FDEC_STRIDE 402 add r1, #7 403 vld1.8 {d7}, [r1] 404 vdup.8 d6, d7[0] 405 vrev64.8 d7, d7 406 407 vext.8 d4, d7, d6, #2 408 vext.8 d2, d7, d6, #1 409 410 vhadd.u8 d16, d7, d4 411 vrhadd.u8 d0, d2, d7 412 vrhadd.u8 d1, d16, d2 413 414 vzip.8 d0, d1 415 416 vdup.16 q1, d1[3] 417 418 vext.8 q2, q0, q1, #2 419 vext.8 q3, q0, q1, #4 420 vext.8 q8, q0, q1, #6 421 vst1.8 {d0}, [r0,:64], r12 422 vst1.8 {d4}, [r0,:64], r12 423 vst1.8 {d6}, [r0,:64], r12 424 vst1.8 {d16}, [r0,:64], r12 425 426 vst1.8 {d1}, [r0,:64], r12 427 vst1.8 {d5}, [r0,:64], r12 428 vst1.8 {d7}, [r0,:64], r12 429 vst1.8 {d17}, [r0,:64] 430 bx lr 431endfunc 432 433function x264_predict_8x8c_dc_top_neon 434 sub r2, r0, #FDEC_STRIDE 435 mov r1, #FDEC_STRIDE 436 vld1.8 {d0}, [r2,:64] 437 vpaddl.u8 d0, d0 438 vpadd.u16 d0, d0, d0 439 vrshrn.u16 d0, q0, #2 440 vdup.8 d1, d0[1] 441 vdup.8 d0, d0[0] 442 vtrn.32 d0, d1 443 b pred8x8_dc_end 444endfunc 445 446function x264_predict_8x8c_dc_left_neon 447 mov r1, #FDEC_STRIDE 448 sub r2, r0, #1 449 ldcol.8 d0, r2, r1 450 vpaddl.u8 d0, d0 451 vpadd.u16 d0, d0, d0 452 vrshrn.u16 d0, q0, #2 453 vdup.8 d1, d0[1] 454 vdup.8 d0, d0[0] 455 b pred8x8_dc_end 456endfunc 457 458function x264_predict_8x8c_dc_neon 459 sub r2, r0, #FDEC_STRIDE 460 mov r1, #FDEC_STRIDE 461 vld1.8 {d0}, [r2,:64] 462 sub r2, r0, #1 463 ldcol.8 d1, r2, r1 464 vtrn.32 d0, d1 465 vpaddl.u8 q0, q0 466 vpadd.u16 d0, d0, d1 467 vpadd.u16 d1, d0, d0 468 vrshrn.u16 d2, q0, #3 469 vrshrn.u16 d3, q0, #2 470 vdup.8 d0, d2[4] 471 vdup.8 d1, d3[3] 472 vdup.8 d4, d3[2] 473 vdup.8 d5, d2[5] 474 vtrn.32 q0, q2 475pred8x8_dc_end: 476 add r2, r0, r1, lsl #2 477.rept 4 478 vst1.8 {d0}, [r0,:64], r1 479 vst1.8 {d1}, [r2,:64], r1 480.endr 481 bx lr 482endfunc 483 484function x264_predict_8x8c_h_neon 485 sub r1, r0, #1 486 mov ip, #FDEC_STRIDE 487.rept 4 488 vld1.8 {d0[]}, [r1], ip 489 vld1.8 {d2[]}, [r1], ip 490 vst1.64 {d0}, [r0,:64], ip 491 vst1.64 {d2}, [r0,:64], ip 492.endr 493 bx lr 494endfunc 495 496function x264_predict_8x8c_v_neon 497 sub r0, r0, #FDEC_STRIDE 498 mov ip, #FDEC_STRIDE 499 vld1.64 {d0}, [r0,:64], ip 500.rept 8 501 vst1.64 {d0}, [r0,:64], ip 502.endr 503 bx lr 504endfunc 505 506function x264_predict_8x8c_p_neon 507 sub r3, r0, #FDEC_STRIDE 508 mov r1, #FDEC_STRIDE 509 add r2, r3, #4 510 sub r3, r3, #1 511 vld1.32 {d0[0]}, [r3] 512 vld1.32 {d2[0]}, [r2,:32], r1 513 ldcol.8 d0, r3, r1, 4, hi=1 514 add r3, r3, r1 515 ldcol.8 d3, r3, r1, 4 516 vaddl.u8 q8, d2, d3 517 vrev32.8 d0, d0 518 vtrn.32 d2, d3 519 vsubl.u8 q2, d2, d0 520 movrel r3, p16weight 521 vld1.16 {q0}, [r3,:128] 522 vmul.s16 d4, d4, d0 523 vmul.s16 d5, d5, d0 524 vpadd.i16 d4, d4, d5 525 vpaddl.s16 d4, d4 526 vshl.i32 d5, d4, #4 527 vadd.s32 d4, d4, d5 528 vrshrn.s32 d4, q2, #5 529 mov r3, #0 530 vtrn.16 d4, d5 531 vadd.i16 d2, d4, d5 532 vshl.i16 d3, d2, #2 533 vrev64.16 d16, d16 534 vsub.i16 d3, d3, d2 535 vadd.i16 d16, d16, d0 536 vshl.i16 d2, d16, #4 537 vsub.i16 d2, d2, d3 538 vshl.i16 d3, d4, #3 539 vext.16 q0, q0, q0, #7 540 vsub.i16 d6, d5, d3 541 vmov.16 d0[0], r3 542 vmul.i16 q0, q0, d4[0] 543 vdup.16 q1, d2[0] 544 vdup.16 q2, d4[0] 545 vdup.16 q3, d6[0] 546 vshl.i16 q2, q2, #3 547 vadd.i16 q1, q1, q0 548 vadd.i16 q3, q3, q2 549 mov r3, #8 5501: 551 vqshrun.s16 d0, q1, #5 552 vadd.i16 q1, q1, q3 553 vst1.8 {d0}, [r0,:64], r1 554 subs r3, r3, #1 555 bne 1b 556 bx lr 557endfunc 558 559 560function x264_predict_16x16_dc_top_neon 561 sub r2, r0, #FDEC_STRIDE 562 mov r1, #FDEC_STRIDE 563 vld1.8 {q0}, [r2,:128] 564 add16x8 q0, d0, d1, d0, d1 565 vrshrn.u16 d0, q0, #4 566 vdup.8 q0, d0[0] 567 b pred16x16_dc_end 568endfunc 569 570function x264_predict_16x16_dc_left_neon 571 mov r1, #FDEC_STRIDE 572 sub r2, r0, #1 573 ldcol.8 d0, r2, r1 574 ldcol.8 d1, r2, r1 575 add16x8 q0, d0, d1, d0, d1 576 vrshrn.u16 d0, q0, #4 577 vdup.8 q0, d0[0] 578 b pred16x16_dc_end 579endfunc 580 581function x264_predict_16x16_dc_neon 582 sub r3, r0, #FDEC_STRIDE 583 sub r0, r0, #1 584 vld1.64 {d0-d1}, [r3,:128] 585 ldrb ip, [r0], #FDEC_STRIDE 586 vaddl.u8 q0, d0, d1 587 ldrb r1, [r0], #FDEC_STRIDE 588 vadd.u16 d0, d0, d1 589 vpadd.u16 d0, d0, d0 590 vpadd.u16 d0, d0, d0 591.rept 4 592 ldrb r2, [r0], #FDEC_STRIDE 593 add ip, ip, r1 594 ldrb r3, [r0], #FDEC_STRIDE 595 add ip, ip, r2 596 ldrb r1, [r0], #FDEC_STRIDE 597 add ip, ip, r3 598.endr 599 ldrb r2, [r0], #FDEC_STRIDE 600 add ip, ip, r1 601 ldrb r3, [r0], #FDEC_STRIDE 602 add ip, ip, r2 603 604 sub r0, r0, #FDEC_STRIDE*16 605 add ip, ip, r3 606 vdup.16 d1, ip 607 vadd.u16 d0, d0, d1 608 mov r1, #FDEC_STRIDE 609 add r0, r0, #1 610 vrshr.u16 d0, d0, #5 611 vdup.8 q0, d0[0] 612pred16x16_dc_end: 613.rept 16 614 vst1.64 {d0-d1}, [r0,:128], r1 615.endr 616 bx lr 617endfunc 618 619function x264_predict_16x16_h_neon 620 sub r1, r0, #1 621 mov ip, #FDEC_STRIDE 622.rept 8 623 vld1.8 {d0[]}, [r1], ip 624 vmov d1, d0 625 vld1.8 {d2[]}, [r1], ip 626 vmov d3, d2 627 vst1.64 {d0-d1}, [r0,:128], ip 628 vst1.64 {d2-d3}, [r0,:128], ip 629.endr 630 bx lr 631endfunc 632 633function x264_predict_16x16_v_neon 634 sub r0, r0, #FDEC_STRIDE 635 mov ip, #FDEC_STRIDE 636 vld1.64 {d0-d1}, [r0,:128], ip 637.rept 16 638 vst1.64 {d0-d1}, [r0,:128], ip 639.endr 640 bx lr 641endfunc 642 643function x264_predict_16x16_p_neon 644 sub r3, r0, #FDEC_STRIDE 645 mov r1, #FDEC_STRIDE 646 add r2, r3, #8 647 sub r3, r3, #1 648 vld1.8 {d0}, [r3] 649 vld1.8 {d2}, [r2,:64], r1 650 ldcol.8 d1, r3, r1 651 add r3, r3, r1 652 ldcol.8 d3, r3, r1 653 vrev64.8 q0, q0 654 vaddl.u8 q8, d2, d3 655 vsubl.u8 q2, d2, d0 656 vsubl.u8 q3, d3, d1 657 movrel r3, p16weight 658 vld1.8 {q0}, [r3,:128] 659 vmul.s16 q2, q2, q0 660 vmul.s16 q3, q3, q0 661 vadd.i16 d4, d4, d5 662 vadd.i16 d5, d6, d7 663 vpadd.i16 d4, d4, d5 664 vpadd.i16 d4, d4, d4 665 vshll.s16 q3, d4, #2 666 vaddw.s16 q2, q3, d4 667 vrshrn.s32 d4, q2, #6 668 mov r3, #0 669 vtrn.16 d4, d5 670 vadd.i16 d2, d4, d5 671 vshl.i16 d3, d2, #3 672 vrev64.16 d16, d17 673 vsub.i16 d3, d3, d2 674 vadd.i16 d16, d16, d0 675 vshl.i16 d2, d16, #4 676 vsub.i16 d2, d2, d3 677 vshl.i16 d3, d4, #4 678 vext.16 q0, q0, q0, #7 679 vsub.i16 d6, d5, d3 680 vmov.16 d0[0], r3 681 vmul.i16 q0, q0, d4[0] 682 vdup.16 q1, d2[0] 683 vdup.16 q2, d4[0] 684 vdup.16 q3, d6[0] 685 vshl.i16 q2, q2, #3 686 vadd.i16 q1, q1, q0 687 vadd.i16 q3, q3, q2 688 mov r3, #16 6891: 690 vqshrun.s16 d0, q1, #5 691 vadd.i16 q1, q1, q2 692 vqshrun.s16 d1, q1, #5 693 vadd.i16 q1, q1, q3 694 vst1.8 {q0}, [r0,:128], r1 695 subs r3, r3, #1 696 bne 1b 697 bx lr 698endfunc 699