1/***************************************************************************** 2 * mc.S: aarch64 motion compensation 3 ***************************************************************************** 4 * Copyright (C) 2009-2014 x264 project 5 * 6 * Authors: David Conrad <lessen42@gmail.com> 7 * Janne Grunau <janne-x264@jannau.net> 8 * Mans Rullgard <mans@mansr.com> 9 * Stefan Groenroos <stefan.gronroos@gmail.com> 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2 of the License, or 14 * (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 24 * 25 * This program is also available under a commercial proprietary license. 26 * For more information, contact us at licensing@x264.com. 27 *****************************************************************************/ 28 29#include "asm.S" 30 31// note: prefetch stuff assumes 64-byte cacheline 32 33// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) 34function x264_prefetch_ref_aarch64, export=1 35 cmp w2, #1 36 csel x2, xzr, x1, eq 37 add x0, x0, #64 38 add x0, x0, x2, lsl #3 39 40 lsl x2, x1, #1 41 add x3, x1, x1, lsl #1 42 add x4, x0, x1, lsl #2 43 44 prfm pldl1strm, [x0] 45 prfm pldl1strm, [x0, x1] 46 prfm pldl1strm, [x0, x2] 47 prfm pldl1strm, [x0, x3] 48 prfm pldl1strm, [x4] 49 prfm pldl1strm, [x4, x1] 50 prfm pldl1strm, [x4, x2] 51 prfm pldl1strm, [x4, x3] 52 ret 53endfunc 54 55// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, 56// uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) 57.macro x264_prefetch_fenc sub 58function x264_prefetch_fenc_\sub\()_aarch64, export=1 59 and w6, w5, #3 60 and w7, w5, #3 61 mul x6, x6, x1 62 mul x7, x7, x3 63 add x0, x0, #64 64 add x2, x2, #64 65 66 add x0, x0, x6, lsl #2 67 add x6, x0, x1, lsl #1 68 prfm pldl1strm, [x0] 69 prfm pldl1strm, [x0, x1] 70 prfm pldl1strm, [x6] 71 prfm pldl1strm, [x6, x1] 72 73 add x2, x2, x7, lsl #1 74 prfm pldl1strm, [x2] 75 prfm pldl1strm, [x2, x3] 76.ifc \sub, 422 77 add x7, x2, x3, lsl #1 78 prfm pldl1strm, [x7] 79 prfm pldl1strm, [x7, x3] 80.endif 81 ret 82endfunc 83.endm 84 85x264_prefetch_fenc 420 86x264_prefetch_fenc 422 87 88// void pixel_avg( uint8_t *dst, intptr_t dst_stride, 89// uint8_t *src1, intptr_t src1_stride, 90// uint8_t *src2, intptr_t src2_stride, int weight ); 91.macro AVGH w h 92function x264_pixel_avg_\w\()x\h\()_neon, export=1 93 mov w10, #64 94 cmp w6, #32 95 mov w9, #\h 96 b.eq pixel_avg_w\w\()_neon 97 subs w7, w10, w6 98 b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 99 cmp w6, #0 100 b.ge pixel_avg_weight_w\w\()_add_add_neon 101 b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 102endfunc 103.endm 104 105AVGH 4, 2 106AVGH 4, 4 107AVGH 4, 8 108AVGH 4, 16 109AVGH 8, 4 110AVGH 8, 8 111AVGH 8, 16 112AVGH 16, 8 113AVGH 16, 16 114 115// 0 < weight < 64 116.macro load_weights_add_add 117 mov w6, w6 118.endm 119.macro weight_add_add dst, s1, s2, h= 120.ifc \h, 2 121 umull2 \dst, \s1, v30.16b 122 umlal2 \dst, \s2, v31.16b 123.else 124 umull \dst, \s1, v30.8b 125 umlal \dst, \s2, v31.8b 126.endif 127.endm 128 129// weight > 64 130.macro load_weights_add_sub 131 neg w7, w7 132.endm 133.macro weight_add_sub dst, s1, s2, h= 134.ifc \h, 2 135 umull2 \dst, \s1, v30.16b 136 umlsl2 \dst, \s2, v31.16b 137.else 138 umull \dst, \s1, v30.8b 139 umlsl \dst, \s2, v31.8b 140.endif 141.endm 142 143// weight < 0 144.macro load_weights_sub_add 145 neg w6, w6 146.endm 147.macro weight_sub_add dst, s1, s2, h= 148.ifc \h, 2 149 umull2 \dst, \s2, v31.16b 150 umlsl2 \dst, \s1, v30.16b 151.else 152 umull \dst, \s2, v31.8b 153 umlsl \dst, \s1, v30.8b 154.endif 155.endm 156 157.macro AVG_WEIGHT ext 158function pixel_avg_weight_w4_\ext\()_neon 159 load_weights_\ext 160 dup v30.8b, w6 161 dup v31.8b, w7 1621: // height loop 163 subs w9, w9, #2 164 ld1 {v0.s}[0], [x2], x3 165 ld1 {v1.s}[0], [x4], x5 166 weight_\ext v4.8h, v0.8b, v1.8b 167 ld1 {v2.s}[0], [x2], x3 168 ld1 {v3.s}[0], [x4], x5 169 sqrshrun v0.8b, v4.8h, #6 170 weight_\ext v5.8h, v2.8b, v3.8b 171 st1 {v0.s}[0], [x0], x1 172 sqrshrun v1.8b, v5.8h, #6 173 st1 {v1.s}[0], [x0], x1 174 b.gt 1b 175 ret 176endfunc 177 178function pixel_avg_weight_w8_\ext\()_neon 179 load_weights_\ext 180 dup v30.8b, w6 181 dup v31.8b, w7 1821: // height loop 183 subs w9, w9, #4 184 ld1 {v0.8b}, [x2], x3 185 ld1 {v1.8b}, [x4], x5 186 weight_\ext v16.8h, v0.8b, v1.8b 187 ld1 {v2.8b}, [x2], x3 188 ld1 {v3.8b}, [x4], x5 189 weight_\ext v17.8h, v2.8b, v3.8b 190 ld1 {v4.8b}, [x2], x3 191 ld1 {v5.8b}, [x4], x5 192 weight_\ext v18.8h, v4.8b, v5.8b 193 ld1 {v6.8b}, [x2], x3 194 ld1 {v7.8b}, [x4], x5 195 weight_\ext v19.8h, v6.8b, v7.8b 196 sqrshrun v0.8b, v16.8h, #6 197 sqrshrun v1.8b, v17.8h, #6 198 sqrshrun v2.8b, v18.8h, #6 199 sqrshrun v3.8b, v19.8h, #6 200 st1 {v0.8b}, [x0], x1 201 st1 {v1.8b}, [x0], x1 202 st1 {v2.8b}, [x0], x1 203 st1 {v3.8b}, [x0], x1 204 b.gt 1b 205 ret 206endfunc 207 208function pixel_avg_weight_w16_\ext\()_neon 209 load_weights_\ext 210 dup v30.16b, w6 211 dup v31.16b, w7 2121: // height loop 213 subs w9, w9, #2 214 ld1 {v0.16b}, [x2], x3 215 ld1 {v1.16b}, [x4], x5 216 weight_\ext v16.8h, v0.8b, v1.8b 217 weight_\ext v17.8h, v0.16b, v1.16b, 2 218 ld1 {v2.16b}, [x2], x3 219 ld1 {v3.16b}, [x4], x5 220 weight_\ext v18.8h, v2.8b, v3.8b 221 weight_\ext v19.8h, v2.16b, v3.16b, 2 222 sqrshrun v0.8b, v16.8h, #6 223 sqrshrun v1.8b, v18.8h, #6 224 sqrshrun2 v0.16b, v17.8h, #6 225 sqrshrun2 v1.16b, v19.8h, #6 226 st1 {v0.16b}, [x0], x1 227 st1 {v1.16b}, [x0], x1 228 b.gt 1b 229 ret 230endfunc 231.endm 232 233AVG_WEIGHT add_add 234AVG_WEIGHT add_sub 235AVG_WEIGHT sub_add 236 237function pixel_avg_w4_neon 2381: subs w9, w9, #2 239 ld1 {v0.s}[0], [x2], x3 240 ld1 {v2.s}[0], [x4], x5 241 urhadd v0.8b, v0.8b, v2.8b 242 ld1 {v1.s}[0], [x2], x3 243 ld1 {v3.s}[0], [x4], x5 244 urhadd v1.8b, v1.8b, v3.8b 245 st1 {v0.s}[0], [x0], x1 246 st1 {v1.s}[0], [x0], x1 247 b.gt 1b 248 ret 249endfunc 250 251function pixel_avg_w8_neon 2521: subs w9, w9, #4 253 ld1 {v0.8b}, [x2], x3 254 ld1 {v1.8b}, [x4], x5 255 ld1 {v2.8b}, [x2], x3 256 urhadd v0.8b, v0.8b, v1.8b 257 ld1 {v3.8b}, [x4], x5 258 st1 {v0.8b}, [x0], x1 259 ld1 {v4.8b}, [x2], x3 260 urhadd v1.8b, v2.8b, v3.8b 261 ld1 {v5.8b}, [x4], x5 262 st1 {v1.8b}, [x0], x1 263 ld1 {v6.8b}, [x2], x3 264 ld1 {v7.8b}, [x4], x5 265 urhadd v2.8b, v4.8b, v5.8b 266 urhadd v3.8b, v6.8b, v7.8b 267 st1 {v2.8b}, [x0], x1 268 st1 {v3.8b}, [x0], x1 269 b.gt 1b 270 ret 271endfunc 272 273function pixel_avg_w16_neon 2741: subs w9, w9, #4 275 ld1 {v0.16b}, [x2], x3 276 ld1 {v1.16b}, [x4], x5 277 ld1 {v2.16b}, [x2], x3 278 urhadd v0.16b, v0.16b, v1.16b 279 ld1 {v3.16b}, [x4], x5 280 st1 {v0.16b}, [x0], x1 281 ld1 {v4.16b}, [x2], x3 282 urhadd v1.16b, v2.16b, v3.16b 283 ld1 {v5.16b}, [x4], x5 284 st1 {v1.16b}, [x0], x1 285 ld1 {v6.16b}, [x2], x3 286 ld1 {v7.16b}, [x4], x5 287 urhadd v2.16b, v4.16b, v5.16b 288 urhadd v3.16b, v6.16b, v7.16b 289 st1 {v2.16b}, [x0], x1 290 st1 {v3.16b}, [x0], x1 291 b.gt 1b 292 ret 293endfunc 294 295function x264_pixel_avg2_w4_neon, export=1 2961: 297 subs w5, w5, #2 298 ld1 {v0.s}[0], [x2], x3 299 ld1 {v2.s}[0], [x4], x3 300 urhadd v0.8b, v0.8b, v2.8b 301 ld1 {v1.s}[0], [x2], x3 302 ld1 {v3.s}[0], [x4], x3 303 urhadd v1.8b, v1.8b, v3.8b 304 st1 {v0.s}[0], [x0], x1 305 st1 {v1.s}[0], [x0], x1 306 b.gt 1b 307 ret 308endfunc 309 310function x264_pixel_avg2_w8_neon, export=1 3111: 312 subs w5, w5, #2 313 ld1 {v0.8b}, [x2], x3 314 ld1 {v2.8b}, [x4], x3 315 urhadd v0.8b, v0.8b, v2.8b 316 ld1 {v1.8b}, [x2], x3 317 ld1 {v3.8b}, [x4], x3 318 urhadd v1.8b, v1.8b, v3.8b 319 st1 {v0.8b}, [x0], x1 320 st1 {v1.8b}, [x0], x1 321 b.gt 1b 322 ret 323endfunc 324 325function x264_pixel_avg2_w16_neon, export=1 3261: 327 subs w5, w5, #2 328 ld1 {v0.16b}, [x2], x3 329 ld1 {v2.16b}, [x4], x3 330 urhadd v0.16b, v0.16b, v2.16b 331 ld1 {v1.16b}, [x2], x3 332 ld1 {v3.16b}, [x4], x3 333 urhadd v1.16b, v1.16b, v3.16b 334 st1 {v0.16b}, [x0], x1 335 st1 {v1.16b}, [x0], x1 336 b.gt 1b 337 ret 338endfunc 339 340function x264_pixel_avg2_w20_neon, export=1 341 sub x1, x1, #16 3421: 343 subs w5, w5, #2 344 ld1 {v0.16b,v1.16b}, [x2], x3 345 ld1 {v2.16b,v3.16b}, [x4], x3 346 urhadd v0.16b, v0.16b, v2.16b 347 urhadd v1.8b, v1.8b, v3.8b 348 ld1 {v4.16b,v5.16b}, [x2], x3 349 ld1 {v6.16b,v7.16b}, [x4], x3 350 urhadd v4.16b, v4.16b, v6.16b 351 urhadd v5.8b, v5.8b, v7.8b 352 st1 {v0.16b}, [x0], #16 353 st1 {v1.s}[0], [x0], x1 354 st1 {v4.16b}, [x0], #16 355 st1 {v5.s}[0], [x0], x1 356 b.gt 1b 357 ret 358endfunc 359 360.macro weight_prologue type 361 mov w9, w5 // height 362.ifc \type, full 363 ldr w12, [x4, #32] // denom 364.endif 365 ldp w4, w5, [x4, #32+4] // scale, offset 366 dup v0.16b, w4 367 dup v1.8h, w5 368.ifc \type, full 369 neg w12, w12 370 dup v2.8h, w12 371.endif 372.endm 373 374// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, 375// intptr_t dst_stride, const x264_weight_t *weight, int h ) 376function x264_mc_weight_w20_neon, export=1 377 weight_prologue full 378 sub x1, x1, #16 3791: 380 subs w9, w9, #2 381 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 382 ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 383 umull v22.8h, v16.8b, v0.8b 384 umull v23.8h, v17.8b, v0.8b 385 zip1 v18.2s, v18.2s, v21.2s 386 umull v25.8h, v19.8b, v0.8b 387 umull v26.8h, v20.8b, v0.8b 388 umull v24.8h, v18.8b, v0.8b 389 srshl v22.8h, v22.8h, v2.8h 390 srshl v23.8h, v23.8h, v2.8h 391 srshl v24.8h, v24.8h, v2.8h 392 srshl v25.8h, v25.8h, v2.8h 393 srshl v26.8h, v26.8h, v2.8h 394 add v22.8h, v22.8h, v1.8h 395 add v23.8h, v23.8h, v1.8h 396 add v24.8h, v24.8h, v1.8h 397 add v25.8h, v25.8h, v1.8h 398 add v26.8h, v26.8h, v1.8h 399 sqxtun v4.8b, v22.8h 400 sqxtun2 v4.16b, v23.8h 401 sqxtun v6.8b, v24.8h 402 sqxtun v5.8b, v25.8h 403 sqxtun2 v5.16b, v26.8h 404 st1 {v4.16b}, [x0], #16 405 st1 {v6.s}[0], [x0], x1 406 st1 {v5.16b}, [x0], #16 407 st1 {v6.s}[1], [x0], x1 408 b.gt 1b 409 ret 410endfunc 411 412function x264_mc_weight_w16_neon, export=1 413 weight_prologue full 414weight16_loop: 4151: 416 subs w9, w9, #2 417 ld1 {v4.16b}, [x2], x3 418 ld1 {v5.16b}, [x2], x3 419 umull v22.8h, v4.8b, v0.8b 420 umull2 v23.8h, v4.16b, v0.16b 421 umull v24.8h, v5.8b, v0.8b 422 umull2 v25.8h, v5.16b, v0.16b 423 srshl v22.8h, v22.8h, v2.8h 424 srshl v23.8h, v23.8h, v2.8h 425 srshl v24.8h, v24.8h, v2.8h 426 srshl v25.8h, v25.8h, v2.8h 427 add v22.8h, v22.8h, v1.8h 428 add v23.8h, v23.8h, v1.8h 429 add v24.8h, v24.8h, v1.8h 430 add v25.8h, v25.8h, v1.8h 431 sqxtun v4.8b, v22.8h 432 sqxtun2 v4.16b, v23.8h 433 sqxtun v5.8b, v24.8h 434 sqxtun2 v5.16b, v25.8h 435 st1 {v4.16b}, [x0], x1 436 st1 {v5.16b}, [x0], x1 437 b.gt 1b 438 ret 439endfunc 440 441function x264_mc_weight_w8_neon, export=1 442 weight_prologue full 4431: 444 subs w9, w9, #2 445 ld1 {v16.8b}, [x2], x3 446 ld1 {v17.8b}, [x2], x3 447 umull v4.8h, v16.8b, v0.8b 448 umull v5.8h, v17.8b, v0.8b 449 srshl v4.8h, v4.8h, v2.8h 450 srshl v5.8h, v5.8h, v2.8h 451 add v4.8h, v4.8h, v1.8h 452 add v5.8h, v5.8h, v1.8h 453 sqxtun v16.8b, v4.8h 454 sqxtun v17.8b, v5.8h 455 st1 {v16.8b}, [x0], x1 456 st1 {v17.8b}, [x0], x1 457 b.gt 1b 458 ret 459endfunc 460 461function x264_mc_weight_w4_neon, export=1 462 weight_prologue full 4631: 464 subs w9, w9, #2 465 ld1 {v16.s}[0], [x2], x3 466 ld1 {v16.s}[1], [x2], x3 467 umull v4.8h, v16.8b, v0.8b 468 srshl v4.8h, v4.8h, v2.8h 469 add v4.8h, v4.8h, v1.8h 470 sqxtun v16.8b, v4.8h 471 st1 {v16.s}[0], [x0], x1 472 st1 {v16.s}[1], [x0], x1 473 b.gt 1b 474 ret 475endfunc 476 477function x264_mc_weight_w20_nodenom_neon, export=1 478 weight_prologue nodenom 479 sub x1, x1, #16 4801: 481 subs w9, w9, #2 482 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 483 mov v27.16b, v1.16b 484 mov v28.16b, v1.16b 485 ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 486 mov v31.16b, v1.16b 487 mov v29.16b, v1.16b 488 mov v30.16b, v1.16b 489 zip1 v18.2s, v18.2s, v21.2s 490 umlal v27.8h, v16.8b, v0.8b 491 umlal v28.8h, v17.8b, v0.8b 492 umlal v31.8h, v18.8b, v0.8b 493 umlal v29.8h, v19.8b, v0.8b 494 umlal v30.8h, v20.8b, v0.8b 495 sqxtun v4.8b, v27.8h 496 sqxtun2 v4.16b, v28.8h 497 sqxtun v5.8b, v29.8h 498 sqxtun2 v5.16b, v30.8h 499 sqxtun v6.8b, v31.8h 500 st1 {v4.16b}, [x0], #16 501 st1 {v6.s}[0], [x0], x1 502 st1 {v5.16b}, [x0], #16 503 st1 {v6.s}[1], [x0], x1 504 b.gt 1b 505 ret 506endfunc 507 508function x264_mc_weight_w16_nodenom_neon, export=1 509 weight_prologue nodenom 5101: 511 subs w9, w9, #2 512 ld1 {v6.16b}, [x2], x3 513 mov v27.16b, v1.16b 514 mov v28.16b, v1.16b 515 ld1 {v7.16b}, [x2], x3 516 mov v29.16b, v1.16b 517 mov v30.16b, v1.16b 518 umlal v27.8h, v6.8b, v0.8b 519 umlal2 v28.8h, v6.16b, v0.16b 520 umlal v29.8h, v7.8b, v0.8b 521 umlal2 v30.8h, v7.16b, v0.16b 522 sqxtun v4.8b, v27.8h 523 sqxtun2 v4.16b, v28.8h 524 sqxtun v5.8b, v29.8h 525 sqxtun2 v5.16b, v30.8h 526 st1 {v4.16b}, [x0], x1 527 st1 {v5.16b}, [x0], x1 528 b.gt 1b 529 ret 530endfunc 531 532function x264_mc_weight_w8_nodenom_neon, export=1 533 weight_prologue nodenom 5341: 535 subs w9, w9, #2 536 ld1 {v16.8b}, [x2], x3 537 mov v27.16b, v1.16b 538 ld1 {v17.8b}, [x2], x3 539 mov v29.16b, v1.16b 540 umlal v27.8h, v16.8b, v0.8b 541 umlal v29.8h, v17.8b, v0.8b 542 sqxtun v4.8b, v27.8h 543 sqxtun v5.8b, v29.8h 544 st1 {v4.8b}, [x0], x1 545 st1 {v5.8b}, [x0], x1 546 b.gt 1b 547 ret 548endfunc 549 550function x264_mc_weight_w4_nodenom_neon, export=1 551 weight_prologue nodenom 5521: 553 subs w9, w9, #2 554 ld1 {v16.s}[0], [x2], x3 555 ld1 {v16.s}[1], [x2], x3 556 mov v27.16b, v1.16b 557 umlal v27.8h, v16.8b, v0.8b 558 sqxtun v4.8b, v27.8h 559 st1 {v4.s}[0], [x0], x1 560 st1 {v4.s}[1], [x0], x1 561 b.gt 1b 562 ret 563endfunc 564 565.macro weight_simple_prologue 566 ldr w6, [x4] // offset 567 dup v1.16b, w6 568.endm 569 570.macro weight_simple name op 571function x264_mc_weight_w20_\name\()_neon, export=1 572 weight_simple_prologue 5731: 574 subs w5, w5, #2 575 ldr s18, [x2, #16] 576 ld1 {v16.16b}, [x2], x3 577 ldr s19, [x2, #16] 578 ld1 {v17.16b}, [x2], x3 579 \op v18.8b, v18.8b, v1.8b 580 \op v16.16b, v16.16b, v1.16b 581 \op v19.8b, v19.8b, v1.8b 582 \op v17.16b, v17.16b, v1.16b 583 str s18, [x0, #16] 584 st1 {v16.16b}, [x0], x1 585 str s19, [x0, #16] 586 st1 {v17.16b}, [x0], x1 587 b.gt 1b 588 ret 589endfunc 590 591function x264_mc_weight_w16_\name\()_neon, export=1 592 weight_simple_prologue 5931: 594 subs w5, w5, #2 595 ld1 {v16.16b}, [x2], x3 596 ld1 {v17.16b}, [x2], x3 597 \op v16.16b, v16.16b, v1.16b 598 \op v17.16b, v17.16b, v1.16b 599 st1 {v16.16b}, [x0], x1 600 st1 {v17.16b}, [x0], x1 601 b.gt 1b 602 ret 603endfunc 604 605function x264_mc_weight_w8_\name\()_neon, export=1 606 weight_simple_prologue 6071: 608 subs w5, w5, #2 609 ld1 {v16.8b}, [x2], x3 610 ld1 {v17.8b}, [x2], x3 611 \op v16.8b, v16.8b, v1.8b 612 \op v17.8b, v17.8b, v1.8b 613 st1 {v16.8b}, [x0], x1 614 st1 {v17.8b}, [x0], x1 615 b.gt 1b 616 ret 617endfunc 618 619function x264_mc_weight_w4_\name\()_neon, export=1 620 weight_simple_prologue 6211: 622 subs w5, w5, #2 623 ld1 {v16.s}[0], [x2], x3 624 ld1 {v16.s}[1], [x2], x3 625 \op v16.8b, v16.8b, v1.8b 626 st1 {v16.s}[0], [x0], x1 627 st1 {v16.s}[1], [x0], x1 628 b.gt 1b 629 ret 630endfunc 631.endm 632 633weight_simple offsetadd, uqadd 634weight_simple offsetsub, uqsub 635 636 637// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) 638function x264_mc_copy_w4_neon, export=1 6391: 640 subs w4, w4, #4 641 ld1 {v0.s}[0], [x2], x3 642 ld1 {v1.s}[0], [x2], x3 643 ld1 {v2.s}[0], [x2], x3 644 ld1 {v3.s}[0], [x2], x3 645 st1 {v0.s}[0], [x0], x1 646 st1 {v1.s}[0], [x0], x1 647 st1 {v2.s}[0], [x0], x1 648 st1 {v3.s}[0], [x0], x1 649 b.gt 1b 650 ret 651endfunc 652 653function x264_mc_copy_w8_neon, export=1 6541: subs w4, w4, #4 655 ld1 {v0.8b}, [x2], x3 656 ld1 {v1.8b}, [x2], x3 657 ld1 {v2.8b}, [x2], x3 658 ld1 {v3.8b}, [x2], x3 659 st1 {v0.8b}, [x0], x1 660 st1 {v1.8b}, [x0], x1 661 st1 {v2.8b}, [x0], x1 662 st1 {v3.8b}, [x0], x1 663 b.gt 1b 664 ret 665endfunc 666 667function x264_mc_copy_w16_neon, export=1 6681: subs w4, w4, #4 669 ld1 {v0.16b}, [x2], x3 670 ld1 {v1.16b}, [x2], x3 671 ld1 {v2.16b}, [x2], x3 672 ld1 {v3.16b}, [x2], x3 673 st1 {v0.16b}, [x0], x1 674 st1 {v1.16b}, [x0], x1 675 st1 {v2.16b}, [x0], x1 676 st1 {v3.16b}, [x0], x1 677 b.gt 1b 678 ret 679endfunc 680 681// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v, 682// intptr_t i_dst_stride, 683// uint8_t *src, intptr_t i_src_stride, 684// int dx, int dy, int i_width, int i_height ); 685function x264_mc_chroma_neon, export=1 686 ldr w15, [sp] // height 687 sbfx x12, x6, #3, #29 // asr(3) and sign extend 688 sbfx x11, x5, #3, #29 // asr(3) and sign extend 689 cmp w7, #4 690 mul x12, x12, x4 691 add x3, x3, x11, lsl #1 692 693 and w5, w5, #7 694 and w6, w6, #7 695 696 add x3, x3, x12 697 698 //pld [x3] 699 //pld [x3, x4] 700 701 b.gt mc_chroma_w8_neon 702 b.eq mc_chroma_w4_neon 703endfunc 704 705.macro CHROMA_MC_START r00, r01, r10, r11 706 mul w12, w5, w6 // cD = d8x *d8y 707 lsl w13, w5, #3 708 add w9, w12, #64 709 lsl w14, w6, #3 710 tst w12, w12 711 sub w9, w9, w13 712 sub w10, w13, w12 // cB = d8x *(8-d8y); 713 sub w11, w14, w12 // cC = (8-d8x)*d8y 714 sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); 715.endm 716 717.macro CHROMA_MC width, vsize 718function mc_chroma_w\width\()_neon 719// since the element size varies, there's a different index for the 2nd store 720.if \width == 4 721 .set st2, 1 722.else 723 .set st2, 2 724.endif 725 CHROMA_MC_START 726 b.eq 2f 727 728 ld2 {v28.8b,v29.8b}, [x3], x4 729 dup v0.8b, w9 // cA 730 dup v1.8b, w10 // cB 731 732 ext v6.8b, v28.8b, v6.8b, #1 733 ext v7.8b, v29.8b, v7.8b, #1 734 735 ld2 {v30.8b,v31.8b}, [x3], x4 736 dup v2.8b, w11 // cC 737 dup v3.8b, w12 // cD 738 739 ext v22.8b, v30.8b, v22.8b, #1 740 ext v23.8b, v31.8b, v23.8b, #1 741 742 trn1 v0.2s, v0.2s, v1.2s 743 trn1 v2.2s, v2.2s, v3.2s 744 745 trn1 v4.2s, v28.2s, v6.2s 746 trn1 v5.2s, v29.2s, v7.2s 747 trn1 v20.2s, v30.2s, v22.2s 748 trn1 v21.2s, v31.2s, v23.2s 7491: // height loop, interpolate xy 750 subs w15, w15, #2 751 umull v16.8h, v4.8b, v0.8b 752 umlal v16.8h, v20.8b, v2.8b 753 umull v17.8h, v5.8b, v0.8b 754 umlal v17.8h, v21.8b, v2.8b 755 756 ld2 {v28.8b,v29.8b}, [x3], x4 757 transpose v24.2d, v25.2d, v16.2d, v17.2d 758 759 ext v6.8b, v28.8b, v6.8b, #1 760 ext v7.8b, v29.8b, v7.8b, #1 761 762 trn1 v4.2s, v28.2s, v6.2s 763 trn1 v5.2s, v29.2s, v7.2s 764 765 add v16.8h, v24.8h, v25.8h 766 767 umull v18.8h, v20.8b, v0.8b 768 umlal v18.8h, v4.8b, v2.8b 769 umull v19.8h, v21.8b, v0.8b 770 umlal v19.8h, v5.8b, v2.8b 771 772 ld2 {v30.8b,v31.8b}, [x3], x4 773 transpose v26.2d, v27.2d, v18.2d, v19.2d 774 775 ext v22.8b, v30.8b, v22.8b, #1 776 ext v23.8b, v31.8b, v23.8b, #1 777 trn1 v20.2s, v30.2s, v22.2s 778 trn1 v21.2s, v31.2s, v23.2s 779 780 add v17.8h, v26.8h, v27.8h 781 782 rshrn v16.8b, v16.8h, #6 783 rshrn v17.8b, v17.8h, #6 784 785 //pld [x3] 786 //pld [x3, x4] 787 788 st1 {v16.\vsize}[0], [x0], x2 789 st1 {v16.\vsize}[st2], [x1], x2 790 st1 {v17.\vsize}[0], [x0], x2 791 st1 {v17.\vsize}[st2], [x1], x2 792 b.gt 1b 793 794 ret 7952: // dx or dy are 0 796 tst w11, w11 797 add w10, w10, w11 798 dup v0.8b, w9 799 dup v1.8b, w10 800 801 b.eq 4f 802 803 ld1 {v4.8b}, [x3], x4 804 ld1 {v6.8b}, [x3], x4 8053: // vertical interpolation loop 806 subs w15, w15, #2 807 umull v16.8h, v4.8b, v0.8b 808 ld1 {v4.8b}, [x3], x4 809 umlal v16.8h, v6.8b, v1.8b 810 umull v17.8h, v6.8b, v0.8b 811 ld1 {v6.8b}, [x3], x4 812 umlal v17.8h, v4.8b, v1.8b 813 814 rshrn v20.8b, v16.8h, #6 // uvuvuvuv 815 rshrn v21.8b, v17.8h, #6 // uvuvuvuv 816 817 uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv 818 uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv 819 820 //pld [x3] 821 //pld [x3, x4] 822 823 st1 {v16.\vsize}[0], [x0], x2 824 st1 {v16.\vsize}[st2], [x0], x2 825 st1 {v17.\vsize}[0], [x1], x2 826 st1 {v17.\vsize}[st2], [x1], x2 827 b.gt 3b 828 829 ret 830 8314: // dy is 0 832 ld1 {v4.8b,v5.8b}, [x3], x4 833 ld1 {v6.8b,v7.8b}, [x3], x4 834 835 ext v5.8b, v4.8b, v5.8b, #2 836 ext v7.8b, v6.8b, v7.8b, #2 8375: // horizontal interpolation loop 838 subs w15, w15, #2 839 umull v16.8h, v4.8b, v0.8b 840 umlal v16.8h, v5.8b, v1.8b 841 umull v17.8h, v6.8b, v0.8b 842 umlal v17.8h, v7.8b, v1.8b 843 844 ld1 {v4.8b,v5.8b}, [x3], x4 845 ld1 {v6.8b,v7.8b}, [x3], x4 846 rshrn v20.8b, v16.8h, #6 847 rshrn v21.8b, v17.8h, #6 848 ext v5.8b, v4.8b, v5.8b, #2 849 ext v7.8b, v6.8b, v7.8b, #2 850 uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv 851 uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv 852 853 //pld [x3] 854 //pld [x3, x4] 855 856 st1 {v16.\vsize}[0], [x0], x2 857 st1 {v16.\vsize}[st2], [x0], x2 858 st1 {v17.\vsize}[0], [x1], x2 859 st1 {v17.\vsize}[st2], [x1], x2 860 b.gt 5b 861 862 ret 863endfunc 864.endm 865 866 CHROMA_MC 2, h 867 CHROMA_MC 4, s 868 869function mc_chroma_w8_neon 870 CHROMA_MC_START 871 b.eq 2f 872 ld2 {v4.16b,v5.16b}, [x3], x4 873 ld2 {v20.16b,v21.16b}, [x3], x4 874 dup v0.8b, w9 // cA 875 dup v1.8b, w10 // cB 876 877 ext v6.16b, v4.16b, v4.16b, #1 878 ext v7.16b, v5.16b, v5.16b, #1 879 880 dup v2.8b, w11 // cC 881 dup v3.8b, w12 // cD 882 883 ext v22.16b, v20.16b, v20.16b, #1 884 ext v23.16b, v21.16b, v21.16b, #1 885 8861: // height loop, interpolate xy 887 subs w15, w15, #2 888 umull v16.8h, v4.8b, v0.8b 889 umlal v16.8h, v6.8b, v1.8b 890 umlal v16.8h, v20.8b, v2.8b 891 umlal v16.8h, v22.8b, v3.8b 892 893 umull v17.8h, v5.8b, v0.8b 894 umlal v17.8h, v7.8b, v1.8b 895 umlal v17.8h, v21.8b, v2.8b 896 umlal v17.8h, v23.8b, v3.8b 897 898 ld2 {v4.16b,v5.16b}, [x3], x4 899 900 ext v6.16b, v4.16b, v4.16b, #1 901 ext v7.16b, v5.16b, v5.16b, #1 902 903 umull v18.8h, v20.8b, v0.8b 904 umlal v18.8h, v22.8b, v1.8b 905 umlal v18.8h, v4.8b, v2.8b 906 umlal v18.8h, v6.8b, v3.8b 907 908 umull v19.8h, v21.8b, v0.8b 909 umlal v19.8h, v23.8b, v1.8b 910 umlal v19.8h, v5.8b, v2.8b 911 umlal v19.8h, v7.8b, v3.8b 912 913 ld2 {v20.16b,v21.16b}, [x3], x4 914 915 rshrn v16.8b, v16.8h, #6 916 rshrn v17.8b, v17.8h, #6 917 rshrn v18.8b, v18.8h, #6 918 rshrn v19.8b, v19.8h, #6 919 920 ext v22.16b, v20.16b, v20.16b, #1 921 ext v23.16b, v21.16b, v21.16b, #1 922 923 //pld [x3] 924 //pld [x3, x4] 925 926 st1 {v16.8b}, [x0], x2 927 st1 {v17.8b}, [x1], x2 928 st1 {v18.8b}, [x0], x2 929 st1 {v19.8b}, [x1], x2 930 b.gt 1b 931 932 ret 9332: // dx or dy are 0 934 tst w11, w11 935 add w10, w10, w11 936 dup v0.8b, w9 937 dup v1.8b, w10 938 939 b.eq 4f 940 941 ld2 {v4.8b,v5.8b}, [x3], x4 942 ld2 {v6.8b,v7.8b}, [x3], x4 9433: // vertical interpolation loop 944 subs w15, w15, #2 945 umull v16.8h, v4.8b, v0.8b //U 946 umlal v16.8h, v6.8b, v1.8b 947 umull v17.8h, v5.8b, v0.8b //V 948 umlal v17.8h, v7.8b, v1.8b 949 950 ld2 {v4.8b,v5.8b}, [x3], x4 951 952 umull v18.8h, v6.8b, v0.8b 953 umlal v18.8h, v4.8b, v1.8b 954 umull v19.8h, v7.8b, v0.8b 955 umlal v19.8h, v5.8b, v1.8b 956 957 ld2 {v6.8b,v7.8b}, [x3], x4 958 959 rshrn v16.8b, v16.8h, #6 960 rshrn v17.8b, v17.8h, #6 961 rshrn v18.8b, v18.8h, #6 962 rshrn v19.8b, v19.8h, #6 963 964 //pld [x3] 965 //pld [x3, x4] 966 967 st1 {v16.8b}, [x0], x2 968 st1 {v17.8b}, [x1], x2 969 st1 {v18.8b}, [x0], x2 970 st1 {v19.8b}, [x1], x2 971 b.gt 3b 972 973 ret 9744: // dy is 0 975 ld2 {v4.16b,v5.16b}, [x3], x4 976 ext v6.16b, v4.16b, v4.16b, #1 977 ext v7.16b, v5.16b, v5.16b, #1 978 ld2 {v20.16b,v21.16b}, [x3], x4 979 ext v22.16b, v20.16b, v20.16b, #1 980 ext v23.16b, v21.16b, v21.16b, #1 9815: // horizontal interpolation loop 982 subs w15, w15, #2 983 umull v16.8h, v4.8b, v0.8b //U 984 umlal v16.8h, v6.8b, v1.8b 985 umull v17.8h, v5.8b, v0.8b //V 986 umlal v17.8h, v7.8b, v1.8b 987 988 ld2 {v4.16b,v5.16b}, [x3], x4 989 990 umull v18.8h, v20.8b, v0.8b 991 umlal v18.8h, v22.8b, v1.8b 992 umull v19.8h, v21.8b, v0.8b 993 umlal v19.8h, v23.8b, v1.8b 994 995 ld2 {v20.16b,v21.16b}, [x3], x4 996 997 rshrn v16.8b, v16.8h, #6 998 rshrn v17.8b, v17.8h, #6 999 rshrn v18.8b, v18.8h, #6 1000 rshrn v19.8b, v19.8h, #6 1001 1002 ext v6.16b, v4.16b, v4.16b, #1 1003 ext v7.16b, v5.16b, v5.16b, #1 1004 ext v22.16b, v20.16b, v20.16b, #1 1005 ext v23.16b, v21.16b, v21.16b, #1 1006 1007 //pld [x3] 1008 //pld [x3, x4] 1009 1010 st1 {v16.8b}, [x0], x2 1011 st1 {v17.8b}, [x1], x2 1012 st1 {v18.8b}, [x0], x2 1013 st1 {v19.8b}, [x1], x2 1014 b.gt 5b 1015 1016 ret 1017endfunc 1018 1019//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, 1020// intptr_t stride, int width, int height, int16_t *buf ) 1021function x264_hpel_filter_neon, export=1 1022 ubfm x9, x3, #0, #3 1023 add w15, w5, w9 1024 sub x13, x3, x9 // align src 1025 sub x10, x0, x9 1026 sub x11, x1, x9 1027 sub x12, x2, x9 1028 movi v30.16b, #5 1029 movi v31.16b, #20 10301: // line start 1031 mov x3, x13 1032 mov x2, x12 1033 mov x1, x11 1034 mov x0, x10 1035 add x7, x3, #16 // src pointer next 16b for horiz filter 1036 mov x5, x15 // restore width 1037 sub x3, x3, x4, lsl #1 // src - 2*stride 1038 ld1 {v28.16b}, [x7], #16 // src[16:31] 1039 1040 add x9, x3, x5 // holds src - 2*stride + width 1041 1042 ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] 1043 ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] 1044 ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] 1045 ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] 1046 ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] 1047 ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] 1048 1049 ext v22.16b, v7.16b, v18.16b, #14 1050 uaddl v1.8h, v16.8b, v21.8b 1051 ext v26.16b, v18.16b, v28.16b, #3 1052 umlsl v1.8h, v17.8b, v30.8b 1053 ext v23.16b, v7.16b, v18.16b, #15 1054 umlal v1.8h, v18.8b, v31.8b 1055 ext v24.16b, v18.16b, v28.16b, #1 1056 umlal v1.8h, v19.8b, v31.8b 1057 ext v25.16b, v18.16b, v28.16b, #2 1058 umlsl v1.8h, v20.8b, v30.8b 10592: // next 16 pixel of line 1060 subs x5, x5, #16 1061 sub x3, x9, x5 // src - 2*stride += 16 1062 1063 uaddl v4.8h, v22.8b, v26.8b 1064 uaddl2 v5.8h, v22.16b, v26.16b 1065 sqrshrun v6.8b, v1.8h, #5 1066 umlsl v4.8h, v23.8b, v30.8b 1067 umlsl2 v5.8h, v23.16b, v30.16b 1068 umlal v4.8h, v18.8b, v31.8b 1069 umlal2 v5.8h, v18.16b, v31.16b 1070 umlal v4.8h, v24.8b, v31.8b 1071 umlal2 v5.8h, v24.16b, v31.16b 1072 umlsl v4.8h, v25.8b, v30.8b 1073 umlsl2 v5.8h, v25.16b, v30.16b 1074 1075 uaddl2 v2.8h, v16.16b, v21.16b 1076 sqrshrun v4.8b, v4.8h, #5 1077 mov v7.16b, v18.16b 1078 sqrshrun2 v4.16b, v5.8h, #5 1079 1080 umlsl2 v2.8h, v17.16b, v30.16b 1081 ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] 1082 umlal2 v2.8h, v18.16b, v31.16b 1083 ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] 1084 umlal2 v2.8h, v19.16b, v31.16b 1085 ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] 1086 umlsl2 v2.8h, v20.16b, v30.16b 1087 ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] 1088 st1 {v4.16b}, [x0], #16 1089 sqrshrun2 v6.16b, v2.8h, #5 1090 ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] 1091 ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] 1092 1093 ext v22.16b, v0.16b, v1.16b, #12 1094 ext v26.16b, v1.16b, v2.16b, #6 1095 ext v23.16b, v0.16b, v1.16b, #14 1096 st1 {v6.16b}, [x1], #16 1097 uaddl v3.8h, v16.8b, v21.8b 1098 ext v25.16b, v1.16b, v2.16b, #4 1099 umlsl v3.8h, v17.8b, v30.8b 1100 ext v24.16b, v1.16b, v2.16b, #2 1101 1102 umlal v3.8h, v18.8b, v31.8b 1103 add v4.8h, v22.8h, v26.8h 1104 umlal v3.8h, v19.8b, v31.8b 1105 add v5.8h, v23.8h, v25.8h 1106 umlsl v3.8h, v20.8b, v30.8b 1107 add v6.8h, v24.8h, v1.8h 1108 1109 ext v22.16b, v1.16b, v2.16b, #12 1110 ext v26.16b, v2.16b, v3.16b, #6 1111 ext v23.16b, v1.16b, v2.16b, #14 1112 ext v25.16b, v2.16b, v3.16b, #4 1113 ext v24.16b, v2.16b, v3.16b, #2 1114 1115 add v22.8h, v22.8h, v26.8h 1116 add v23.8h, v23.8h, v25.8h 1117 add v24.8h, v24.8h, v2.8h 1118 1119 sub v4.8h, v4.8h, v5.8h // a-b 1120 sub v5.8h, v5.8h, v6.8h // b-c 1121 1122 sub v22.8h, v22.8h, v23.8h // a-b 1123 sub v23.8h, v23.8h, v24.8h // b-c 1124 1125 sshr v4.8h, v4.8h, #2 // (a-b)/4 1126 sshr v22.8h, v22.8h, #2 // (a-b)/4 1127 sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c 1128 sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c 1129 sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4 1130 sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4 1131 add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 1132 add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 1133 1134 sqrshrun v4.8b, v4.8h, #6 1135 ld1 {v28.16b}, [x7], #16 // src[16:31] 1136 mov v0.16b, v2.16b 1137 ext v23.16b, v7.16b, v18.16b, #15 1138 sqrshrun2 v4.16b, v22.8h, #6 1139 mov v1.16b, v3.16b 1140 ext v22.16b, v7.16b, v18.16b, #14 1141 ext v24.16b, v18.16b, v28.16b, #1 1142 ext v25.16b, v18.16b, v28.16b, #2 1143 ext v26.16b, v18.16b, v28.16b, #3 1144 1145 st1 {v4.16b}, [x2], #16 1146 b.gt 2b 1147 1148 subs w6, w6, #1 1149 add x10, x10, x4 1150 add x11, x11, x4 1151 add x12, x12, x4 1152 add x13, x13, x4 1153 b.gt 1b 1154 1155 ret 1156endfunc 1157 1158// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, 1159// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride, 1160// intptr_t dst_stride, int width, int height ) 1161function x264_frame_init_lowres_core_neon, export=1 1162 ldr w8, [sp] 1163 sub x10, x6, w7, uxtw // dst_stride - width 1164 and x10, x10, #~15 1165 11661: 1167 mov w9, w7 // width 1168 mov x11, x0 // src0 1169 add x12, x0, x5 // src1 = src0 + src_stride 1170 add x13, x0, x5, lsl #1 // src2 = src1 + src_stride 1171 1172 ld2 {v0.16b,v1.16b}, [x11], #32 1173 ld2 {v2.16b,v3.16b}, [x12], #32 1174 ld2 {v4.16b,v5.16b}, [x13], #32 1175 1176 urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x] 1177 urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x] 11782: 1179 subs w9, w9, #16 1180 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] 1181 urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] 1182 1183 ld2 {v0.16b,v1.16b}, [x11], #32 1184 ld2 {v2.16b,v3.16b}, [x12], #32 1185 ld2 {v4.16b,v5.16b}, [x13], #32 1186 urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] 1187 urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] 1188 ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2] 1189 ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2] 1190 1191 urhadd v16.16b, v20.16b, v21.16b 1192 urhadd v18.16b, v22.16b, v23.16b 1193 urhadd v17.16b, v21.16b, v24.16b 1194 urhadd v19.16b, v23.16b, v25.16b 1195 1196 st1 {v16.16b}, [x1], #16 1197 st1 {v18.16b}, [x3], #16 1198 st1 {v17.16b}, [x2], #16 1199 st1 {v19.16b}, [x4], #16 1200 b.le 3f 1201 1202 subs w9, w9, #16 1203 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] 1204 urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] 1205 1206 ld2 {v0.16b,v1.16b}, [x11], #32 1207 ld2 {v2.16b,v3.16b}, [x12], #32 1208 ld2 {v4.16b,v5.16b}, [x13], #32 1209 urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] 1210 urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] 1211 ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2] 1212 ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2] 1213 1214 urhadd v16.16b, v30.16b, v21.16b 1215 urhadd v18.16b, v31.16b, v23.16b 1216 urhadd v17.16b, v21.16b, v24.16b 1217 urhadd v19.16b, v23.16b, v25.16b 1218 1219 st1 {v16.16b}, [x1], #16 1220 st1 {v18.16b}, [x3], #16 1221 st1 {v17.16b}, [x2], #16 1222 st1 {v19.16b}, [x4], #16 1223 b.gt 2b 12243: 1225 subs w8, w8, #1 1226 add x0, x0, x5, lsl #1 1227 add x1, x1, x10 1228 add x2, x2, x10 1229 add x3, x3, x10 1230 add x4, x4, x10 1231 b.gt 1b 1232 1233 ret 1234endfunc 1235 1236function x264_load_deinterleave_chroma_fenc_neon, export=1 1237 mov x4, #FENC_STRIDE/2 1238 b load_deinterleave_chroma 1239endfunc 1240 1241function x264_load_deinterleave_chroma_fdec_neon, export=1 1242 mov x4, #FDEC_STRIDE/2 1243load_deinterleave_chroma: 1244 ld2 {v0.8b,v1.8b}, [x1], x2 1245 ld2 {v2.8b,v3.8b}, [x1], x2 1246 subs w3, w3, #2 1247 st1 {v0.8b}, [x0], x4 1248 st1 {v1.8b}, [x0], x4 1249 st1 {v2.8b}, [x0], x4 1250 st1 {v3.8b}, [x0], x4 1251 b.gt load_deinterleave_chroma 1252 1253 ret 1254endfunc 1255 1256function x264_plane_copy_deinterleave_neon, export=1 1257 add w9, w6, #15 1258 and w9, w9, #0xfffffff0 1259 sub x1, x1, x9 1260 sub x3, x3, x9 1261 sub x5, x5, x9, lsl #1 12621: 1263 ld2 {v0.16b,v1.16b}, [x4], #32 1264 subs w9, w9, #16 1265 st1 {v0.16b}, [x0], #16 1266 st1 {v1.16b}, [x2], #16 1267 b.gt 1b 1268 1269 add x4, x4, x5 1270 subs w7, w7, #1 1271 add x0, x0, x1 1272 add x2, x2, x3 1273 mov w9, w6 1274 b.gt 1b 1275 1276 ret 1277endfunc 1278 1279.macro deinterleave_rgb 1280 subs x11, x11, #8 1281 st1 {v0.8b}, [x0], #8 1282 st1 {v1.8b}, [x2], #8 1283 st1 {v2.8b}, [x4], #8 1284 b.gt 1b 1285 1286 subs w10, w10, #1 1287 add x0, x0, x1 1288 add x2, x2, x3 1289 add x4, x4, x5 1290 add x6, x6, x7 1291 mov x11, x9 1292 b.gt 1b 1293.endm 1294 1295function x264_plane_copy_deinterleave_rgb_neon, export=1 1296#if SYS_MACOSX 1297 ldr w8, [sp] 1298 ldp w9, w10, [sp, #4] 1299#else 1300 ldr x8, [sp] 1301 ldp x9, x10, [sp, #8] 1302#endif 1303 cmp w8, #3 1304 uxtw x9, w9 1305 add x11, x9, #7 1306 and x11, x11, #~7 1307 sub x1, x1, x11 1308 sub x3, x3, x11 1309 sub x5, x5, x11 1310 b.ne 4f 1311 sub x7, x7, x11, lsl #1 1312 sub x7, x7, x11 13131: 1314 ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 1315 deinterleave_rgb 1316 1317 ret 13184: 1319 sub x7, x7, x11, lsl #2 13201: 1321 ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 1322 deinterleave_rgb 1323 1324 ret 1325endfunc 1326 1327function x264_plane_copy_interleave_neon, export=1 1328 add w9, w6, #15 1329 and w9, w9, #0xfffffff0 1330 sub x1, x1, x9, lsl #1 1331 sub x3, x3, x9 1332 sub x5, x5, x9 13331: 1334 ld1 {v0.16b}, [x2], #16 1335 ld1 {v1.16b}, [x4], #16 1336 subs w9, w9, #16 1337 st2 {v0.16b,v1.16b}, [x0], #32 1338 b.gt 1b 1339 1340 subs w7, w7, #1 1341 add x0, x0, x1 1342 add x2, x2, x3 1343 add x4, x4, x5 1344 mov w9, w6 1345 b.gt 1b 1346 1347 ret 1348endfunc 1349 1350function x264_store_interleave_chroma_neon, export=1 1351 mov x5, #FDEC_STRIDE 13521: 1353 ld1 {v0.8b}, [x2], x5 1354 ld1 {v1.8b}, [x3], x5 1355 ld1 {v2.8b}, [x2], x5 1356 ld1 {v3.8b}, [x3], x5 1357 subs w4, w4, #2 1358 zip1 v4.16b, v0.16b, v1.16b 1359 zip1 v5.16b, v2.16b, v3.16b 1360 st1 {v4.16b}, [x0], x1 1361 st1 {v5.16b}, [x0], x1 1362 b.gt 1b 1363 1364 ret 1365endfunc 1366