1/***************************************************************************** 2 * predict.S: aarch64 intra prediction 3 ***************************************************************************** 4 * Copyright (C) 2009-2021 x264 project 5 * 6 * Authors: David Conrad <lessen42@gmail.com> 7 * Mans Rullgard <mans@mansr.com> 8 * Janne Grunau <janne-x264@jannau.net> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23 * 24 * This program is also available under a commercial proprietary license. 25 * For more information, contact us at licensing@x264.com. 26 *****************************************************************************/ 27 28#include "asm.S" 29 30const p8weight, align=4 31 .short 1, 2, 3, 4, 1, 2, 3, 4 32endconst 33const p16weight, align=4 34 .short 1, 2, 3, 4, 5, 6, 7, 8 35endconst 36 37.macro ldcol.8 vd, xn, xm, n=8, hi=0 38.if \n == 8 || \hi == 0 39 ld1 {\vd\().b}[0], [\xn], \xm 40 ld1 {\vd\().b}[1], [\xn], \xm 41 ld1 {\vd\().b}[2], [\xn], \xm 42 ld1 {\vd\().b}[3], [\xn], \xm 43.endif 44.if \n == 8 || \hi == 1 45 ld1 {\vd\().b}[4], [\xn], \xm 46 ld1 {\vd\().b}[5], [\xn], \xm 47 ld1 {\vd\().b}[6], [\xn], \xm 48 ld1 {\vd\().b}[7], [\xn], \xm 49.endif 50.endm 51 52.macro ldcol.16 vd, xn, xm 53 ldcol.8 \vd, \xn, \xm 54 ld1 {\vd\().b}[ 8], [\xn], \xm 55 ld1 {\vd\().b}[ 9], [\xn], \xm 56 ld1 {\vd\().b}[10], [\xn], \xm 57 ld1 {\vd\().b}[11], [\xn], \xm 58 ld1 {\vd\().b}[12], [\xn], \xm 59 ld1 {\vd\().b}[13], [\xn], \xm 60 ld1 {\vd\().b}[14], [\xn], \xm 61 ld1 {\vd\().b}[15], [\xn], \xm 62.endm 63 64 65function predict_4x4_h_aarch64, export=1 66 ldurb w1, [x0, #0*FDEC_STRIDE-1] 67 mov w5, #0x01010101 68 ldrb w2, [x0, #1*FDEC_STRIDE-1] 69 ldrb w3, [x0, #2*FDEC_STRIDE-1] 70 mul w1, w1, w5 71 ldrb w4, [x0, #3*FDEC_STRIDE-1] 72 mul w2, w2, w5 73 str w1, [x0, #0*FDEC_STRIDE] 74 mul w3, w3, w5 75 str w2, [x0, #1*FDEC_STRIDE] 76 mul w4, w4, w5 77 str w3, [x0, #2*FDEC_STRIDE] 78 str w4, [x0, #3*FDEC_STRIDE] 79 ret 80endfunc 81 82function predict_4x4_v_aarch64, export=1 83 ldur w1, [x0, #0 - 1 * FDEC_STRIDE] 84 str w1, [x0, #0 + 0 * FDEC_STRIDE] 85 str w1, [x0, #0 + 1 * FDEC_STRIDE] 86 str w1, [x0, #0 + 2 * FDEC_STRIDE] 87 str w1, [x0, #0 + 3 * FDEC_STRIDE] 88 ret 89endfunc 90 91function predict_4x4_dc_neon, export=1 92 sub x1, x0, #FDEC_STRIDE 93 ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE] 94 ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE] 95 ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE] 96 ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE] 97 add w4, w4, w5 98 ldr s0, [x1] 99 add w6, w6, w7 100 uaddlv h0, v0.8b 101 add w4, w4, w6 102 dup v0.4h, v0.h[0] 103 dup v1.4h, w4 104 add v0.4h, v0.4h, v1.4h 105 rshrn v0.8b, v0.8h, #3 106 str s0, [x0] 107 str s0, [x0, #1 * FDEC_STRIDE] 108 str s0, [x0, #2 * FDEC_STRIDE] 109 str s0, [x0, #3 * FDEC_STRIDE] 110 ret 111endfunc 112 113function predict_4x4_dc_top_neon, export=1 114 sub x1, x0, #FDEC_STRIDE 115 ldr s0, [x1] 116 uaddlv h0, v0.8b 117 dup v0.4h, v0.h[0] 118 rshrn v0.8b, v0.8h, #2 119 str s0, [x0] 120 str s0, [x0, #1 * FDEC_STRIDE] 121 str s0, [x0, #2 * FDEC_STRIDE] 122 str s0, [x0, #3 * FDEC_STRIDE] 123 ret 124 ret 125endfunc 126 127function predict_4x4_ddr_neon, export=1 128 sub x1, x0, #FDEC_STRIDE+1 129 mov x7, #FDEC_STRIDE 130 ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 131 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 132 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 133 ext v0.8b, v1.8b, v0.8b, #7 134 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 135 ext v0.8b, v2.8b, v0.8b, #7 // a 136 ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 137 ext v1.8b, v3.8b, v0.8b, #7 // b 138 ext v2.8b, v4.8b, v1.8b, #7 // c 139 uaddl v0.8h, v0.8b, v1.8b 140 uaddl v1.8h, v1.8b, v2.8b 141 add v0.8h, v0.8h, v1.8h 142 rshrn v0.8b, v0.8h, #2 143 144 ext v3.8b, v0.8b, v0.8b, #3 145 ext v2.8b, v0.8b, v0.8b, #2 146 ext v1.8b, v0.8b, v0.8b, #1 147 148 str s3, [x0], #FDEC_STRIDE 149 str s2, [x0], #FDEC_STRIDE 150 str s1, [x0], #FDEC_STRIDE 151 str s0, [x0] 152 ret 153endfunc 154 155function predict_4x4_ddl_neon, export=1 156 sub x0, x0, #FDEC_STRIDE 157 mov x7, #FDEC_STRIDE 158 ld1 {v0.8b}, [x0], x7 159 dup v3.8b, v0.b[7] 160 ext v1.8b, v0.8b, v0.8b, #1 161 ext v2.8b, v0.8b, v3.8b, #2 162 uhadd v0.8b, v0.8b, v2.8b 163 urhadd v0.8b, v0.8b, v1.8b 164 str s0, [x0], #FDEC_STRIDE 165 ext v1.8b, v0.8b, v0.8b, #1 166 ext v2.8b, v0.8b, v0.8b, #2 167 str s1, [x0], #FDEC_STRIDE 168 ext v3.8b, v0.8b, v0.8b, #3 169 str s2, [x0], #FDEC_STRIDE 170 str s3, [x0] 171 ret 172endfunc 173 174function predict_8x8_dc_neon, export=1 175 mov x7, #FDEC_STRIDE 176 ld1 {v0.16b}, [x1], #16 177 ld1 {v1.8b}, [x1] 178 ext v0.16b, v0.16b, v0.16b, #7 179 uaddlv h1, v1.8b 180 uaddlv h0, v0.8b 181 add v0.8h, v0.8h, v1.8h 182 dup v0.8h, v0.h[0] 183 rshrn v0.8b, v0.8h, #4 184.rept 8 185 st1 {v0.8b}, [x0], x7 186.endr 187 ret 188endfunc 189 190function predict_8x8_h_neon, export=1 191 mov x7, #FDEC_STRIDE 192 ld1 {v16.16b}, [x1] 193 dup v0.8b, v16.b[14] 194 dup v1.8b, v16.b[13] 195 st1 {v0.8b}, [x0], x7 196 dup v2.8b, v16.b[12] 197 st1 {v1.8b}, [x0], x7 198 dup v3.8b, v16.b[11] 199 st1 {v2.8b}, [x0], x7 200 dup v4.8b, v16.b[10] 201 st1 {v3.8b}, [x0], x7 202 dup v5.8b, v16.b[9] 203 st1 {v4.8b}, [x0], x7 204 dup v6.8b, v16.b[8] 205 st1 {v5.8b}, [x0], x7 206 dup v7.8b, v16.b[7] 207 st1 {v6.8b}, [x0], x7 208 st1 {v7.8b}, [x0], x7 209 ret 210endfunc 211 212function predict_8x8_v_neon, export=1 213 add x1, x1, #16 214 mov x7, #FDEC_STRIDE 215 ld1 {v0.8b}, [x1] 216.rept 8 217 st1 {v0.8b}, [x0], x7 218.endr 219 ret 220endfunc 221 222function predict_8x8_ddl_neon, export=1 223 add x1, x1, #16 224 mov x7, #FDEC_STRIDE 225 ld1 {v0.16b}, [x1] 226 movi v3.16b, #0 227 dup v2.16b, v0.b[15] 228 ext v4.16b, v3.16b, v0.16b, #15 229 ext v2.16b, v0.16b, v2.16b, #1 230 uhadd v4.16b, v4.16b, v2.16b 231 urhadd v0.16b, v0.16b, v4.16b 232 ext v1.16b, v0.16b, v0.16b, #1 233 ext v2.16b, v0.16b, v0.16b, #2 234 st1 {v1.8b}, [x0], x7 235 ext v3.16b, v0.16b, v0.16b, #3 236 st1 {v2.8b}, [x0], x7 237 ext v4.16b, v0.16b, v0.16b, #4 238 st1 {v3.8b}, [x0], x7 239 ext v5.16b, v0.16b, v0.16b, #5 240 st1 {v4.8b}, [x0], x7 241 ext v6.16b, v0.16b, v0.16b, #6 242 st1 {v5.8b}, [x0], x7 243 ext v7.16b, v0.16b, v0.16b, #7 244 st1 {v6.8b}, [x0], x7 245 ext v0.16b, v0.16b, v0.16b, #8 246 st1 {v7.8b}, [x0], x7 247 st1 {v0.8b}, [x0], x7 248 ret 249endfunc 250 251function predict_8x8_ddr_neon, export=1 252 ld1 {v0.16b,v1.16b}, [x1] 253 ext v2.16b, v0.16b, v1.16b, #7 254 ext v4.16b, v0.16b, v1.16b, #9 255 ext v3.16b, v0.16b, v1.16b, #8 256 257 uhadd v2.16b, v2.16b, v4.16b 258 urhadd v7.16b, v3.16b, v2.16b 259 260 add x0, x0, #7*FDEC_STRIDE 261 mov x7, #-1*FDEC_STRIDE 262 263 ext v6.16b, v7.16b, v7.16b, #1 264 st1 {v7.8b}, [x0], x7 265 ext v5.16b, v7.16b, v7.16b, #2 266 st1 {v6.8b}, [x0], x7 267 ext v4.16b, v7.16b, v7.16b, #3 268 st1 {v5.8b}, [x0], x7 269 ext v3.16b, v7.16b, v7.16b, #4 270 st1 {v4.8b}, [x0], x7 271 ext v2.16b, v7.16b, v7.16b, #5 272 st1 {v3.8b}, [x0], x7 273 ext v1.16b, v7.16b, v7.16b, #6 274 st1 {v2.8b}, [x0], x7 275 ext v0.16b, v7.16b, v7.16b, #7 276 st1 {v1.8b}, [x0], x7 277 st1 {v0.8b}, [x0], x7 278 ret 279endfunc 280 281function predict_8x8_vl_neon, export=1 282 add x1, x1, #16 283 mov x7, #FDEC_STRIDE 284 285 ld1 {v0.16b}, [x1] 286 ext v1.16b, v1.16b, v0.16b, #15 287 ext v2.16b, v0.16b, v2.16b, #1 288 289 uhadd v1.16b, v1.16b, v2.16b 290 urhadd v3.16b, v0.16b, v2.16b 291 292 urhadd v0.16b, v0.16b, v1.16b 293 294 ext v4.16b, v0.16b, v0.16b, #1 295 st1 {v3.8b}, [x0], x7 296 ext v5.16b, v3.16b, v3.16b, #1 297 st1 {v4.8b}, [x0], x7 298 ext v6.16b, v0.16b, v0.16b, #2 299 st1 {v5.8b}, [x0], x7 300 ext v7.16b, v3.16b, v3.16b, #2 301 st1 {v6.8b}, [x0], x7 302 ext v4.16b, v0.16b, v0.16b, #3 303 st1 {v7.8b}, [x0], x7 304 ext v5.16b, v3.16b, v3.16b, #3 305 st1 {v4.8b}, [x0], x7 306 ext v6.16b, v0.16b, v0.16b, #4 307 st1 {v5.8b}, [x0], x7 308 st1 {v6.8b}, [x0], x7 309 ret 310endfunc 311 312function predict_8x8_vr_neon, export=1 313 add x1, x1, #8 314 mov x7, #FDEC_STRIDE 315 ld1 {v2.16b}, [x1] 316 317 ext v1.16b, v2.16b, v2.16b, #14 318 ext v0.16b, v2.16b, v2.16b, #15 319 320 uhadd v3.16b, v2.16b, v1.16b 321 urhadd v2.16b, v2.16b, v0.16b 322 urhadd v0.16b, v0.16b, v3.16b 323 324 ext v1.16b, v2.16b, v2.16b, #8 325 uzp1 v2.8b, v0.8b, v0.8b 326 uzp2 v3.8b, v0.8b, v0.8b 327 ext v0.16b, v0.16b, v0.16b, #8 328 329 st1 {v1.8b}, [x0], x7 330 st1 {v0.8b}, [x0], x7 331 ext v4.8b, v3.8b, v1.8b, #7 332 ext v5.8b, v2.8b, v0.8b, #7 333 st1 {v4.8b}, [x0], x7 334 st1 {v5.8b}, [x0], x7 335 ext v6.8b, v3.8b, v1.8b, #6 336 ext v7.8b, v2.8b, v0.8b, #6 337 st1 {v6.8b}, [x0], x7 338 st1 {v7.8b}, [x0], x7 339 ext v1.8b, v3.8b, v1.8b, #5 340 ext v0.8b, v2.8b, v0.8b, #5 341 st1 {v1.8b}, [x0], x7 342 st1 {v0.8b}, [x0], x7 343 ret 344endfunc 345 346function predict_8x8_hd_neon, export=1 347 add x1, x1, #7 348 mov x7, #FDEC_STRIDE 349 350 ld1 {v1.16b}, [x1] 351 ext v3.16b, v1.16b, v1.16b, #1 352 ext v2.16b, v1.16b, v1.16b, #2 353 354 urhadd v4.16b, v1.16b, v3.16b 355 356 uhadd v1.16b, v1.16b, v2.16b 357 urhadd v0.16b, v1.16b, v3.16b 358 359 zip1 v16.8b, v4.8b, v0.8b 360 zip2 v17.8b, v4.8b, v0.8b 361 ext v7.16b, v0.16b, v0.16b, #8 362 363 ext v0.8b, v17.8b, v7.8b, #6 364 ext v1.8b, v17.8b, v7.8b, #4 365 st1 {v0.8b}, [x0], x7 366 ext v2.8b, v17.8b, v7.8b, #2 367 st1 {v1.8b}, [x0], x7 368 st1 {v2.8b}, [x0], x7 369 ext v3.8b, v16.8b, v17.8b, #6 370 st1 {v17.8b}, [x0], x7 371 ext v4.8b, v16.8b, v17.8b, #4 372 st1 {v3.8b}, [x0], x7 373 ext v5.8b, v16.8b, v17.8b, #2 374 st1 {v4.8b}, [x0], x7 375 st1 {v5.8b}, [x0], x7 376 st1 {v16.8b}, [x0], x7 377 378 ret 379endfunc 380 381function predict_8x8_hu_neon, export=1 382 add x1, x1, #7 383 mov x7, #FDEC_STRIDE 384 ld1 {v7.8b}, [x1] 385 dup v6.8b, v7.b[0] 386 rev64 v7.8b, v7.8b 387 388 ext v4.8b, v7.8b, v6.8b, #2 389 ext v2.8b, v7.8b, v6.8b, #1 390 391 uhadd v5.8b, v7.8b, v4.8b 392 urhadd v0.8b, v2.8b, v7.8b 393 urhadd v1.8b, v5.8b, v2.8b 394 395 zip1 v16.8b, v0.8b, v1.8b 396 zip2 v17.8b, v0.8b, v1.8b 397 398 dup v18.4h, v17.h[3] 399 400 ext v0.8b, v16.8b, v17.8b, #2 401 ext v1.8b, v16.8b, v17.8b, #4 402 ext v2.8b, v16.8b, v17.8b, #6 403 st1 {v16.8b}, [x0], x7 404 st1 {v0.8b}, [x0], x7 405 st1 {v1.8b}, [x0], x7 406 st1 {v2.8b}, [x0], x7 407 408 ext v4.8b, v17.8b, v18.8b, #2 409 ext v5.8b, v17.8b, v18.8b, #4 410 ext v6.8b, v17.8b, v18.8b, #6 411 st1 {v17.8b}, [x0], x7 412 st1 {v4.8b}, [x0], x7 413 st1 {v5.8b}, [x0], x7 414 st1 {v6.8b}, [x0] 415 ret 416endfunc 417 418 419function predict_8x8c_dc_top_neon, export=1 420 sub x2, x0, #FDEC_STRIDE 421 mov x1, #FDEC_STRIDE 422 ld1 {v0.8b}, [x2] 423 uaddlp v0.4h, v0.8b 424 addp v0.4h, v0.4h, v0.4h 425 rshrn v0.8b, v0.8h, #2 426 dup v3.8b, v0.b[1] 427 dup v2.8b, v0.b[0] 428 transpose v0.2s, v1.2s, v2.2s, v3.2s 429 b pred8x8c_dc_end 430endfunc 431 432function predict_8x8c_dc_left_neon, export=1 433 ldurb w2, [x0, #0 * FDEC_STRIDE - 1] 434 ldrb w3, [x0, #1 * FDEC_STRIDE - 1] 435 ldrb w4, [x0, #2 * FDEC_STRIDE - 1] 436 ldrb w5, [x0, #3 * FDEC_STRIDE - 1] 437 mov x1, #FDEC_STRIDE 438 add w2, w2, w3 439 add w3, w4, w5 440 ldrb w6, [x0, #4 * FDEC_STRIDE - 1] 441 ldrb w7, [x0, #5 * FDEC_STRIDE - 1] 442 ldrb w8, [x0, #6 * FDEC_STRIDE - 1] 443 ldrb w9, [x0, #7 * FDEC_STRIDE - 1] 444 add w6, w6, w7 445 add w7, w8, w9 446 add w2, w2, w3 447 add w6, w6, w7 448 dup v0.8h, w2 449 dup v1.8h, w6 450 rshrn v0.8b, v0.8h, #2 451 rshrn v1.8b, v1.8h, #2 452 b pred8x8c_dc_end 453endfunc 454 455function predict_8x8c_dc_neon, export=1 456 mov x1, #FDEC_STRIDE 457 sub x2, x0, #FDEC_STRIDE 458 ldurb w10, [x0, #0 * FDEC_STRIDE - 1] 459 ldrb w11, [x0, #1 * FDEC_STRIDE - 1] 460 ldrb w12, [x0, #2 * FDEC_STRIDE - 1] 461 ldrb w13, [x0, #3 * FDEC_STRIDE - 1] 462 add w10, w10, w11 463 ldrb w4, [x0, #4 * FDEC_STRIDE - 1] 464 ldrb w5, [x0, #5 * FDEC_STRIDE - 1] 465 add w12, w12, w13 466 ldrb w6, [x0, #6 * FDEC_STRIDE - 1] 467 ldrb w7, [x0, #7 * FDEC_STRIDE - 1] 468 add w4, w4, w5 469 add w6, w6, w7 470 add w10, w10, w12, lsl #16 471 add w4, w4, w6, lsl #16 472 ld1 {v0.8b}, [x2] 473 add x10, x10, x4, lsl #32 474 uaddlp v0.4h, v0.8b // s0, s1 475 mov v1.d[0], x10 // s2, s3 476 add v3.4h, v0.4h, v1.4h 477 addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3 478 addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3 479 uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3 480 uzp1 v1.2d, v1.2d, v1.2d 481 uzp1 v0.2d, v0.2d, v0.2d 482 rshrn v3.8b, v1.8h, #3 483 rshrn v2.8b, v0.8h, #2 484 uzp1 v0.8b, v3.8b, v2.8b 485 uzp2 v1.8b, v2.8b, v3.8b 486pred8x8c_dc_end: 487 add x2, x0, #2 * FDEC_STRIDE 488 add x4, x0, #4 * FDEC_STRIDE 489 add x5, x0, #6 * FDEC_STRIDE 490 st1 {v0.8b}, [x0], x1 491 st1 {v0.8b}, [x2], x1 492 st1 {v0.8b}, [x0] 493 st1 {v0.8b}, [x2] 494 st1 {v1.8b}, [x4], x1 495 st1 {v1.8b}, [x5], x1 496 st1 {v1.8b}, [x4] 497 st1 {v1.8b}, [x5] 498 ret 499endfunc 500 501function predict_8x8c_h_neon, export=1 502 sub x1, x0, #1 503 mov x7, #FDEC_STRIDE 504.rept 4 505 ld1r {v0.8b}, [x1], x7 506 ld1r {v1.8b}, [x1], x7 507 st1 {v0.8b}, [x0], x7 508 st1 {v1.8b}, [x0], x7 509.endr 510 ret 511endfunc 512 513function predict_8x8c_v_aarch64, export=1 514 ldur x1, [x0, #-FDEC_STRIDE] 515.irp c, 0,1,2,3,4,5,6,7 516 str x1, [x0, #\c * FDEC_STRIDE] 517.endr 518 ret 519endfunc 520 521function predict_8x8c_p_neon, export=1 522 sub x3, x0, #FDEC_STRIDE 523 mov x1, #FDEC_STRIDE 524 add x2, x3, #4 525 sub x3, x3, #1 526 ld1 {v0.s}[0], [x3] 527 ld1 {v2.s}[0], [x2], x1 528 ldcol.8 v0, x3, x1, 4, hi=1 529 add x3, x3, x1 530 ldcol.8 v3, x3, x1, 4 531 movrel x4, p8weight 532 movrel x5, p16weight 533 uaddl v4.8h, v2.8b, v3.8b 534 rev32 v0.8b, v0.8b 535 trn1 v2.2s, v2.2s, v3.2s 536 ld1 {v7.8h}, [x4] 537 usubl v2.8h, v2.8b, v0.8b 538 mul v2.8h, v2.8h, v7.8h 539 ld1 {v0.8h}, [x5] 540 saddlp v2.4s, v2.8h 541 addp v2.4s, v2.4s, v2.4s 542 shl v3.2s, v2.2s, #4 543 add v2.2s, v2.2s, v3.2s 544 rshrn v5.4h, v2.4s, #5 // b, c, x, x 545 addp v2.4h, v5.4h, v5.4h 546 shl v3.4h, v2.4h, #2 547 sub v3.4h, v3.4h, v2.4h // 3 * (b + c) 548 rev64 v4.4h, v4.4h 549 add v4.4h, v4.4h, v0.4h 550 shl v2.4h, v4.4h, #4 // a 551 sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16 552 ext v0.16b, v0.16b, v0.16b, #14 553 sub v6.4h, v5.4h, v3.4h 554 mov v0.h[0], wzr 555 mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b 556 dup v1.8h, v2.h[0] // pix 557 dup v2.8h, v5.h[1] // c 558 add v1.8h, v1.8h, v0.8h // pix + x*b 559 mov x3, #8 5601: 561 subs x3, x3, #1 562 sqshrun v0.8b, v1.8h, #5 563 add v1.8h, v1.8h, v2.8h 564 st1 {v0.8b}, [x0], x1 565 b.ne 1b 566 ret 567endfunc 568 569 570.macro loadsum4 wd, t1, t2, t3, x, idx 571 .if \idx == 0 572 ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1] 573 .else 574 ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1] 575 .endif 576 ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1] 577 ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1] 578 ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1] 579 add \wd, \wd, \t1 580 add \t1, \t2, \t3 581 add \wd, \wd, \t1 582.endm 583 584function predict_8x16c_h_neon, export=1 585 sub x2, x0, #1 586 add x3, x0, #FDEC_STRIDE - 1 587 mov x7, #2 * FDEC_STRIDE 588 add x1, x0, #FDEC_STRIDE 589.rept 4 590 ld1r {v0.8b}, [x2], x7 591 ld1r {v1.8b}, [x3], x7 592 ld1r {v2.8b}, [x2], x7 593 ld1r {v3.8b}, [x3], x7 594 st1 {v0.8b}, [x0], x7 595 st1 {v1.8b}, [x1], x7 596 st1 {v2.8b}, [x0], x7 597 st1 {v3.8b}, [x1], x7 598.endr 599 ret 600endfunc 601 602function predict_8x16c_v_neon, export=1 603 sub x1, x0, #FDEC_STRIDE 604 mov x2, #2 * FDEC_STRIDE 605 ld1 {v0.8b}, [x1], x2 606.rept 8 607 st1 {v0.8b}, [x0], x2 608 st1 {v0.8b}, [x1], x2 609.endr 610 ret 611endfunc 612 613function predict_8x16c_p_neon, export=1 614 movrel x4, p16weight 615 ld1 {v17.8h}, [x4] 616 sub x3, x0, #FDEC_STRIDE 617 mov x1, #FDEC_STRIDE 618 add x2, x3, #4 619 sub x3, x3, #1 620 621 ld1 {v0.8b}, [x3] 622 ld1 {v2.8b}, [x2], x1 623 ldcol.8 v1, x3, x1 624 add x3, x3, x1 625 ldcol.8 v3, x3, x1 626 ext v4.8b, v2.8b, v2.8b, #3 627 ext v5.8b, v3.8b, v3.8b, #7 628 rev32 v0.8b, v0.8b 629 rev64 v1.8b, v1.8b 630 631 uaddl v4.8h, v5.8b, v4.8b // a * 1/16 632 633 usubl v2.8h, v2.8b, v0.8b 634 mul v2.8h, v2.8h, v17.8h 635 saddlp v2.4s, v2.8h 636 addp v2.4s, v2.4s, v2.4s // H 637 638 usubl v3.8h, v3.8b, v1.8b 639 mul v3.8h, v3.8h, v17.8h 640 saddlp v3.4s, v3.8h 641 addp v3.4s, v3.4s, v3.4s 642 addp v3.4s, v3.4s, v3.4s // V 643 644 ext v17.16b, v17.16b, v17.16b, #14 645 646 shl v4.4h, v4.4h, #4 // a 647 shl v6.2s, v2.2s, #4 // 16 * H 648 shl v7.2s, v3.2s, #2 // 4 * V 649 add v2.2s, v2.2s, v6.2s // 17 * H 650 add v3.2s, v3.2s, v7.2s // 5 * V 651 rshrn v2.4h, v2.4s, #5 // b 652 rshrn v3.4h, v3.4s, #6 // c 653 654 mov v17.h[0], wzr 655 656 sub v4.4h, v4.4h, v2.4h // a - b 657 shl v6.4h, v2.4h, #1 // 2 * b 658 add v4.4h, v4.4h, v3.4h // a - b + c 659 shl v7.4h, v3.4h, #3 // 8 * c 660 sub v4.4h, v4.4h, v6.4h // a - 3b + c 661 sub v4.4h, v4.4h, v7.4h // a - 3b - 7c 662 663 mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b 664 dup v1.8h, v4.h[0] // i00 665 dup v2.8h, v3.h[0] // c 666 add v1.8h, v1.8h, v0.8h // pix + {0..7}*b 667 mov x3, #16 6681: 669 subs x3, x3, #2 670 sqrshrun v4.8b, v1.8h, #5 671 add v1.8h, v1.8h, v2.8h 672 sqrshrun v5.8b, v1.8h, #5 673 st1 {v4.8b}, [x0], x1 674 add v1.8h, v1.8h, v2.8h 675 st1 {v5.8b}, [x0], x1 676 b.ne 1b 677 ret 678endfunc 679 680function predict_8x16c_dc_neon, export=1 681 mov x1, #FDEC_STRIDE 682 sub x10, x0, #FDEC_STRIDE 683 loadsum4 w2, w3, w4, w5, x0, 0 684 ld1 {v6.8b}, [x10] 685 loadsum4 w6, w7, w8, w9, x0, 4 686 uaddlp v6.4h, v6.8b 687 dup v22.8h, w2 // s2 688 dup v23.8h, w6 // s3 689 loadsum4 w2, w3, w4, w5, x0, 8 690 addp v6.4h, v6.4h, v6.4h // s0, s1 691 loadsum4 w6, w7, w8, w9, x0, 12 692 dup v20.8h, v6.h[0] // s0 693 dup v21.8h, v6.h[1] // s1 694 dup v24.8h, w2 // s4 695 dup v25.8h, w6 // s5 696 697 ext v16.16b, v20.16b, v21.16b, #8 698 ext v17.16b, v22.16b, v21.16b, #8 699 ext v1.16b, v23.16b, v21.16b, #8 700 ext v2.16b, v24.16b, v21.16b, #8 701 ext v3.16b, v25.16b, v21.16b, #8 702 703 add v0.8h, v16.8h, v17.8h 704 add v1.8h, v1.8h, v23.8h 705 add v2.8h, v2.8h, v24.8h 706 add v3.8h, v3.8h, v25.8h 707 708 rshrn v0.8b, v0.8h, #3 709 rshrn v1.8b, v1.8h, #3 710 rshrn v2.8b, v2.8h, #3 711 rshrn v3.8b, v3.8h, #3 712 713 add x11, x0, #4 * FDEC_STRIDE 714 add x12, x0, #8 * FDEC_STRIDE 715 add x13, x0, #12 * FDEC_STRIDE 716.rept 4 717 st1 {v0.8b}, [x0], x1 718 st1 {v1.8b}, [x11], x1 719 st1 {v2.8b}, [x12], x1 720 st1 {v3.8b}, [x13], x1 721.endr 722 ret 723endfunc 724 725function predict_8x16c_dc_left_neon, export=1 726 mov x1, #FDEC_STRIDE 727 ldurb w2, [x0, # 0 * FDEC_STRIDE - 1] 728 ldrb w3, [x0, # 1 * FDEC_STRIDE - 1] 729 ldrb w4, [x0, # 2 * FDEC_STRIDE - 1] 730 ldrb w5, [x0, # 3 * FDEC_STRIDE - 1] 731 add w2, w2, w3 732 733 ldrb w6, [x0, # 4 * FDEC_STRIDE - 1] 734 add w4, w4, w5 735 ldrb w7, [x0, # 5 * FDEC_STRIDE - 1] 736 add w2, w2, w4 737 ldrb w8, [x0, # 6 * FDEC_STRIDE - 1] 738 ldrb w9, [x0, # 7 * FDEC_STRIDE - 1] 739 dup v0.8h, w2 740 add w6, w6, w7 741 rshrn v0.8b, v0.8h, #2 742 add w8, w8, w9 743 744 ldrb w10, [x0, # 8 * FDEC_STRIDE - 1] 745 ldrb w11, [x0, # 9 * FDEC_STRIDE - 1] 746 add w6, w6, w8 747 ldrb w12, [x0, #10 * FDEC_STRIDE - 1] 748 ldrb w13, [x0, #11 * FDEC_STRIDE - 1] 749 dup v1.8h, w6 750 add w10, w10, w11 751 rshrn v1.8b, v1.8h, #2 752 add w12, w12, w13 753 754 ldrb w2, [x0, #12 * FDEC_STRIDE - 1] 755 ldrb w3, [x0, #13 * FDEC_STRIDE - 1] 756 add w10, w10, w12 757 ldrb w4, [x0, #14 * FDEC_STRIDE - 1] 758 ldrb w5, [x0, #15 * FDEC_STRIDE - 1] 759 dup v2.8h, w10 760 add w2, w2, w3 761 rshrn v2.8b, v2.8h, #2 762 add w4, w4, w5 763 st1 {v0.8b}, [x0], x1 764 st1 {v0.8b}, [x0], x1 765 add w2, w2, w4 766 st1 {v0.8b}, [x0], x1 767 dup v3.8h, w2 768 st1 {v0.8b}, [x0], x1 769 rshrn v3.8b, v3.8h, #2 770 771.irp idx, 1, 2, 3 772.rept 4 773 st1 {v\idx\().8b}, [x0], x1 774.endr 775.endr 776 ret 777endfunc 778 779function predict_8x16c_dc_top_neon, export=1 780 sub x2, x0, #FDEC_STRIDE 781 mov x1, #FDEC_STRIDE 782 ld1 {v0.8b}, [x2] 783 uaddlp v0.4h, v0.8b 784 addp v0.4h, v0.4h, v0.4h 785 rshrn v4.8b, v0.8h, #2 786 dup v0.8b, v4.b[0] 787 dup v1.8b, v4.b[1] 788 ext v0.8b, v0.8b, v1.8b, #4 789.rept 16 790 st1 {v0.8b}, [x0], x1 791.endr 792 ret 793endfunc 794 795 796function predict_16x16_dc_top_neon, export=1 797 sub x2, x0, #FDEC_STRIDE 798 mov x1, #FDEC_STRIDE 799 ld1 {v0.16b}, [x2] 800 uaddlv h0, v0.16b 801 rshrn v0.8b, v0.8h, #4 802 dup v0.16b, v0.b[0] 803 b pred16x16_dc_end 804endfunc 805 806function predict_16x16_dc_left_neon, export=1 807 sub x2, x0, #1 808 mov x1, #FDEC_STRIDE 809 ldcol.16 v0, x2, x1 810 uaddlv h0, v0.16b 811 rshrn v0.8b, v0.8h, #4 812 dup v0.16b, v0.b[0] 813 b pred16x16_dc_end 814endfunc 815 816function predict_16x16_dc_neon, export=1 817 sub x3, x0, #FDEC_STRIDE 818 sub x2, x0, #1 819 mov x1, #FDEC_STRIDE 820 ld1 {v0.16b}, [x3] 821 ldcol.16 v1, x2, x1 822 uaddlv h0, v0.16b 823 uaddlv h1, v1.16b 824 add v0.4h, v0.4h, v1.4h 825 rshrn v0.8b, v0.8h, #5 826 dup v0.16b, v0.b[0] 827pred16x16_dc_end: 828.rept 16 829 st1 {v0.16b}, [x0], x1 830.endr 831 ret 832endfunc 833 834function predict_16x16_h_neon, export=1 835 sub x1, x0, #1 836 mov x7, #FDEC_STRIDE 837.rept 8 838 ld1r {v0.16b}, [x1], x7 839 ld1r {v1.16b}, [x1], x7 840 st1 {v0.16b}, [x0], x7 841 st1 {v1.16b}, [x0], x7 842.endr 843 ret 844endfunc 845 846function predict_16x16_v_neon, export=1 847 sub x0, x0, #FDEC_STRIDE 848 mov x7, #FDEC_STRIDE 849 ld1 {v0.16b}, [x0], x7 850.rept 16 851 st1 {v0.16b}, [x0], x7 852.endr 853 ret 854endfunc 855 856function predict_16x16_p_neon, export=1 857 sub x3, x0, #FDEC_STRIDE 858 mov x1, #FDEC_STRIDE 859 add x2, x3, #8 860 sub x3, x3, #1 861 ld1 {v0.8b}, [x3] 862 ld1 {v2.8b}, [x2], x1 863 ldcol.8 v1, x3, x1 864 add x3, x3, x1 865 ldcol.8 v3, x3, x1 866 rev64 v0.8b, v0.8b 867 rev64 v1.8b, v1.8b 868 movrel x4, p16weight 869 uaddl v4.8h, v2.8b, v3.8b 870 ld1 {v7.8h}, [x4] 871 usubl v2.8h, v2.8b, v0.8b 872 usubl v3.8h, v3.8b, v1.8b 873 mul v2.8h, v2.8h, v7.8h 874 mul v3.8h, v3.8h, v7.8h 875 saddlp v2.4s, v2.8h 876 saddlp v3.4s, v3.8h 877 addp v2.4s, v2.4s, v3.4s 878 addp v2.4s, v2.4s, v2.4s 879 shl v3.2s, v2.2s, #2 880 add v2.2s, v2.2s, v3.2s 881 rshrn v5.4h, v2.4s, #6 // b, c, x, x 882 addp v2.4h, v5.4h, v5.4h 883 shl v3.4h, v2.4h, #3 884 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 885 ext v4.16b, v4.16b, v4.16b, #14 886 add v4.4h, v4.4h, v7.4h 887 shl v2.4h, v4.4h, #4 // a 888 sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16 889 ext v7.16b, v7.16b, v7.16b, #14 890 mov v7.h[0], wzr 891 dup v3.8h, v5.h[0] 892 mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b 893 dup v1.8h, v2.h[0] // pix 894 dup v2.8h, v5.h[1] // c 895 shl v3.8h, v3.8h, #3 896 add v1.8h, v1.8h, v0.8h // pix + x*b 897 add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b 898 mov x3, #16 8991: 900 subs x3, x3, #1 901 sqshrun v0.8b, v1.8h, #5 902 add v1.8h, v1.8h, v2.8h 903 sqshrun2 v0.16b, v3.8h, #5 904 add v3.8h, v3.8h, v2.8h 905 st1 {v0.16b}, [x0], x1 906 b.ne 1b 907 ret 908endfunc 909