1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height, 35// const int bitdepth_max); 36function ipred_dc_128_16bpc_neon, export=1 37 ldr w8, [sp] 38 clz w3, w3 39 adr x5, L(ipred_dc_128_tbl) 40 sub w3, w3, #25 41 ldrh w3, [x5, w3, uxtw #1] 42 dup v0.8h, w8 43 sub x5, x5, w3, uxtw 44 add x6, x0, x1 45 lsl x1, x1, #1 46 urshr v0.8h, v0.8h, #1 47 br x5 484: 49 AARCH64_VALID_JUMP_TARGET 50 st1 {v0.4h}, [x0], x1 51 st1 {v0.4h}, [x6], x1 52 subs w4, w4, #4 53 st1 {v0.4h}, [x0], x1 54 st1 {v0.4h}, [x6], x1 55 b.gt 4b 56 ret 578: 58 AARCH64_VALID_JUMP_TARGET 59 st1 {v0.8h}, [x0], x1 60 st1 {v0.8h}, [x6], x1 61 subs w4, w4, #4 62 st1 {v0.8h}, [x0], x1 63 st1 {v0.8h}, [x6], x1 64 b.gt 8b 65 ret 66160: 67 AARCH64_VALID_JUMP_TARGET 68 mov v1.16b, v0.16b 6916: 70 st1 {v0.8h, v1.8h}, [x0], x1 71 st1 {v0.8h, v1.8h}, [x6], x1 72 subs w4, w4, #4 73 st1 {v0.8h, v1.8h}, [x0], x1 74 st1 {v0.8h, v1.8h}, [x6], x1 75 b.gt 16b 76 ret 77320: 78 AARCH64_VALID_JUMP_TARGET 79 mov v1.16b, v0.16b 80 mov v2.16b, v0.16b 81 mov v3.16b, v0.16b 8232: 83 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 84 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 85 subs w4, w4, #4 86 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 87 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 88 b.gt 32b 89 ret 90640: 91 AARCH64_VALID_JUMP_TARGET 92 mov v1.16b, v0.16b 93 mov v2.16b, v0.16b 94 mov v3.16b, v0.16b 95 sub x1, x1, #64 9664: 97 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 98 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 101 subs w4, w4, #4 102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 103 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 104 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 105 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 106 b.gt 64b 107 ret 108 109L(ipred_dc_128_tbl): 110 .hword L(ipred_dc_128_tbl) - 640b 111 .hword L(ipred_dc_128_tbl) - 320b 112 .hword L(ipred_dc_128_tbl) - 160b 113 .hword L(ipred_dc_128_tbl) - 8b 114 .hword L(ipred_dc_128_tbl) - 4b 115endfunc 116 117// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 118// const pixel *const topleft, 119// const int width, const int height, const int a, 120// const int max_width, const int max_height); 121function ipred_v_16bpc_neon, export=1 122 clz w3, w3 123 adr x5, L(ipred_v_tbl) 124 sub w3, w3, #25 125 ldrh w3, [x5, w3, uxtw #1] 126 add x2, x2, #2 127 sub x5, x5, w3, uxtw 128 add x6, x0, x1 129 lsl x1, x1, #1 130 br x5 13140: 132 AARCH64_VALID_JUMP_TARGET 133 ld1 {v0.4h}, [x2] 1344: 135 st1 {v0.4h}, [x0], x1 136 st1 {v0.4h}, [x6], x1 137 subs w4, w4, #4 138 st1 {v0.4h}, [x0], x1 139 st1 {v0.4h}, [x6], x1 140 b.gt 4b 141 ret 14280: 143 AARCH64_VALID_JUMP_TARGET 144 ld1 {v0.8h}, [x2] 1458: 146 st1 {v0.8h}, [x0], x1 147 st1 {v0.8h}, [x6], x1 148 subs w4, w4, #4 149 st1 {v0.8h}, [x0], x1 150 st1 {v0.8h}, [x6], x1 151 b.gt 8b 152 ret 153160: 154 AARCH64_VALID_JUMP_TARGET 155 ld1 {v0.8h, v1.8h}, [x2] 15616: 157 st1 {v0.8h, v1.8h}, [x0], x1 158 st1 {v0.8h, v1.8h}, [x6], x1 159 subs w4, w4, #4 160 st1 {v0.8h, v1.8h}, [x0], x1 161 st1 {v0.8h, v1.8h}, [x6], x1 162 b.gt 16b 163 ret 164320: 165 AARCH64_VALID_JUMP_TARGET 166 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 16732: 168 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 169 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 170 subs w4, w4, #4 171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 173 b.gt 32b 174 ret 175640: 176 AARCH64_VALID_JUMP_TARGET 177 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 178 sub x1, x1, #64 179 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 18064: 181 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 182 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 183 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 184 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 185 subs w4, w4, #4 186 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 187 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 188 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 189 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 190 b.gt 64b 191 ret 192 193L(ipred_v_tbl): 194 .hword L(ipred_v_tbl) - 640b 195 .hword L(ipred_v_tbl) - 320b 196 .hword L(ipred_v_tbl) - 160b 197 .hword L(ipred_v_tbl) - 80b 198 .hword L(ipred_v_tbl) - 40b 199endfunc 200 201// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 202// const pixel *const topleft, 203// const int width, const int height, const int a, 204// const int max_width, const int max_height); 205function ipred_h_16bpc_neon, export=1 206 clz w3, w3 207 adr x5, L(ipred_h_tbl) 208 sub w3, w3, #25 209 ldrh w3, [x5, w3, uxtw #1] 210 sub x2, x2, #8 211 sub x5, x5, w3, uxtw 212 mov x7, #-8 213 add x6, x0, x1 214 lsl x1, x1, #1 215 br x5 2164: 217 AARCH64_VALID_JUMP_TARGET 218 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 219 st1 {v3.4h}, [x0], x1 220 st1 {v2.4h}, [x6], x1 221 subs w4, w4, #4 222 st1 {v1.4h}, [x0], x1 223 st1 {v0.4h}, [x6], x1 224 b.gt 4b 225 ret 2268: 227 AARCH64_VALID_JUMP_TARGET 228 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 229 st1 {v3.8h}, [x0], x1 230 st1 {v2.8h}, [x6], x1 231 subs w4, w4, #4 232 st1 {v1.8h}, [x0], x1 233 st1 {v0.8h}, [x6], x1 234 b.gt 8b 235 ret 23616: 237 AARCH64_VALID_JUMP_TARGET 238 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 239 str q3, [x0, #16] 240 str q2, [x6, #16] 241 st1 {v3.8h}, [x0], x1 242 st1 {v2.8h}, [x6], x1 243 subs w4, w4, #4 244 str q1, [x0, #16] 245 str q0, [x6, #16] 246 st1 {v1.8h}, [x0], x1 247 st1 {v0.8h}, [x6], x1 248 b.gt 16b 249 ret 25032: 251 AARCH64_VALID_JUMP_TARGET 252 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 253 str q3, [x0, #16] 254 str q2, [x6, #16] 255 stp q3, q3, [x0, #32] 256 stp q2, q2, [x6, #32] 257 st1 {v3.8h}, [x0], x1 258 st1 {v2.8h}, [x6], x1 259 subs w4, w4, #4 260 str q1, [x0, #16] 261 str q0, [x6, #16] 262 stp q1, q1, [x0, #32] 263 stp q0, q0, [x6, #32] 264 st1 {v1.8h}, [x0], x1 265 st1 {v0.8h}, [x6], x1 266 b.gt 32b 267 ret 26864: 269 AARCH64_VALID_JUMP_TARGET 270 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 271 str q3, [x0, #16] 272 str q2, [x6, #16] 273 stp q3, q3, [x0, #32] 274 stp q2, q2, [x6, #32] 275 stp q3, q3, [x0, #64] 276 stp q2, q2, [x6, #64] 277 stp q3, q3, [x0, #96] 278 stp q2, q2, [x6, #96] 279 st1 {v3.8h}, [x0], x1 280 st1 {v2.8h}, [x6], x1 281 subs w4, w4, #4 282 str q1, [x0, #16] 283 str q0, [x6, #16] 284 stp q1, q1, [x0, #32] 285 stp q0, q0, [x6, #32] 286 stp q1, q1, [x0, #64] 287 stp q0, q0, [x6, #64] 288 stp q1, q1, [x0, #96] 289 stp q0, q0, [x6, #96] 290 st1 {v1.8h}, [x0], x1 291 st1 {v0.8h}, [x6], x1 292 b.gt 64b 293 ret 294 295L(ipred_h_tbl): 296 .hword L(ipred_h_tbl) - 64b 297 .hword L(ipred_h_tbl) - 32b 298 .hword L(ipred_h_tbl) - 16b 299 .hword L(ipred_h_tbl) - 8b 300 .hword L(ipred_h_tbl) - 4b 301endfunc 302 303// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 304// const pixel *const topleft, 305// const int width, const int height, const int a, 306// const int max_width, const int max_height); 307function ipred_dc_top_16bpc_neon, export=1 308 clz w3, w3 309 adr x5, L(ipred_dc_top_tbl) 310 sub w3, w3, #25 311 ldrh w3, [x5, w3, uxtw #1] 312 add x2, x2, #2 313 sub x5, x5, w3, uxtw 314 add x6, x0, x1 315 lsl x1, x1, #1 316 br x5 31740: 318 AARCH64_VALID_JUMP_TARGET 319 ld1 {v0.4h}, [x2] 320 addv h0, v0.4h 321 urshr v0.4h, v0.4h, #2 322 dup v0.4h, v0.h[0] 3234: 324 st1 {v0.4h}, [x0], x1 325 st1 {v0.4h}, [x6], x1 326 subs w4, w4, #4 327 st1 {v0.4h}, [x0], x1 328 st1 {v0.4h}, [x6], x1 329 b.gt 4b 330 ret 33180: 332 AARCH64_VALID_JUMP_TARGET 333 ld1 {v0.8h}, [x2] 334 addv h0, v0.8h 335 urshr v0.4h, v0.4h, #3 336 dup v0.8h, v0.h[0] 3378: 338 st1 {v0.8h}, [x0], x1 339 st1 {v0.8h}, [x6], x1 340 subs w4, w4, #4 341 st1 {v0.8h}, [x0], x1 342 st1 {v0.8h}, [x6], x1 343 b.gt 8b 344 ret 345160: 346 AARCH64_VALID_JUMP_TARGET 347 ld1 {v0.8h, v1.8h}, [x2] 348 addp v0.8h, v0.8h, v1.8h 349 addv h0, v0.8h 350 urshr v2.4h, v0.4h, #4 351 dup v0.8h, v2.h[0] 352 dup v1.8h, v2.h[0] 35316: 354 st1 {v0.8h, v1.8h}, [x0], x1 355 st1 {v0.8h, v1.8h}, [x6], x1 356 subs w4, w4, #4 357 st1 {v0.8h, v1.8h}, [x0], x1 358 st1 {v0.8h, v1.8h}, [x6], x1 359 b.gt 16b 360 ret 361320: 362 AARCH64_VALID_JUMP_TARGET 363 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 364 addp v0.8h, v0.8h, v1.8h 365 addp v2.8h, v2.8h, v3.8h 366 addp v0.8h, v0.8h, v2.8h 367 uaddlv s0, v0.8h 368 rshrn v4.4h, v0.4s, #5 369 dup v0.8h, v4.h[0] 370 dup v1.8h, v4.h[0] 371 dup v2.8h, v4.h[0] 372 dup v3.8h, v4.h[0] 37332: 374 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 375 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 376 subs w4, w4, #4 377 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 378 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 379 b.gt 32b 380 ret 381640: 382 AARCH64_VALID_JUMP_TARGET 383 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 384 addp v0.8h, v0.8h, v1.8h 385 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 386 addp v2.8h, v2.8h, v3.8h 387 addp v4.8h, v4.8h, v5.8h 388 addp v6.8h, v6.8h, v7.8h 389 addp v0.8h, v0.8h, v2.8h 390 addp v4.8h, v4.8h, v6.8h 391 addp v0.8h, v0.8h, v4.8h 392 uaddlv s0, v0.8h 393 rshrn v4.4h, v0.4s, #6 394 sub x1, x1, #64 395 dup v0.8h, v4.h[0] 396 dup v1.8h, v4.h[0] 397 dup v2.8h, v4.h[0] 398 dup v3.8h, v4.h[0] 39964: 400 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 401 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 402 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 403 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 404 subs w4, w4, #4 405 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 406 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 407 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 408 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 409 b.gt 64b 410 ret 411 412L(ipred_dc_top_tbl): 413 .hword L(ipred_dc_top_tbl) - 640b 414 .hword L(ipred_dc_top_tbl) - 320b 415 .hword L(ipred_dc_top_tbl) - 160b 416 .hword L(ipred_dc_top_tbl) - 80b 417 .hword L(ipred_dc_top_tbl) - 40b 418endfunc 419 420// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 421// const pixel *const topleft, 422// const int width, const int height, const int a, 423// const int max_width, const int max_height); 424function ipred_dc_left_16bpc_neon, export=1 425 sub x2, x2, w4, uxtw #1 426 clz w3, w3 427 clz w7, w4 428 adr x5, L(ipred_dc_left_tbl) 429 sub w3, w3, #20 // 25 leading bits, minus table offset 5 430 sub w7, w7, #25 431 ldrh w3, [x5, w3, uxtw #1] 432 ldrh w7, [x5, w7, uxtw #1] 433 sub x3, x5, w3, uxtw 434 sub x5, x5, w7, uxtw 435 add x6, x0, x1 436 lsl x1, x1, #1 437 br x5 438 439L(ipred_dc_left_h4): 440 AARCH64_VALID_JUMP_TARGET 441 ld1 {v0.4h}, [x2] 442 addv h0, v0.4h 443 urshr v0.4h, v0.4h, #2 444 dup v0.8h, v0.h[0] 445 br x3 446L(ipred_dc_left_w4): 447 AARCH64_VALID_JUMP_TARGET 448 st1 {v0.4h}, [x0], x1 449 st1 {v0.4h}, [x6], x1 450 subs w4, w4, #4 451 st1 {v0.4h}, [x0], x1 452 st1 {v0.4h}, [x6], x1 453 b.gt L(ipred_dc_left_w4) 454 ret 455 456L(ipred_dc_left_h8): 457 AARCH64_VALID_JUMP_TARGET 458 ld1 {v0.8h}, [x2] 459 addv h0, v0.8h 460 urshr v0.4h, v0.4h, #3 461 dup v0.8h, v0.h[0] 462 br x3 463L(ipred_dc_left_w8): 464 AARCH64_VALID_JUMP_TARGET 465 st1 {v0.8h}, [x0], x1 466 st1 {v0.8h}, [x6], x1 467 subs w4, w4, #4 468 st1 {v0.8h}, [x0], x1 469 st1 {v0.8h}, [x6], x1 470 b.gt L(ipred_dc_left_w8) 471 ret 472 473L(ipred_dc_left_h16): 474 AARCH64_VALID_JUMP_TARGET 475 ld1 {v0.8h, v1.8h}, [x2] 476 addp v0.8h, v0.8h, v1.8h 477 addv h0, v0.8h 478 urshr v2.4h, v0.4h, #4 479 dup v0.8h, v2.h[0] 480 dup v1.8h, v2.h[0] 481 br x3 482L(ipred_dc_left_w16): 483 AARCH64_VALID_JUMP_TARGET 484 mov v1.16b, v0.16b 4851: 486 st1 {v0.8h, v1.8h}, [x0], x1 487 st1 {v0.8h, v1.8h}, [x6], x1 488 subs w4, w4, #4 489 st1 {v0.8h, v1.8h}, [x0], x1 490 st1 {v0.8h, v1.8h}, [x6], x1 491 b.gt 1b 492 ret 493 494L(ipred_dc_left_h32): 495 AARCH64_VALID_JUMP_TARGET 496 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 497 addp v0.8h, v0.8h, v1.8h 498 addp v2.8h, v2.8h, v3.8h 499 addp v0.8h, v0.8h, v2.8h 500 uaddlp v0.4s, v0.8h 501 addv s0, v0.4s 502 rshrn v4.4h, v0.4s, #5 503 dup v0.8h, v4.h[0] 504 br x3 505L(ipred_dc_left_w32): 506 AARCH64_VALID_JUMP_TARGET 507 mov v1.16b, v0.16b 508 mov v2.16b, v0.16b 509 mov v3.16b, v0.16b 5101: 511 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 512 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 513 subs w4, w4, #4 514 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 515 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 516 b.gt 1b 517 ret 518 519L(ipred_dc_left_h64): 520 AARCH64_VALID_JUMP_TARGET 521 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 522 addp v0.8h, v0.8h, v1.8h 523 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 524 addp v2.8h, v2.8h, v3.8h 525 addp v4.8h, v4.8h, v5.8h 526 addp v6.8h, v6.8h, v7.8h 527 addp v0.8h, v0.8h, v2.8h 528 addp v4.8h, v4.8h, v6.8h 529 addp v0.8h, v0.8h, v4.8h 530 uaddlv s0, v0.8h 531 rshrn v4.4h, v0.4s, #6 532 dup v0.8h, v4.h[0] 533 br x3 534L(ipred_dc_left_w64): 535 AARCH64_VALID_JUMP_TARGET 536 mov v1.16b, v0.16b 537 mov v2.16b, v0.16b 538 mov v3.16b, v0.16b 539 sub x1, x1, #64 5401: 541 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 542 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 543 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 544 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 545 subs w4, w4, #4 546 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 547 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 548 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 549 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 550 b.gt 1b 551 ret 552 553L(ipred_dc_left_tbl): 554 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 555 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 556 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 557 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 558 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 559 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 560 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 561 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 562 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 563 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 564endfunc 565 566// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, 567// const pixel *const topleft, 568// const int width, const int height, const int a, 569// const int max_width, const int max_height); 570function ipred_dc_16bpc_neon, export=1 571 sub x2, x2, w4, uxtw #1 572 add w7, w3, w4 // width + height 573 clz w3, w3 574 clz w6, w4 575 dup v16.4s, w7 // width + height 576 adr x5, L(ipred_dc_tbl) 577 rbit w7, w7 // rbit(width + height) 578 sub w3, w3, #20 // 25 leading bits, minus table offset 5 579 sub w6, w6, #25 580 clz w7, w7 // ctz(width + height) 581 ldrh w3, [x5, w3, uxtw #1] 582 ldrh w6, [x5, w6, uxtw #1] 583 neg w7, w7 // -ctz(width + height) 584 sub x3, x5, w3, uxtw 585 sub x5, x5, w6, uxtw 586 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 587 dup v17.4s, w7 // -ctz(width + height) 588 add x6, x0, x1 589 lsl x1, x1, #1 590 br x5 591 592L(ipred_dc_h4): 593 AARCH64_VALID_JUMP_TARGET 594 ld1 {v0.4h}, [x2], #8 595 uaddlv s0, v0.4h 596 add x2, x2, #2 597 br x3 598L(ipred_dc_w4): 599 AARCH64_VALID_JUMP_TARGET 600 ld1 {v1.4h}, [x2] 601 add v0.2s, v0.2s, v16.2s 602 uaddlv s1, v1.4h 603 cmp w4, #4 604 add v0.2s, v0.2s, v1.2s 605 ushl v0.2s, v0.2s, v17.2s 606 b.eq 1f 607 // h = 8/16 608 cmp w4, #16 609 mov w16, #0x6667 610 mov w17, #0xAAAB 611 csel w16, w16, w17, eq 612 dup v16.2s, w16 613 mul v0.2s, v0.2s, v16.2s 614 ushr v0.2s, v0.2s, #17 6151: 616 dup v0.4h, v0.h[0] 6172: 618 st1 {v0.4h}, [x0], x1 619 st1 {v0.4h}, [x6], x1 620 subs w4, w4, #4 621 st1 {v0.4h}, [x0], x1 622 st1 {v0.4h}, [x6], x1 623 b.gt 2b 624 ret 625 626L(ipred_dc_h8): 627 AARCH64_VALID_JUMP_TARGET 628 ld1 {v0.8h}, [x2], #16 629 uaddlv s0, v0.8h 630 add x2, x2, #2 631 br x3 632L(ipred_dc_w8): 633 AARCH64_VALID_JUMP_TARGET 634 ld1 {v1.8h}, [x2] 635 add v0.2s, v0.2s, v16.2s 636 uaddlv s1, v1.8h 637 cmp w4, #8 638 add v0.2s, v0.2s, v1.2s 639 ushl v0.2s, v0.2s, v17.2s 640 b.eq 1f 641 // h = 4/16/32 642 cmp w4, #32 643 mov w16, #0x6667 644 mov w17, #0xAAAB 645 csel w16, w16, w17, eq 646 dup v16.2s, w16 647 mul v0.2s, v0.2s, v16.2s 648 ushr v0.2s, v0.2s, #17 6491: 650 dup v0.8h, v0.h[0] 6512: 652 st1 {v0.8h}, [x0], x1 653 st1 {v0.8h}, [x6], x1 654 subs w4, w4, #4 655 st1 {v0.8h}, [x0], x1 656 st1 {v0.8h}, [x6], x1 657 b.gt 2b 658 ret 659 660L(ipred_dc_h16): 661 AARCH64_VALID_JUMP_TARGET 662 ld1 {v0.8h, v1.8h}, [x2], #32 663 addp v0.8h, v0.8h, v1.8h 664 add x2, x2, #2 665 uaddlv s0, v0.8h 666 br x3 667L(ipred_dc_w16): 668 AARCH64_VALID_JUMP_TARGET 669 ld1 {v1.8h, v2.8h}, [x2] 670 add v0.2s, v0.2s, v16.2s 671 addp v1.8h, v1.8h, v2.8h 672 uaddlv s1, v1.8h 673 cmp w4, #16 674 add v0.2s, v0.2s, v1.2s 675 ushl v4.2s, v0.2s, v17.2s 676 b.eq 1f 677 // h = 4/8/32/64 678 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 679 mov w16, #0x6667 680 mov w17, #0xAAAB 681 csel w16, w16, w17, eq 682 dup v16.2s, w16 683 mul v4.2s, v4.2s, v16.2s 684 ushr v4.2s, v4.2s, #17 6851: 686 dup v0.8h, v4.h[0] 687 dup v1.8h, v4.h[0] 6882: 689 st1 {v0.8h, v1.8h}, [x0], x1 690 st1 {v0.8h, v1.8h}, [x6], x1 691 subs w4, w4, #4 692 st1 {v0.8h, v1.8h}, [x0], x1 693 st1 {v0.8h, v1.8h}, [x6], x1 694 b.gt 2b 695 ret 696 697L(ipred_dc_h32): 698 AARCH64_VALID_JUMP_TARGET 699 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 700 addp v0.8h, v0.8h, v1.8h 701 addp v2.8h, v2.8h, v3.8h 702 addp v0.8h, v0.8h, v2.8h 703 add x2, x2, #2 704 uaddlv s0, v0.8h 705 br x3 706L(ipred_dc_w32): 707 AARCH64_VALID_JUMP_TARGET 708 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] 709 add v0.2s, v0.2s, v16.2s 710 addp v1.8h, v1.8h, v2.8h 711 addp v3.8h, v3.8h, v4.8h 712 addp v1.8h, v1.8h, v3.8h 713 uaddlv s1, v1.8h 714 cmp w4, #32 715 add v0.2s, v0.2s, v1.2s 716 ushl v4.2s, v0.2s, v17.2s 717 b.eq 1f 718 // h = 8/16/64 719 cmp w4, #8 720 mov w16, #0x6667 721 mov w17, #0xAAAB 722 csel w16, w16, w17, eq 723 dup v16.2s, w16 724 mul v4.2s, v4.2s, v16.2s 725 ushr v4.2s, v4.2s, #17 7261: 727 dup v0.8h, v4.h[0] 728 dup v1.8h, v4.h[0] 729 dup v2.8h, v4.h[0] 730 dup v3.8h, v4.h[0] 7312: 732 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 733 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 734 subs w4, w4, #4 735 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 736 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 737 b.gt 2b 738 ret 739 740L(ipred_dc_h64): 741 AARCH64_VALID_JUMP_TARGET 742 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 743 addp v0.8h, v0.8h, v1.8h 744 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 745 addp v2.8h, v2.8h, v3.8h 746 addp v4.8h, v4.8h, v5.8h 747 addp v6.8h, v6.8h, v7.8h 748 addp v0.8h, v0.8h, v2.8h 749 addp v4.8h, v4.8h, v6.8h 750 addp v0.8h, v0.8h, v4.8h 751 add x2, x2, #2 752 uaddlv s0, v0.8h 753 br x3 754L(ipred_dc_w64): 755 AARCH64_VALID_JUMP_TARGET 756 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 757 add v0.2s, v0.2s, v16.2s 758 addp v1.8h, v1.8h, v2.8h 759 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] 760 addp v3.8h, v3.8h, v4.8h 761 addp v20.8h, v20.8h, v21.8h 762 addp v22.8h, v22.8h, v23.8h 763 addp v1.8h, v1.8h, v3.8h 764 addp v20.8h, v20.8h, v22.8h 765 addp v1.8h, v1.8h, v20.8h 766 uaddlv s1, v1.8h 767 cmp w4, #64 768 add v0.2s, v0.2s, v1.2s 769 ushl v4.2s, v0.2s, v17.2s 770 b.eq 1f 771 // h = 16/32 772 cmp w4, #16 773 mov w16, #0x6667 774 mov w17, #0xAAAB 775 csel w16, w16, w17, eq 776 dup v16.2s, w16 777 mul v4.2s, v4.2s, v16.2s 778 ushr v4.2s, v4.2s, #17 7791: 780 sub x1, x1, #64 781 dup v0.8h, v4.h[0] 782 dup v1.8h, v4.h[0] 783 dup v2.8h, v4.h[0] 784 dup v3.8h, v4.h[0] 7852: 786 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 787 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 788 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 789 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 790 subs w4, w4, #4 791 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 792 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 793 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 794 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 795 b.gt 2b 796 ret 797 798L(ipred_dc_tbl): 799 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 800 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 801 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 802 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 803 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 804 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 805 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 806 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 807 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 808 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 809endfunc 810 811// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 812// const pixel *const topleft, 813// const int width, const int height, const int a, 814// const int max_width, const int max_height); 815function ipred_paeth_16bpc_neon, export=1 816 clz w9, w3 817 adr x5, L(ipred_paeth_tbl) 818 sub w9, w9, #25 819 ldrh w9, [x5, w9, uxtw #1] 820 ld1r {v4.8h}, [x2] 821 add x8, x2, #2 822 sub x2, x2, #8 823 sub x5, x5, w9, uxtw 824 mov x7, #-8 825 add x6, x0, x1 826 lsl x1, x1, #1 827 br x5 82840: 829 AARCH64_VALID_JUMP_TARGET 830 ld1r {v5.2d}, [x8] 831 sub v6.8h, v5.8h, v4.8h // top - topleft 8324: 833 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 834 zip1 v0.2d, v0.2d, v1.2d 835 zip1 v2.2d, v2.2d, v3.2d 836 add v16.8h, v6.8h, v0.8h // base 837 add v17.8h, v6.8h, v2.8h 838 sabd v20.8h, v5.8h, v16.8h // tdiff 839 sabd v21.8h, v5.8h, v17.8h 840 sabd v22.8h, v4.8h, v16.8h // tldiff 841 sabd v23.8h, v4.8h, v17.8h 842 sabd v16.8h, v0.8h, v16.8h // ldiff 843 sabd v17.8h, v2.8h, v17.8h 844 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) 845 umin v19.8h, v21.8h, v23.8h 846 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff 847 cmge v21.8h, v23.8h, v21.8h 848 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff 849 cmge v17.8h, v19.8h, v17.8h 850 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 851 bsl v20.16b, v5.16b, v4.16b 852 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 853 bit v20.16b, v0.16b, v16.16b 854 st1 {v21.d}[1], [x0], x1 855 st1 {v21.d}[0], [x6], x1 856 subs w4, w4, #4 857 st1 {v20.d}[1], [x0], x1 858 st1 {v20.d}[0], [x6], x1 859 b.gt 4b 860 ret 86180: 862160: 863320: 864640: 865 AARCH64_VALID_JUMP_TARGET 866 ld1 {v5.8h}, [x8], #16 867 mov w9, w3 868 // Set up pointers for four rows in parallel; x0, x6, x5, x10 869 add x5, x0, x1 870 add x10, x6, x1 871 lsl x1, x1, #1 872 sub x1, x1, w3, uxtw #1 8731: 874 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 8752: 876 sub v6.8h, v5.8h, v4.8h // top - topleft 877 add v16.8h, v6.8h, v0.8h // base 878 add v17.8h, v6.8h, v1.8h 879 add v18.8h, v6.8h, v2.8h 880 add v19.8h, v6.8h, v3.8h 881 sabd v20.8h, v5.8h, v16.8h // tdiff 882 sabd v21.8h, v5.8h, v17.8h 883 sabd v22.8h, v5.8h, v18.8h 884 sabd v23.8h, v5.8h, v19.8h 885 sabd v24.8h, v4.8h, v16.8h // tldiff 886 sabd v25.8h, v4.8h, v17.8h 887 sabd v26.8h, v4.8h, v18.8h 888 sabd v27.8h, v4.8h, v19.8h 889 sabd v16.8h, v0.8h, v16.8h // ldiff 890 sabd v17.8h, v1.8h, v17.8h 891 sabd v18.8h, v2.8h, v18.8h 892 sabd v19.8h, v3.8h, v19.8h 893 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) 894 umin v29.8h, v21.8h, v25.8h 895 umin v30.8h, v22.8h, v26.8h 896 umin v31.8h, v23.8h, v27.8h 897 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff 898 cmge v21.8h, v25.8h, v21.8h 899 cmge v22.8h, v26.8h, v22.8h 900 cmge v23.8h, v27.8h, v23.8h 901 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff 902 cmge v17.8h, v29.8h, v17.8h 903 cmge v18.8h, v30.8h, v18.8h 904 cmge v19.8h, v31.8h, v19.8h 905 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 906 bsl v22.16b, v5.16b, v4.16b 907 bsl v21.16b, v5.16b, v4.16b 908 bsl v20.16b, v5.16b, v4.16b 909 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 910 bit v22.16b, v2.16b, v18.16b 911 bit v21.16b, v1.16b, v17.16b 912 bit v20.16b, v0.16b, v16.16b 913 st1 {v23.8h}, [x0], #16 914 st1 {v22.8h}, [x6], #16 915 subs w3, w3, #8 916 st1 {v21.8h}, [x5], #16 917 st1 {v20.8h}, [x10], #16 918 b.le 8f 919 ld1 {v5.8h}, [x8], #16 920 b 2b 9218: 922 subs w4, w4, #4 923 b.le 9f 924 // End of horizontal loop, move pointers to next four rows 925 sub x8, x8, w9, uxtw #1 926 add x0, x0, x1 927 add x6, x6, x1 928 // Load the top row as early as possible 929 ld1 {v5.8h}, [x8], #16 930 add x5, x5, x1 931 add x10, x10, x1 932 mov w3, w9 933 b 1b 9349: 935 ret 936 937L(ipred_paeth_tbl): 938 .hword L(ipred_paeth_tbl) - 640b 939 .hword L(ipred_paeth_tbl) - 320b 940 .hword L(ipred_paeth_tbl) - 160b 941 .hword L(ipred_paeth_tbl) - 80b 942 .hword L(ipred_paeth_tbl) - 40b 943endfunc 944 945// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 946// const pixel *const topleft, 947// const int width, const int height, const int a, 948// const int max_width, const int max_height); 949function ipred_smooth_16bpc_neon, export=1 950 movrel x10, X(sm_weights) 951 add x11, x10, w4, uxtw 952 add x10, x10, w3, uxtw 953 clz w9, w3 954 adr x5, L(ipred_smooth_tbl) 955 sub x12, x2, w4, uxtw #1 956 sub w9, w9, #25 957 ldrh w9, [x5, w9, uxtw #1] 958 ld1r {v4.8h}, [x12] // bottom 959 add x8, x2, #2 960 sub x5, x5, w9, uxtw 961 add x6, x0, x1 962 lsl x1, x1, #1 963 br x5 96440: 965 AARCH64_VALID_JUMP_TARGET 966 ld1r {v6.2d}, [x8] // top 967 ld1r {v7.2s}, [x10] // weights_hor 968 sub x2, x2, #8 969 mov x7, #-8 970 dup v5.8h, v6.h[3] // right 971 sub v6.8h, v6.8h, v4.8h // top-bottom 972 uxtl v7.8h, v7.8b // weights_hor 973 add v31.4h, v4.4h, v5.4h // bottom+right 9744: 975 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 976 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 977 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 978 ushll v21.4s, v31.4h, #8 979 ushll v22.4s, v31.4h, #8 980 ushll v23.4s, v31.4h, #8 981 zip1 v1.2d, v1.2d, v0.2d // left, flipped 982 zip1 v0.2d, v3.2d, v2.2d 983 zip1 v16.2s, v16.2s, v17.2s // weights_ver 984 zip1 v18.2s, v18.2s, v19.2s 985 sub v0.8h, v0.8h, v5.8h // left-right 986 sub v1.8h, v1.8h, v5.8h 987 uxtl v16.8h, v16.8b // weights_ver 988 uxtl v18.8h, v18.8b 989 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor 990 smlal2 v21.4s, v0.8h, v7.8h 991 smlal v22.4s, v1.4h, v7.4h 992 smlal2 v23.4s, v1.8h, v7.8h 993 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 994 smlal2 v21.4s, v6.8h, v16.8h 995 smlal v22.4s, v6.4h, v18.4h 996 smlal2 v23.4s, v6.8h, v18.8h 997 rshrn v20.4h, v20.4s, #9 998 rshrn v21.4h, v21.4s, #9 999 rshrn v22.4h, v22.4s, #9 1000 rshrn v23.4h, v23.4s, #9 1001 st1 {v20.4h}, [x0], x1 1002 st1 {v21.4h}, [x6], x1 1003 subs w4, w4, #4 1004 st1 {v22.4h}, [x0], x1 1005 st1 {v23.4h}, [x6], x1 1006 b.gt 4b 1007 ret 100880: 1009 AARCH64_VALID_JUMP_TARGET 1010 ld1 {v6.8h}, [x8] // top 1011 ld1 {v7.8b}, [x10] // weights_hor 1012 sub x2, x2, #8 1013 mov x7, #-8 1014 dup v5.8h, v6.h[7] // right 1015 sub v6.8h, v6.8h, v4.8h // top-bottom 1016 uxtl v7.8h, v7.8b // weights_hor 1017 add v31.4h, v4.4h, v5.4h // bottom+right 10188: 1019 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1020 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 1021 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1022 ushll v21.4s, v31.4h, #8 1023 ushll v22.4s, v31.4h, #8 1024 ushll v23.4s, v31.4h, #8 1025 ushll v24.4s, v31.4h, #8 1026 ushll v25.4s, v31.4h, #8 1027 ushll v26.4s, v31.4h, #8 1028 ushll v27.4s, v31.4h, #8 1029 sub v0.8h, v0.8h, v5.8h // left-right 1030 sub v1.8h, v1.8h, v5.8h 1031 sub v2.8h, v2.8h, v5.8h 1032 sub v3.8h, v3.8h, v5.8h 1033 uxtl v16.8h, v16.8b // weights_ver 1034 uxtl v17.8h, v17.8b 1035 uxtl v18.8h, v18.8b 1036 uxtl v19.8h, v19.8b 1037 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor 1038 smlal2 v21.4s, v3.8h, v7.8h // (left flipped) 1039 smlal v22.4s, v2.4h, v7.4h 1040 smlal2 v23.4s, v2.8h, v7.8h 1041 smlal v24.4s, v1.4h, v7.4h 1042 smlal2 v25.4s, v1.8h, v7.8h 1043 smlal v26.4s, v0.4h, v7.4h 1044 smlal2 v27.4s, v0.8h, v7.8h 1045 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1046 smlal2 v21.4s, v6.8h, v16.8h 1047 smlal v22.4s, v6.4h, v17.4h 1048 smlal2 v23.4s, v6.8h, v17.8h 1049 smlal v24.4s, v6.4h, v18.4h 1050 smlal2 v25.4s, v6.8h, v18.8h 1051 smlal v26.4s, v6.4h, v19.4h 1052 smlal2 v27.4s, v6.8h, v19.8h 1053 rshrn v20.4h, v20.4s, #9 1054 rshrn2 v20.8h, v21.4s, #9 1055 rshrn v21.4h, v22.4s, #9 1056 rshrn2 v21.8h, v23.4s, #9 1057 rshrn v22.4h, v24.4s, #9 1058 rshrn2 v22.8h, v25.4s, #9 1059 rshrn v23.4h, v26.4s, #9 1060 rshrn2 v23.8h, v27.4s, #9 1061 st1 {v20.8h}, [x0], x1 1062 st1 {v21.8h}, [x6], x1 1063 subs w4, w4, #4 1064 st1 {v22.8h}, [x0], x1 1065 st1 {v23.8h}, [x6], x1 1066 b.gt 8b 1067 ret 1068160: 1069320: 1070640: 1071 AARCH64_VALID_JUMP_TARGET 1072 add x12, x2, w3, uxtw #1 1073 sub x1, x1, w3, uxtw #1 1074 ld1r {v5.8h}, [x12] // right 1075 sub x2, x2, #4 1076 mov x7, #-4 1077 mov w9, w3 1078 add v31.4h, v4.4h, v5.4h // bottom+right 1079 10801: 1081 ld2r {v0.8h, v1.8h}, [x2], x7 // left 1082 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1083 sub v0.8h, v0.8h, v5.8h // left-right 1084 sub v1.8h, v1.8h, v5.8h 1085 uxtl v16.8h, v16.8b // weights_ver 1086 uxtl v17.8h, v17.8b 10872: 1088 ld1 {v7.16b}, [x10], #16 // weights_hor 1089 ld1 {v2.8h, v3.8h}, [x8], #32 // top 1090 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1091 ushll v21.4s, v31.4h, #8 1092 ushll v22.4s, v31.4h, #8 1093 ushll v23.4s, v31.4h, #8 1094 ushll v24.4s, v31.4h, #8 1095 ushll v25.4s, v31.4h, #8 1096 ushll v26.4s, v31.4h, #8 1097 ushll v27.4s, v31.4h, #8 1098 uxtl v6.8h, v7.8b // weights_hor 1099 uxtl2 v7.8h, v7.16b 1100 sub v2.8h, v2.8h, v4.8h // top-bottom 1101 sub v3.8h, v3.8h, v4.8h 1102 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor 1103 smlal2 v21.4s, v1.8h, v6.8h // (left flipped) 1104 smlal v22.4s, v1.4h, v7.4h 1105 smlal2 v23.4s, v1.8h, v7.8h 1106 smlal v24.4s, v0.4h, v6.4h 1107 smlal2 v25.4s, v0.8h, v6.8h 1108 smlal v26.4s, v0.4h, v7.4h 1109 smlal2 v27.4s, v0.8h, v7.8h 1110 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver 1111 smlal2 v21.4s, v2.8h, v16.8h 1112 smlal v22.4s, v3.4h, v16.4h 1113 smlal2 v23.4s, v3.8h, v16.8h 1114 smlal v24.4s, v2.4h, v17.4h 1115 smlal2 v25.4s, v2.8h, v17.8h 1116 smlal v26.4s, v3.4h, v17.4h 1117 smlal2 v27.4s, v3.8h, v17.8h 1118 rshrn v20.4h, v20.4s, #9 1119 rshrn2 v20.8h, v21.4s, #9 1120 rshrn v21.4h, v22.4s, #9 1121 rshrn2 v21.8h, v23.4s, #9 1122 rshrn v22.4h, v24.4s, #9 1123 rshrn2 v22.8h, v25.4s, #9 1124 rshrn v23.4h, v26.4s, #9 1125 rshrn2 v23.8h, v27.4s, #9 1126 subs w3, w3, #16 1127 st1 {v20.8h, v21.8h}, [x0], #32 1128 st1 {v22.8h, v23.8h}, [x6], #32 1129 b.gt 2b 1130 subs w4, w4, #2 1131 b.le 9f 1132 sub x8, x8, w9, uxtw #1 1133 sub x10, x10, w9, uxtw 1134 add x0, x0, x1 1135 add x6, x6, x1 1136 mov w3, w9 1137 b 1b 11389: 1139 ret 1140 1141L(ipred_smooth_tbl): 1142 .hword L(ipred_smooth_tbl) - 640b 1143 .hword L(ipred_smooth_tbl) - 320b 1144 .hword L(ipred_smooth_tbl) - 160b 1145 .hword L(ipred_smooth_tbl) - 80b 1146 .hword L(ipred_smooth_tbl) - 40b 1147endfunc 1148 1149// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1150// const pixel *const topleft, 1151// const int width, const int height, const int a, 1152// const int max_width, const int max_height); 1153function ipred_smooth_v_16bpc_neon, export=1 1154 movrel x7, X(sm_weights) 1155 add x7, x7, w4, uxtw 1156 clz w9, w3 1157 adr x5, L(ipred_smooth_v_tbl) 1158 sub x8, x2, w4, uxtw #1 1159 sub w9, w9, #25 1160 ldrh w9, [x5, w9, uxtw #1] 1161 ld1r {v4.8h}, [x8] // bottom 1162 add x2, x2, #2 1163 sub x5, x5, w9, uxtw 1164 add x6, x0, x1 1165 lsl x1, x1, #1 1166 br x5 116740: 1168 AARCH64_VALID_JUMP_TARGET 1169 ld1r {v6.2d}, [x2] // top 1170 sub v6.8h, v6.8h, v4.8h // top-bottom 11714: 1172 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1173 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1174 zip1 v18.2s, v18.2s, v19.2s 1175 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1176 ushll v18.8h, v18.8b, #7 1177 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1178 sqrdmulh v21.8h, v6.8h, v18.8h 1179 add v20.8h, v20.8h, v4.8h 1180 add v21.8h, v21.8h, v4.8h 1181 st1 {v20.d}[0], [x0], x1 1182 st1 {v20.d}[1], [x6], x1 1183 subs w4, w4, #4 1184 st1 {v21.d}[0], [x0], x1 1185 st1 {v21.d}[1], [x6], x1 1186 b.gt 4b 1187 ret 118880: 1189 AARCH64_VALID_JUMP_TARGET 1190 ld1 {v6.8h}, [x2] // top 1191 sub v6.8h, v6.8h, v4.8h // top-bottom 11928: 1193 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1194 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1195 ushll v17.8h, v17.8b, #7 1196 ushll v18.8h, v18.8b, #7 1197 ushll v19.8h, v19.8b, #7 1198 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1199 sqrdmulh v21.8h, v6.8h, v17.8h 1200 sqrdmulh v22.8h, v6.8h, v18.8h 1201 sqrdmulh v23.8h, v6.8h, v19.8h 1202 add v20.8h, v20.8h, v4.8h 1203 add v21.8h, v21.8h, v4.8h 1204 add v22.8h, v22.8h, v4.8h 1205 add v23.8h, v23.8h, v4.8h 1206 st1 {v20.8h}, [x0], x1 1207 st1 {v21.8h}, [x6], x1 1208 subs w4, w4, #4 1209 st1 {v22.8h}, [x0], x1 1210 st1 {v23.8h}, [x6], x1 1211 b.gt 8b 1212 ret 1213160: 1214320: 1215640: 1216 AARCH64_VALID_JUMP_TARGET 1217 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1218 add x5, x0, x1 1219 add x8, x6, x1 1220 lsl x1, x1, #1 1221 sub x1, x1, w3, uxtw #1 1222 mov w9, w3 1223 12241: 1225 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1226 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1227 ushll v17.8h, v17.8b, #7 1228 ushll v18.8h, v18.8b, #7 1229 ushll v19.8h, v19.8b, #7 12302: 1231 ld1 {v2.8h, v3.8h}, [x2], #32 // top 1232 sub v2.8h, v2.8h, v4.8h // top-bottom 1233 sub v3.8h, v3.8h, v4.8h 1234 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1235 sqrdmulh v21.8h, v3.8h, v16.8h 1236 sqrdmulh v22.8h, v2.8h, v17.8h 1237 sqrdmulh v23.8h, v3.8h, v17.8h 1238 sqrdmulh v24.8h, v2.8h, v18.8h 1239 sqrdmulh v25.8h, v3.8h, v18.8h 1240 sqrdmulh v26.8h, v2.8h, v19.8h 1241 sqrdmulh v27.8h, v3.8h, v19.8h 1242 add v20.8h, v20.8h, v4.8h 1243 add v21.8h, v21.8h, v4.8h 1244 add v22.8h, v22.8h, v4.8h 1245 add v23.8h, v23.8h, v4.8h 1246 add v24.8h, v24.8h, v4.8h 1247 add v25.8h, v25.8h, v4.8h 1248 add v26.8h, v26.8h, v4.8h 1249 add v27.8h, v27.8h, v4.8h 1250 subs w3, w3, #16 1251 st1 {v20.8h, v21.8h}, [x0], #32 1252 st1 {v22.8h, v23.8h}, [x6], #32 1253 st1 {v24.8h, v25.8h}, [x5], #32 1254 st1 {v26.8h, v27.8h}, [x8], #32 1255 b.gt 2b 1256 subs w4, w4, #4 1257 b.le 9f 1258 sub x2, x2, w9, uxtw #1 1259 add x0, x0, x1 1260 add x6, x6, x1 1261 add x5, x5, x1 1262 add x8, x8, x1 1263 mov w3, w9 1264 b 1b 12659: 1266 ret 1267 1268L(ipred_smooth_v_tbl): 1269 .hword L(ipred_smooth_v_tbl) - 640b 1270 .hword L(ipred_smooth_v_tbl) - 320b 1271 .hword L(ipred_smooth_v_tbl) - 160b 1272 .hword L(ipred_smooth_v_tbl) - 80b 1273 .hword L(ipred_smooth_v_tbl) - 40b 1274endfunc 1275 1276// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1277// const pixel *const topleft, 1278// const int width, const int height, const int a, 1279// const int max_width, const int max_height); 1280function ipred_smooth_h_16bpc_neon, export=1 1281 movrel x8, X(sm_weights) 1282 add x8, x8, w3, uxtw 1283 clz w9, w3 1284 adr x5, L(ipred_smooth_h_tbl) 1285 add x12, x2, w3, uxtw #1 1286 sub w9, w9, #25 1287 ldrh w9, [x5, w9, uxtw #1] 1288 ld1r {v5.8h}, [x12] // right 1289 sub x5, x5, w9, uxtw 1290 add x6, x0, x1 1291 lsl x1, x1, #1 1292 br x5 129340: 1294 AARCH64_VALID_JUMP_TARGET 1295 ld1r {v7.2s}, [x8] // weights_hor 1296 sub x2, x2, #8 1297 mov x7, #-8 1298 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12994: 1300 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 1301 zip1 v1.2d, v1.2d, v0.2d // left, flipped 1302 zip1 v0.2d, v3.2d, v2.2d 1303 sub v0.8h, v0.8h, v5.8h // left-right 1304 sub v1.8h, v1.8h, v5.8h 1305 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1306 sqrdmulh v21.8h, v1.8h, v7.8h 1307 add v20.8h, v20.8h, v5.8h 1308 add v21.8h, v21.8h, v5.8h 1309 st1 {v20.d}[0], [x0], x1 1310 st1 {v20.d}[1], [x6], x1 1311 subs w4, w4, #4 1312 st1 {v21.d}[0], [x0], x1 1313 st1 {v21.d}[1], [x6], x1 1314 b.gt 4b 1315 ret 131680: 1317 AARCH64_VALID_JUMP_TARGET 1318 ld1 {v7.8b}, [x8] // weights_hor 1319 sub x2, x2, #8 1320 mov x7, #-8 1321 ushll v7.8h, v7.8b, #7 // weights_hor << 7 13228: 1323 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1324 sub v3.8h, v3.8h, v5.8h // left-right 1325 sub v2.8h, v2.8h, v5.8h 1326 sub v1.8h, v1.8h, v5.8h 1327 sub v0.8h, v0.8h, v5.8h 1328 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1329 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) 1330 sqrdmulh v22.8h, v1.8h, v7.8h 1331 sqrdmulh v23.8h, v0.8h, v7.8h 1332 add v20.8h, v20.8h, v5.8h 1333 add v21.8h, v21.8h, v5.8h 1334 add v22.8h, v22.8h, v5.8h 1335 add v23.8h, v23.8h, v5.8h 1336 st1 {v20.8h}, [x0], x1 1337 st1 {v21.8h}, [x6], x1 1338 subs w4, w4, #4 1339 st1 {v22.8h}, [x0], x1 1340 st1 {v23.8h}, [x6], x1 1341 b.gt 8b 1342 ret 1343160: 1344320: 1345640: 1346 AARCH64_VALID_JUMP_TARGET 1347 sub x2, x2, #8 1348 mov x7, #-8 1349 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1350 add x5, x0, x1 1351 add x10, x6, x1 1352 lsl x1, x1, #1 1353 sub x1, x1, w3, uxtw #1 1354 mov w9, w3 1355 13561: 1357 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1358 sub v0.8h, v0.8h, v5.8h // left-right 1359 sub v1.8h, v1.8h, v5.8h 1360 sub v2.8h, v2.8h, v5.8h 1361 sub v3.8h, v3.8h, v5.8h 13622: 1363 ld1 {v7.16b}, [x8], #16 // weights_hor 1364 ushll v6.8h, v7.8b, #7 // weights_hor << 7 1365 ushll2 v7.8h, v7.16b, #7 1366 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 1367 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) 1368 sqrdmulh v22.8h, v2.8h, v6.8h 1369 sqrdmulh v23.8h, v2.8h, v7.8h 1370 sqrdmulh v24.8h, v1.8h, v6.8h 1371 sqrdmulh v25.8h, v1.8h, v7.8h 1372 sqrdmulh v26.8h, v0.8h, v6.8h 1373 sqrdmulh v27.8h, v0.8h, v7.8h 1374 add v20.8h, v20.8h, v5.8h 1375 add v21.8h, v21.8h, v5.8h 1376 add v22.8h, v22.8h, v5.8h 1377 add v23.8h, v23.8h, v5.8h 1378 add v24.8h, v24.8h, v5.8h 1379 add v25.8h, v25.8h, v5.8h 1380 add v26.8h, v26.8h, v5.8h 1381 add v27.8h, v27.8h, v5.8h 1382 subs w3, w3, #16 1383 st1 {v20.8h, v21.8h}, [x0], #32 1384 st1 {v22.8h, v23.8h}, [x6], #32 1385 st1 {v24.8h, v25.8h}, [x5], #32 1386 st1 {v26.8h, v27.8h}, [x10], #32 1387 b.gt 2b 1388 subs w4, w4, #4 1389 b.le 9f 1390 sub x8, x8, w9, uxtw 1391 add x0, x0, x1 1392 add x6, x6, x1 1393 add x5, x5, x1 1394 add x10, x10, x1 1395 mov w3, w9 1396 b 1b 13979: 1398 ret 1399 1400L(ipred_smooth_h_tbl): 1401 .hword L(ipred_smooth_h_tbl) - 640b 1402 .hword L(ipred_smooth_h_tbl) - 320b 1403 .hword L(ipred_smooth_h_tbl) - 160b 1404 .hword L(ipred_smooth_h_tbl) - 80b 1405 .hword L(ipred_smooth_h_tbl) - 40b 1406endfunc 1407 1408// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1409// const pixel *const topleft, 1410// const int width, const int height, const int filt_idx, 1411// const int max_width, const int max_height, 1412// const int bitdepth_max); 1413.macro filter_fn bpc 1414function ipred_filter_\bpc\()bpc_neon 1415 and w5, w5, #511 1416 movrel x6, X(filter_intra_taps) 1417 lsl w5, w5, #6 1418 add x6, x6, w5, uxtw 1419 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 1420 clz w9, w3 1421 adr x5, L(ipred_filter\bpc\()_tbl) 1422 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 1423 sub w9, w9, #26 1424 ldrh w9, [x5, w9, uxtw #1] 1425 sxtl v16.8h, v16.8b 1426 sxtl v17.8h, v17.8b 1427 sub x5, x5, w9, uxtw 1428 sxtl v18.8h, v18.8b 1429 sxtl v19.8h, v19.8b 1430 add x6, x0, x1 1431 lsl x1, x1, #1 1432 sxtl v20.8h, v20.8b 1433 sxtl v21.8h, v21.8b 1434 sxtl v22.8h, v22.8b 1435 dup v31.8h, w8 1436.if \bpc == 10 1437 movi v30.8h, #0 1438.endif 1439 br x5 144040: 1441 AARCH64_VALID_JUMP_TARGET 1442 ldur d0, [x2, #2] // top (0-3) 1443 sub x2, x2, #4 1444 mov x7, #-4 14454: 1446 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1447.if \bpc == 10 1448 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1449 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1450 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1451 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1452 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1453 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1454 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1455 srshr v2.8h, v2.8h, #4 1456 smax v2.8h, v2.8h, v30.8h 1457.else 1458 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1459 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1460 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1461 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1462 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1463 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1464 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1465 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1466 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1467 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1468 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1469 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1470 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1471 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1472 sqrshrun v2.4h, v2.4s, #4 1473 sqrshrun2 v2.8h, v3.4s, #4 1474.endif 1475 smin v2.8h, v2.8h, v31.8h 1476 subs w4, w4, #2 1477 st1 {v2.d}[0], [x0], x1 1478 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] 1479 st1 {v2.d}[1], [x6], x1 1480 b.gt 4b 1481 ret 148280: 1483 AARCH64_VALID_JUMP_TARGET 1484 ldur q0, [x2, #2] // top (0-7) 1485 sub x2, x2, #4 1486 mov x7, #-4 14878: 1488 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1489.if \bpc == 10 1490 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1491 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1492 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1493 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1494 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1495 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1496 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1497 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1498 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1499 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1500 srshr v2.8h, v2.8h, #4 1501 smax v2.8h, v2.8h, v30.8h 1502 smin v2.8h, v2.8h, v31.8h 1503 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1504 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1505 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1506 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1507 srshr v3.8h, v3.8h, #4 1508 smax v3.8h, v3.8h, v30.8h 1509.else 1510 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1511 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1512 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1513 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1514 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1515 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1516 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1517 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1518 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1519 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1520 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1521 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1522 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1523 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1524 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) 1525 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) 1526 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) 1527 sqrshrun v2.4h, v2.4s, #4 1528 sqrshrun2 v2.8h, v3.4s, #4 1529 smin v2.8h, v2.8h, v31.8h 1530 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) 1531 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) 1532 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) 1533 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) 1534 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1535 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1536 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1537 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1538 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1539 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1540 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1541 sqrshrun v3.4h, v4.4s, #4 1542 sqrshrun2 v3.8h, v5.4s, #4 1543.endif 1544 smin v3.8h, v3.8h, v31.8h 1545 subs w4, w4, #2 1546 st2 {v2.d, v3.d}[0], [x0], x1 1547 zip2 v0.2d, v2.2d, v3.2d 1548 st2 {v2.d, v3.d}[1], [x6], x1 1549 b.gt 8b 1550 ret 1551160: 1552320: 1553 AARCH64_VALID_JUMP_TARGET 1554 add x8, x2, #2 1555 sub x2, x2, #4 1556 mov x7, #-4 1557 sub x1, x1, w3, uxtw #1 1558 mov w9, w3 1559 15601: 1561 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 15622: 1563 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) 1564.if \bpc == 10 1565 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1566 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1567 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1568 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1569 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1570 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1571 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1572 1573 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1574 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1575 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1576 srshr v3.8h, v3.8h, #4 1577 smax v3.8h, v3.8h, v30.8h 1578 smin v3.8h, v3.8h, v31.8h 1579 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1580 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1581 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1582 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1583 1584 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1585 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1586 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1587 srshr v4.8h, v4.8h, #4 1588 smax v4.8h, v4.8h, v30.8h 1589 smin v4.8h, v4.8h, v31.8h 1590 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1591 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1592 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1593 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1594 1595 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1596 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1597 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1598 srshr v5.8h, v5.8h, #4 1599 smax v5.8h, v5.8h, v30.8h 1600 smin v5.8h, v5.8h, v31.8h 1601 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1602 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1603 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1604 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1605 1606 subs w3, w3, #16 1607 srshr v6.8h, v6.8h, #4 1608 smax v6.8h, v6.8h, v30.8h 1609.else 1610 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) 1611 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) 1612 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) 1613 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) 1614 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) 1615 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) 1616 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) 1617 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1618 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1619 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1620 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1621 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1622 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1623 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1624 1625 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) 1626 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) 1627 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) 1628 sqrshrun v3.4h, v3.4s, #4 1629 sqrshrun2 v3.8h, v4.4s, #4 1630 smin v3.8h, v3.8h, v31.8h 1631 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) 1632 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) 1633 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) 1634 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) 1635 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1636 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1637 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1638 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1639 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1640 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1641 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1642 1643 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) 1644 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) 1645 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) 1646 sqrshrun v4.4h, v5.4s, #4 1647 sqrshrun2 v4.8h, v6.4s, #4 1648 smin v4.8h, v4.8h, v31.8h 1649 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) 1650 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) 1651 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) 1652 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) 1653 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1654 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1655 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1656 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1657 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1658 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1659 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1660 1661 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) 1662 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) 1663 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) 1664 sqrshrun v5.4h, v24.4s, #4 1665 sqrshrun2 v5.8h, v25.4s, #4 1666 smin v5.8h, v5.8h, v31.8h 1667 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) 1668 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) 1669 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) 1670 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) 1671 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1672 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1673 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1674 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1675 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1676 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1677 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1678 1679 subs w3, w3, #16 1680 sqrshrun v6.4h, v26.4s, #4 1681 sqrshrun2 v6.8h, v27.4s, #4 1682.endif 1683 smin v6.8h, v6.8h, v31.8h 1684 1685 ins v0.h[2], v2.h[7] 1686 st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 1687 ins v0.h[0], v6.h[7] 1688 st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 1689 ins v0.h[1], v6.h[3] 1690 b.gt 2b 1691 subs w4, w4, #2 1692 b.le 9f 1693 sub x8, x6, w9, uxtw #1 1694 add x0, x0, x1 1695 add x6, x6, x1 1696 mov w3, w9 1697 b 1b 16989: 1699 ret 1700 1701L(ipred_filter\bpc\()_tbl): 1702 .hword L(ipred_filter\bpc\()_tbl) - 320b 1703 .hword L(ipred_filter\bpc\()_tbl) - 160b 1704 .hword L(ipred_filter\bpc\()_tbl) - 80b 1705 .hword L(ipred_filter\bpc\()_tbl) - 40b 1706endfunc 1707.endm 1708 1709filter_fn 10 1710filter_fn 12 1711 1712function ipred_filter_16bpc_neon, export=1 1713 ldr w8, [sp] 1714 cmp w8, 0x3ff 1715 b.le ipred_filter_10bpc_neon 1716 b ipred_filter_12bpc_neon 1717endfunc 1718 1719// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1720// const uint16_t *const pal, const uint8_t *idx, 1721// const int w, const int h); 1722function pal_pred_16bpc_neon, export=1 1723 ld1 {v30.8h}, [x2] 1724 clz w9, w4 1725 adr x6, L(pal_pred_tbl) 1726 sub w9, w9, #25 1727 ldrh w9, [x6, w9, uxtw #1] 1728 movi v31.8h, #1, lsl #8 1729 sub x6, x6, w9, uxtw 1730 br x6 173140: 1732 AARCH64_VALID_JUMP_TARGET 1733 add x2, x0, x1 1734 lsl x1, x1, #1 17354: 1736 ld1 {v1.16b}, [x3], #16 1737 subs w5, w5, #4 1738 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... 1739 add v1.16b, v1.16b, v1.16b 1740 zip1 v0.16b, v1.16b, v1.16b 1741 zip2 v1.16b, v1.16b, v1.16b 1742 add v0.8h, v0.8h, v31.8h 1743 add v1.8h, v1.8h, v31.8h 1744 tbl v0.16b, {v30.16b}, v0.16b 1745 st1 {v0.d}[0], [x0], x1 1746 tbl v1.16b, {v30.16b}, v1.16b 1747 st1 {v0.d}[1], [x2], x1 1748 st1 {v1.d}[0], [x0], x1 1749 st1 {v1.d}[1], [x2], x1 1750 b.gt 4b 1751 ret 175280: 1753 AARCH64_VALID_JUMP_TARGET 1754 add x2, x0, x1 1755 lsl x1, x1, #1 17568: 1757 ld1 {v2.16b, v3.16b}, [x3], #32 1758 subs w5, w5, #4 1759 add v2.16b, v2.16b, v2.16b 1760 add v3.16b, v3.16b, v3.16b 1761 zip1 v0.16b, v2.16b, v2.16b 1762 zip2 v1.16b, v2.16b, v2.16b 1763 zip1 v2.16b, v3.16b, v3.16b 1764 zip2 v3.16b, v3.16b, v3.16b 1765 add v0.8h, v0.8h, v31.8h 1766 add v1.8h, v1.8h, v31.8h 1767 add v2.8h, v2.8h, v31.8h 1768 add v3.8h, v3.8h, v31.8h 1769 tbl v0.16b, {v30.16b}, v0.16b 1770 tbl v1.16b, {v30.16b}, v1.16b 1771 st1 {v0.8h}, [x0], x1 1772 tbl v2.16b, {v30.16b}, v2.16b 1773 st1 {v1.8h}, [x2], x1 1774 tbl v3.16b, {v30.16b}, v3.16b 1775 st1 {v2.8h}, [x0], x1 1776 st1 {v3.8h}, [x2], x1 1777 b.gt 8b 1778 ret 1779160: 1780 AARCH64_VALID_JUMP_TARGET 1781 add x2, x0, x1 1782 lsl x1, x1, #1 178316: 1784 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1785 subs w5, w5, #4 1786 add v4.16b, v4.16b, v4.16b 1787 add v5.16b, v5.16b, v5.16b 1788 add v6.16b, v6.16b, v6.16b 1789 add v7.16b, v7.16b, v7.16b 1790 zip1 v0.16b, v4.16b, v4.16b 1791 zip2 v1.16b, v4.16b, v4.16b 1792 zip1 v2.16b, v5.16b, v5.16b 1793 zip2 v3.16b, v5.16b, v5.16b 1794 zip1 v4.16b, v6.16b, v6.16b 1795 zip2 v5.16b, v6.16b, v6.16b 1796 zip1 v6.16b, v7.16b, v7.16b 1797 zip2 v7.16b, v7.16b, v7.16b 1798 add v0.8h, v0.8h, v31.8h 1799 add v1.8h, v1.8h, v31.8h 1800 add v2.8h, v2.8h, v31.8h 1801 add v3.8h, v3.8h, v31.8h 1802 add v4.8h, v4.8h, v31.8h 1803 tbl v0.16b, {v30.16b}, v0.16b 1804 add v5.8h, v5.8h, v31.8h 1805 tbl v1.16b, {v30.16b}, v1.16b 1806 add v6.8h, v6.8h, v31.8h 1807 tbl v2.16b, {v30.16b}, v2.16b 1808 add v7.8h, v7.8h, v31.8h 1809 tbl v3.16b, {v30.16b}, v3.16b 1810 tbl v4.16b, {v30.16b}, v4.16b 1811 tbl v5.16b, {v30.16b}, v5.16b 1812 st1 {v0.8h, v1.8h}, [x0], x1 1813 tbl v6.16b, {v30.16b}, v6.16b 1814 st1 {v2.8h, v3.8h}, [x2], x1 1815 tbl v7.16b, {v30.16b}, v7.16b 1816 st1 {v4.8h, v5.8h}, [x0], x1 1817 st1 {v6.8h, v7.8h}, [x2], x1 1818 b.gt 16b 1819 ret 1820320: 1821 AARCH64_VALID_JUMP_TARGET 1822 add x2, x0, x1 1823 lsl x1, x1, #1 182432: 1825 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1826 subs w5, w5, #2 1827 add v4.16b, v4.16b, v4.16b 1828 add v5.16b, v5.16b, v5.16b 1829 add v6.16b, v6.16b, v6.16b 1830 add v7.16b, v7.16b, v7.16b 1831 zip1 v0.16b, v4.16b, v4.16b 1832 zip2 v1.16b, v4.16b, v4.16b 1833 zip1 v2.16b, v5.16b, v5.16b 1834 zip2 v3.16b, v5.16b, v5.16b 1835 zip1 v4.16b, v6.16b, v6.16b 1836 zip2 v5.16b, v6.16b, v6.16b 1837 zip1 v6.16b, v7.16b, v7.16b 1838 zip2 v7.16b, v7.16b, v7.16b 1839 add v0.8h, v0.8h, v31.8h 1840 add v1.8h, v1.8h, v31.8h 1841 add v2.8h, v2.8h, v31.8h 1842 add v3.8h, v3.8h, v31.8h 1843 add v4.8h, v4.8h, v31.8h 1844 tbl v0.16b, {v30.16b}, v0.16b 1845 add v5.8h, v5.8h, v31.8h 1846 tbl v1.16b, {v30.16b}, v1.16b 1847 add v6.8h, v6.8h, v31.8h 1848 tbl v2.16b, {v30.16b}, v2.16b 1849 add v7.8h, v7.8h, v31.8h 1850 tbl v3.16b, {v30.16b}, v3.16b 1851 tbl v4.16b, {v30.16b}, v4.16b 1852 tbl v5.16b, {v30.16b}, v5.16b 1853 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1854 tbl v6.16b, {v30.16b}, v6.16b 1855 tbl v7.16b, {v30.16b}, v7.16b 1856 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1857 b.gt 32b 1858 ret 1859640: 1860 AARCH64_VALID_JUMP_TARGET 1861 add x2, x0, #64 186264: 1863 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1864 subs w5, w5, #1 1865 add v4.16b, v4.16b, v4.16b 1866 add v5.16b, v5.16b, v5.16b 1867 add v6.16b, v6.16b, v6.16b 1868 add v7.16b, v7.16b, v7.16b 1869 zip1 v0.16b, v4.16b, v4.16b 1870 zip2 v1.16b, v4.16b, v4.16b 1871 zip1 v2.16b, v5.16b, v5.16b 1872 zip2 v3.16b, v5.16b, v5.16b 1873 zip1 v4.16b, v6.16b, v6.16b 1874 zip2 v5.16b, v6.16b, v6.16b 1875 zip1 v6.16b, v7.16b, v7.16b 1876 zip2 v7.16b, v7.16b, v7.16b 1877 add v0.8h, v0.8h, v31.8h 1878 add v1.8h, v1.8h, v31.8h 1879 add v2.8h, v2.8h, v31.8h 1880 add v3.8h, v3.8h, v31.8h 1881 add v4.8h, v4.8h, v31.8h 1882 tbl v0.16b, {v30.16b}, v0.16b 1883 add v5.8h, v5.8h, v31.8h 1884 tbl v1.16b, {v30.16b}, v1.16b 1885 add v6.8h, v6.8h, v31.8h 1886 tbl v2.16b, {v30.16b}, v2.16b 1887 add v7.8h, v7.8h, v31.8h 1888 tbl v3.16b, {v30.16b}, v3.16b 1889 tbl v4.16b, {v30.16b}, v4.16b 1890 tbl v5.16b, {v30.16b}, v5.16b 1891 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1892 tbl v6.16b, {v30.16b}, v6.16b 1893 tbl v7.16b, {v30.16b}, v7.16b 1894 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1895 b.gt 64b 1896 ret 1897 1898L(pal_pred_tbl): 1899 .hword L(pal_pred_tbl) - 640b 1900 .hword L(pal_pred_tbl) - 320b 1901 .hword L(pal_pred_tbl) - 160b 1902 .hword L(pal_pred_tbl) - 80b 1903 .hword L(pal_pred_tbl) - 40b 1904endfunc 1905 1906// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1907// const pixel *const topleft, 1908// const int width, const int height, 1909// const int16_t *ac, const int alpha, 1910// const int bitdepth_max); 1911function ipred_cfl_128_16bpc_neon, export=1 1912 dup v31.8h, w7 // bitdepth_max 1913 clz w9, w3 1914 adr x7, L(ipred_cfl_128_tbl) 1915 sub w9, w9, #26 1916 ldrh w9, [x7, w9, uxtw #1] 1917 urshr v0.8h, v31.8h, #1 1918 dup v1.8h, w6 // alpha 1919 sub x7, x7, w9, uxtw 1920 add x6, x0, x1 1921 lsl x1, x1, #1 1922 movi v30.8h, #0 1923 br x7 1924L(ipred_cfl_splat_w4): 1925 AARCH64_VALID_JUMP_TARGET 1926 ld1 {v4.8h, v5.8h}, [x5], #32 1927 subs w4, w4, #4 1928 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1929 smull2 v3.4s, v4.8h, v1.8h 1930 smull v4.4s, v5.4h, v1.4h 1931 smull2 v5.4s, v5.8h, v1.8h 1932 cmlt v16.4s, v2.4s, #0 // sign 1933 cmlt v17.4s, v3.4s, #0 1934 cmlt v18.4s, v4.4s, #0 1935 cmlt v19.4s, v5.4s, #0 1936 add v2.4s, v2.4s, v16.4s // diff + sign 1937 add v3.4s, v3.4s, v17.4s 1938 add v4.4s, v4.4s, v18.4s 1939 add v5.4s, v5.4s, v19.4s 1940 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1941 rshrn2 v2.8h, v3.4s, #6 1942 rshrn v3.4h, v4.4s, #6 1943 rshrn2 v3.8h, v5.4s, #6 1944 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1945 add v3.8h, v3.8h, v0.8h 1946 smax v2.8h, v2.8h, v30.8h 1947 smax v3.8h, v3.8h, v30.8h 1948 smin v2.8h, v2.8h, v31.8h 1949 smin v3.8h, v3.8h, v31.8h 1950 st1 {v2.d}[0], [x0], x1 1951 st1 {v2.d}[1], [x6], x1 1952 st1 {v3.d}[0], [x0], x1 1953 st1 {v3.d}[1], [x6], x1 1954 b.gt L(ipred_cfl_splat_w4) 1955 ret 1956L(ipred_cfl_splat_w8): 1957 AARCH64_VALID_JUMP_TARGET 1958 ld1 {v4.8h, v5.8h}, [x5], #32 1959 subs w4, w4, #2 1960 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1961 smull2 v3.4s, v4.8h, v1.8h 1962 smull v4.4s, v5.4h, v1.4h 1963 smull2 v5.4s, v5.8h, v1.8h 1964 cmlt v16.4s, v2.4s, #0 // sign 1965 cmlt v17.4s, v3.4s, #0 1966 cmlt v18.4s, v4.4s, #0 1967 cmlt v19.4s, v5.4s, #0 1968 add v2.4s, v2.4s, v16.4s // diff + sign 1969 add v3.4s, v3.4s, v17.4s 1970 add v4.4s, v4.4s, v18.4s 1971 add v5.4s, v5.4s, v19.4s 1972 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1973 rshrn2 v2.8h, v3.4s, #6 1974 rshrn v3.4h, v4.4s, #6 1975 rshrn2 v3.8h, v5.4s, #6 1976 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1977 add v3.8h, v3.8h, v0.8h 1978 smax v2.8h, v2.8h, v30.8h 1979 smax v3.8h, v3.8h, v30.8h 1980 smin v2.8h, v2.8h, v31.8h 1981 smin v3.8h, v3.8h, v31.8h 1982 st1 {v2.8h}, [x0], x1 1983 st1 {v3.8h}, [x6], x1 1984 b.gt L(ipred_cfl_splat_w8) 1985 ret 1986L(ipred_cfl_splat_w16): 1987 AARCH64_VALID_JUMP_TARGET 1988 add x7, x5, w3, uxtw #1 1989 sub x1, x1, w3, uxtw #1 1990 mov w9, w3 19911: 1992 ld1 {v2.8h, v3.8h}, [x5], #32 1993 ld1 {v4.8h, v5.8h}, [x7], #32 1994 subs w3, w3, #16 1995 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha 1996 smull2 v17.4s, v2.8h, v1.8h 1997 smull v18.4s, v3.4h, v1.4h 1998 smull2 v19.4s, v3.8h, v1.8h 1999 smull v2.4s, v4.4h, v1.4h 2000 smull2 v3.4s, v4.8h, v1.8h 2001 smull v4.4s, v5.4h, v1.4h 2002 smull2 v5.4s, v5.8h, v1.8h 2003 cmlt v20.4s, v16.4s, #0 // sign 2004 cmlt v21.4s, v17.4s, #0 2005 cmlt v22.4s, v18.4s, #0 2006 cmlt v23.4s, v19.4s, #0 2007 cmlt v24.4s, v2.4s, #0 2008 cmlt v25.4s, v3.4s, #0 2009 cmlt v26.4s, v4.4s, #0 2010 cmlt v27.4s, v5.4s, #0 2011 add v16.4s, v16.4s, v20.4s // diff + sign 2012 add v17.4s, v17.4s, v21.4s 2013 add v18.4s, v18.4s, v22.4s 2014 add v19.4s, v19.4s, v23.4s 2015 add v2.4s, v2.4s, v24.4s 2016 add v3.4s, v3.4s, v25.4s 2017 add v4.4s, v4.4s, v26.4s 2018 add v5.4s, v5.4s, v27.4s 2019 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 2020 rshrn2 v16.8h, v17.4s, #6 2021 rshrn v17.4h, v18.4s, #6 2022 rshrn2 v17.8h, v19.4s, #6 2023 rshrn v6.4h, v2.4s, #6 2024 rshrn2 v6.8h, v3.4s, #6 2025 rshrn v7.4h, v4.4s, #6 2026 rshrn2 v7.8h, v5.4s, #6 2027 add v2.8h, v16.8h, v0.8h // dc + apply_sign() 2028 add v3.8h, v17.8h, v0.8h 2029 add v4.8h, v6.8h, v0.8h 2030 add v5.8h, v7.8h, v0.8h 2031 smax v2.8h, v2.8h, v30.8h 2032 smax v3.8h, v3.8h, v30.8h 2033 smax v4.8h, v4.8h, v30.8h 2034 smax v5.8h, v5.8h, v30.8h 2035 smin v2.8h, v2.8h, v31.8h 2036 smin v3.8h, v3.8h, v31.8h 2037 smin v4.8h, v4.8h, v31.8h 2038 smin v5.8h, v5.8h, v31.8h 2039 st1 {v2.8h, v3.8h}, [x0], #32 2040 st1 {v4.8h, v5.8h}, [x6], #32 2041 b.gt 1b 2042 subs w4, w4, #2 2043 add x5, x5, w9, uxtw #1 2044 add x7, x7, w9, uxtw #1 2045 add x0, x0, x1 2046 add x6, x6, x1 2047 mov w3, w9 2048 b.gt 1b 2049 ret 2050 2051L(ipred_cfl_128_tbl): 2052L(ipred_cfl_splat_tbl): 2053 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 2054 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 2055 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 2056 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 2057endfunc 2058 2059// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2060// const pixel *const topleft, 2061// const int width, const int height, 2062// const int16_t *ac, const int alpha, 2063// const int bitdepth_max); 2064function ipred_cfl_top_16bpc_neon, export=1 2065 dup v31.8h, w7 // bitdepth_max 2066 clz w9, w3 2067 adr x7, L(ipred_cfl_top_tbl) 2068 sub w9, w9, #26 2069 ldrh w9, [x7, w9, uxtw #1] 2070 dup v1.8h, w6 // alpha 2071 add x2, x2, #2 2072 sub x7, x7, w9, uxtw 2073 add x6, x0, x1 2074 lsl x1, x1, #1 2075 movi v30.8h, #0 2076 br x7 20774: 2078 AARCH64_VALID_JUMP_TARGET 2079 ld1 {v0.4h}, [x2] 2080 addv h0, v0.4h 2081 urshr v0.4h, v0.4h, #2 2082 dup v0.8h, v0.h[0] 2083 b L(ipred_cfl_splat_w4) 20848: 2085 AARCH64_VALID_JUMP_TARGET 2086 ld1 {v0.8h}, [x2] 2087 addv h0, v0.8h 2088 urshr v0.4h, v0.4h, #3 2089 dup v0.8h, v0.h[0] 2090 b L(ipred_cfl_splat_w8) 209116: 2092 AARCH64_VALID_JUMP_TARGET 2093 ld1 {v2.8h, v3.8h}, [x2] 2094 addp v0.8h, v2.8h, v3.8h 2095 addv h0, v0.8h 2096 urshr v0.4h, v0.4h, #4 2097 dup v0.8h, v0.h[0] 2098 b L(ipred_cfl_splat_w16) 209932: 2100 AARCH64_VALID_JUMP_TARGET 2101 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2102 addp v2.8h, v2.8h, v3.8h 2103 addp v4.8h, v4.8h, v5.8h 2104 addp v0.8h, v2.8h, v4.8h 2105 uaddlv s0, v0.8h 2106 rshrn v0.4h, v0.4s, #5 2107 dup v0.8h, v0.h[0] 2108 b L(ipred_cfl_splat_w16) 2109 2110L(ipred_cfl_top_tbl): 2111 .hword L(ipred_cfl_top_tbl) - 32b 2112 .hword L(ipred_cfl_top_tbl) - 16b 2113 .hword L(ipred_cfl_top_tbl) - 8b 2114 .hword L(ipred_cfl_top_tbl) - 4b 2115endfunc 2116 2117// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2118// const pixel *const topleft, 2119// const int width, const int height, 2120// const int16_t *ac, const int alpha, 2121// const int bitdepth_max); 2122function ipred_cfl_left_16bpc_neon, export=1 2123 dup v31.8h, w7 // bitdepth_max 2124 sub x2, x2, w4, uxtw #1 2125 clz w9, w3 2126 clz w8, w4 2127 adr x10, L(ipred_cfl_splat_tbl) 2128 adr x7, L(ipred_cfl_left_tbl) 2129 sub w9, w9, #26 2130 sub w8, w8, #26 2131 ldrh w9, [x10, w9, uxtw #1] 2132 ldrh w8, [x7, w8, uxtw #1] 2133 dup v1.8h, w6 // alpha 2134 sub x9, x10, w9, uxtw 2135 sub x7, x7, w8, uxtw 2136 add x6, x0, x1 2137 lsl x1, x1, #1 2138 movi v30.8h, #0 2139 br x7 2140 2141L(ipred_cfl_left_h4): 2142 AARCH64_VALID_JUMP_TARGET 2143 ld1 {v0.4h}, [x2] 2144 addv h0, v0.4h 2145 urshr v0.4h, v0.4h, #2 2146 dup v0.8h, v0.h[0] 2147 br x9 2148 2149L(ipred_cfl_left_h8): 2150 AARCH64_VALID_JUMP_TARGET 2151 ld1 {v0.8h}, [x2] 2152 addv h0, v0.8h 2153 urshr v0.4h, v0.4h, #3 2154 dup v0.8h, v0.h[0] 2155 br x9 2156 2157L(ipred_cfl_left_h16): 2158 AARCH64_VALID_JUMP_TARGET 2159 ld1 {v2.8h, v3.8h}, [x2] 2160 addp v0.8h, v2.8h, v3.8h 2161 addv h0, v0.8h 2162 urshr v0.4h, v0.4h, #4 2163 dup v0.8h, v0.h[0] 2164 br x9 2165 2166L(ipred_cfl_left_h32): 2167 AARCH64_VALID_JUMP_TARGET 2168 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2169 addp v2.8h, v2.8h, v3.8h 2170 addp v4.8h, v4.8h, v5.8h 2171 addp v0.8h, v2.8h, v4.8h 2172 uaddlv s0, v0.8h 2173 rshrn v0.4h, v0.4s, #5 2174 dup v0.8h, v0.h[0] 2175 br x9 2176 2177L(ipred_cfl_left_tbl): 2178 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 2179 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 2180 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 2181 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 2182endfunc 2183 2184// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2185// const pixel *const topleft, 2186// const int width, const int height, 2187// const int16_t *ac, const int alpha, 2188// const int bitdepth_max); 2189function ipred_cfl_16bpc_neon, export=1 2190 dup v31.8h, w7 // bitdepth_max 2191 sub x2, x2, w4, uxtw #1 2192 add w8, w3, w4 // width + height 2193 dup v1.8h, w6 // alpha 2194 clz w9, w3 2195 clz w6, w4 2196 dup v16.4s, w8 // width + height 2197 adr x7, L(ipred_cfl_tbl) 2198 rbit w8, w8 // rbit(width + height) 2199 sub w9, w9, #22 // 26 leading bits, minus table offset 4 2200 sub w6, w6, #26 2201 clz w8, w8 // ctz(width + height) 2202 ldrh w9, [x7, w9, uxtw #1] 2203 ldrh w6, [x7, w6, uxtw #1] 2204 neg w8, w8 // -ctz(width + height) 2205 sub x9, x7, w9, uxtw 2206 sub x7, x7, w6, uxtw 2207 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 2208 dup v17.4s, w8 // -ctz(width + height) 2209 add x6, x0, x1 2210 lsl x1, x1, #1 2211 movi v30.8h, #0 2212 br x7 2213 2214L(ipred_cfl_h4): 2215 AARCH64_VALID_JUMP_TARGET 2216 ld1 {v0.4h}, [x2], #8 2217 uaddlv s0, v0.4h 2218 add x2, x2, #2 2219 br x9 2220L(ipred_cfl_w4): 2221 AARCH64_VALID_JUMP_TARGET 2222 ld1 {v2.4h}, [x2] 2223 add v0.2s, v0.2s, v16.2s 2224 uaddlv s2, v2.4h 2225 cmp w4, #4 2226 add v0.2s, v0.2s, v2.2s 2227 ushl v0.2s, v0.2s, v17.2s 2228 b.eq 1f 2229 // h = 8/16 2230 cmp w4, #16 2231 mov w16, #0x6667 2232 mov w17, #0xAAAB 2233 csel w16, w16, w17, eq 2234 dup v16.2s, w16 2235 mul v0.2s, v0.2s, v16.2s 2236 ushr v0.2s, v0.2s, #17 22371: 2238 dup v0.8h, v0.h[0] 2239 b L(ipred_cfl_splat_w4) 2240 2241L(ipred_cfl_h8): 2242 AARCH64_VALID_JUMP_TARGET 2243 ld1 {v0.8h}, [x2], #16 2244 uaddlv s0, v0.8h 2245 add x2, x2, #2 2246 br x9 2247L(ipred_cfl_w8): 2248 AARCH64_VALID_JUMP_TARGET 2249 ld1 {v2.8h}, [x2] 2250 add v0.2s, v0.2s, v16.2s 2251 uaddlv s2, v2.8h 2252 cmp w4, #8 2253 add v0.2s, v0.2s, v2.2s 2254 ushl v0.2s, v0.2s, v17.2s 2255 b.eq 1f 2256 // h = 4/16/32 2257 cmp w4, #32 2258 mov w16, #0x6667 2259 mov w17, #0xAAAB 2260 csel w16, w16, w17, eq 2261 dup v16.2s, w16 2262 mul v0.2s, v0.2s, v16.2s 2263 ushr v0.2s, v0.2s, #17 22641: 2265 dup v0.8h, v0.h[0] 2266 b L(ipred_cfl_splat_w8) 2267 2268L(ipred_cfl_h16): 2269 AARCH64_VALID_JUMP_TARGET 2270 ld1 {v2.8h, v3.8h}, [x2], #32 2271 addp v0.8h, v2.8h, v3.8h 2272 add x2, x2, #2 2273 uaddlv s0, v0.8h 2274 br x9 2275L(ipred_cfl_w16): 2276 AARCH64_VALID_JUMP_TARGET 2277 ld1 {v2.8h, v3.8h}, [x2] 2278 add v0.2s, v0.2s, v16.2s 2279 addp v2.8h, v2.8h, v3.8h 2280 uaddlv s2, v2.8h 2281 cmp w4, #16 2282 add v0.2s, v0.2s, v2.2s 2283 ushl v0.2s, v0.2s, v17.2s 2284 b.eq 1f 2285 // h = 4/8/32 2286 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 2287 mov w16, #0x6667 2288 mov w17, #0xAAAB 2289 csel w16, w16, w17, eq 2290 dup v16.2s, w16 2291 mul v0.2s, v0.2s, v16.2s 2292 ushr v0.2s, v0.2s, #17 22931: 2294 dup v0.8h, v0.h[0] 2295 b L(ipred_cfl_splat_w16) 2296 2297L(ipred_cfl_h32): 2298 AARCH64_VALID_JUMP_TARGET 2299 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 2300 addp v2.8h, v2.8h, v3.8h 2301 addp v4.8h, v4.8h, v5.8h 2302 addp v0.8h, v2.8h, v4.8h 2303 add x2, x2, #2 2304 uaddlv s0, v0.8h 2305 br x9 2306L(ipred_cfl_w32): 2307 AARCH64_VALID_JUMP_TARGET 2308 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2309 add v0.4s, v0.4s, v16.4s 2310 addp v2.8h, v2.8h, v3.8h 2311 addp v4.8h, v4.8h, v5.8h 2312 addp v2.8h, v2.8h, v4.8h 2313 cmp w4, #32 2314 uaddlv s2, v2.8h 2315 add v0.2s, v0.2s, v2.2s 2316 ushl v0.2s, v0.2s, v17.2s 2317 b.eq 1f 2318 // h = 8/16 2319 cmp w4, #8 2320 mov w16, #0x6667 2321 mov w17, #0xAAAB 2322 csel w16, w16, w17, eq 2323 dup v16.2s, w16 2324 mul v0.2s, v0.2s, v16.2s 2325 ushr v0.2s, v0.2s, #17 23261: 2327 dup v0.8h, v0.h[0] 2328 b L(ipred_cfl_splat_w16) 2329 2330L(ipred_cfl_tbl): 2331 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 2332 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 2333 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 2334 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 2335 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 2336 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 2337 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 2338 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 2339endfunc 2340 2341// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2342// const ptrdiff_t stride, const int w_pad, 2343// const int h_pad, const int cw, const int ch); 2344function ipred_cfl_ac_420_16bpc_neon, export=1 2345 clz w8, w5 2346 lsl w4, w4, #2 2347 adr x7, L(ipred_cfl_ac_420_tbl) 2348 sub w8, w8, #27 2349 ldrh w8, [x7, w8, uxtw #1] 2350 movi v24.4s, #0 2351 movi v25.4s, #0 2352 movi v26.4s, #0 2353 movi v27.4s, #0 2354 sub x7, x7, w8, uxtw 2355 sub w8, w6, w4 // height - h_pad 2356 rbit w9, w5 // rbit(width) 2357 rbit w10, w6 // rbit(height) 2358 clz w9, w9 // ctz(width) 2359 clz w10, w10 // ctz(height) 2360 add w9, w9, w10 // log2sz 2361 add x10, x1, x2 2362 dup v31.4s, w9 2363 lsl x2, x2, #1 2364 neg v31.4s, v31.4s // -log2sz 2365 br x7 2366 2367L(ipred_cfl_ac_420_w4): 2368 AARCH64_VALID_JUMP_TARGET 23691: // Copy and subsample input 2370 ld1 {v0.8h}, [x1], x2 2371 ld1 {v1.8h}, [x10], x2 2372 ld1 {v2.8h}, [x1], x2 2373 ld1 {v3.8h}, [x10], x2 2374 addp v0.8h, v0.8h, v2.8h 2375 addp v1.8h, v1.8h, v3.8h 2376 add v0.8h, v0.8h, v1.8h 2377 shl v0.8h, v0.8h, #1 2378 subs w8, w8, #2 2379 st1 {v0.8h}, [x0], #16 2380 uaddw v24.4s, v24.4s, v0.4h 2381 uaddw2 v25.4s, v25.4s, v0.8h 2382 b.gt 1b 2383 trn2 v1.2d, v0.2d, v0.2d 2384 trn2 v0.2d, v0.2d, v0.2d 2385L(ipred_cfl_ac_420_w4_hpad): 2386 cbz w4, 3f 23872: // Vertical padding (h_pad > 0) 2388 subs w4, w4, #4 2389 st1 {v0.8h, v1.8h}, [x0], #32 2390 uaddw v24.4s, v24.4s, v0.4h 2391 uaddw2 v25.4s, v25.4s, v0.8h 2392 uaddw v26.4s, v26.4s, v1.4h 2393 uaddw2 v27.4s, v27.4s, v1.8h 2394 b.gt 2b 23953: 2396L(ipred_cfl_ac_420_w4_calc_subtract_dc): 2397 // Aggregate the sums 2398 add v24.4s, v24.4s, v25.4s 2399 add v26.4s, v26.4s, v27.4s 2400 add v0.4s, v24.4s, v26.4s 2401 addv s0, v0.4s // sum 2402 sub x0, x0, w6, uxtw #3 2403 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2404 dup v4.8h, v4.h[0] 24056: // Subtract dc from ac 2406 ld1 {v0.8h, v1.8h}, [x0] 2407 subs w6, w6, #4 2408 sub v0.8h, v0.8h, v4.8h 2409 sub v1.8h, v1.8h, v4.8h 2410 st1 {v0.8h, v1.8h}, [x0], #32 2411 b.gt 6b 2412 ret 2413 2414L(ipred_cfl_ac_420_w8): 2415 AARCH64_VALID_JUMP_TARGET 2416 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 24171: // Copy and subsample input, without padding 2418 ld1 {v0.8h, v1.8h}, [x1], x2 2419 ld1 {v2.8h, v3.8h}, [x10], x2 2420 ld1 {v4.8h, v5.8h}, [x1], x2 2421 addp v0.8h, v0.8h, v1.8h 2422 ld1 {v6.8h, v7.8h}, [x10], x2 2423 addp v2.8h, v2.8h, v3.8h 2424 addp v4.8h, v4.8h, v5.8h 2425 addp v6.8h, v6.8h, v7.8h 2426 add v0.8h, v0.8h, v2.8h 2427 add v4.8h, v4.8h, v6.8h 2428 shl v0.8h, v0.8h, #1 2429 shl v1.8h, v4.8h, #1 2430 subs w8, w8, #2 2431 st1 {v0.8h, v1.8h}, [x0], #32 2432 uaddw v24.4s, v24.4s, v0.4h 2433 uaddw2 v25.4s, v25.4s, v0.8h 2434 uaddw v26.4s, v26.4s, v1.4h 2435 uaddw2 v27.4s, v27.4s, v1.8h 2436 b.gt 1b 2437 mov v0.16b, v1.16b 2438 b L(ipred_cfl_ac_420_w8_hpad) 2439 2440L(ipred_cfl_ac_420_w8_wpad): 24411: // Copy and subsample input, padding 4 2442 ld1 {v0.8h}, [x1], x2 2443 ld1 {v1.8h}, [x10], x2 2444 ld1 {v2.8h}, [x1], x2 2445 ld1 {v3.8h}, [x10], x2 2446 addp v0.8h, v0.8h, v2.8h 2447 addp v1.8h, v1.8h, v3.8h 2448 add v0.8h, v0.8h, v1.8h 2449 shl v0.8h, v0.8h, #1 2450 dup v1.4h, v0.h[3] 2451 dup v3.4h, v0.h[7] 2452 trn2 v2.2d, v0.2d, v0.2d 2453 subs w8, w8, #2 2454 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 2455 uaddw v24.4s, v24.4s, v0.4h 2456 uaddw v25.4s, v25.4s, v1.4h 2457 uaddw v26.4s, v26.4s, v2.4h 2458 uaddw v27.4s, v27.4s, v3.4h 2459 b.gt 1b 2460 trn1 v0.2d, v2.2d, v3.2d 2461 trn1 v1.2d, v2.2d, v3.2d 2462 2463L(ipred_cfl_ac_420_w8_hpad): 2464 cbz w4, 3f 24652: // Vertical padding (h_pad > 0) 2466 subs w4, w4, #4 2467 st1 {v0.8h, v1.8h}, [x0], #32 2468 uaddw v24.4s, v24.4s, v0.4h 2469 uaddw2 v25.4s, v25.4s, v0.8h 2470 uaddw v26.4s, v26.4s, v1.4h 2471 uaddw2 v27.4s, v27.4s, v1.8h 2472 st1 {v0.8h, v1.8h}, [x0], #32 2473 uaddw v24.4s, v24.4s, v0.4h 2474 uaddw2 v25.4s, v25.4s, v0.8h 2475 uaddw v26.4s, v26.4s, v1.4h 2476 uaddw2 v27.4s, v27.4s, v1.8h 2477 b.gt 2b 24783: 2479 2480 // Double the height and reuse the w4 summing/subtracting 2481 lsl w6, w6, #1 2482 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2483 2484L(ipred_cfl_ac_420_w16): 2485 AARCH64_VALID_JUMP_TARGET 2486 adr x7, L(ipred_cfl_ac_420_w16_tbl) 2487 ldrh w3, [x7, w3, uxtw #1] 2488 sub x7, x7, w3, uxtw 2489 br x7 2490 2491L(ipred_cfl_ac_420_w16_wpad0): 2492 AARCH64_VALID_JUMP_TARGET 24931: // Copy and subsample input, without padding 2494 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2495 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2496 addp v0.8h, v0.8h, v1.8h 2497 addp v2.8h, v2.8h, v3.8h 2498 addp v4.8h, v4.8h, v5.8h 2499 addp v6.8h, v6.8h, v7.8h 2500 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 2501 add v0.8h, v0.8h, v4.8h 2502 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 2503 add v2.8h, v2.8h, v6.8h 2504 addp v16.8h, v16.8h, v17.8h 2505 addp v18.8h, v18.8h, v19.8h 2506 addp v20.8h, v20.8h, v21.8h 2507 addp v22.8h, v22.8h, v23.8h 2508 add v16.8h, v16.8h, v20.8h 2509 add v18.8h, v18.8h, v22.8h 2510 shl v0.8h, v0.8h, #1 2511 shl v1.8h, v2.8h, #1 2512 shl v2.8h, v16.8h, #1 2513 shl v3.8h, v18.8h, #1 2514 subs w8, w8, #2 2515 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2516 uaddw v24.4s, v24.4s, v0.4h 2517 uaddw2 v25.4s, v25.4s, v0.8h 2518 uaddw v26.4s, v26.4s, v1.4h 2519 uaddw2 v27.4s, v27.4s, v1.8h 2520 uaddw v24.4s, v24.4s, v2.4h 2521 uaddw2 v25.4s, v25.4s, v2.8h 2522 uaddw v26.4s, v26.4s, v3.4h 2523 uaddw2 v27.4s, v27.4s, v3.8h 2524 b.gt 1b 2525 mov v0.16b, v2.16b 2526 mov v1.16b, v3.16b 2527 b L(ipred_cfl_ac_420_w16_hpad) 2528 2529L(ipred_cfl_ac_420_w16_wpad1): 2530 AARCH64_VALID_JUMP_TARGET 25311: // Copy and subsample input, padding 4 2532 ldr q2, [x1, #32] 2533 ld1 {v0.8h, v1.8h}, [x1], x2 2534 ldr q5, [x10, #32] 2535 ld1 {v3.8h, v4.8h}, [x10], x2 2536 addp v2.8h, v2.8h, v2.8h 2537 addp v0.8h, v0.8h, v1.8h 2538 addp v5.8h, v5.8h, v5.8h 2539 addp v3.8h, v3.8h, v4.8h 2540 ldr q18, [x1, #32] 2541 add v2.4h, v2.4h, v5.4h 2542 ld1 {v16.8h, v17.8h}, [x1], x2 2543 add v0.8h, v0.8h, v3.8h 2544 ldr q21, [x10, #32] 2545 ld1 {v19.8h, v20.8h}, [x10], x2 2546 addp v18.8h, v18.8h, v18.8h 2547 addp v16.8h, v16.8h, v17.8h 2548 addp v21.8h, v21.8h, v21.8h 2549 addp v19.8h, v19.8h, v20.8h 2550 add v18.4h, v18.4h, v21.4h 2551 add v16.8h, v16.8h, v19.8h 2552 shl v1.4h, v2.4h, #1 2553 shl v0.8h, v0.8h, #1 2554 shl v3.4h, v18.4h, #1 2555 shl v2.8h, v16.8h, #1 2556 dup v4.4h, v1.h[3] 2557 dup v5.4h, v3.h[3] 2558 trn1 v1.2d, v1.2d, v4.2d 2559 trn1 v3.2d, v3.2d, v5.2d 2560 subs w8, w8, #2 2561 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2562 uaddw v24.4s, v24.4s, v0.4h 2563 uaddw2 v25.4s, v25.4s, v0.8h 2564 uaddw v26.4s, v26.4s, v1.4h 2565 uaddw2 v27.4s, v27.4s, v1.8h 2566 uaddw v24.4s, v24.4s, v2.4h 2567 uaddw2 v25.4s, v25.4s, v2.8h 2568 uaddw v26.4s, v26.4s, v3.4h 2569 uaddw2 v27.4s, v27.4s, v3.8h 2570 b.gt 1b 2571 mov v0.16b, v2.16b 2572 mov v1.16b, v3.16b 2573 b L(ipred_cfl_ac_420_w16_hpad) 2574 2575L(ipred_cfl_ac_420_w16_wpad2): 2576 AARCH64_VALID_JUMP_TARGET 25771: // Copy and subsample input, padding 8 2578 ld1 {v0.8h, v1.8h}, [x1], x2 2579 ld1 {v2.8h, v3.8h}, [x10], x2 2580 ld1 {v4.8h, v5.8h}, [x1], x2 2581 addp v0.8h, v0.8h, v1.8h 2582 ld1 {v6.8h, v7.8h}, [x10], x2 2583 addp v2.8h, v2.8h, v3.8h 2584 addp v4.8h, v4.8h, v5.8h 2585 addp v6.8h, v6.8h, v7.8h 2586 add v0.8h, v0.8h, v2.8h 2587 add v4.8h, v4.8h, v6.8h 2588 shl v0.8h, v0.8h, #1 2589 shl v2.8h, v4.8h, #1 2590 dup v1.8h, v0.h[7] 2591 dup v3.8h, v2.h[7] 2592 subs w8, w8, #2 2593 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2594 uaddw v24.4s, v24.4s, v0.4h 2595 uaddw2 v25.4s, v25.4s, v0.8h 2596 uaddw v26.4s, v26.4s, v1.4h 2597 uaddw2 v27.4s, v27.4s, v1.8h 2598 uaddw v24.4s, v24.4s, v2.4h 2599 uaddw2 v25.4s, v25.4s, v2.8h 2600 uaddw v26.4s, v26.4s, v3.4h 2601 uaddw2 v27.4s, v27.4s, v3.8h 2602 b.gt 1b 2603 mov v0.16b, v2.16b 2604 mov v1.16b, v3.16b 2605 b L(ipred_cfl_ac_420_w16_hpad) 2606 2607L(ipred_cfl_ac_420_w16_wpad3): 2608 AARCH64_VALID_JUMP_TARGET 26091: // Copy and subsample input, padding 12 2610 ld1 {v0.8h}, [x1], x2 2611 ld1 {v2.8h}, [x10], x2 2612 ld1 {v4.8h}, [x1], x2 2613 ld1 {v6.8h}, [x10], x2 2614 addp v0.8h, v0.8h, v4.8h 2615 addp v2.8h, v2.8h, v6.8h 2616 add v0.8h, v0.8h, v2.8h 2617 shl v0.8h, v0.8h, #1 2618 dup v1.8h, v0.h[3] 2619 dup v3.8h, v0.h[7] 2620 trn2 v2.2d, v0.2d, v3.2d 2621 trn1 v0.2d, v0.2d, v1.2d 2622 subs w8, w8, #2 2623 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2624 uaddw v24.4s, v24.4s, v0.4h 2625 uaddw2 v25.4s, v25.4s, v0.8h 2626 uaddw v26.4s, v26.4s, v1.4h 2627 uaddw2 v27.4s, v27.4s, v1.8h 2628 uaddw v24.4s, v24.4s, v2.4h 2629 uaddw2 v25.4s, v25.4s, v2.8h 2630 uaddw v26.4s, v26.4s, v3.4h 2631 uaddw2 v27.4s, v27.4s, v3.8h 2632 b.gt 1b 2633 mov v0.16b, v2.16b 2634 mov v1.16b, v3.16b 2635 2636L(ipred_cfl_ac_420_w16_hpad): 2637 cbz w4, 3f 26382: // Vertical padding (h_pad > 0) 2639 subs w4, w4, #4 2640 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2641 uaddw v24.4s, v24.4s, v0.4h 2642 uaddw2 v25.4s, v25.4s, v0.8h 2643 uaddw v26.4s, v26.4s, v1.4h 2644 uaddw2 v27.4s, v27.4s, v1.8h 2645 uaddw v24.4s, v24.4s, v2.4h 2646 uaddw2 v25.4s, v25.4s, v2.8h 2647 uaddw v26.4s, v26.4s, v3.4h 2648 uaddw2 v27.4s, v27.4s, v3.8h 2649 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2650 uaddw v24.4s, v24.4s, v0.4h 2651 uaddw2 v25.4s, v25.4s, v0.8h 2652 uaddw v26.4s, v26.4s, v1.4h 2653 uaddw2 v27.4s, v27.4s, v1.8h 2654 uaddw v24.4s, v24.4s, v2.4h 2655 uaddw2 v25.4s, v25.4s, v2.8h 2656 uaddw v26.4s, v26.4s, v3.4h 2657 uaddw2 v27.4s, v27.4s, v3.8h 2658 b.gt 2b 26593: 2660 2661 // Quadruple the height and reuse the w4 summing/subtracting 2662 lsl w6, w6, #2 2663 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2664 2665L(ipred_cfl_ac_420_tbl): 2666 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 2667 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 2668 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 2669 .hword 0 2670 2671L(ipred_cfl_ac_420_w16_tbl): 2672 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 2673 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 2674 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 2675 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 2676endfunc 2677 2678// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2679// const ptrdiff_t stride, const int w_pad, 2680// const int h_pad, const int cw, const int ch); 2681function ipred_cfl_ac_422_16bpc_neon, export=1 2682 clz w8, w5 2683 lsl w4, w4, #2 2684 adr x7, L(ipred_cfl_ac_422_tbl) 2685 sub w8, w8, #27 2686 ldrh w8, [x7, w8, uxtw #1] 2687 movi v24.4s, #0 2688 movi v25.4s, #0 2689 movi v26.4s, #0 2690 movi v27.4s, #0 2691 sub x7, x7, w8, uxtw 2692 sub w8, w6, w4 // height - h_pad 2693 rbit w9, w5 // rbit(width) 2694 rbit w10, w6 // rbit(height) 2695 clz w9, w9 // ctz(width) 2696 clz w10, w10 // ctz(height) 2697 add w9, w9, w10 // log2sz 2698 add x10, x1, x2 2699 dup v31.4s, w9 2700 lsl x2, x2, #1 2701 neg v31.4s, v31.4s // -log2sz 2702 br x7 2703 2704L(ipred_cfl_ac_422_w4): 2705 AARCH64_VALID_JUMP_TARGET 27061: // Copy and subsample input 2707 ld1 {v0.8h}, [x1], x2 2708 ld1 {v1.8h}, [x10], x2 2709 ld1 {v2.8h}, [x1], x2 2710 ld1 {v3.8h}, [x10], x2 2711 addp v0.8h, v0.8h, v1.8h 2712 addp v2.8h, v2.8h, v3.8h 2713 shl v0.8h, v0.8h, #2 2714 shl v1.8h, v2.8h, #2 2715 subs w8, w8, #4 2716 st1 {v0.8h, v1.8h}, [x0], #32 2717 uaddw v24.4s, v24.4s, v0.4h 2718 uaddw2 v25.4s, v25.4s, v0.8h 2719 uaddw v26.4s, v26.4s, v1.4h 2720 uaddw2 v27.4s, v27.4s, v1.8h 2721 b.gt 1b 2722 trn2 v0.2d, v1.2d, v1.2d 2723 trn2 v1.2d, v1.2d, v1.2d 2724 b L(ipred_cfl_ac_420_w4_hpad) 2725 2726L(ipred_cfl_ac_422_w8): 2727 AARCH64_VALID_JUMP_TARGET 2728 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 27291: // Copy and subsample input, without padding 2730 ld1 {v0.8h, v1.8h}, [x1], x2 2731 ld1 {v2.8h, v3.8h}, [x10], x2 2732 ld1 {v4.8h, v5.8h}, [x1], x2 2733 addp v0.8h, v0.8h, v1.8h 2734 ld1 {v6.8h, v7.8h}, [x10], x2 2735 addp v2.8h, v2.8h, v3.8h 2736 addp v4.8h, v4.8h, v5.8h 2737 addp v6.8h, v6.8h, v7.8h 2738 shl v0.8h, v0.8h, #2 2739 shl v1.8h, v2.8h, #2 2740 shl v2.8h, v4.8h, #2 2741 shl v3.8h, v6.8h, #2 2742 subs w8, w8, #4 2743 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2744 uaddw v24.4s, v24.4s, v0.4h 2745 uaddw2 v25.4s, v25.4s, v0.8h 2746 uaddw v26.4s, v26.4s, v1.4h 2747 uaddw2 v27.4s, v27.4s, v1.8h 2748 uaddw v24.4s, v24.4s, v2.4h 2749 uaddw2 v25.4s, v25.4s, v2.8h 2750 uaddw v26.4s, v26.4s, v3.4h 2751 uaddw2 v27.4s, v27.4s, v3.8h 2752 b.gt 1b 2753 mov v0.16b, v3.16b 2754 mov v1.16b, v3.16b 2755 b L(ipred_cfl_ac_420_w8_hpad) 2756 2757L(ipred_cfl_ac_422_w8_wpad): 27581: // Copy and subsample input, padding 4 2759 ld1 {v0.8h}, [x1], x2 2760 ld1 {v1.8h}, [x10], x2 2761 ld1 {v2.8h}, [x1], x2 2762 ld1 {v3.8h}, [x10], x2 2763 addp v0.8h, v0.8h, v1.8h 2764 addp v2.8h, v2.8h, v3.8h 2765 shl v0.8h, v0.8h, #2 2766 shl v2.8h, v2.8h, #2 2767 dup v4.4h, v0.h[3] 2768 dup v5.8h, v0.h[7] 2769 dup v6.4h, v2.h[3] 2770 dup v7.8h, v2.h[7] 2771 trn2 v1.2d, v0.2d, v5.2d 2772 trn1 v0.2d, v0.2d, v4.2d 2773 trn2 v3.2d, v2.2d, v7.2d 2774 trn1 v2.2d, v2.2d, v6.2d 2775 subs w8, w8, #4 2776 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2777 uaddw v24.4s, v24.4s, v0.4h 2778 uaddw2 v25.4s, v25.4s, v0.8h 2779 uaddw v26.4s, v26.4s, v1.4h 2780 uaddw2 v27.4s, v27.4s, v1.8h 2781 uaddw v24.4s, v24.4s, v2.4h 2782 uaddw2 v25.4s, v25.4s, v2.8h 2783 uaddw v26.4s, v26.4s, v3.4h 2784 uaddw2 v27.4s, v27.4s, v3.8h 2785 b.gt 1b 2786 mov v0.16b, v3.16b 2787 mov v1.16b, v3.16b 2788 b L(ipred_cfl_ac_420_w8_hpad) 2789 2790L(ipred_cfl_ac_422_w16): 2791 AARCH64_VALID_JUMP_TARGET 2792 adr x7, L(ipred_cfl_ac_422_w16_tbl) 2793 ldrh w3, [x7, w3, uxtw #1] 2794 sub x7, x7, w3, uxtw 2795 br x7 2796 2797L(ipred_cfl_ac_422_w16_wpad0): 2798 AARCH64_VALID_JUMP_TARGET 27991: // Copy and subsample input, without padding 2800 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2801 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2802 addp v0.8h, v0.8h, v1.8h 2803 addp v2.8h, v2.8h, v3.8h 2804 addp v4.8h, v4.8h, v5.8h 2805 addp v6.8h, v6.8h, v7.8h 2806 shl v0.8h, v0.8h, #2 2807 shl v1.8h, v2.8h, #2 2808 shl v2.8h, v4.8h, #2 2809 shl v3.8h, v6.8h, #2 2810 subs w8, w8, #2 2811 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2812 uaddw v24.4s, v24.4s, v0.4h 2813 uaddw2 v25.4s, v25.4s, v0.8h 2814 uaddw v26.4s, v26.4s, v1.4h 2815 uaddw2 v27.4s, v27.4s, v1.8h 2816 uaddw v24.4s, v24.4s, v2.4h 2817 uaddw2 v25.4s, v25.4s, v2.8h 2818 uaddw v26.4s, v26.4s, v3.4h 2819 uaddw2 v27.4s, v27.4s, v3.8h 2820 b.gt 1b 2821 mov v0.16b, v2.16b 2822 mov v1.16b, v3.16b 2823 b L(ipred_cfl_ac_420_w16_hpad) 2824 2825L(ipred_cfl_ac_422_w16_wpad1): 2826 AARCH64_VALID_JUMP_TARGET 28271: // Copy and subsample input, padding 4 2828 ldr q2, [x1, #32] 2829 ld1 {v0.8h, v1.8h}, [x1], x2 2830 ldr q6, [x10, #32] 2831 ld1 {v4.8h, v5.8h}, [x10], x2 2832 addp v2.8h, v2.8h, v2.8h 2833 addp v0.8h, v0.8h, v1.8h 2834 addp v6.8h, v6.8h, v6.8h 2835 addp v4.8h, v4.8h, v5.8h 2836 shl v1.4h, v2.4h, #2 2837 shl v0.8h, v0.8h, #2 2838 shl v3.4h, v6.4h, #2 2839 shl v2.8h, v4.8h, #2 2840 dup v4.4h, v1.h[3] 2841 dup v5.4h, v3.h[3] 2842 trn1 v1.2d, v1.2d, v4.2d 2843 trn1 v3.2d, v3.2d, v5.2d 2844 subs w8, w8, #2 2845 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2846 uaddw v24.4s, v24.4s, v0.4h 2847 uaddw2 v25.4s, v25.4s, v0.8h 2848 uaddw v26.4s, v26.4s, v1.4h 2849 uaddw2 v27.4s, v27.4s, v1.8h 2850 uaddw v24.4s, v24.4s, v2.4h 2851 uaddw2 v25.4s, v25.4s, v2.8h 2852 uaddw v26.4s, v26.4s, v3.4h 2853 uaddw2 v27.4s, v27.4s, v3.8h 2854 b.gt 1b 2855 mov v0.16b, v2.16b 2856 mov v1.16b, v3.16b 2857 b L(ipred_cfl_ac_420_w16_hpad) 2858 2859L(ipred_cfl_ac_422_w16_wpad2): 2860 AARCH64_VALID_JUMP_TARGET 28611: // Copy and subsample input, padding 8 2862 ld1 {v0.8h, v1.8h}, [x1], x2 2863 ld1 {v2.8h, v3.8h}, [x10], x2 2864 addp v0.8h, v0.8h, v1.8h 2865 addp v2.8h, v2.8h, v3.8h 2866 shl v0.8h, v0.8h, #2 2867 shl v2.8h, v2.8h, #2 2868 dup v1.8h, v0.h[7] 2869 dup v3.8h, v2.h[7] 2870 subs w8, w8, #2 2871 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2872 uaddw v24.4s, v24.4s, v0.4h 2873 uaddw2 v25.4s, v25.4s, v0.8h 2874 uaddw v26.4s, v26.4s, v1.4h 2875 uaddw2 v27.4s, v27.4s, v1.8h 2876 uaddw v24.4s, v24.4s, v2.4h 2877 uaddw2 v25.4s, v25.4s, v2.8h 2878 uaddw v26.4s, v26.4s, v3.4h 2879 uaddw2 v27.4s, v27.4s, v3.8h 2880 b.gt 1b 2881 mov v0.16b, v2.16b 2882 mov v1.16b, v3.16b 2883 b L(ipred_cfl_ac_420_w16_hpad) 2884 2885L(ipred_cfl_ac_422_w16_wpad3): 2886 AARCH64_VALID_JUMP_TARGET 28871: // Copy and subsample input, padding 12 2888 ld1 {v0.8h}, [x1], x2 2889 ld1 {v2.8h}, [x10], x2 2890 addp v0.8h, v0.8h, v0.8h 2891 addp v2.8h, v2.8h, v2.8h 2892 shl v0.4h, v0.4h, #2 2893 shl v2.4h, v2.4h, #2 2894 dup v1.8h, v0.h[3] 2895 dup v3.8h, v2.h[3] 2896 trn1 v0.2d, v0.2d, v1.2d 2897 trn1 v2.2d, v2.2d, v3.2d 2898 subs w8, w8, #2 2899 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2900 uaddw v24.4s, v24.4s, v0.4h 2901 uaddw2 v25.4s, v25.4s, v0.8h 2902 uaddw v26.4s, v26.4s, v1.4h 2903 uaddw2 v27.4s, v27.4s, v1.8h 2904 uaddw v24.4s, v24.4s, v2.4h 2905 uaddw2 v25.4s, v25.4s, v2.8h 2906 uaddw v26.4s, v26.4s, v3.4h 2907 uaddw2 v27.4s, v27.4s, v3.8h 2908 b.gt 1b 2909 mov v0.16b, v2.16b 2910 mov v1.16b, v3.16b 2911 b L(ipred_cfl_ac_420_w16_hpad) 2912 2913L(ipred_cfl_ac_422_tbl): 2914 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 2915 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 2916 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 2917 .hword 0 2918 2919L(ipred_cfl_ac_422_w16_tbl): 2920 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 2921 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 2922 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 2923 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 2924endfunc 2925 2926// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2927// const ptrdiff_t stride, const int w_pad, 2928// const int h_pad, const int cw, const int ch); 2929function ipred_cfl_ac_444_16bpc_neon, export=1 2930 clz w8, w5 2931 lsl w4, w4, #2 2932 adr x7, L(ipred_cfl_ac_444_tbl) 2933 sub w8, w8, #26 2934 ldrh w8, [x7, w8, uxtw #1] 2935 movi v24.4s, #0 2936 movi v25.4s, #0 2937 movi v26.4s, #0 2938 movi v27.4s, #0 2939 sub x7, x7, w8, uxtw 2940 sub w8, w6, w4 // height - h_pad 2941 rbit w9, w5 // rbit(width) 2942 rbit w10, w6 // rbit(height) 2943 clz w9, w9 // ctz(width) 2944 clz w10, w10 // ctz(height) 2945 add w9, w9, w10 // log2sz 2946 add x10, x1, x2 2947 dup v31.4s, w9 2948 lsl x2, x2, #1 2949 neg v31.4s, v31.4s // -log2sz 2950 br x7 2951 2952L(ipred_cfl_ac_444_w4): 2953 AARCH64_VALID_JUMP_TARGET 29541: // Copy and expand input 2955 ld1 {v0.4h}, [x1], x2 2956 ld1 {v0.d}[1], [x10], x2 2957 ld1 {v1.4h}, [x1], x2 2958 ld1 {v1.d}[1], [x10], x2 2959 shl v0.8h, v0.8h, #3 2960 shl v1.8h, v1.8h, #3 2961 subs w8, w8, #4 2962 st1 {v0.8h, v1.8h}, [x0], #32 2963 uaddw v24.4s, v24.4s, v0.4h 2964 uaddw2 v25.4s, v25.4s, v0.8h 2965 uaddw v26.4s, v26.4s, v1.4h 2966 uaddw2 v27.4s, v27.4s, v1.8h 2967 b.gt 1b 2968 trn2 v0.2d, v1.2d, v1.2d 2969 trn2 v1.2d, v1.2d, v1.2d 2970 b L(ipred_cfl_ac_420_w4_hpad) 2971 2972L(ipred_cfl_ac_444_w8): 2973 AARCH64_VALID_JUMP_TARGET 29741: // Copy and expand input 2975 ld1 {v0.8h}, [x1], x2 2976 ld1 {v1.8h}, [x10], x2 2977 ld1 {v2.8h}, [x1], x2 2978 shl v0.8h, v0.8h, #3 2979 ld1 {v3.8h}, [x10], x2 2980 shl v1.8h, v1.8h, #3 2981 shl v2.8h, v2.8h, #3 2982 shl v3.8h, v3.8h, #3 2983 subs w8, w8, #4 2984 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2985 uaddw v24.4s, v24.4s, v0.4h 2986 uaddw2 v25.4s, v25.4s, v0.8h 2987 uaddw v26.4s, v26.4s, v1.4h 2988 uaddw2 v27.4s, v27.4s, v1.8h 2989 uaddw v24.4s, v24.4s, v2.4h 2990 uaddw2 v25.4s, v25.4s, v2.8h 2991 uaddw v26.4s, v26.4s, v3.4h 2992 uaddw2 v27.4s, v27.4s, v3.8h 2993 b.gt 1b 2994 mov v0.16b, v3.16b 2995 mov v1.16b, v3.16b 2996 b L(ipred_cfl_ac_420_w8_hpad) 2997 2998L(ipred_cfl_ac_444_w16): 2999 AARCH64_VALID_JUMP_TARGET 3000 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 30011: // Copy and expand input, without padding 3002 ld1 {v0.8h, v1.8h}, [x1], x2 3003 ld1 {v2.8h, v3.8h}, [x10], x2 3004 shl v0.8h, v0.8h, #3 3005 shl v1.8h, v1.8h, #3 3006 shl v2.8h, v2.8h, #3 3007 shl v3.8h, v3.8h, #3 3008 subs w8, w8, #2 3009 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3010 uaddw v24.4s, v24.4s, v0.4h 3011 uaddw2 v25.4s, v25.4s, v0.8h 3012 uaddw v26.4s, v26.4s, v1.4h 3013 uaddw2 v27.4s, v27.4s, v1.8h 3014 uaddw v24.4s, v24.4s, v2.4h 3015 uaddw2 v25.4s, v25.4s, v2.8h 3016 uaddw v26.4s, v26.4s, v3.4h 3017 uaddw2 v27.4s, v27.4s, v3.8h 3018 b.gt 1b 3019 mov v0.16b, v2.16b 3020 mov v1.16b, v3.16b 3021 b L(ipred_cfl_ac_420_w16_hpad) 3022 3023L(ipred_cfl_ac_444_w16_wpad): 30241: // Copy and expand input, padding 8 3025 ld1 {v0.8h}, [x1], x2 3026 ld1 {v2.8h}, [x10], x2 3027 shl v0.8h, v0.8h, #3 3028 shl v2.8h, v2.8h, #3 3029 dup v1.8h, v0.h[7] 3030 dup v3.8h, v2.h[7] 3031 subs w8, w8, #2 3032 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3033 uaddw v24.4s, v24.4s, v0.4h 3034 uaddw2 v25.4s, v25.4s, v0.8h 3035 uaddw v26.4s, v26.4s, v1.4h 3036 uaddw2 v27.4s, v27.4s, v1.8h 3037 uaddw v24.4s, v24.4s, v2.4h 3038 uaddw2 v25.4s, v25.4s, v2.8h 3039 uaddw v26.4s, v26.4s, v3.4h 3040 uaddw2 v27.4s, v27.4s, v3.8h 3041 b.gt 1b 3042 mov v0.16b, v2.16b 3043 mov v1.16b, v3.16b 3044 b L(ipred_cfl_ac_420_w16_hpad) 3045 3046L(ipred_cfl_ac_444_w32): 3047 AARCH64_VALID_JUMP_TARGET 3048 adr x7, L(ipred_cfl_ac_444_w32_tbl) 3049 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 3050 lsr x2, x2, #1 // Restore the stride to one line increments 3051 sub x7, x7, w3, uxtw 3052 br x7 3053 3054L(ipred_cfl_ac_444_w32_wpad0): 3055 AARCH64_VALID_JUMP_TARGET 30561: // Copy and expand input, without padding 3057 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 3058 shl v0.8h, v0.8h, #3 3059 shl v1.8h, v1.8h, #3 3060 shl v2.8h, v2.8h, #3 3061 shl v3.8h, v3.8h, #3 3062 subs w8, w8, #1 3063 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3064 uaddw v24.4s, v24.4s, v0.4h 3065 uaddw2 v25.4s, v25.4s, v0.8h 3066 uaddw v26.4s, v26.4s, v1.4h 3067 uaddw2 v27.4s, v27.4s, v1.8h 3068 uaddw v24.4s, v24.4s, v2.4h 3069 uaddw2 v25.4s, v25.4s, v2.8h 3070 uaddw v26.4s, v26.4s, v3.4h 3071 uaddw2 v27.4s, v27.4s, v3.8h 3072 b.gt 1b 3073 b L(ipred_cfl_ac_444_w32_hpad) 3074 3075L(ipred_cfl_ac_444_w32_wpad2): 3076 AARCH64_VALID_JUMP_TARGET 30771: // Copy and expand input, padding 8 3078 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 3079 shl v2.8h, v2.8h, #3 3080 shl v0.8h, v0.8h, #3 3081 shl v1.8h, v1.8h, #3 3082 dup v3.8h, v2.h[7] 3083 subs w8, w8, #1 3084 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3085 uaddw v24.4s, v24.4s, v0.4h 3086 uaddw2 v25.4s, v25.4s, v0.8h 3087 uaddw v26.4s, v26.4s, v1.4h 3088 uaddw2 v27.4s, v27.4s, v1.8h 3089 uaddw v24.4s, v24.4s, v2.4h 3090 uaddw2 v25.4s, v25.4s, v2.8h 3091 uaddw v26.4s, v26.4s, v3.4h 3092 uaddw2 v27.4s, v27.4s, v3.8h 3093 b.gt 1b 3094 b L(ipred_cfl_ac_444_w32_hpad) 3095 3096L(ipred_cfl_ac_444_w32_wpad4): 3097 AARCH64_VALID_JUMP_TARGET 30981: // Copy and expand input, padding 16 3099 ld1 {v0.8h, v1.8h}, [x1], x2 3100 shl v1.8h, v1.8h, #3 3101 shl v0.8h, v0.8h, #3 3102 dup v2.8h, v1.h[7] 3103 dup v3.8h, v1.h[7] 3104 subs w8, w8, #1 3105 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3106 uaddw v24.4s, v24.4s, v0.4h 3107 uaddw2 v25.4s, v25.4s, v0.8h 3108 uaddw v26.4s, v26.4s, v1.4h 3109 uaddw2 v27.4s, v27.4s, v1.8h 3110 uaddw v24.4s, v24.4s, v2.4h 3111 uaddw2 v25.4s, v25.4s, v2.8h 3112 uaddw v26.4s, v26.4s, v3.4h 3113 uaddw2 v27.4s, v27.4s, v3.8h 3114 b.gt 1b 3115 b L(ipred_cfl_ac_444_w32_hpad) 3116 3117L(ipred_cfl_ac_444_w32_wpad6): 3118 AARCH64_VALID_JUMP_TARGET 31191: // Copy and expand input, padding 24 3120 ld1 {v0.8h}, [x1], x2 3121 shl v0.8h, v0.8h, #3 3122 dup v1.8h, v0.h[7] 3123 dup v2.8h, v0.h[7] 3124 dup v3.8h, v0.h[7] 3125 subs w8, w8, #1 3126 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3127 uaddw v24.4s, v24.4s, v0.4h 3128 uaddw2 v25.4s, v25.4s, v0.8h 3129 uaddw v26.4s, v26.4s, v1.4h 3130 uaddw2 v27.4s, v27.4s, v1.8h 3131 uaddw v24.4s, v24.4s, v2.4h 3132 uaddw2 v25.4s, v25.4s, v2.8h 3133 uaddw v26.4s, v26.4s, v3.4h 3134 uaddw2 v27.4s, v27.4s, v3.8h 3135 b.gt 1b 3136 3137L(ipred_cfl_ac_444_w32_hpad): 3138 cbz w4, 3f 31392: // Vertical padding (h_pad > 0) 3140 subs w4, w4, #2 3141 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3142 uaddw v24.4s, v24.4s, v0.4h 3143 uaddw2 v25.4s, v25.4s, v0.8h 3144 uaddw v26.4s, v26.4s, v1.4h 3145 uaddw2 v27.4s, v27.4s, v1.8h 3146 uaddw v24.4s, v24.4s, v2.4h 3147 uaddw2 v25.4s, v25.4s, v2.8h 3148 uaddw v26.4s, v26.4s, v3.4h 3149 uaddw2 v27.4s, v27.4s, v3.8h 3150 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3151 uaddw v24.4s, v24.4s, v0.4h 3152 uaddw2 v25.4s, v25.4s, v0.8h 3153 uaddw v26.4s, v26.4s, v1.4h 3154 uaddw2 v27.4s, v27.4s, v1.8h 3155 uaddw v24.4s, v24.4s, v2.4h 3156 uaddw2 v25.4s, v25.4s, v2.8h 3157 uaddw v26.4s, v26.4s, v3.4h 3158 uaddw2 v27.4s, v27.4s, v3.8h 3159 b.gt 2b 31603: 3161 3162 // Multiply the height by eight and reuse the w4 subtracting 3163 lsl w6, w6, #3 3164 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 3165 3166L(ipred_cfl_ac_444_tbl): 3167 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 3168 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 3169 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 3170 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 3171 3172L(ipred_cfl_ac_444_w32_tbl): 3173 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 3174 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 3175 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 3176 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 3177endfunc 3178