1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height, 35// const int bitdepth_max); 36function ipred_dc_128_16bpc_neon, export=1 37 ldr w8, [sp] 38 clz w3, w3 39 adr x5, L(ipred_dc_128_tbl) 40 sub w3, w3, #25 41 ldrh w3, [x5, w3, uxtw #1] 42 dup v0.8h, w8 43 sub x5, x5, w3, uxtw 44 add x6, x0, x1 45 lsl x1, x1, #1 46 urshr v0.8h, v0.8h, #1 47 br x5 484: 49 st1 {v0.4h}, [x0], x1 50 st1 {v0.4h}, [x6], x1 51 subs w4, w4, #4 52 st1 {v0.4h}, [x0], x1 53 st1 {v0.4h}, [x6], x1 54 b.gt 4b 55 ret 568: 57 st1 {v0.8h}, [x0], x1 58 st1 {v0.8h}, [x6], x1 59 subs w4, w4, #4 60 st1 {v0.8h}, [x0], x1 61 st1 {v0.8h}, [x6], x1 62 b.gt 8b 63 ret 64160: 65 mov v1.16b, v0.16b 6616: 67 st1 {v0.8h, v1.8h}, [x0], x1 68 st1 {v0.8h, v1.8h}, [x6], x1 69 subs w4, w4, #4 70 st1 {v0.8h, v1.8h}, [x0], x1 71 st1 {v0.8h, v1.8h}, [x6], x1 72 b.gt 16b 73 ret 74320: 75 mov v1.16b, v0.16b 76 mov v2.16b, v0.16b 77 mov v3.16b, v0.16b 7832: 79 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 80 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 81 subs w4, w4, #4 82 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 83 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 84 b.gt 32b 85 ret 86640: 87 mov v1.16b, v0.16b 88 mov v2.16b, v0.16b 89 mov v3.16b, v0.16b 90 sub x1, x1, #64 9164: 92 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 93 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 94 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 95 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 96 subs w4, w4, #4 97 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 98 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 101 b.gt 64b 102 ret 103 104L(ipred_dc_128_tbl): 105 .hword L(ipred_dc_128_tbl) - 640b 106 .hword L(ipred_dc_128_tbl) - 320b 107 .hword L(ipred_dc_128_tbl) - 160b 108 .hword L(ipred_dc_128_tbl) - 8b 109 .hword L(ipred_dc_128_tbl) - 4b 110endfunc 111 112// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 113// const pixel *const topleft, 114// const int width, const int height, const int a, 115// const int max_width, const int max_height); 116function ipred_v_16bpc_neon, export=1 117 clz w3, w3 118 adr x5, L(ipred_v_tbl) 119 sub w3, w3, #25 120 ldrh w3, [x5, w3, uxtw #1] 121 add x2, x2, #2 122 sub x5, x5, w3, uxtw 123 add x6, x0, x1 124 lsl x1, x1, #1 125 br x5 12640: 127 ld1 {v0.4h}, [x2] 1284: 129 st1 {v0.4h}, [x0], x1 130 st1 {v0.4h}, [x6], x1 131 subs w4, w4, #4 132 st1 {v0.4h}, [x0], x1 133 st1 {v0.4h}, [x6], x1 134 b.gt 4b 135 ret 13680: 137 ld1 {v0.8h}, [x2] 1388: 139 st1 {v0.8h}, [x0], x1 140 st1 {v0.8h}, [x6], x1 141 subs w4, w4, #4 142 st1 {v0.8h}, [x0], x1 143 st1 {v0.8h}, [x6], x1 144 b.gt 8b 145 ret 146160: 147 ld1 {v0.8h, v1.8h}, [x2] 14816: 149 st1 {v0.8h, v1.8h}, [x0], x1 150 st1 {v0.8h, v1.8h}, [x6], x1 151 subs w4, w4, #4 152 st1 {v0.8h, v1.8h}, [x0], x1 153 st1 {v0.8h, v1.8h}, [x6], x1 154 b.gt 16b 155 ret 156320: 157 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 15832: 159 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 160 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 161 subs w4, w4, #4 162 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 163 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 164 b.gt 32b 165 ret 166640: 167 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 168 sub x1, x1, #64 169 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 17064: 171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 173 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 174 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 175 subs w4, w4, #4 176 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 177 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 178 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 179 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 180 b.gt 64b 181 ret 182 183L(ipred_v_tbl): 184 .hword L(ipred_v_tbl) - 640b 185 .hword L(ipred_v_tbl) - 320b 186 .hword L(ipred_v_tbl) - 160b 187 .hword L(ipred_v_tbl) - 80b 188 .hword L(ipred_v_tbl) - 40b 189endfunc 190 191// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 192// const pixel *const topleft, 193// const int width, const int height, const int a, 194// const int max_width, const int max_height); 195function ipred_h_16bpc_neon, export=1 196 clz w3, w3 197 adr x5, L(ipred_h_tbl) 198 sub w3, w3, #25 199 ldrh w3, [x5, w3, uxtw #1] 200 sub x2, x2, #8 201 sub x5, x5, w3, uxtw 202 mov x7, #-8 203 add x6, x0, x1 204 lsl x1, x1, #1 205 br x5 2064: 207 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 208 st1 {v3.4h}, [x0], x1 209 st1 {v2.4h}, [x6], x1 210 subs w4, w4, #4 211 st1 {v1.4h}, [x0], x1 212 st1 {v0.4h}, [x6], x1 213 b.gt 4b 214 ret 2158: 216 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 217 st1 {v3.8h}, [x0], x1 218 st1 {v2.8h}, [x6], x1 219 subs w4, w4, #4 220 st1 {v1.8h}, [x0], x1 221 st1 {v0.8h}, [x6], x1 222 b.gt 8b 223 ret 22416: 225 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 226 str q3, [x0, #16] 227 str q2, [x6, #16] 228 st1 {v3.8h}, [x0], x1 229 st1 {v2.8h}, [x6], x1 230 subs w4, w4, #4 231 str q1, [x0, #16] 232 str q0, [x6, #16] 233 st1 {v1.8h}, [x0], x1 234 st1 {v0.8h}, [x6], x1 235 b.gt 16b 236 ret 23732: 238 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 239 str q3, [x0, #16] 240 str q2, [x6, #16] 241 stp q3, q3, [x0, #32] 242 stp q2, q2, [x6, #32] 243 st1 {v3.8h}, [x0], x1 244 st1 {v2.8h}, [x6], x1 245 subs w4, w4, #4 246 str q1, [x0, #16] 247 str q0, [x6, #16] 248 stp q1, q1, [x0, #32] 249 stp q0, q0, [x6, #32] 250 st1 {v1.8h}, [x0], x1 251 st1 {v0.8h}, [x6], x1 252 b.gt 32b 253 ret 25464: 255 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 256 str q3, [x0, #16] 257 str q2, [x6, #16] 258 stp q3, q3, [x0, #32] 259 stp q2, q2, [x6, #32] 260 stp q3, q3, [x0, #64] 261 stp q2, q2, [x6, #64] 262 stp q3, q3, [x0, #96] 263 stp q2, q2, [x6, #96] 264 st1 {v3.8h}, [x0], x1 265 st1 {v2.8h}, [x6], x1 266 subs w4, w4, #4 267 str q1, [x0, #16] 268 str q0, [x6, #16] 269 stp q1, q1, [x0, #32] 270 stp q0, q0, [x6, #32] 271 stp q1, q1, [x0, #64] 272 stp q0, q0, [x6, #64] 273 stp q1, q1, [x0, #96] 274 stp q0, q0, [x6, #96] 275 st1 {v1.8h}, [x0], x1 276 st1 {v0.8h}, [x6], x1 277 b.gt 64b 278 ret 279 280L(ipred_h_tbl): 281 .hword L(ipred_h_tbl) - 64b 282 .hword L(ipred_h_tbl) - 32b 283 .hword L(ipred_h_tbl) - 16b 284 .hword L(ipred_h_tbl) - 8b 285 .hword L(ipred_h_tbl) - 4b 286endfunc 287 288// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 289// const pixel *const topleft, 290// const int width, const int height, const int a, 291// const int max_width, const int max_height); 292function ipred_dc_top_16bpc_neon, export=1 293 clz w3, w3 294 adr x5, L(ipred_dc_top_tbl) 295 sub w3, w3, #25 296 ldrh w3, [x5, w3, uxtw #1] 297 add x2, x2, #2 298 sub x5, x5, w3, uxtw 299 add x6, x0, x1 300 lsl x1, x1, #1 301 br x5 30240: 303 ld1 {v0.4h}, [x2] 304 addv h0, v0.4h 305 urshr v0.4h, v0.4h, #2 306 dup v0.4h, v0.h[0] 3074: 308 st1 {v0.4h}, [x0], x1 309 st1 {v0.4h}, [x6], x1 310 subs w4, w4, #4 311 st1 {v0.4h}, [x0], x1 312 st1 {v0.4h}, [x6], x1 313 b.gt 4b 314 ret 31580: 316 ld1 {v0.8h}, [x2] 317 addv h0, v0.8h 318 urshr v0.4h, v0.4h, #3 319 dup v0.8h, v0.h[0] 3208: 321 st1 {v0.8h}, [x0], x1 322 st1 {v0.8h}, [x6], x1 323 subs w4, w4, #4 324 st1 {v0.8h}, [x0], x1 325 st1 {v0.8h}, [x6], x1 326 b.gt 8b 327 ret 328160: 329 ld1 {v0.8h, v1.8h}, [x2] 330 addp v0.8h, v0.8h, v1.8h 331 addv h0, v0.8h 332 urshr v2.4h, v0.4h, #4 333 dup v0.8h, v2.h[0] 334 dup v1.8h, v2.h[0] 33516: 336 st1 {v0.8h, v1.8h}, [x0], x1 337 st1 {v0.8h, v1.8h}, [x6], x1 338 subs w4, w4, #4 339 st1 {v0.8h, v1.8h}, [x0], x1 340 st1 {v0.8h, v1.8h}, [x6], x1 341 b.gt 16b 342 ret 343320: 344 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 345 addp v0.8h, v0.8h, v1.8h 346 addp v2.8h, v2.8h, v3.8h 347 addp v0.8h, v0.8h, v2.8h 348 uaddlv s0, v0.8h 349 rshrn v4.4h, v0.4s, #5 350 dup v0.8h, v4.h[0] 351 dup v1.8h, v4.h[0] 352 dup v2.8h, v4.h[0] 353 dup v3.8h, v4.h[0] 35432: 355 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 356 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 357 subs w4, w4, #4 358 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 359 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 360 b.gt 32b 361 ret 362640: 363 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 364 addp v0.8h, v0.8h, v1.8h 365 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 366 addp v2.8h, v2.8h, v3.8h 367 addp v4.8h, v4.8h, v5.8h 368 addp v6.8h, v6.8h, v7.8h 369 addp v0.8h, v0.8h, v2.8h 370 addp v4.8h, v4.8h, v6.8h 371 addp v0.8h, v0.8h, v4.8h 372 uaddlv s0, v0.8h 373 rshrn v4.4h, v0.4s, #6 374 sub x1, x1, #64 375 dup v0.8h, v4.h[0] 376 dup v1.8h, v4.h[0] 377 dup v2.8h, v4.h[0] 378 dup v3.8h, v4.h[0] 37964: 380 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 381 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 382 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 383 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 384 subs w4, w4, #4 385 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 386 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 387 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 388 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 389 b.gt 64b 390 ret 391 392L(ipred_dc_top_tbl): 393 .hword L(ipred_dc_top_tbl) - 640b 394 .hword L(ipred_dc_top_tbl) - 320b 395 .hword L(ipred_dc_top_tbl) - 160b 396 .hword L(ipred_dc_top_tbl) - 80b 397 .hword L(ipred_dc_top_tbl) - 40b 398endfunc 399 400// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 401// const pixel *const topleft, 402// const int width, const int height, const int a, 403// const int max_width, const int max_height); 404function ipred_dc_left_16bpc_neon, export=1 405 sub x2, x2, w4, uxtw #1 406 clz w3, w3 407 clz w7, w4 408 adr x5, L(ipred_dc_left_tbl) 409 sub w3, w3, #20 // 25 leading bits, minus table offset 5 410 sub w7, w7, #25 411 ldrh w3, [x5, w3, uxtw #1] 412 ldrh w7, [x5, w7, uxtw #1] 413 sub x3, x5, w3, uxtw 414 sub x5, x5, w7, uxtw 415 add x6, x0, x1 416 lsl x1, x1, #1 417 br x5 418 419L(ipred_dc_left_h4): 420 ld1 {v0.4h}, [x2] 421 addv h0, v0.4h 422 urshr v0.4h, v0.4h, #2 423 dup v0.8h, v0.h[0] 424 br x3 425L(ipred_dc_left_w4): 426 st1 {v0.4h}, [x0], x1 427 st1 {v0.4h}, [x6], x1 428 subs w4, w4, #4 429 st1 {v0.4h}, [x0], x1 430 st1 {v0.4h}, [x6], x1 431 b.gt L(ipred_dc_left_w4) 432 ret 433 434L(ipred_dc_left_h8): 435 ld1 {v0.8h}, [x2] 436 addv h0, v0.8h 437 urshr v0.4h, v0.4h, #3 438 dup v0.8h, v0.h[0] 439 br x3 440L(ipred_dc_left_w8): 441 st1 {v0.8h}, [x0], x1 442 st1 {v0.8h}, [x6], x1 443 subs w4, w4, #4 444 st1 {v0.8h}, [x0], x1 445 st1 {v0.8h}, [x6], x1 446 b.gt L(ipred_dc_left_w8) 447 ret 448 449L(ipred_dc_left_h16): 450 ld1 {v0.8h, v1.8h}, [x2] 451 addp v0.8h, v0.8h, v1.8h 452 addv h0, v0.8h 453 urshr v2.4h, v0.4h, #4 454 dup v0.8h, v2.h[0] 455 dup v1.8h, v2.h[0] 456 br x3 457L(ipred_dc_left_w16): 458 mov v1.16b, v0.16b 4591: 460 st1 {v0.8h, v1.8h}, [x0], x1 461 st1 {v0.8h, v1.8h}, [x6], x1 462 subs w4, w4, #4 463 st1 {v0.8h, v1.8h}, [x0], x1 464 st1 {v0.8h, v1.8h}, [x6], x1 465 b.gt 1b 466 ret 467 468L(ipred_dc_left_h32): 469 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 470 addp v0.8h, v0.8h, v1.8h 471 addp v2.8h, v2.8h, v3.8h 472 addp v0.8h, v0.8h, v2.8h 473 uaddlp v0.4s, v0.8h 474 addv s0, v0.4s 475 rshrn v4.4h, v0.4s, #5 476 dup v0.8h, v4.h[0] 477 br x3 478L(ipred_dc_left_w32): 479 mov v1.16b, v0.16b 480 mov v2.16b, v0.16b 481 mov v3.16b, v0.16b 4821: 483 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 484 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 485 subs w4, w4, #4 486 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 487 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 488 b.gt 1b 489 ret 490 491L(ipred_dc_left_h64): 492 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 493 addp v0.8h, v0.8h, v1.8h 494 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 495 addp v2.8h, v2.8h, v3.8h 496 addp v4.8h, v4.8h, v5.8h 497 addp v6.8h, v6.8h, v7.8h 498 addp v0.8h, v0.8h, v2.8h 499 addp v4.8h, v4.8h, v6.8h 500 addp v0.8h, v0.8h, v4.8h 501 uaddlv s0, v0.8h 502 rshrn v4.4h, v0.4s, #6 503 dup v0.8h, v4.h[0] 504 br x3 505L(ipred_dc_left_w64): 506 mov v1.16b, v0.16b 507 mov v2.16b, v0.16b 508 mov v3.16b, v0.16b 509 sub x1, x1, #64 5101: 511 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 512 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 513 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 514 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 515 subs w4, w4, #4 516 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 517 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 518 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 519 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 520 b.gt 1b 521 ret 522 523L(ipred_dc_left_tbl): 524 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 525 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 526 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 527 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 528 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 529 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 530 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 531 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 532 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 533 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 534endfunc 535 536// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, 537// const pixel *const topleft, 538// const int width, const int height, const int a, 539// const int max_width, const int max_height); 540function ipred_dc_16bpc_neon, export=1 541 sub x2, x2, w4, uxtw #1 542 add w7, w3, w4 // width + height 543 clz w3, w3 544 clz w6, w4 545 dup v16.4s, w7 // width + height 546 adr x5, L(ipred_dc_tbl) 547 rbit w7, w7 // rbit(width + height) 548 sub w3, w3, #20 // 25 leading bits, minus table offset 5 549 sub w6, w6, #25 550 clz w7, w7 // ctz(width + height) 551 ldrh w3, [x5, w3, uxtw #1] 552 ldrh w6, [x5, w6, uxtw #1] 553 neg w7, w7 // -ctz(width + height) 554 sub x3, x5, w3, uxtw 555 sub x5, x5, w6, uxtw 556 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 557 dup v17.4s, w7 // -ctz(width + height) 558 add x6, x0, x1 559 lsl x1, x1, #1 560 br x5 561 562L(ipred_dc_h4): 563 ld1 {v0.4h}, [x2], #8 564 uaddlv s0, v0.4h 565 add x2, x2, #2 566 br x3 567L(ipred_dc_w4): 568 ld1 {v1.4h}, [x2] 569 add v0.2s, v0.2s, v16.2s 570 uaddlv s1, v1.4h 571 cmp w4, #4 572 add v0.2s, v0.2s, v1.2s 573 ushl v0.2s, v0.2s, v17.2s 574 b.eq 1f 575 // h = 8/16 576 cmp w4, #16 577 mov w16, #0x6667 578 mov w17, #0xAAAB 579 csel w16, w16, w17, eq 580 dup v16.2s, w16 581 mul v0.2s, v0.2s, v16.2s 582 ushr v0.2s, v0.2s, #17 5831: 584 dup v0.4h, v0.h[0] 5852: 586 st1 {v0.4h}, [x0], x1 587 st1 {v0.4h}, [x6], x1 588 subs w4, w4, #4 589 st1 {v0.4h}, [x0], x1 590 st1 {v0.4h}, [x6], x1 591 b.gt 2b 592 ret 593 594L(ipred_dc_h8): 595 ld1 {v0.8h}, [x2], #16 596 uaddlv s0, v0.8h 597 add x2, x2, #2 598 br x3 599L(ipred_dc_w8): 600 ld1 {v1.8h}, [x2] 601 add v0.2s, v0.2s, v16.2s 602 uaddlv s1, v1.8h 603 cmp w4, #8 604 add v0.2s, v0.2s, v1.2s 605 ushl v0.2s, v0.2s, v17.2s 606 b.eq 1f 607 // h = 4/16/32 608 cmp w4, #32 609 mov w16, #0x6667 610 mov w17, #0xAAAB 611 csel w16, w16, w17, eq 612 dup v16.2s, w16 613 mul v0.2s, v0.2s, v16.2s 614 ushr v0.2s, v0.2s, #17 6151: 616 dup v0.8h, v0.h[0] 6172: 618 st1 {v0.8h}, [x0], x1 619 st1 {v0.8h}, [x6], x1 620 subs w4, w4, #4 621 st1 {v0.8h}, [x0], x1 622 st1 {v0.8h}, [x6], x1 623 b.gt 2b 624 ret 625 626L(ipred_dc_h16): 627 ld1 {v0.8h, v1.8h}, [x2], #32 628 addp v0.8h, v0.8h, v1.8h 629 add x2, x2, #2 630 uaddlv s0, v0.8h 631 br x3 632L(ipred_dc_w16): 633 ld1 {v1.8h, v2.8h}, [x2] 634 add v0.2s, v0.2s, v16.2s 635 addp v1.8h, v1.8h, v2.8h 636 uaddlv s1, v1.8h 637 cmp w4, #16 638 add v0.2s, v0.2s, v1.2s 639 ushl v4.2s, v0.2s, v17.2s 640 b.eq 1f 641 // h = 4/8/32/64 642 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 643 mov w16, #0x6667 644 mov w17, #0xAAAB 645 csel w16, w16, w17, eq 646 dup v16.2s, w16 647 mul v4.2s, v4.2s, v16.2s 648 ushr v4.2s, v4.2s, #17 6491: 650 dup v0.8h, v4.h[0] 651 dup v1.8h, v4.h[0] 6522: 653 st1 {v0.8h, v1.8h}, [x0], x1 654 st1 {v0.8h, v1.8h}, [x6], x1 655 subs w4, w4, #4 656 st1 {v0.8h, v1.8h}, [x0], x1 657 st1 {v0.8h, v1.8h}, [x6], x1 658 b.gt 2b 659 ret 660 661L(ipred_dc_h32): 662 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 663 addp v0.8h, v0.8h, v1.8h 664 addp v2.8h, v2.8h, v3.8h 665 addp v0.8h, v0.8h, v2.8h 666 add x2, x2, #2 667 uaddlv s0, v0.8h 668 br x3 669L(ipred_dc_w32): 670 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] 671 add v0.2s, v0.2s, v16.2s 672 addp v1.8h, v1.8h, v2.8h 673 addp v3.8h, v3.8h, v4.8h 674 addp v1.8h, v1.8h, v3.8h 675 uaddlv s1, v1.8h 676 cmp w4, #32 677 add v0.2s, v0.2s, v1.2s 678 ushl v4.2s, v0.2s, v17.2s 679 b.eq 1f 680 // h = 8/16/64 681 cmp w4, #8 682 mov w16, #0x6667 683 mov w17, #0xAAAB 684 csel w16, w16, w17, eq 685 dup v16.2s, w16 686 mul v4.2s, v4.2s, v16.2s 687 ushr v4.2s, v4.2s, #17 6881: 689 dup v0.8h, v4.h[0] 690 dup v1.8h, v4.h[0] 691 dup v2.8h, v4.h[0] 692 dup v3.8h, v4.h[0] 6932: 694 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 695 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 696 subs w4, w4, #4 697 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 698 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 699 b.gt 2b 700 ret 701 702L(ipred_dc_h64): 703 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 704 addp v0.8h, v0.8h, v1.8h 705 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 706 addp v2.8h, v2.8h, v3.8h 707 addp v4.8h, v4.8h, v5.8h 708 addp v6.8h, v6.8h, v7.8h 709 addp v0.8h, v0.8h, v2.8h 710 addp v4.8h, v4.8h, v6.8h 711 addp v0.8h, v0.8h, v4.8h 712 add x2, x2, #2 713 uaddlv s0, v0.8h 714 br x3 715L(ipred_dc_w64): 716 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 717 add v0.2s, v0.2s, v16.2s 718 addp v1.8h, v1.8h, v2.8h 719 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] 720 addp v3.8h, v3.8h, v4.8h 721 addp v20.8h, v20.8h, v21.8h 722 addp v22.8h, v22.8h, v23.8h 723 addp v1.8h, v1.8h, v3.8h 724 addp v20.8h, v20.8h, v22.8h 725 addp v1.8h, v1.8h, v20.8h 726 uaddlv s1, v1.8h 727 cmp w4, #64 728 add v0.2s, v0.2s, v1.2s 729 ushl v4.2s, v0.2s, v17.2s 730 b.eq 1f 731 // h = 16/32 732 cmp w4, #16 733 mov w16, #0x6667 734 mov w17, #0xAAAB 735 csel w16, w16, w17, eq 736 dup v16.2s, w16 737 mul v4.2s, v4.2s, v16.2s 738 ushr v4.2s, v4.2s, #17 7391: 740 sub x1, x1, #64 741 dup v0.8h, v4.h[0] 742 dup v1.8h, v4.h[0] 743 dup v2.8h, v4.h[0] 744 dup v3.8h, v4.h[0] 7452: 746 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 747 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 748 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 750 subs w4, w4, #4 751 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 752 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 753 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 754 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 755 b.gt 2b 756 ret 757 758L(ipred_dc_tbl): 759 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 760 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 761 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 762 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 763 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 764 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 765 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 766 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 767 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 768 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 769endfunc 770 771// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 772// const pixel *const topleft, 773// const int width, const int height, const int a, 774// const int max_width, const int max_height); 775function ipred_paeth_16bpc_neon, export=1 776 clz w9, w3 777 adr x5, L(ipred_paeth_tbl) 778 sub w9, w9, #25 779 ldrh w9, [x5, w9, uxtw #1] 780 ld1r {v4.8h}, [x2] 781 add x8, x2, #2 782 sub x2, x2, #8 783 sub x5, x5, w9, uxtw 784 mov x7, #-8 785 add x6, x0, x1 786 lsl x1, x1, #1 787 br x5 78840: 789 ld1r {v5.2d}, [x8] 790 sub v6.8h, v5.8h, v4.8h // top - topleft 7914: 792 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 793 zip1 v0.2d, v0.2d, v1.2d 794 zip1 v2.2d, v2.2d, v3.2d 795 add v16.8h, v6.8h, v0.8h // base 796 add v17.8h, v6.8h, v2.8h 797 sabd v20.8h, v5.8h, v16.8h // tdiff 798 sabd v21.8h, v5.8h, v17.8h 799 sabd v22.8h, v4.8h, v16.8h // tldiff 800 sabd v23.8h, v4.8h, v17.8h 801 sabd v16.8h, v0.8h, v16.8h // ldiff 802 sabd v17.8h, v2.8h, v17.8h 803 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) 804 umin v19.8h, v21.8h, v23.8h 805 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff 806 cmge v21.8h, v23.8h, v21.8h 807 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff 808 cmge v17.8h, v19.8h, v17.8h 809 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 810 bsl v20.16b, v5.16b, v4.16b 811 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 812 bit v20.16b, v0.16b, v16.16b 813 st1 {v21.d}[1], [x0], x1 814 st1 {v21.d}[0], [x6], x1 815 subs w4, w4, #4 816 st1 {v20.d}[1], [x0], x1 817 st1 {v20.d}[0], [x6], x1 818 b.gt 4b 819 ret 82080: 821160: 822320: 823640: 824 ld1 {v5.8h}, [x8], #16 825 mov w9, w3 826 // Set up pointers for four rows in parallel; x0, x6, x5, x10 827 add x5, x0, x1 828 add x10, x6, x1 829 lsl x1, x1, #1 830 sub x1, x1, w3, uxtw #1 8311: 832 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 8332: 834 sub v6.8h, v5.8h, v4.8h // top - topleft 835 add v16.8h, v6.8h, v0.8h // base 836 add v17.8h, v6.8h, v1.8h 837 add v18.8h, v6.8h, v2.8h 838 add v19.8h, v6.8h, v3.8h 839 sabd v20.8h, v5.8h, v16.8h // tdiff 840 sabd v21.8h, v5.8h, v17.8h 841 sabd v22.8h, v5.8h, v18.8h 842 sabd v23.8h, v5.8h, v19.8h 843 sabd v24.8h, v4.8h, v16.8h // tldiff 844 sabd v25.8h, v4.8h, v17.8h 845 sabd v26.8h, v4.8h, v18.8h 846 sabd v27.8h, v4.8h, v19.8h 847 sabd v16.8h, v0.8h, v16.8h // ldiff 848 sabd v17.8h, v1.8h, v17.8h 849 sabd v18.8h, v2.8h, v18.8h 850 sabd v19.8h, v3.8h, v19.8h 851 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) 852 umin v29.8h, v21.8h, v25.8h 853 umin v30.8h, v22.8h, v26.8h 854 umin v31.8h, v23.8h, v27.8h 855 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff 856 cmge v21.8h, v25.8h, v21.8h 857 cmge v22.8h, v26.8h, v22.8h 858 cmge v23.8h, v27.8h, v23.8h 859 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff 860 cmge v17.8h, v29.8h, v17.8h 861 cmge v18.8h, v30.8h, v18.8h 862 cmge v19.8h, v31.8h, v19.8h 863 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 864 bsl v22.16b, v5.16b, v4.16b 865 bsl v21.16b, v5.16b, v4.16b 866 bsl v20.16b, v5.16b, v4.16b 867 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 868 bit v22.16b, v2.16b, v18.16b 869 bit v21.16b, v1.16b, v17.16b 870 bit v20.16b, v0.16b, v16.16b 871 st1 {v23.8h}, [x0], #16 872 st1 {v22.8h}, [x6], #16 873 subs w3, w3, #8 874 st1 {v21.8h}, [x5], #16 875 st1 {v20.8h}, [x10], #16 876 b.le 8f 877 ld1 {v5.8h}, [x8], #16 878 b 2b 8798: 880 subs w4, w4, #4 881 b.le 9f 882 // End of horizontal loop, move pointers to next four rows 883 sub x8, x8, w9, uxtw #1 884 add x0, x0, x1 885 add x6, x6, x1 886 // Load the top row as early as possible 887 ld1 {v5.8h}, [x8], #16 888 add x5, x5, x1 889 add x10, x10, x1 890 mov w3, w9 891 b 1b 8929: 893 ret 894 895L(ipred_paeth_tbl): 896 .hword L(ipred_paeth_tbl) - 640b 897 .hword L(ipred_paeth_tbl) - 320b 898 .hword L(ipred_paeth_tbl) - 160b 899 .hword L(ipred_paeth_tbl) - 80b 900 .hword L(ipred_paeth_tbl) - 40b 901endfunc 902 903// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 904// const pixel *const topleft, 905// const int width, const int height, const int a, 906// const int max_width, const int max_height); 907function ipred_smooth_16bpc_neon, export=1 908 movrel x10, X(sm_weights) 909 add x11, x10, w4, uxtw 910 add x10, x10, w3, uxtw 911 clz w9, w3 912 adr x5, L(ipred_smooth_tbl) 913 sub x12, x2, w4, uxtw #1 914 sub w9, w9, #25 915 ldrh w9, [x5, w9, uxtw #1] 916 ld1r {v4.8h}, [x12] // bottom 917 add x8, x2, #2 918 sub x5, x5, w9, uxtw 919 add x6, x0, x1 920 lsl x1, x1, #1 921 br x5 92240: 923 ld1r {v6.2d}, [x8] // top 924 ld1r {v7.2s}, [x10] // weights_hor 925 sub x2, x2, #8 926 mov x7, #-8 927 dup v5.8h, v6.h[3] // right 928 sub v6.8h, v6.8h, v4.8h // top-bottom 929 uxtl v7.8h, v7.8b // weights_hor 930 add v31.4h, v4.4h, v5.4h // bottom+right 9314: 932 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 933 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 934 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 935 ushll v21.4s, v31.4h, #8 936 ushll v22.4s, v31.4h, #8 937 ushll v23.4s, v31.4h, #8 938 zip1 v1.2d, v1.2d, v0.2d // left, flipped 939 zip1 v0.2d, v3.2d, v2.2d 940 zip1 v16.2s, v16.2s, v17.2s // weights_ver 941 zip1 v18.2s, v18.2s, v19.2s 942 sub v0.8h, v0.8h, v5.8h // left-right 943 sub v1.8h, v1.8h, v5.8h 944 uxtl v16.8h, v16.8b // weights_ver 945 uxtl v18.8h, v18.8b 946 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor 947 smlal2 v21.4s, v0.8h, v7.8h 948 smlal v22.4s, v1.4h, v7.4h 949 smlal2 v23.4s, v1.8h, v7.8h 950 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 951 smlal2 v21.4s, v6.8h, v16.8h 952 smlal v22.4s, v6.4h, v18.4h 953 smlal2 v23.4s, v6.8h, v18.8h 954 rshrn v20.4h, v20.4s, #9 955 rshrn v21.4h, v21.4s, #9 956 rshrn v22.4h, v22.4s, #9 957 rshrn v23.4h, v23.4s, #9 958 st1 {v20.4h}, [x0], x1 959 st1 {v21.4h}, [x6], x1 960 subs w4, w4, #4 961 st1 {v22.4h}, [x0], x1 962 st1 {v23.4h}, [x6], x1 963 b.gt 4b 964 ret 96580: 966 ld1 {v6.8h}, [x8] // top 967 ld1 {v7.8b}, [x10] // weights_hor 968 sub x2, x2, #8 969 mov x7, #-8 970 dup v5.8h, v6.h[7] // right 971 sub v6.8h, v6.8h, v4.8h // top-bottom 972 uxtl v7.8h, v7.8b // weights_hor 973 add v31.4h, v4.4h, v5.4h // bottom+right 9748: 975 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 976 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 977 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 978 ushll v21.4s, v31.4h, #8 979 ushll v22.4s, v31.4h, #8 980 ushll v23.4s, v31.4h, #8 981 ushll v24.4s, v31.4h, #8 982 ushll v25.4s, v31.4h, #8 983 ushll v26.4s, v31.4h, #8 984 ushll v27.4s, v31.4h, #8 985 sub v0.8h, v0.8h, v5.8h // left-right 986 sub v1.8h, v1.8h, v5.8h 987 sub v2.8h, v2.8h, v5.8h 988 sub v3.8h, v3.8h, v5.8h 989 uxtl v16.8h, v16.8b // weights_ver 990 uxtl v17.8h, v17.8b 991 uxtl v18.8h, v18.8b 992 uxtl v19.8h, v19.8b 993 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor 994 smlal2 v21.4s, v3.8h, v7.8h // (left flipped) 995 smlal v22.4s, v2.4h, v7.4h 996 smlal2 v23.4s, v2.8h, v7.8h 997 smlal v24.4s, v1.4h, v7.4h 998 smlal2 v25.4s, v1.8h, v7.8h 999 smlal v26.4s, v0.4h, v7.4h 1000 smlal2 v27.4s, v0.8h, v7.8h 1001 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1002 smlal2 v21.4s, v6.8h, v16.8h 1003 smlal v22.4s, v6.4h, v17.4h 1004 smlal2 v23.4s, v6.8h, v17.8h 1005 smlal v24.4s, v6.4h, v18.4h 1006 smlal2 v25.4s, v6.8h, v18.8h 1007 smlal v26.4s, v6.4h, v19.4h 1008 smlal2 v27.4s, v6.8h, v19.8h 1009 rshrn v20.4h, v20.4s, #9 1010 rshrn2 v20.8h, v21.4s, #9 1011 rshrn v21.4h, v22.4s, #9 1012 rshrn2 v21.8h, v23.4s, #9 1013 rshrn v22.4h, v24.4s, #9 1014 rshrn2 v22.8h, v25.4s, #9 1015 rshrn v23.4h, v26.4s, #9 1016 rshrn2 v23.8h, v27.4s, #9 1017 st1 {v20.8h}, [x0], x1 1018 st1 {v21.8h}, [x6], x1 1019 subs w4, w4, #4 1020 st1 {v22.8h}, [x0], x1 1021 st1 {v23.8h}, [x6], x1 1022 b.gt 8b 1023 ret 1024160: 1025320: 1026640: 1027 add x12, x2, w3, uxtw #1 1028 sub x1, x1, w3, uxtw #1 1029 ld1r {v5.8h}, [x12] // right 1030 sub x2, x2, #4 1031 mov x7, #-4 1032 mov w9, w3 1033 add v31.4h, v4.4h, v5.4h // bottom+right 1034 10351: 1036 ld2r {v0.8h, v1.8h}, [x2], x7 // left 1037 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1038 sub v0.8h, v0.8h, v5.8h // left-right 1039 sub v1.8h, v1.8h, v5.8h 1040 uxtl v16.8h, v16.8b // weights_ver 1041 uxtl v17.8h, v17.8b 10422: 1043 ld1 {v7.16b}, [x10], #16 // weights_hor 1044 ld1 {v2.8h, v3.8h}, [x8], #32 // top 1045 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1046 ushll v21.4s, v31.4h, #8 1047 ushll v22.4s, v31.4h, #8 1048 ushll v23.4s, v31.4h, #8 1049 ushll v24.4s, v31.4h, #8 1050 ushll v25.4s, v31.4h, #8 1051 ushll v26.4s, v31.4h, #8 1052 ushll v27.4s, v31.4h, #8 1053 uxtl v6.8h, v7.8b // weights_hor 1054 uxtl2 v7.8h, v7.16b 1055 sub v2.8h, v2.8h, v4.8h // top-bottom 1056 sub v3.8h, v3.8h, v4.8h 1057 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor 1058 smlal2 v21.4s, v1.8h, v6.8h // (left flipped) 1059 smlal v22.4s, v1.4h, v7.4h 1060 smlal2 v23.4s, v1.8h, v7.8h 1061 smlal v24.4s, v0.4h, v6.4h 1062 smlal2 v25.4s, v0.8h, v6.8h 1063 smlal v26.4s, v0.4h, v7.4h 1064 smlal2 v27.4s, v0.8h, v7.8h 1065 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver 1066 smlal2 v21.4s, v2.8h, v16.8h 1067 smlal v22.4s, v3.4h, v16.4h 1068 smlal2 v23.4s, v3.8h, v16.8h 1069 smlal v24.4s, v2.4h, v17.4h 1070 smlal2 v25.4s, v2.8h, v17.8h 1071 smlal v26.4s, v3.4h, v17.4h 1072 smlal2 v27.4s, v3.8h, v17.8h 1073 rshrn v20.4h, v20.4s, #9 1074 rshrn2 v20.8h, v21.4s, #9 1075 rshrn v21.4h, v22.4s, #9 1076 rshrn2 v21.8h, v23.4s, #9 1077 rshrn v22.4h, v24.4s, #9 1078 rshrn2 v22.8h, v25.4s, #9 1079 rshrn v23.4h, v26.4s, #9 1080 rshrn2 v23.8h, v27.4s, #9 1081 subs w3, w3, #16 1082 st1 {v20.8h, v21.8h}, [x0], #32 1083 st1 {v22.8h, v23.8h}, [x6], #32 1084 b.gt 2b 1085 subs w4, w4, #2 1086 b.le 9f 1087 sub x8, x8, w9, uxtw #1 1088 sub x10, x10, w9, uxtw 1089 add x0, x0, x1 1090 add x6, x6, x1 1091 mov w3, w9 1092 b 1b 10939: 1094 ret 1095 1096L(ipred_smooth_tbl): 1097 .hword L(ipred_smooth_tbl) - 640b 1098 .hword L(ipred_smooth_tbl) - 320b 1099 .hword L(ipred_smooth_tbl) - 160b 1100 .hword L(ipred_smooth_tbl) - 80b 1101 .hword L(ipred_smooth_tbl) - 40b 1102endfunc 1103 1104// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1105// const pixel *const topleft, 1106// const int width, const int height, const int a, 1107// const int max_width, const int max_height); 1108function ipred_smooth_v_16bpc_neon, export=1 1109 movrel x7, X(sm_weights) 1110 add x7, x7, w4, uxtw 1111 clz w9, w3 1112 adr x5, L(ipred_smooth_v_tbl) 1113 sub x8, x2, w4, uxtw #1 1114 sub w9, w9, #25 1115 ldrh w9, [x5, w9, uxtw #1] 1116 ld1r {v4.8h}, [x8] // bottom 1117 add x2, x2, #2 1118 sub x5, x5, w9, uxtw 1119 add x6, x0, x1 1120 lsl x1, x1, #1 1121 br x5 112240: 1123 ld1r {v6.2d}, [x2] // top 1124 sub v6.8h, v6.8h, v4.8h // top-bottom 11254: 1126 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1127 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1128 zip1 v18.2s, v18.2s, v19.2s 1129 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1130 ushll v18.8h, v18.8b, #7 1131 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1132 sqrdmulh v21.8h, v6.8h, v18.8h 1133 add v20.8h, v20.8h, v4.8h 1134 add v21.8h, v21.8h, v4.8h 1135 st1 {v20.d}[0], [x0], x1 1136 st1 {v20.d}[1], [x6], x1 1137 subs w4, w4, #4 1138 st1 {v21.d}[0], [x0], x1 1139 st1 {v21.d}[1], [x6], x1 1140 b.gt 4b 1141 ret 114280: 1143 ld1 {v6.8h}, [x2] // top 1144 sub v6.8h, v6.8h, v4.8h // top-bottom 11458: 1146 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1147 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1148 ushll v17.8h, v17.8b, #7 1149 ushll v18.8h, v18.8b, #7 1150 ushll v19.8h, v19.8b, #7 1151 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1152 sqrdmulh v21.8h, v6.8h, v17.8h 1153 sqrdmulh v22.8h, v6.8h, v18.8h 1154 sqrdmulh v23.8h, v6.8h, v19.8h 1155 add v20.8h, v20.8h, v4.8h 1156 add v21.8h, v21.8h, v4.8h 1157 add v22.8h, v22.8h, v4.8h 1158 add v23.8h, v23.8h, v4.8h 1159 st1 {v20.8h}, [x0], x1 1160 st1 {v21.8h}, [x6], x1 1161 subs w4, w4, #4 1162 st1 {v22.8h}, [x0], x1 1163 st1 {v23.8h}, [x6], x1 1164 b.gt 8b 1165 ret 1166160: 1167320: 1168640: 1169 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1170 add x5, x0, x1 1171 add x8, x6, x1 1172 lsl x1, x1, #1 1173 sub x1, x1, w3, uxtw #1 1174 mov w9, w3 1175 11761: 1177 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1178 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1179 ushll v17.8h, v17.8b, #7 1180 ushll v18.8h, v18.8b, #7 1181 ushll v19.8h, v19.8b, #7 11822: 1183 ld1 {v2.8h, v3.8h}, [x2], #32 // top 1184 sub v2.8h, v2.8h, v4.8h // top-bottom 1185 sub v3.8h, v3.8h, v4.8h 1186 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1187 sqrdmulh v21.8h, v3.8h, v16.8h 1188 sqrdmulh v22.8h, v2.8h, v17.8h 1189 sqrdmulh v23.8h, v3.8h, v17.8h 1190 sqrdmulh v24.8h, v2.8h, v18.8h 1191 sqrdmulh v25.8h, v3.8h, v18.8h 1192 sqrdmulh v26.8h, v2.8h, v19.8h 1193 sqrdmulh v27.8h, v3.8h, v19.8h 1194 add v20.8h, v20.8h, v4.8h 1195 add v21.8h, v21.8h, v4.8h 1196 add v22.8h, v22.8h, v4.8h 1197 add v23.8h, v23.8h, v4.8h 1198 add v24.8h, v24.8h, v4.8h 1199 add v25.8h, v25.8h, v4.8h 1200 add v26.8h, v26.8h, v4.8h 1201 add v27.8h, v27.8h, v4.8h 1202 subs w3, w3, #16 1203 st1 {v20.8h, v21.8h}, [x0], #32 1204 st1 {v22.8h, v23.8h}, [x6], #32 1205 st1 {v24.8h, v25.8h}, [x5], #32 1206 st1 {v26.8h, v27.8h}, [x8], #32 1207 b.gt 2b 1208 subs w4, w4, #4 1209 b.le 9f 1210 sub x2, x2, w9, uxtw #1 1211 add x0, x0, x1 1212 add x6, x6, x1 1213 add x5, x5, x1 1214 add x8, x8, x1 1215 mov w3, w9 1216 b 1b 12179: 1218 ret 1219 1220L(ipred_smooth_v_tbl): 1221 .hword L(ipred_smooth_v_tbl) - 640b 1222 .hword L(ipred_smooth_v_tbl) - 320b 1223 .hword L(ipred_smooth_v_tbl) - 160b 1224 .hword L(ipred_smooth_v_tbl) - 80b 1225 .hword L(ipred_smooth_v_tbl) - 40b 1226endfunc 1227 1228// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1229// const pixel *const topleft, 1230// const int width, const int height, const int a, 1231// const int max_width, const int max_height); 1232function ipred_smooth_h_16bpc_neon, export=1 1233 movrel x8, X(sm_weights) 1234 add x8, x8, w3, uxtw 1235 clz w9, w3 1236 adr x5, L(ipred_smooth_h_tbl) 1237 add x12, x2, w3, uxtw #1 1238 sub w9, w9, #25 1239 ldrh w9, [x5, w9, uxtw #1] 1240 ld1r {v5.8h}, [x12] // right 1241 sub x5, x5, w9, uxtw 1242 add x6, x0, x1 1243 lsl x1, x1, #1 1244 br x5 124540: 1246 ld1r {v7.2s}, [x8] // weights_hor 1247 sub x2, x2, #8 1248 mov x7, #-8 1249 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12504: 1251 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 1252 zip1 v1.2d, v1.2d, v0.2d // left, flipped 1253 zip1 v0.2d, v3.2d, v2.2d 1254 sub v0.8h, v0.8h, v5.8h // left-right 1255 sub v1.8h, v1.8h, v5.8h 1256 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1257 sqrdmulh v21.8h, v1.8h, v7.8h 1258 add v20.8h, v20.8h, v5.8h 1259 add v21.8h, v21.8h, v5.8h 1260 st1 {v20.d}[0], [x0], x1 1261 st1 {v20.d}[1], [x6], x1 1262 subs w4, w4, #4 1263 st1 {v21.d}[0], [x0], x1 1264 st1 {v21.d}[1], [x6], x1 1265 b.gt 4b 1266 ret 126780: 1268 ld1 {v7.8b}, [x8] // weights_hor 1269 sub x2, x2, #8 1270 mov x7, #-8 1271 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12728: 1273 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1274 sub v3.8h, v3.8h, v5.8h // left-right 1275 sub v2.8h, v2.8h, v5.8h 1276 sub v1.8h, v1.8h, v5.8h 1277 sub v0.8h, v0.8h, v5.8h 1278 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1279 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) 1280 sqrdmulh v22.8h, v1.8h, v7.8h 1281 sqrdmulh v23.8h, v0.8h, v7.8h 1282 add v20.8h, v20.8h, v5.8h 1283 add v21.8h, v21.8h, v5.8h 1284 add v22.8h, v22.8h, v5.8h 1285 add v23.8h, v23.8h, v5.8h 1286 st1 {v20.8h}, [x0], x1 1287 st1 {v21.8h}, [x6], x1 1288 subs w4, w4, #4 1289 st1 {v22.8h}, [x0], x1 1290 st1 {v23.8h}, [x6], x1 1291 b.gt 8b 1292 ret 1293160: 1294320: 1295640: 1296 sub x2, x2, #8 1297 mov x7, #-8 1298 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1299 add x5, x0, x1 1300 add x10, x6, x1 1301 lsl x1, x1, #1 1302 sub x1, x1, w3, uxtw #1 1303 mov w9, w3 1304 13051: 1306 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1307 sub v0.8h, v0.8h, v5.8h // left-right 1308 sub v1.8h, v1.8h, v5.8h 1309 sub v2.8h, v2.8h, v5.8h 1310 sub v3.8h, v3.8h, v5.8h 13112: 1312 ld1 {v7.16b}, [x8], #16 // weights_hor 1313 ushll v6.8h, v7.8b, #7 // weights_hor << 7 1314 ushll2 v7.8h, v7.16b, #7 1315 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 1316 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) 1317 sqrdmulh v22.8h, v2.8h, v6.8h 1318 sqrdmulh v23.8h, v2.8h, v7.8h 1319 sqrdmulh v24.8h, v1.8h, v6.8h 1320 sqrdmulh v25.8h, v1.8h, v7.8h 1321 sqrdmulh v26.8h, v0.8h, v6.8h 1322 sqrdmulh v27.8h, v0.8h, v7.8h 1323 add v20.8h, v20.8h, v5.8h 1324 add v21.8h, v21.8h, v5.8h 1325 add v22.8h, v22.8h, v5.8h 1326 add v23.8h, v23.8h, v5.8h 1327 add v24.8h, v24.8h, v5.8h 1328 add v25.8h, v25.8h, v5.8h 1329 add v26.8h, v26.8h, v5.8h 1330 add v27.8h, v27.8h, v5.8h 1331 subs w3, w3, #16 1332 st1 {v20.8h, v21.8h}, [x0], #32 1333 st1 {v22.8h, v23.8h}, [x6], #32 1334 st1 {v24.8h, v25.8h}, [x5], #32 1335 st1 {v26.8h, v27.8h}, [x10], #32 1336 b.gt 2b 1337 subs w4, w4, #4 1338 b.le 9f 1339 sub x8, x8, w9, uxtw 1340 add x0, x0, x1 1341 add x6, x6, x1 1342 add x5, x5, x1 1343 add x10, x10, x1 1344 mov w3, w9 1345 b 1b 13469: 1347 ret 1348 1349L(ipred_smooth_h_tbl): 1350 .hword L(ipred_smooth_h_tbl) - 640b 1351 .hword L(ipred_smooth_h_tbl) - 320b 1352 .hword L(ipred_smooth_h_tbl) - 160b 1353 .hword L(ipred_smooth_h_tbl) - 80b 1354 .hword L(ipred_smooth_h_tbl) - 40b 1355endfunc 1356 1357// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1358// const pixel *const topleft, 1359// const int width, const int height, const int filt_idx, 1360// const int max_width, const int max_height, 1361// const int bitdepth_max); 1362.macro filter_fn bpc 1363function ipred_filter_\bpc\()bpc_neon 1364 and w5, w5, #511 1365 movrel x6, X(filter_intra_taps) 1366 lsl w5, w5, #6 1367 add x6, x6, w5, uxtw 1368 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 1369 clz w9, w3 1370 adr x5, L(ipred_filter\bpc\()_tbl) 1371 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 1372 sub w9, w9, #26 1373 ldrh w9, [x5, w9, uxtw #1] 1374 sxtl v16.8h, v16.8b 1375 sxtl v17.8h, v17.8b 1376 sub x5, x5, w9, uxtw 1377 sxtl v18.8h, v18.8b 1378 sxtl v19.8h, v19.8b 1379 add x6, x0, x1 1380 lsl x1, x1, #1 1381 sxtl v20.8h, v20.8b 1382 sxtl v21.8h, v21.8b 1383 sxtl v22.8h, v22.8b 1384 dup v31.8h, w8 1385.if \bpc == 10 1386 movi v30.8h, #0 1387.endif 1388 br x5 138940: 1390 ldur d0, [x2, #2] // top (0-3) 1391 sub x2, x2, #4 1392 mov x7, #-4 13934: 1394 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1395.if \bpc == 10 1396 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1397 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1398 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1399 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1400 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1401 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1402 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1403 srshr v2.8h, v2.8h, #4 1404 smax v2.8h, v2.8h, v30.8h 1405.else 1406 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1407 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1408 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1409 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1410 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1411 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1412 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1413 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1414 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1415 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1416 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1417 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1418 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1419 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1420 sqrshrun v2.4h, v2.4s, #4 1421 sqrshrun2 v2.8h, v3.4s, #4 1422.endif 1423 smin v2.8h, v2.8h, v31.8h 1424 subs w4, w4, #2 1425 st1 {v2.d}[0], [x0], x1 1426 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] 1427 st1 {v2.d}[1], [x6], x1 1428 b.gt 4b 1429 ret 143080: 1431 ldur q0, [x2, #2] // top (0-7) 1432 sub x2, x2, #4 1433 mov x7, #-4 14348: 1435 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1436.if \bpc == 10 1437 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1438 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1439 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1440 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1441 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1442 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1443 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1444 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1445 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1446 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1447 srshr v2.8h, v2.8h, #4 1448 smax v2.8h, v2.8h, v30.8h 1449 smin v2.8h, v2.8h, v31.8h 1450 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1451 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1452 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1453 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1454 srshr v3.8h, v3.8h, #4 1455 smax v3.8h, v3.8h, v30.8h 1456.else 1457 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1458 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1459 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1460 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1461 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1462 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1463 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1464 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1465 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1466 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1467 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1468 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1469 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1470 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1471 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) 1472 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) 1473 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) 1474 sqrshrun v2.4h, v2.4s, #4 1475 sqrshrun2 v2.8h, v3.4s, #4 1476 smin v2.8h, v2.8h, v31.8h 1477 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) 1478 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) 1479 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) 1480 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) 1481 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1482 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1483 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1484 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1485 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1486 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1487 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1488 sqrshrun v3.4h, v4.4s, #4 1489 sqrshrun2 v3.8h, v5.4s, #4 1490.endif 1491 smin v3.8h, v3.8h, v31.8h 1492 subs w4, w4, #2 1493 st2 {v2.d, v3.d}[0], [x0], x1 1494 zip2 v0.2d, v2.2d, v3.2d 1495 st2 {v2.d, v3.d}[1], [x6], x1 1496 b.gt 8b 1497 ret 1498160: 1499320: 1500 add x8, x2, #2 1501 sub x2, x2, #4 1502 mov x7, #-4 1503 sub x1, x1, w3, uxtw #1 1504 mov w9, w3 1505 15061: 1507 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 15082: 1509 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) 1510.if \bpc == 10 1511 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1512 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1513 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1514 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1515 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1516 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1517 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1518 1519 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1520 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1521 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1522 srshr v3.8h, v3.8h, #4 1523 smax v3.8h, v3.8h, v30.8h 1524 smin v3.8h, v3.8h, v31.8h 1525 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1526 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1527 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1528 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1529 1530 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1531 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1532 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1533 srshr v4.8h, v4.8h, #4 1534 smax v4.8h, v4.8h, v30.8h 1535 smin v4.8h, v4.8h, v31.8h 1536 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1537 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1538 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1539 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1540 1541 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1542 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1543 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1544 srshr v5.8h, v5.8h, #4 1545 smax v5.8h, v5.8h, v30.8h 1546 smin v5.8h, v5.8h, v31.8h 1547 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1548 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1549 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1550 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1551 1552 subs w3, w3, #16 1553 srshr v6.8h, v6.8h, #4 1554 smax v6.8h, v6.8h, v30.8h 1555.else 1556 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) 1557 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) 1558 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) 1559 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) 1560 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) 1561 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) 1562 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) 1563 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1564 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1565 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1566 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1567 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1568 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1569 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1570 1571 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) 1572 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) 1573 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) 1574 sqrshrun v3.4h, v3.4s, #4 1575 sqrshrun2 v3.8h, v4.4s, #4 1576 smin v3.8h, v3.8h, v31.8h 1577 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) 1578 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) 1579 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) 1580 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) 1581 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1582 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1583 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1584 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1585 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1586 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1587 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1588 1589 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) 1590 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) 1591 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) 1592 sqrshrun v4.4h, v5.4s, #4 1593 sqrshrun2 v4.8h, v6.4s, #4 1594 smin v4.8h, v4.8h, v31.8h 1595 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) 1596 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) 1597 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) 1598 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) 1599 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1600 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1601 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1602 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1603 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1604 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1605 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1606 1607 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) 1608 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) 1609 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) 1610 sqrshrun v5.4h, v24.4s, #4 1611 sqrshrun2 v5.8h, v25.4s, #4 1612 smin v5.8h, v5.8h, v31.8h 1613 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) 1614 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) 1615 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) 1616 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) 1617 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1618 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1619 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1620 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1621 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1622 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1623 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1624 1625 subs w3, w3, #16 1626 sqrshrun v6.4h, v26.4s, #4 1627 sqrshrun2 v6.8h, v27.4s, #4 1628.endif 1629 smin v6.8h, v6.8h, v31.8h 1630 1631 ins v0.h[2], v2.h[7] 1632 st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 1633 ins v0.h[0], v6.h[7] 1634 st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 1635 ins v0.h[1], v6.h[3] 1636 b.gt 2b 1637 subs w4, w4, #2 1638 b.le 9f 1639 sub x8, x6, w9, uxtw #1 1640 add x0, x0, x1 1641 add x6, x6, x1 1642 mov w3, w9 1643 b 1b 16449: 1645 ret 1646 1647L(ipred_filter\bpc\()_tbl): 1648 .hword L(ipred_filter\bpc\()_tbl) - 320b 1649 .hword L(ipred_filter\bpc\()_tbl) - 160b 1650 .hword L(ipred_filter\bpc\()_tbl) - 80b 1651 .hword L(ipred_filter\bpc\()_tbl) - 40b 1652endfunc 1653.endm 1654 1655filter_fn 10 1656filter_fn 12 1657 1658function ipred_filter_16bpc_neon, export=1 1659 ldr w8, [sp] 1660 cmp w8, 0x3ff 1661 b.le ipred_filter_10bpc_neon 1662 b ipred_filter_12bpc_neon 1663endfunc 1664 1665// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1666// const uint16_t *const pal, const uint8_t *idx, 1667// const int w, const int h); 1668function pal_pred_16bpc_neon, export=1 1669 ld1 {v30.8h}, [x2] 1670 clz w9, w4 1671 adr x6, L(pal_pred_tbl) 1672 sub w9, w9, #25 1673 ldrh w9, [x6, w9, uxtw #1] 1674 movi v31.8h, #1, lsl #8 1675 sub x6, x6, w9, uxtw 1676 br x6 167740: 1678 add x2, x0, x1 1679 lsl x1, x1, #1 16804: 1681 ld1 {v1.16b}, [x3], #16 1682 subs w5, w5, #4 1683 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... 1684 add v1.16b, v1.16b, v1.16b 1685 zip1 v0.16b, v1.16b, v1.16b 1686 zip2 v1.16b, v1.16b, v1.16b 1687 add v0.8h, v0.8h, v31.8h 1688 add v1.8h, v1.8h, v31.8h 1689 tbl v0.16b, {v30.16b}, v0.16b 1690 st1 {v0.d}[0], [x0], x1 1691 tbl v1.16b, {v30.16b}, v1.16b 1692 st1 {v0.d}[1], [x2], x1 1693 st1 {v1.d}[0], [x0], x1 1694 st1 {v1.d}[1], [x2], x1 1695 b.gt 4b 1696 ret 169780: 1698 add x2, x0, x1 1699 lsl x1, x1, #1 17008: 1701 ld1 {v2.16b, v3.16b}, [x3], #32 1702 subs w5, w5, #4 1703 add v2.16b, v2.16b, v2.16b 1704 add v3.16b, v3.16b, v3.16b 1705 zip1 v0.16b, v2.16b, v2.16b 1706 zip2 v1.16b, v2.16b, v2.16b 1707 zip1 v2.16b, v3.16b, v3.16b 1708 zip2 v3.16b, v3.16b, v3.16b 1709 add v0.8h, v0.8h, v31.8h 1710 add v1.8h, v1.8h, v31.8h 1711 add v2.8h, v2.8h, v31.8h 1712 add v3.8h, v3.8h, v31.8h 1713 tbl v0.16b, {v30.16b}, v0.16b 1714 tbl v1.16b, {v30.16b}, v1.16b 1715 st1 {v0.8h}, [x0], x1 1716 tbl v2.16b, {v30.16b}, v2.16b 1717 st1 {v1.8h}, [x2], x1 1718 tbl v3.16b, {v30.16b}, v3.16b 1719 st1 {v2.8h}, [x0], x1 1720 st1 {v3.8h}, [x2], x1 1721 b.gt 8b 1722 ret 1723160: 1724 add x2, x0, x1 1725 lsl x1, x1, #1 172616: 1727 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1728 subs w5, w5, #4 1729 add v4.16b, v4.16b, v4.16b 1730 add v5.16b, v5.16b, v5.16b 1731 add v6.16b, v6.16b, v6.16b 1732 add v7.16b, v7.16b, v7.16b 1733 zip1 v0.16b, v4.16b, v4.16b 1734 zip2 v1.16b, v4.16b, v4.16b 1735 zip1 v2.16b, v5.16b, v5.16b 1736 zip2 v3.16b, v5.16b, v5.16b 1737 zip1 v4.16b, v6.16b, v6.16b 1738 zip2 v5.16b, v6.16b, v6.16b 1739 zip1 v6.16b, v7.16b, v7.16b 1740 zip2 v7.16b, v7.16b, v7.16b 1741 add v0.8h, v0.8h, v31.8h 1742 add v1.8h, v1.8h, v31.8h 1743 add v2.8h, v2.8h, v31.8h 1744 add v3.8h, v3.8h, v31.8h 1745 add v4.8h, v4.8h, v31.8h 1746 tbl v0.16b, {v30.16b}, v0.16b 1747 add v5.8h, v5.8h, v31.8h 1748 tbl v1.16b, {v30.16b}, v1.16b 1749 add v6.8h, v6.8h, v31.8h 1750 tbl v2.16b, {v30.16b}, v2.16b 1751 add v7.8h, v7.8h, v31.8h 1752 tbl v3.16b, {v30.16b}, v3.16b 1753 tbl v4.16b, {v30.16b}, v4.16b 1754 tbl v5.16b, {v30.16b}, v5.16b 1755 st1 {v0.8h, v1.8h}, [x0], x1 1756 tbl v6.16b, {v30.16b}, v6.16b 1757 st1 {v2.8h, v3.8h}, [x2], x1 1758 tbl v7.16b, {v30.16b}, v7.16b 1759 st1 {v4.8h, v5.8h}, [x0], x1 1760 st1 {v6.8h, v7.8h}, [x2], x1 1761 b.gt 16b 1762 ret 1763320: 1764 add x2, x0, x1 1765 lsl x1, x1, #1 176632: 1767 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1768 subs w5, w5, #2 1769 add v4.16b, v4.16b, v4.16b 1770 add v5.16b, v5.16b, v5.16b 1771 add v6.16b, v6.16b, v6.16b 1772 add v7.16b, v7.16b, v7.16b 1773 zip1 v0.16b, v4.16b, v4.16b 1774 zip2 v1.16b, v4.16b, v4.16b 1775 zip1 v2.16b, v5.16b, v5.16b 1776 zip2 v3.16b, v5.16b, v5.16b 1777 zip1 v4.16b, v6.16b, v6.16b 1778 zip2 v5.16b, v6.16b, v6.16b 1779 zip1 v6.16b, v7.16b, v7.16b 1780 zip2 v7.16b, v7.16b, v7.16b 1781 add v0.8h, v0.8h, v31.8h 1782 add v1.8h, v1.8h, v31.8h 1783 add v2.8h, v2.8h, v31.8h 1784 add v3.8h, v3.8h, v31.8h 1785 add v4.8h, v4.8h, v31.8h 1786 tbl v0.16b, {v30.16b}, v0.16b 1787 add v5.8h, v5.8h, v31.8h 1788 tbl v1.16b, {v30.16b}, v1.16b 1789 add v6.8h, v6.8h, v31.8h 1790 tbl v2.16b, {v30.16b}, v2.16b 1791 add v7.8h, v7.8h, v31.8h 1792 tbl v3.16b, {v30.16b}, v3.16b 1793 tbl v4.16b, {v30.16b}, v4.16b 1794 tbl v5.16b, {v30.16b}, v5.16b 1795 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1796 tbl v6.16b, {v30.16b}, v6.16b 1797 tbl v7.16b, {v30.16b}, v7.16b 1798 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1799 b.gt 32b 1800 ret 1801640: 1802 add x2, x0, #64 180364: 1804 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1805 subs w5, w5, #1 1806 add v4.16b, v4.16b, v4.16b 1807 add v5.16b, v5.16b, v5.16b 1808 add v6.16b, v6.16b, v6.16b 1809 add v7.16b, v7.16b, v7.16b 1810 zip1 v0.16b, v4.16b, v4.16b 1811 zip2 v1.16b, v4.16b, v4.16b 1812 zip1 v2.16b, v5.16b, v5.16b 1813 zip2 v3.16b, v5.16b, v5.16b 1814 zip1 v4.16b, v6.16b, v6.16b 1815 zip2 v5.16b, v6.16b, v6.16b 1816 zip1 v6.16b, v7.16b, v7.16b 1817 zip2 v7.16b, v7.16b, v7.16b 1818 add v0.8h, v0.8h, v31.8h 1819 add v1.8h, v1.8h, v31.8h 1820 add v2.8h, v2.8h, v31.8h 1821 add v3.8h, v3.8h, v31.8h 1822 add v4.8h, v4.8h, v31.8h 1823 tbl v0.16b, {v30.16b}, v0.16b 1824 add v5.8h, v5.8h, v31.8h 1825 tbl v1.16b, {v30.16b}, v1.16b 1826 add v6.8h, v6.8h, v31.8h 1827 tbl v2.16b, {v30.16b}, v2.16b 1828 add v7.8h, v7.8h, v31.8h 1829 tbl v3.16b, {v30.16b}, v3.16b 1830 tbl v4.16b, {v30.16b}, v4.16b 1831 tbl v5.16b, {v30.16b}, v5.16b 1832 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1833 tbl v6.16b, {v30.16b}, v6.16b 1834 tbl v7.16b, {v30.16b}, v7.16b 1835 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1836 b.gt 64b 1837 ret 1838 1839L(pal_pred_tbl): 1840 .hword L(pal_pred_tbl) - 640b 1841 .hword L(pal_pred_tbl) - 320b 1842 .hword L(pal_pred_tbl) - 160b 1843 .hword L(pal_pred_tbl) - 80b 1844 .hword L(pal_pred_tbl) - 40b 1845endfunc 1846 1847// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1848// const pixel *const topleft, 1849// const int width, const int height, 1850// const int16_t *ac, const int alpha, 1851// const int bitdepth_max); 1852function ipred_cfl_128_16bpc_neon, export=1 1853 dup v31.8h, w7 // bitdepth_max 1854 clz w9, w3 1855 adr x7, L(ipred_cfl_128_tbl) 1856 sub w9, w9, #26 1857 ldrh w9, [x7, w9, uxtw #1] 1858 urshr v0.8h, v31.8h, #1 1859 dup v1.8h, w6 // alpha 1860 sub x7, x7, w9, uxtw 1861 add x6, x0, x1 1862 lsl x1, x1, #1 1863 movi v30.8h, #0 1864 br x7 1865L(ipred_cfl_splat_w4): 1866 ld1 {v4.8h, v5.8h}, [x5], #32 1867 subs w4, w4, #4 1868 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1869 smull2 v3.4s, v4.8h, v1.8h 1870 smull v4.4s, v5.4h, v1.4h 1871 smull2 v5.4s, v5.8h, v1.8h 1872 sshr v16.4s, v2.4s, #31 // sign = diff >> 31 1873 sshr v17.4s, v3.4s, #31 1874 sshr v18.4s, v4.4s, #31 1875 sshr v19.4s, v5.4s, #31 1876 add v2.4s, v2.4s, v16.4s // diff + sign 1877 add v3.4s, v3.4s, v17.4s 1878 add v4.4s, v4.4s, v18.4s 1879 add v5.4s, v5.4s, v19.4s 1880 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1881 rshrn2 v2.8h, v3.4s, #6 1882 rshrn v3.4h, v4.4s, #6 1883 rshrn2 v3.8h, v5.4s, #6 1884 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1885 add v3.8h, v3.8h, v0.8h 1886 smax v2.8h, v2.8h, v30.8h 1887 smax v3.8h, v3.8h, v30.8h 1888 smin v2.8h, v2.8h, v31.8h 1889 smin v3.8h, v3.8h, v31.8h 1890 st1 {v2.d}[0], [x0], x1 1891 st1 {v2.d}[1], [x6], x1 1892 st1 {v3.d}[0], [x0], x1 1893 st1 {v3.d}[1], [x6], x1 1894 b.gt L(ipred_cfl_splat_w4) 1895 ret 1896L(ipred_cfl_splat_w8): 1897 ld1 {v4.8h, v5.8h}, [x5], #32 1898 subs w4, w4, #2 1899 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1900 smull2 v3.4s, v4.8h, v1.8h 1901 smull v4.4s, v5.4h, v1.4h 1902 smull2 v5.4s, v5.8h, v1.8h 1903 sshr v16.4s, v2.4s, #31 // sign = diff >> 31 1904 sshr v17.4s, v3.4s, #31 1905 sshr v18.4s, v4.4s, #31 1906 sshr v19.4s, v5.4s, #31 1907 add v2.4s, v2.4s, v16.4s // diff + sign 1908 add v3.4s, v3.4s, v17.4s 1909 add v4.4s, v4.4s, v18.4s 1910 add v5.4s, v5.4s, v19.4s 1911 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1912 rshrn2 v2.8h, v3.4s, #6 1913 rshrn v3.4h, v4.4s, #6 1914 rshrn2 v3.8h, v5.4s, #6 1915 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1916 add v3.8h, v3.8h, v0.8h 1917 smax v2.8h, v2.8h, v30.8h 1918 smax v3.8h, v3.8h, v30.8h 1919 smin v2.8h, v2.8h, v31.8h 1920 smin v3.8h, v3.8h, v31.8h 1921 st1 {v2.8h}, [x0], x1 1922 st1 {v3.8h}, [x6], x1 1923 b.gt L(ipred_cfl_splat_w8) 1924 ret 1925L(ipred_cfl_splat_w16): 1926 add x7, x5, w3, uxtw #1 1927 sub x1, x1, w3, uxtw #1 1928 mov w9, w3 19291: 1930 ld1 {v2.8h, v3.8h}, [x5], #32 1931 ld1 {v4.8h, v5.8h}, [x7], #32 1932 subs w3, w3, #16 1933 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha 1934 smull2 v17.4s, v2.8h, v1.8h 1935 smull v18.4s, v3.4h, v1.4h 1936 smull2 v19.4s, v3.8h, v1.8h 1937 smull v2.4s, v4.4h, v1.4h 1938 smull2 v3.4s, v4.8h, v1.8h 1939 smull v4.4s, v5.4h, v1.4h 1940 smull2 v5.4s, v5.8h, v1.8h 1941 sshr v20.4s, v16.4s, #31 // sign = diff >> 31 1942 sshr v21.4s, v17.4s, #31 1943 sshr v22.4s, v18.4s, #31 1944 sshr v23.4s, v19.4s, #31 1945 sshr v24.4s, v2.4s, #31 1946 sshr v25.4s, v3.4s, #31 1947 sshr v26.4s, v4.4s, #31 1948 sshr v27.4s, v5.4s, #31 1949 add v16.4s, v16.4s, v20.4s // diff + sign 1950 add v17.4s, v17.4s, v21.4s 1951 add v18.4s, v18.4s, v22.4s 1952 add v19.4s, v19.4s, v23.4s 1953 add v2.4s, v2.4s, v24.4s 1954 add v3.4s, v3.4s, v25.4s 1955 add v4.4s, v4.4s, v26.4s 1956 add v5.4s, v5.4s, v27.4s 1957 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1958 rshrn2 v16.8h, v17.4s, #6 1959 rshrn v17.4h, v18.4s, #6 1960 rshrn2 v17.8h, v19.4s, #6 1961 rshrn v6.4h, v2.4s, #6 1962 rshrn2 v6.8h, v3.4s, #6 1963 rshrn v7.4h, v4.4s, #6 1964 rshrn2 v7.8h, v5.4s, #6 1965 add v2.8h, v16.8h, v0.8h // dc + apply_sign() 1966 add v3.8h, v17.8h, v0.8h 1967 add v4.8h, v6.8h, v0.8h 1968 add v5.8h, v7.8h, v0.8h 1969 smax v2.8h, v2.8h, v30.8h 1970 smax v3.8h, v3.8h, v30.8h 1971 smax v4.8h, v4.8h, v30.8h 1972 smax v5.8h, v5.8h, v30.8h 1973 smin v2.8h, v2.8h, v31.8h 1974 smin v3.8h, v3.8h, v31.8h 1975 smin v4.8h, v4.8h, v31.8h 1976 smin v5.8h, v5.8h, v31.8h 1977 st1 {v2.8h, v3.8h}, [x0], #32 1978 st1 {v4.8h, v5.8h}, [x6], #32 1979 b.gt 1b 1980 subs w4, w4, #2 1981 add x5, x5, w9, uxtw #1 1982 add x7, x7, w9, uxtw #1 1983 add x0, x0, x1 1984 add x6, x6, x1 1985 mov w3, w9 1986 b.gt 1b 1987 ret 1988 1989L(ipred_cfl_128_tbl): 1990L(ipred_cfl_splat_tbl): 1991 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1992 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1993 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 1994 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 1995endfunc 1996 1997// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1998// const pixel *const topleft, 1999// const int width, const int height, 2000// const int16_t *ac, const int alpha, 2001// const int bitdepth_max); 2002function ipred_cfl_top_16bpc_neon, export=1 2003 dup v31.8h, w7 // bitdepth_max 2004 clz w9, w3 2005 adr x7, L(ipred_cfl_top_tbl) 2006 sub w9, w9, #26 2007 ldrh w9, [x7, w9, uxtw #1] 2008 dup v1.8h, w6 // alpha 2009 add x2, x2, #2 2010 sub x7, x7, w9, uxtw 2011 add x6, x0, x1 2012 lsl x1, x1, #1 2013 movi v30.8h, #0 2014 br x7 20154: 2016 ld1 {v0.4h}, [x2] 2017 addv h0, v0.4h 2018 urshr v0.4h, v0.4h, #2 2019 dup v0.8h, v0.h[0] 2020 b L(ipred_cfl_splat_w4) 20218: 2022 ld1 {v0.8h}, [x2] 2023 addv h0, v0.8h 2024 urshr v0.4h, v0.4h, #3 2025 dup v0.8h, v0.h[0] 2026 b L(ipred_cfl_splat_w8) 202716: 2028 ld1 {v2.8h, v3.8h}, [x2] 2029 addp v0.8h, v2.8h, v3.8h 2030 addv h0, v0.8h 2031 urshr v0.4h, v0.4h, #4 2032 dup v0.8h, v0.h[0] 2033 b L(ipred_cfl_splat_w16) 203432: 2035 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2036 addp v2.8h, v2.8h, v3.8h 2037 addp v4.8h, v4.8h, v5.8h 2038 addp v0.8h, v2.8h, v4.8h 2039 uaddlv s0, v0.8h 2040 rshrn v0.4h, v0.4s, #5 2041 dup v0.8h, v0.h[0] 2042 b L(ipred_cfl_splat_w16) 2043 2044L(ipred_cfl_top_tbl): 2045 .hword L(ipred_cfl_top_tbl) - 32b 2046 .hword L(ipred_cfl_top_tbl) - 16b 2047 .hword L(ipred_cfl_top_tbl) - 8b 2048 .hword L(ipred_cfl_top_tbl) - 4b 2049endfunc 2050 2051// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2052// const pixel *const topleft, 2053// const int width, const int height, 2054// const int16_t *ac, const int alpha, 2055// const int bitdepth_max); 2056function ipred_cfl_left_16bpc_neon, export=1 2057 dup v31.8h, w7 // bitdepth_max 2058 sub x2, x2, w4, uxtw #1 2059 clz w9, w3 2060 clz w8, w4 2061 adr x10, L(ipred_cfl_splat_tbl) 2062 adr x7, L(ipred_cfl_left_tbl) 2063 sub w9, w9, #26 2064 sub w8, w8, #26 2065 ldrh w9, [x10, w9, uxtw #1] 2066 ldrh w8, [x7, w8, uxtw #1] 2067 dup v1.8h, w6 // alpha 2068 sub x9, x10, w9, uxtw 2069 sub x7, x7, w8, uxtw 2070 add x6, x0, x1 2071 lsl x1, x1, #1 2072 movi v30.8h, #0 2073 br x7 2074 2075L(ipred_cfl_left_h4): 2076 ld1 {v0.4h}, [x2] 2077 addv h0, v0.4h 2078 urshr v0.4h, v0.4h, #2 2079 dup v0.8h, v0.h[0] 2080 br x9 2081 2082L(ipred_cfl_left_h8): 2083 ld1 {v0.8h}, [x2] 2084 addv h0, v0.8h 2085 urshr v0.4h, v0.4h, #3 2086 dup v0.8h, v0.h[0] 2087 br x9 2088 2089L(ipred_cfl_left_h16): 2090 ld1 {v2.8h, v3.8h}, [x2] 2091 addp v0.8h, v2.8h, v3.8h 2092 addv h0, v0.8h 2093 urshr v0.4h, v0.4h, #4 2094 dup v0.8h, v0.h[0] 2095 br x9 2096 2097L(ipred_cfl_left_h32): 2098 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2099 addp v2.8h, v2.8h, v3.8h 2100 addp v4.8h, v4.8h, v5.8h 2101 addp v0.8h, v2.8h, v4.8h 2102 uaddlv s0, v0.8h 2103 rshrn v0.4h, v0.4s, #5 2104 dup v0.8h, v0.h[0] 2105 br x9 2106 2107L(ipred_cfl_left_tbl): 2108 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 2109 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 2110 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 2111 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 2112endfunc 2113 2114// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2115// const pixel *const topleft, 2116// const int width, const int height, 2117// const int16_t *ac, const int alpha, 2118// const int bitdepth_max); 2119function ipred_cfl_16bpc_neon, export=1 2120 dup v31.8h, w7 // bitdepth_max 2121 sub x2, x2, w4, uxtw #1 2122 add w8, w3, w4 // width + height 2123 dup v1.8h, w6 // alpha 2124 clz w9, w3 2125 clz w6, w4 2126 dup v16.4s, w8 // width + height 2127 adr x7, L(ipred_cfl_tbl) 2128 rbit w8, w8 // rbit(width + height) 2129 sub w9, w9, #22 // 26 leading bits, minus table offset 4 2130 sub w6, w6, #26 2131 clz w8, w8 // ctz(width + height) 2132 ldrh w9, [x7, w9, uxtw #1] 2133 ldrh w6, [x7, w6, uxtw #1] 2134 neg w8, w8 // -ctz(width + height) 2135 sub x9, x7, w9, uxtw 2136 sub x7, x7, w6, uxtw 2137 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 2138 dup v17.4s, w8 // -ctz(width + height) 2139 add x6, x0, x1 2140 lsl x1, x1, #1 2141 movi v30.8h, #0 2142 br x7 2143 2144L(ipred_cfl_h4): 2145 ld1 {v0.4h}, [x2], #8 2146 uaddlv s0, v0.4h 2147 add x2, x2, #2 2148 br x9 2149L(ipred_cfl_w4): 2150 ld1 {v2.4h}, [x2] 2151 add v0.2s, v0.2s, v16.2s 2152 uaddlv s2, v2.4h 2153 cmp w4, #4 2154 add v0.2s, v0.2s, v2.2s 2155 ushl v0.2s, v0.2s, v17.2s 2156 b.eq 1f 2157 // h = 8/16 2158 cmp w4, #16 2159 mov w16, #0x6667 2160 mov w17, #0xAAAB 2161 csel w16, w16, w17, eq 2162 dup v16.2s, w16 2163 mul v0.2s, v0.2s, v16.2s 2164 ushr v0.2s, v0.2s, #17 21651: 2166 dup v0.8h, v0.h[0] 2167 b L(ipred_cfl_splat_w4) 2168 2169L(ipred_cfl_h8): 2170 ld1 {v0.8h}, [x2], #16 2171 uaddlv s0, v0.8h 2172 add x2, x2, #2 2173 br x9 2174L(ipred_cfl_w8): 2175 ld1 {v2.8h}, [x2] 2176 add v0.2s, v0.2s, v16.2s 2177 uaddlv s2, v2.8h 2178 cmp w4, #8 2179 add v0.2s, v0.2s, v2.2s 2180 ushl v0.2s, v0.2s, v17.2s 2181 b.eq 1f 2182 // h = 4/16/32 2183 cmp w4, #32 2184 mov w16, #0x6667 2185 mov w17, #0xAAAB 2186 csel w16, w16, w17, eq 2187 dup v16.2s, w16 2188 mul v0.2s, v0.2s, v16.2s 2189 ushr v0.2s, v0.2s, #17 21901: 2191 dup v0.8h, v0.h[0] 2192 b L(ipred_cfl_splat_w8) 2193 2194L(ipred_cfl_h16): 2195 ld1 {v2.8h, v3.8h}, [x2], #32 2196 addp v0.8h, v2.8h, v3.8h 2197 add x2, x2, #2 2198 uaddlv s0, v0.8h 2199 br x9 2200L(ipred_cfl_w16): 2201 ld1 {v2.8h, v3.8h}, [x2] 2202 add v0.2s, v0.2s, v16.2s 2203 addp v2.8h, v2.8h, v3.8h 2204 uaddlv s2, v2.8h 2205 cmp w4, #16 2206 add v0.2s, v0.2s, v2.2s 2207 ushl v0.2s, v0.2s, v17.2s 2208 b.eq 1f 2209 // h = 4/8/32 2210 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 2211 mov w16, #0x6667 2212 mov w17, #0xAAAB 2213 csel w16, w16, w17, eq 2214 dup v16.2s, w16 2215 mul v0.2s, v0.2s, v16.2s 2216 ushr v0.2s, v0.2s, #17 22171: 2218 dup v0.8h, v0.h[0] 2219 b L(ipred_cfl_splat_w16) 2220 2221L(ipred_cfl_h32): 2222 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 2223 addp v2.8h, v2.8h, v3.8h 2224 addp v4.8h, v4.8h, v5.8h 2225 addp v0.8h, v2.8h, v4.8h 2226 add x2, x2, #2 2227 uaddlv s0, v0.8h 2228 br x9 2229L(ipred_cfl_w32): 2230 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2231 add v0.4s, v0.4s, v16.4s 2232 addp v2.8h, v2.8h, v3.8h 2233 addp v4.8h, v4.8h, v5.8h 2234 addp v2.8h, v2.8h, v4.8h 2235 cmp w4, #32 2236 uaddlv s2, v2.8h 2237 add v0.2s, v0.2s, v2.2s 2238 ushl v0.2s, v0.2s, v17.2s 2239 b.eq 1f 2240 // h = 8/16 2241 cmp w4, #8 2242 mov w16, #0x6667 2243 mov w17, #0xAAAB 2244 csel w16, w16, w17, eq 2245 dup v16.2s, w16 2246 mul v0.2s, v0.2s, v16.2s 2247 ushr v0.2s, v0.2s, #17 22481: 2249 dup v0.8h, v0.h[0] 2250 b L(ipred_cfl_splat_w16) 2251 2252L(ipred_cfl_tbl): 2253 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 2254 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 2255 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 2256 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 2257 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 2258 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 2259 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 2260 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 2261endfunc 2262 2263// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2264// const ptrdiff_t stride, const int w_pad, 2265// const int h_pad, const int cw, const int ch); 2266function ipred_cfl_ac_420_16bpc_neon, export=1 2267 clz w8, w5 2268 lsl w4, w4, #2 2269 adr x7, L(ipred_cfl_ac_420_tbl) 2270 sub w8, w8, #27 2271 ldrh w8, [x7, w8, uxtw #1] 2272 movi v24.4s, #0 2273 movi v25.4s, #0 2274 movi v26.4s, #0 2275 movi v27.4s, #0 2276 sub x7, x7, w8, uxtw 2277 sub w8, w6, w4 // height - h_pad 2278 rbit w9, w5 // rbit(width) 2279 rbit w10, w6 // rbit(height) 2280 clz w9, w9 // ctz(width) 2281 clz w10, w10 // ctz(height) 2282 add w9, w9, w10 // log2sz 2283 add x10, x1, x2 2284 dup v31.4s, w9 2285 lsl x2, x2, #1 2286 neg v31.4s, v31.4s // -log2sz 2287 br x7 2288 2289L(ipred_cfl_ac_420_w4): 22901: // Copy and subsample input 2291 ld1 {v0.8h}, [x1], x2 2292 ld1 {v1.8h}, [x10], x2 2293 ld1 {v2.8h}, [x1], x2 2294 ld1 {v3.8h}, [x10], x2 2295 addp v0.8h, v0.8h, v2.8h 2296 addp v1.8h, v1.8h, v3.8h 2297 add v0.8h, v0.8h, v1.8h 2298 shl v0.8h, v0.8h, #1 2299 subs w8, w8, #2 2300 st1 {v0.8h}, [x0], #16 2301 uaddw v24.4s, v24.4s, v0.4h 2302 uaddw2 v25.4s, v25.4s, v0.8h 2303 b.gt 1b 2304 trn2 v1.2d, v0.2d, v0.2d 2305 trn2 v0.2d, v0.2d, v0.2d 2306L(ipred_cfl_ac_420_w4_hpad): 2307 cbz w4, 3f 23082: // Vertical padding (h_pad > 0) 2309 subs w4, w4, #4 2310 st1 {v0.8h, v1.8h}, [x0], #32 2311 uaddw v24.4s, v24.4s, v0.4h 2312 uaddw2 v25.4s, v25.4s, v0.8h 2313 uaddw v26.4s, v26.4s, v1.4h 2314 uaddw2 v27.4s, v27.4s, v1.8h 2315 b.gt 2b 23163: 2317L(ipred_cfl_ac_420_w4_calc_subtract_dc): 2318 // Aggregate the sums 2319 add v24.4s, v24.4s, v25.4s 2320 add v26.4s, v26.4s, v27.4s 2321 add v0.4s, v24.4s, v26.4s 2322 addv s0, v0.4s // sum 2323 sub x0, x0, w6, uxtw #3 2324 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2325 dup v4.8h, v4.h[0] 23266: // Subtract dc from ac 2327 ld1 {v0.8h, v1.8h}, [x0] 2328 subs w6, w6, #4 2329 sub v0.8h, v0.8h, v4.8h 2330 sub v1.8h, v1.8h, v4.8h 2331 st1 {v0.8h, v1.8h}, [x0], #32 2332 b.gt 6b 2333 ret 2334 2335L(ipred_cfl_ac_420_w8): 2336 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 23371: // Copy and subsample input, without padding 2338 ld1 {v0.8h, v1.8h}, [x1], x2 2339 ld1 {v2.8h, v3.8h}, [x10], x2 2340 ld1 {v4.8h, v5.8h}, [x1], x2 2341 addp v0.8h, v0.8h, v1.8h 2342 ld1 {v6.8h, v7.8h}, [x10], x2 2343 addp v2.8h, v2.8h, v3.8h 2344 addp v4.8h, v4.8h, v5.8h 2345 addp v6.8h, v6.8h, v7.8h 2346 add v0.8h, v0.8h, v2.8h 2347 add v4.8h, v4.8h, v6.8h 2348 shl v0.8h, v0.8h, #1 2349 shl v1.8h, v4.8h, #1 2350 subs w8, w8, #2 2351 st1 {v0.8h, v1.8h}, [x0], #32 2352 uaddw v24.4s, v24.4s, v0.4h 2353 uaddw2 v25.4s, v25.4s, v0.8h 2354 uaddw v26.4s, v26.4s, v1.4h 2355 uaddw2 v27.4s, v27.4s, v1.8h 2356 b.gt 1b 2357 mov v0.16b, v1.16b 2358 b L(ipred_cfl_ac_420_w8_hpad) 2359 2360L(ipred_cfl_ac_420_w8_wpad): 23611: // Copy and subsample input, padding 4 2362 ld1 {v0.8h}, [x1], x2 2363 ld1 {v1.8h}, [x10], x2 2364 ld1 {v2.8h}, [x1], x2 2365 ld1 {v3.8h}, [x10], x2 2366 addp v0.8h, v0.8h, v2.8h 2367 addp v1.8h, v1.8h, v3.8h 2368 add v0.8h, v0.8h, v1.8h 2369 shl v0.8h, v0.8h, #1 2370 dup v1.4h, v0.h[3] 2371 dup v3.4h, v0.h[7] 2372 trn2 v2.2d, v0.2d, v0.2d 2373 subs w8, w8, #2 2374 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 2375 uaddw v24.4s, v24.4s, v0.4h 2376 uaddw v25.4s, v25.4s, v1.4h 2377 uaddw v26.4s, v26.4s, v2.4h 2378 uaddw v27.4s, v27.4s, v3.4h 2379 b.gt 1b 2380 trn1 v0.2d, v2.2d, v3.2d 2381 trn1 v1.2d, v2.2d, v3.2d 2382 2383L(ipred_cfl_ac_420_w8_hpad): 2384 cbz w4, 3f 23852: // Vertical padding (h_pad > 0) 2386 subs w4, w4, #4 2387 st1 {v0.8h, v1.8h}, [x0], #32 2388 uaddw v24.4s, v24.4s, v0.4h 2389 uaddw2 v25.4s, v25.4s, v0.8h 2390 uaddw v26.4s, v26.4s, v1.4h 2391 uaddw2 v27.4s, v27.4s, v1.8h 2392 st1 {v0.8h, v1.8h}, [x0], #32 2393 uaddw v24.4s, v24.4s, v0.4h 2394 uaddw2 v25.4s, v25.4s, v0.8h 2395 uaddw v26.4s, v26.4s, v1.4h 2396 uaddw2 v27.4s, v27.4s, v1.8h 2397 b.gt 2b 23983: 2399 2400 // Double the height and reuse the w4 summing/subtracting 2401 lsl w6, w6, #1 2402 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2403 2404L(ipred_cfl_ac_420_w16): 2405 adr x7, L(ipred_cfl_ac_420_w16_tbl) 2406 ldrh w3, [x7, w3, uxtw #1] 2407 sub x7, x7, w3, uxtw 2408 br x7 2409 2410L(ipred_cfl_ac_420_w16_wpad0): 24111: // Copy and subsample input, without padding 2412 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2413 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2414 addp v0.8h, v0.8h, v1.8h 2415 addp v2.8h, v2.8h, v3.8h 2416 addp v4.8h, v4.8h, v5.8h 2417 addp v6.8h, v6.8h, v7.8h 2418 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 2419 add v0.8h, v0.8h, v4.8h 2420 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 2421 add v2.8h, v2.8h, v6.8h 2422 addp v16.8h, v16.8h, v17.8h 2423 addp v18.8h, v18.8h, v19.8h 2424 addp v20.8h, v20.8h, v21.8h 2425 addp v22.8h, v22.8h, v23.8h 2426 add v16.8h, v16.8h, v20.8h 2427 add v18.8h, v18.8h, v22.8h 2428 shl v0.8h, v0.8h, #1 2429 shl v1.8h, v2.8h, #1 2430 shl v2.8h, v16.8h, #1 2431 shl v3.8h, v18.8h, #1 2432 subs w8, w8, #2 2433 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2434 uaddw v24.4s, v24.4s, v0.4h 2435 uaddw2 v25.4s, v25.4s, v0.8h 2436 uaddw v26.4s, v26.4s, v1.4h 2437 uaddw2 v27.4s, v27.4s, v1.8h 2438 uaddw v24.4s, v24.4s, v2.4h 2439 uaddw2 v25.4s, v25.4s, v2.8h 2440 uaddw v26.4s, v26.4s, v3.4h 2441 uaddw2 v27.4s, v27.4s, v3.8h 2442 b.gt 1b 2443 mov v0.16b, v2.16b 2444 mov v1.16b, v3.16b 2445 b L(ipred_cfl_ac_420_w16_hpad) 2446 2447L(ipred_cfl_ac_420_w16_wpad1): 24481: // Copy and subsample input, padding 4 2449 ldr q2, [x1, #32] 2450 ld1 {v0.8h, v1.8h}, [x1], x2 2451 ldr q5, [x10, #32] 2452 ld1 {v3.8h, v4.8h}, [x10], x2 2453 addp v2.8h, v2.8h, v2.8h 2454 addp v0.8h, v0.8h, v1.8h 2455 addp v5.8h, v5.8h, v5.8h 2456 addp v3.8h, v3.8h, v4.8h 2457 ldr q18, [x1, #32] 2458 add v2.4h, v2.4h, v5.4h 2459 ld1 {v16.8h, v17.8h}, [x1], x2 2460 add v0.8h, v0.8h, v3.8h 2461 ldr q21, [x10, #32] 2462 ld1 {v19.8h, v20.8h}, [x10], x2 2463 addp v18.8h, v18.8h, v18.8h 2464 addp v16.8h, v16.8h, v17.8h 2465 addp v21.8h, v21.8h, v21.8h 2466 addp v19.8h, v19.8h, v20.8h 2467 add v18.4h, v18.4h, v21.4h 2468 add v16.8h, v16.8h, v19.8h 2469 shl v1.4h, v2.4h, #1 2470 shl v0.8h, v0.8h, #1 2471 shl v3.4h, v18.4h, #1 2472 shl v2.8h, v16.8h, #1 2473 dup v4.4h, v1.h[3] 2474 dup v5.4h, v3.h[3] 2475 trn1 v1.2d, v1.2d, v4.2d 2476 trn1 v3.2d, v3.2d, v5.2d 2477 subs w8, w8, #2 2478 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2479 uaddw v24.4s, v24.4s, v0.4h 2480 uaddw2 v25.4s, v25.4s, v0.8h 2481 uaddw v26.4s, v26.4s, v1.4h 2482 uaddw2 v27.4s, v27.4s, v1.8h 2483 uaddw v24.4s, v24.4s, v2.4h 2484 uaddw2 v25.4s, v25.4s, v2.8h 2485 uaddw v26.4s, v26.4s, v3.4h 2486 uaddw2 v27.4s, v27.4s, v3.8h 2487 b.gt 1b 2488 mov v0.16b, v2.16b 2489 mov v1.16b, v3.16b 2490 b L(ipred_cfl_ac_420_w16_hpad) 2491 2492L(ipred_cfl_ac_420_w16_wpad2): 24931: // Copy and subsample input, padding 8 2494 ld1 {v0.8h, v1.8h}, [x1], x2 2495 ld1 {v2.8h, v3.8h}, [x10], x2 2496 ld1 {v4.8h, v5.8h}, [x1], x2 2497 addp v0.8h, v0.8h, v1.8h 2498 ld1 {v6.8h, v7.8h}, [x10], x2 2499 addp v2.8h, v2.8h, v3.8h 2500 addp v4.8h, v4.8h, v5.8h 2501 addp v6.8h, v6.8h, v7.8h 2502 add v0.8h, v0.8h, v2.8h 2503 add v4.8h, v4.8h, v6.8h 2504 shl v0.8h, v0.8h, #1 2505 shl v2.8h, v4.8h, #1 2506 dup v1.8h, v0.h[7] 2507 dup v3.8h, v2.h[7] 2508 subs w8, w8, #2 2509 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2510 uaddw v24.4s, v24.4s, v0.4h 2511 uaddw2 v25.4s, v25.4s, v0.8h 2512 uaddw v26.4s, v26.4s, v1.4h 2513 uaddw2 v27.4s, v27.4s, v1.8h 2514 uaddw v24.4s, v24.4s, v2.4h 2515 uaddw2 v25.4s, v25.4s, v2.8h 2516 uaddw v26.4s, v26.4s, v3.4h 2517 uaddw2 v27.4s, v27.4s, v3.8h 2518 b.gt 1b 2519 mov v0.16b, v2.16b 2520 mov v1.16b, v3.16b 2521 b L(ipred_cfl_ac_420_w16_hpad) 2522 2523L(ipred_cfl_ac_420_w16_wpad3): 25241: // Copy and subsample input, padding 12 2525 ld1 {v0.8h}, [x1], x2 2526 ld1 {v2.8h}, [x10], x2 2527 ld1 {v4.8h}, [x1], x2 2528 ld1 {v6.8h}, [x10], x2 2529 addp v0.8h, v0.8h, v4.8h 2530 addp v2.8h, v2.8h, v6.8h 2531 add v0.8h, v0.8h, v2.8h 2532 shl v0.8h, v0.8h, #1 2533 dup v1.8h, v0.h[3] 2534 dup v3.8h, v0.h[7] 2535 trn2 v2.2d, v0.2d, v3.2d 2536 trn1 v0.2d, v0.2d, v1.2d 2537 subs w8, w8, #2 2538 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2539 uaddw v24.4s, v24.4s, v0.4h 2540 uaddw2 v25.4s, v25.4s, v0.8h 2541 uaddw v26.4s, v26.4s, v1.4h 2542 uaddw2 v27.4s, v27.4s, v1.8h 2543 uaddw v24.4s, v24.4s, v2.4h 2544 uaddw2 v25.4s, v25.4s, v2.8h 2545 uaddw v26.4s, v26.4s, v3.4h 2546 uaddw2 v27.4s, v27.4s, v3.8h 2547 b.gt 1b 2548 mov v0.16b, v2.16b 2549 mov v1.16b, v3.16b 2550 2551L(ipred_cfl_ac_420_w16_hpad): 2552 cbz w4, 3f 25532: // Vertical padding (h_pad > 0) 2554 subs w4, w4, #4 2555 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2556 uaddw v24.4s, v24.4s, v0.4h 2557 uaddw2 v25.4s, v25.4s, v0.8h 2558 uaddw v26.4s, v26.4s, v1.4h 2559 uaddw2 v27.4s, v27.4s, v1.8h 2560 uaddw v24.4s, v24.4s, v2.4h 2561 uaddw2 v25.4s, v25.4s, v2.8h 2562 uaddw v26.4s, v26.4s, v3.4h 2563 uaddw2 v27.4s, v27.4s, v3.8h 2564 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2565 uaddw v24.4s, v24.4s, v0.4h 2566 uaddw2 v25.4s, v25.4s, v0.8h 2567 uaddw v26.4s, v26.4s, v1.4h 2568 uaddw2 v27.4s, v27.4s, v1.8h 2569 uaddw v24.4s, v24.4s, v2.4h 2570 uaddw2 v25.4s, v25.4s, v2.8h 2571 uaddw v26.4s, v26.4s, v3.4h 2572 uaddw2 v27.4s, v27.4s, v3.8h 2573 b.gt 2b 25743: 2575 2576 // Quadruple the height and reuse the w4 summing/subtracting 2577 lsl w6, w6, #2 2578 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2579 2580L(ipred_cfl_ac_420_tbl): 2581 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 2582 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 2583 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 2584 .hword 0 2585 2586L(ipred_cfl_ac_420_w16_tbl): 2587 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 2588 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 2589 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 2590 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 2591endfunc 2592 2593// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2594// const ptrdiff_t stride, const int w_pad, 2595// const int h_pad, const int cw, const int ch); 2596function ipred_cfl_ac_422_16bpc_neon, export=1 2597 clz w8, w5 2598 lsl w4, w4, #2 2599 adr x7, L(ipred_cfl_ac_422_tbl) 2600 sub w8, w8, #27 2601 ldrh w8, [x7, w8, uxtw #1] 2602 movi v24.4s, #0 2603 movi v25.4s, #0 2604 movi v26.4s, #0 2605 movi v27.4s, #0 2606 sub x7, x7, w8, uxtw 2607 sub w8, w6, w4 // height - h_pad 2608 rbit w9, w5 // rbit(width) 2609 rbit w10, w6 // rbit(height) 2610 clz w9, w9 // ctz(width) 2611 clz w10, w10 // ctz(height) 2612 add w9, w9, w10 // log2sz 2613 add x10, x1, x2 2614 dup v31.4s, w9 2615 lsl x2, x2, #1 2616 neg v31.4s, v31.4s // -log2sz 2617 br x7 2618 2619L(ipred_cfl_ac_422_w4): 26201: // Copy and subsample input 2621 ld1 {v0.8h}, [x1], x2 2622 ld1 {v1.8h}, [x10], x2 2623 ld1 {v2.8h}, [x1], x2 2624 ld1 {v3.8h}, [x10], x2 2625 addp v0.8h, v0.8h, v1.8h 2626 addp v2.8h, v2.8h, v3.8h 2627 shl v0.8h, v0.8h, #2 2628 shl v1.8h, v2.8h, #2 2629 subs w8, w8, #4 2630 st1 {v0.8h, v1.8h}, [x0], #32 2631 uaddw v24.4s, v24.4s, v0.4h 2632 uaddw2 v25.4s, v25.4s, v0.8h 2633 uaddw v26.4s, v26.4s, v1.4h 2634 uaddw2 v27.4s, v27.4s, v1.8h 2635 b.gt 1b 2636 trn2 v0.2d, v1.2d, v1.2d 2637 trn2 v1.2d, v1.2d, v1.2d 2638 b L(ipred_cfl_ac_420_w4_hpad) 2639 2640L(ipred_cfl_ac_422_w8): 2641 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 26421: // Copy and subsample input, without padding 2643 ld1 {v0.8h, v1.8h}, [x1], x2 2644 ld1 {v2.8h, v3.8h}, [x10], x2 2645 ld1 {v4.8h, v5.8h}, [x1], x2 2646 addp v0.8h, v0.8h, v1.8h 2647 ld1 {v6.8h, v7.8h}, [x10], x2 2648 addp v2.8h, v2.8h, v3.8h 2649 addp v4.8h, v4.8h, v5.8h 2650 addp v6.8h, v6.8h, v7.8h 2651 shl v0.8h, v0.8h, #2 2652 shl v1.8h, v2.8h, #2 2653 shl v2.8h, v4.8h, #2 2654 shl v3.8h, v6.8h, #2 2655 subs w8, w8, #4 2656 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2657 uaddw v24.4s, v24.4s, v0.4h 2658 uaddw2 v25.4s, v25.4s, v0.8h 2659 uaddw v26.4s, v26.4s, v1.4h 2660 uaddw2 v27.4s, v27.4s, v1.8h 2661 uaddw v24.4s, v24.4s, v2.4h 2662 uaddw2 v25.4s, v25.4s, v2.8h 2663 uaddw v26.4s, v26.4s, v3.4h 2664 uaddw2 v27.4s, v27.4s, v3.8h 2665 b.gt 1b 2666 mov v0.16b, v3.16b 2667 mov v1.16b, v3.16b 2668 b L(ipred_cfl_ac_420_w8_hpad) 2669 2670L(ipred_cfl_ac_422_w8_wpad): 26711: // Copy and subsample input, padding 4 2672 ld1 {v0.8h}, [x1], x2 2673 ld1 {v1.8h}, [x10], x2 2674 ld1 {v2.8h}, [x1], x2 2675 ld1 {v3.8h}, [x10], x2 2676 addp v0.8h, v0.8h, v1.8h 2677 addp v2.8h, v2.8h, v3.8h 2678 shl v0.8h, v0.8h, #2 2679 shl v2.8h, v2.8h, #2 2680 dup v4.4h, v0.h[3] 2681 dup v5.8h, v0.h[7] 2682 dup v6.4h, v2.h[3] 2683 dup v7.8h, v2.h[7] 2684 trn2 v1.2d, v0.2d, v5.2d 2685 trn1 v0.2d, v0.2d, v4.2d 2686 trn2 v3.2d, v2.2d, v7.2d 2687 trn1 v2.2d, v2.2d, v6.2d 2688 subs w8, w8, #4 2689 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2690 uaddw v24.4s, v24.4s, v0.4h 2691 uaddw2 v25.4s, v25.4s, v0.8h 2692 uaddw v26.4s, v26.4s, v1.4h 2693 uaddw2 v27.4s, v27.4s, v1.8h 2694 uaddw v24.4s, v24.4s, v2.4h 2695 uaddw2 v25.4s, v25.4s, v2.8h 2696 uaddw v26.4s, v26.4s, v3.4h 2697 uaddw2 v27.4s, v27.4s, v3.8h 2698 b.gt 1b 2699 mov v0.16b, v3.16b 2700 mov v1.16b, v3.16b 2701 b L(ipred_cfl_ac_420_w8_hpad) 2702 2703L(ipred_cfl_ac_422_w16): 2704 adr x7, L(ipred_cfl_ac_422_w16_tbl) 2705 ldrh w3, [x7, w3, uxtw #1] 2706 sub x7, x7, w3, uxtw 2707 br x7 2708 2709L(ipred_cfl_ac_422_w16_wpad0): 27101: // Copy and subsample input, without padding 2711 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2712 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2713 addp v0.8h, v0.8h, v1.8h 2714 addp v2.8h, v2.8h, v3.8h 2715 addp v4.8h, v4.8h, v5.8h 2716 addp v6.8h, v6.8h, v7.8h 2717 shl v0.8h, v0.8h, #2 2718 shl v1.8h, v2.8h, #2 2719 shl v2.8h, v4.8h, #2 2720 shl v3.8h, v6.8h, #2 2721 subs w8, w8, #2 2722 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2723 uaddw v24.4s, v24.4s, v0.4h 2724 uaddw2 v25.4s, v25.4s, v0.8h 2725 uaddw v26.4s, v26.4s, v1.4h 2726 uaddw2 v27.4s, v27.4s, v1.8h 2727 uaddw v24.4s, v24.4s, v2.4h 2728 uaddw2 v25.4s, v25.4s, v2.8h 2729 uaddw v26.4s, v26.4s, v3.4h 2730 uaddw2 v27.4s, v27.4s, v3.8h 2731 b.gt 1b 2732 mov v0.16b, v2.16b 2733 mov v1.16b, v3.16b 2734 b L(ipred_cfl_ac_420_w16_hpad) 2735 2736L(ipred_cfl_ac_422_w16_wpad1): 27371: // Copy and subsample input, padding 4 2738 ldr q2, [x1, #32] 2739 ld1 {v0.8h, v1.8h}, [x1], x2 2740 ldr q6, [x10, #32] 2741 ld1 {v4.8h, v5.8h}, [x10], x2 2742 addp v2.8h, v2.8h, v2.8h 2743 addp v0.8h, v0.8h, v1.8h 2744 addp v6.8h, v6.8h, v6.8h 2745 addp v4.8h, v4.8h, v5.8h 2746 shl v1.4h, v2.4h, #2 2747 shl v0.8h, v0.8h, #2 2748 shl v3.4h, v6.4h, #2 2749 shl v2.8h, v4.8h, #2 2750 dup v4.4h, v1.h[3] 2751 dup v5.4h, v3.h[3] 2752 trn1 v1.2d, v1.2d, v4.2d 2753 trn1 v3.2d, v3.2d, v5.2d 2754 subs w8, w8, #2 2755 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2756 uaddw v24.4s, v24.4s, v0.4h 2757 uaddw2 v25.4s, v25.4s, v0.8h 2758 uaddw v26.4s, v26.4s, v1.4h 2759 uaddw2 v27.4s, v27.4s, v1.8h 2760 uaddw v24.4s, v24.4s, v2.4h 2761 uaddw2 v25.4s, v25.4s, v2.8h 2762 uaddw v26.4s, v26.4s, v3.4h 2763 uaddw2 v27.4s, v27.4s, v3.8h 2764 b.gt 1b 2765 mov v0.16b, v2.16b 2766 mov v1.16b, v3.16b 2767 b L(ipred_cfl_ac_420_w16_hpad) 2768 2769L(ipred_cfl_ac_422_w16_wpad2): 27701: // Copy and subsample input, padding 8 2771 ld1 {v0.8h, v1.8h}, [x1], x2 2772 ld1 {v2.8h, v3.8h}, [x10], x2 2773 addp v0.8h, v0.8h, v1.8h 2774 addp v2.8h, v2.8h, v3.8h 2775 shl v0.8h, v0.8h, #2 2776 shl v2.8h, v2.8h, #2 2777 dup v1.8h, v0.h[7] 2778 dup v3.8h, v2.h[7] 2779 subs w8, w8, #2 2780 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2781 uaddw v24.4s, v24.4s, v0.4h 2782 uaddw2 v25.4s, v25.4s, v0.8h 2783 uaddw v26.4s, v26.4s, v1.4h 2784 uaddw2 v27.4s, v27.4s, v1.8h 2785 uaddw v24.4s, v24.4s, v2.4h 2786 uaddw2 v25.4s, v25.4s, v2.8h 2787 uaddw v26.4s, v26.4s, v3.4h 2788 uaddw2 v27.4s, v27.4s, v3.8h 2789 b.gt 1b 2790 mov v0.16b, v2.16b 2791 mov v1.16b, v3.16b 2792 b L(ipred_cfl_ac_420_w16_hpad) 2793 2794L(ipred_cfl_ac_422_w16_wpad3): 27951: // Copy and subsample input, padding 12 2796 ld1 {v0.8h}, [x1], x2 2797 ld1 {v2.8h}, [x10], x2 2798 addp v0.8h, v0.8h, v0.8h 2799 addp v2.8h, v2.8h, v2.8h 2800 shl v0.4h, v0.4h, #2 2801 shl v2.4h, v2.4h, #2 2802 dup v1.8h, v0.h[3] 2803 dup v3.8h, v2.h[3] 2804 trn1 v0.2d, v0.2d, v1.2d 2805 trn1 v2.2d, v2.2d, v3.2d 2806 subs w8, w8, #2 2807 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2808 uaddw v24.4s, v24.4s, v0.4h 2809 uaddw2 v25.4s, v25.4s, v0.8h 2810 uaddw v26.4s, v26.4s, v1.4h 2811 uaddw2 v27.4s, v27.4s, v1.8h 2812 uaddw v24.4s, v24.4s, v2.4h 2813 uaddw2 v25.4s, v25.4s, v2.8h 2814 uaddw v26.4s, v26.4s, v3.4h 2815 uaddw2 v27.4s, v27.4s, v3.8h 2816 b.gt 1b 2817 mov v0.16b, v2.16b 2818 mov v1.16b, v3.16b 2819 b L(ipred_cfl_ac_420_w16_hpad) 2820 2821L(ipred_cfl_ac_422_tbl): 2822 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 2823 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 2824 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 2825 .hword 0 2826 2827L(ipred_cfl_ac_422_w16_tbl): 2828 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 2829 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 2830 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 2831 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 2832endfunc 2833 2834// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2835// const ptrdiff_t stride, const int w_pad, 2836// const int h_pad, const int cw, const int ch); 2837function ipred_cfl_ac_444_16bpc_neon, export=1 2838 clz w8, w5 2839 lsl w4, w4, #2 2840 adr x7, L(ipred_cfl_ac_444_tbl) 2841 sub w8, w8, #26 2842 ldrh w8, [x7, w8, uxtw #1] 2843 movi v24.4s, #0 2844 movi v25.4s, #0 2845 movi v26.4s, #0 2846 movi v27.4s, #0 2847 sub x7, x7, w8, uxtw 2848 sub w8, w6, w4 // height - h_pad 2849 rbit w9, w5 // rbit(width) 2850 rbit w10, w6 // rbit(height) 2851 clz w9, w9 // ctz(width) 2852 clz w10, w10 // ctz(height) 2853 add w9, w9, w10 // log2sz 2854 add x10, x1, x2 2855 dup v31.4s, w9 2856 lsl x2, x2, #1 2857 neg v31.4s, v31.4s // -log2sz 2858 br x7 2859 2860L(ipred_cfl_ac_444_w4): 28611: // Copy and expand input 2862 ld1 {v0.4h}, [x1], x2 2863 ld1 {v0.d}[1], [x10], x2 2864 ld1 {v1.4h}, [x1], x2 2865 ld1 {v1.d}[1], [x10], x2 2866 shl v0.8h, v0.8h, #3 2867 shl v1.8h, v1.8h, #3 2868 subs w8, w8, #4 2869 st1 {v0.8h, v1.8h}, [x0], #32 2870 uaddw v24.4s, v24.4s, v0.4h 2871 uaddw2 v25.4s, v25.4s, v0.8h 2872 uaddw v26.4s, v26.4s, v1.4h 2873 uaddw2 v27.4s, v27.4s, v1.8h 2874 b.gt 1b 2875 trn2 v0.2d, v1.2d, v1.2d 2876 trn2 v1.2d, v1.2d, v1.2d 2877 b L(ipred_cfl_ac_420_w4_hpad) 2878 2879L(ipred_cfl_ac_444_w8): 28801: // Copy and expand input 2881 ld1 {v0.8h}, [x1], x2 2882 ld1 {v1.8h}, [x10], x2 2883 ld1 {v2.8h}, [x1], x2 2884 shl v0.8h, v0.8h, #3 2885 ld1 {v3.8h}, [x10], x2 2886 shl v1.8h, v1.8h, #3 2887 shl v2.8h, v2.8h, #3 2888 shl v3.8h, v3.8h, #3 2889 subs w8, w8, #4 2890 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2891 uaddw v24.4s, v24.4s, v0.4h 2892 uaddw2 v25.4s, v25.4s, v0.8h 2893 uaddw v26.4s, v26.4s, v1.4h 2894 uaddw2 v27.4s, v27.4s, v1.8h 2895 uaddw v24.4s, v24.4s, v2.4h 2896 uaddw2 v25.4s, v25.4s, v2.8h 2897 uaddw v26.4s, v26.4s, v3.4h 2898 uaddw2 v27.4s, v27.4s, v3.8h 2899 b.gt 1b 2900 mov v0.16b, v3.16b 2901 mov v1.16b, v3.16b 2902 b L(ipred_cfl_ac_420_w8_hpad) 2903 2904L(ipred_cfl_ac_444_w16): 2905 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 29061: // Copy and expand input, without padding 2907 ld1 {v0.8h, v1.8h}, [x1], x2 2908 ld1 {v2.8h, v3.8h}, [x10], x2 2909 shl v0.8h, v0.8h, #3 2910 shl v1.8h, v1.8h, #3 2911 shl v2.8h, v2.8h, #3 2912 shl v3.8h, v3.8h, #3 2913 subs w8, w8, #2 2914 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2915 uaddw v24.4s, v24.4s, v0.4h 2916 uaddw2 v25.4s, v25.4s, v0.8h 2917 uaddw v26.4s, v26.4s, v1.4h 2918 uaddw2 v27.4s, v27.4s, v1.8h 2919 uaddw v24.4s, v24.4s, v2.4h 2920 uaddw2 v25.4s, v25.4s, v2.8h 2921 uaddw v26.4s, v26.4s, v3.4h 2922 uaddw2 v27.4s, v27.4s, v3.8h 2923 b.gt 1b 2924 mov v0.16b, v2.16b 2925 mov v1.16b, v3.16b 2926 b L(ipred_cfl_ac_420_w16_hpad) 2927 2928L(ipred_cfl_ac_444_w16_wpad): 29291: // Copy and expand input, padding 8 2930 ld1 {v0.8h}, [x1], x2 2931 ld1 {v2.8h}, [x10], x2 2932 shl v0.8h, v0.8h, #3 2933 shl v2.8h, v2.8h, #3 2934 dup v1.8h, v0.h[7] 2935 dup v3.8h, v2.h[7] 2936 subs w8, w8, #2 2937 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2938 uaddw v24.4s, v24.4s, v0.4h 2939 uaddw2 v25.4s, v25.4s, v0.8h 2940 uaddw v26.4s, v26.4s, v1.4h 2941 uaddw2 v27.4s, v27.4s, v1.8h 2942 uaddw v24.4s, v24.4s, v2.4h 2943 uaddw2 v25.4s, v25.4s, v2.8h 2944 uaddw v26.4s, v26.4s, v3.4h 2945 uaddw2 v27.4s, v27.4s, v3.8h 2946 b.gt 1b 2947 mov v0.16b, v2.16b 2948 mov v1.16b, v3.16b 2949 b L(ipred_cfl_ac_420_w16_hpad) 2950 2951L(ipred_cfl_ac_444_w32): 2952 adr x7, L(ipred_cfl_ac_444_w32_tbl) 2953 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 2954 lsr x2, x2, #1 // Restore the stride to one line increments 2955 sub x7, x7, w3, uxtw 2956 br x7 2957 2958L(ipred_cfl_ac_444_w32_wpad0): 29591: // Copy and expand input, without padding 2960 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2961 shl v0.8h, v0.8h, #3 2962 shl v1.8h, v1.8h, #3 2963 shl v2.8h, v2.8h, #3 2964 shl v3.8h, v3.8h, #3 2965 subs w8, w8, #1 2966 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2967 uaddw v24.4s, v24.4s, v0.4h 2968 uaddw2 v25.4s, v25.4s, v0.8h 2969 uaddw v26.4s, v26.4s, v1.4h 2970 uaddw2 v27.4s, v27.4s, v1.8h 2971 uaddw v24.4s, v24.4s, v2.4h 2972 uaddw2 v25.4s, v25.4s, v2.8h 2973 uaddw v26.4s, v26.4s, v3.4h 2974 uaddw2 v27.4s, v27.4s, v3.8h 2975 b.gt 1b 2976 b L(ipred_cfl_ac_444_w32_hpad) 2977 2978L(ipred_cfl_ac_444_w32_wpad2): 29791: // Copy and expand input, padding 8 2980 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 2981 shl v2.8h, v2.8h, #3 2982 shl v0.8h, v0.8h, #3 2983 shl v1.8h, v1.8h, #3 2984 dup v3.8h, v2.h[7] 2985 subs w8, w8, #1 2986 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2987 uaddw v24.4s, v24.4s, v0.4h 2988 uaddw2 v25.4s, v25.4s, v0.8h 2989 uaddw v26.4s, v26.4s, v1.4h 2990 uaddw2 v27.4s, v27.4s, v1.8h 2991 uaddw v24.4s, v24.4s, v2.4h 2992 uaddw2 v25.4s, v25.4s, v2.8h 2993 uaddw v26.4s, v26.4s, v3.4h 2994 uaddw2 v27.4s, v27.4s, v3.8h 2995 b.gt 1b 2996 b L(ipred_cfl_ac_444_w32_hpad) 2997 2998L(ipred_cfl_ac_444_w32_wpad4): 29991: // Copy and expand input, padding 16 3000 ld1 {v0.8h, v1.8h}, [x1], x2 3001 shl v1.8h, v1.8h, #3 3002 shl v0.8h, v0.8h, #3 3003 dup v2.8h, v1.h[7] 3004 dup v3.8h, v1.h[7] 3005 subs w8, w8, #1 3006 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3007 uaddw v24.4s, v24.4s, v0.4h 3008 uaddw2 v25.4s, v25.4s, v0.8h 3009 uaddw v26.4s, v26.4s, v1.4h 3010 uaddw2 v27.4s, v27.4s, v1.8h 3011 uaddw v24.4s, v24.4s, v2.4h 3012 uaddw2 v25.4s, v25.4s, v2.8h 3013 uaddw v26.4s, v26.4s, v3.4h 3014 uaddw2 v27.4s, v27.4s, v3.8h 3015 b.gt 1b 3016 b L(ipred_cfl_ac_444_w32_hpad) 3017 3018L(ipred_cfl_ac_444_w32_wpad6): 30191: // Copy and expand input, padding 24 3020 ld1 {v0.8h}, [x1], x2 3021 shl v0.8h, v0.8h, #3 3022 dup v1.8h, v0.h[7] 3023 dup v2.8h, v0.h[7] 3024 dup v3.8h, v0.h[7] 3025 subs w8, w8, #1 3026 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3027 uaddw v24.4s, v24.4s, v0.4h 3028 uaddw2 v25.4s, v25.4s, v0.8h 3029 uaddw v26.4s, v26.4s, v1.4h 3030 uaddw2 v27.4s, v27.4s, v1.8h 3031 uaddw v24.4s, v24.4s, v2.4h 3032 uaddw2 v25.4s, v25.4s, v2.8h 3033 uaddw v26.4s, v26.4s, v3.4h 3034 uaddw2 v27.4s, v27.4s, v3.8h 3035 b.gt 1b 3036 3037L(ipred_cfl_ac_444_w32_hpad): 3038 cbz w4, 3f 30392: // Vertical padding (h_pad > 0) 3040 subs w4, w4, #2 3041 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3042 uaddw v24.4s, v24.4s, v0.4h 3043 uaddw2 v25.4s, v25.4s, v0.8h 3044 uaddw v26.4s, v26.4s, v1.4h 3045 uaddw2 v27.4s, v27.4s, v1.8h 3046 uaddw v24.4s, v24.4s, v2.4h 3047 uaddw2 v25.4s, v25.4s, v2.8h 3048 uaddw v26.4s, v26.4s, v3.4h 3049 uaddw2 v27.4s, v27.4s, v3.8h 3050 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3051 uaddw v24.4s, v24.4s, v0.4h 3052 uaddw2 v25.4s, v25.4s, v0.8h 3053 uaddw v26.4s, v26.4s, v1.4h 3054 uaddw2 v27.4s, v27.4s, v1.8h 3055 uaddw v24.4s, v24.4s, v2.4h 3056 uaddw2 v25.4s, v25.4s, v2.8h 3057 uaddw v26.4s, v26.4s, v3.4h 3058 uaddw2 v27.4s, v27.4s, v3.8h 3059 b.gt 2b 30603: 3061 3062 // Multiply the height by eight and reuse the w4 subtracting 3063 lsl w6, w6, #3 3064 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 3065 3066L(ipred_cfl_ac_444_tbl): 3067 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 3068 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 3069 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 3070 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 3071 3072L(ipred_cfl_ac_444_w32_tbl): 3073 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 3074 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 3075 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 3076 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 3077endfunc 3078