1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height, 35// const int bitdepth_max); 36function ipred_dc_128_16bpc_neon, export=1 37 ldr w8, [sp] 38 clz w3, w3 39 adr x5, L(ipred_dc_128_tbl) 40 sub w3, w3, #25 41 ldrh w3, [x5, w3, uxtw #1] 42 dup v0.8h, w8 43 sub x5, x5, w3, uxtw 44 add x6, x0, x1 45 lsl x1, x1, #1 46 urshr v0.8h, v0.8h, #1 47 br x5 484: 49 st1 {v0.4h}, [x0], x1 50 st1 {v0.4h}, [x6], x1 51 subs w4, w4, #4 52 st1 {v0.4h}, [x0], x1 53 st1 {v0.4h}, [x6], x1 54 b.gt 4b 55 ret 568: 57 st1 {v0.8h}, [x0], x1 58 st1 {v0.8h}, [x6], x1 59 subs w4, w4, #4 60 st1 {v0.8h}, [x0], x1 61 st1 {v0.8h}, [x6], x1 62 b.gt 8b 63 ret 64160: 65 mov v1.16b, v0.16b 6616: 67 st1 {v0.8h, v1.8h}, [x0], x1 68 st1 {v0.8h, v1.8h}, [x6], x1 69 subs w4, w4, #4 70 st1 {v0.8h, v1.8h}, [x0], x1 71 st1 {v0.8h, v1.8h}, [x6], x1 72 b.gt 16b 73 ret 74320: 75 mov v1.16b, v0.16b 76 mov v2.16b, v0.16b 77 mov v3.16b, v0.16b 7832: 79 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 80 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 81 subs w4, w4, #4 82 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 83 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 84 b.gt 32b 85 ret 86640: 87 mov v1.16b, v0.16b 88 mov v2.16b, v0.16b 89 mov v3.16b, v0.16b 90 sub x1, x1, #64 9164: 92 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 93 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 94 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 95 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 96 subs w4, w4, #4 97 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 98 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 101 b.gt 64b 102 ret 103 104L(ipred_dc_128_tbl): 105 .hword L(ipred_dc_128_tbl) - 640b 106 .hword L(ipred_dc_128_tbl) - 320b 107 .hword L(ipred_dc_128_tbl) - 160b 108 .hword L(ipred_dc_128_tbl) - 8b 109 .hword L(ipred_dc_128_tbl) - 4b 110endfunc 111 112// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 113// const pixel *const topleft, 114// const int width, const int height, const int a, 115// const int max_width, const int max_height); 116function ipred_v_16bpc_neon, export=1 117 clz w3, w3 118 adr x5, L(ipred_v_tbl) 119 sub w3, w3, #25 120 ldrh w3, [x5, w3, uxtw #1] 121 add x2, x2, #2 122 sub x5, x5, w3, uxtw 123 add x6, x0, x1 124 lsl x1, x1, #1 125 br x5 12640: 127 ld1 {v0.4h}, [x2] 1284: 129 st1 {v0.4h}, [x0], x1 130 st1 {v0.4h}, [x6], x1 131 subs w4, w4, #4 132 st1 {v0.4h}, [x0], x1 133 st1 {v0.4h}, [x6], x1 134 b.gt 4b 135 ret 13680: 137 ld1 {v0.8h}, [x2] 1388: 139 st1 {v0.8h}, [x0], x1 140 st1 {v0.8h}, [x6], x1 141 subs w4, w4, #4 142 st1 {v0.8h}, [x0], x1 143 st1 {v0.8h}, [x6], x1 144 b.gt 8b 145 ret 146160: 147 ld1 {v0.8h, v1.8h}, [x2] 14816: 149 st1 {v0.8h, v1.8h}, [x0], x1 150 st1 {v0.8h, v1.8h}, [x6], x1 151 subs w4, w4, #4 152 st1 {v0.8h, v1.8h}, [x0], x1 153 st1 {v0.8h, v1.8h}, [x6], x1 154 b.gt 16b 155 ret 156320: 157 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 15832: 159 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 160 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 161 subs w4, w4, #4 162 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 163 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 164 b.gt 32b 165 ret 166640: 167 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 168 sub x1, x1, #64 169 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 17064: 171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 173 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 174 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 175 subs w4, w4, #4 176 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 177 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 178 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 179 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 180 b.gt 64b 181 ret 182 183L(ipred_v_tbl): 184 .hword L(ipred_v_tbl) - 640b 185 .hword L(ipred_v_tbl) - 320b 186 .hword L(ipred_v_tbl) - 160b 187 .hword L(ipred_v_tbl) - 80b 188 .hword L(ipred_v_tbl) - 40b 189endfunc 190 191// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 192// const pixel *const topleft, 193// const int width, const int height, const int a, 194// const int max_width, const int max_height); 195function ipred_h_16bpc_neon, export=1 196 clz w3, w3 197 adr x5, L(ipred_h_tbl) 198 sub w3, w3, #25 199 ldrh w3, [x5, w3, uxtw #1] 200 sub x2, x2, #8 201 sub x5, x5, w3, uxtw 202 mov x7, #-8 203 add x6, x0, x1 204 lsl x1, x1, #1 205 br x5 2064: 207 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 208 st1 {v3.4h}, [x0], x1 209 st1 {v2.4h}, [x6], x1 210 subs w4, w4, #4 211 st1 {v1.4h}, [x0], x1 212 st1 {v0.4h}, [x6], x1 213 b.gt 4b 214 ret 2158: 216 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 217 st1 {v3.8h}, [x0], x1 218 st1 {v2.8h}, [x6], x1 219 subs w4, w4, #4 220 st1 {v1.8h}, [x0], x1 221 st1 {v0.8h}, [x6], x1 222 b.gt 8b 223 ret 22416: 225 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 226 str q3, [x0, #16] 227 str q2, [x6, #16] 228 st1 {v3.8h}, [x0], x1 229 st1 {v2.8h}, [x6], x1 230 subs w4, w4, #4 231 str q1, [x0, #16] 232 str q0, [x6, #16] 233 st1 {v1.8h}, [x0], x1 234 st1 {v0.8h}, [x6], x1 235 b.gt 16b 236 ret 23732: 238 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 239 str q3, [x0, #16] 240 str q2, [x6, #16] 241 stp q3, q3, [x0, #32] 242 stp q2, q2, [x6, #32] 243 st1 {v3.8h}, [x0], x1 244 st1 {v2.8h}, [x6], x1 245 subs w4, w4, #4 246 str q1, [x0, #16] 247 str q0, [x6, #16] 248 stp q1, q1, [x0, #32] 249 stp q0, q0, [x6, #32] 250 st1 {v1.8h}, [x0], x1 251 st1 {v0.8h}, [x6], x1 252 b.gt 32b 253 ret 25464: 255 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 256 str q3, [x0, #16] 257 str q2, [x6, #16] 258 stp q3, q3, [x0, #32] 259 stp q2, q2, [x6, #32] 260 stp q3, q3, [x0, #64] 261 stp q2, q2, [x6, #64] 262 stp q3, q3, [x0, #96] 263 stp q2, q2, [x6, #96] 264 st1 {v3.8h}, [x0], x1 265 st1 {v2.8h}, [x6], x1 266 subs w4, w4, #4 267 str q1, [x0, #16] 268 str q0, [x6, #16] 269 stp q1, q1, [x0, #32] 270 stp q0, q0, [x6, #32] 271 stp q1, q1, [x0, #64] 272 stp q0, q0, [x6, #64] 273 stp q1, q1, [x0, #96] 274 stp q0, q0, [x6, #96] 275 st1 {v1.8h}, [x0], x1 276 st1 {v0.8h}, [x6], x1 277 b.gt 64b 278 ret 279 280L(ipred_h_tbl): 281 .hword L(ipred_h_tbl) - 64b 282 .hword L(ipred_h_tbl) - 32b 283 .hword L(ipred_h_tbl) - 16b 284 .hword L(ipred_h_tbl) - 8b 285 .hword L(ipred_h_tbl) - 4b 286endfunc 287 288// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 289// const pixel *const topleft, 290// const int width, const int height, const int a, 291// const int max_width, const int max_height); 292function ipred_dc_top_16bpc_neon, export=1 293 clz w3, w3 294 adr x5, L(ipred_dc_top_tbl) 295 sub w3, w3, #25 296 ldrh w3, [x5, w3, uxtw #1] 297 add x2, x2, #2 298 sub x5, x5, w3, uxtw 299 add x6, x0, x1 300 lsl x1, x1, #1 301 br x5 30240: 303 ld1 {v0.4h}, [x2] 304 addv h0, v0.4h 305 urshr v0.4h, v0.4h, #2 306 dup v0.4h, v0.h[0] 3074: 308 st1 {v0.4h}, [x0], x1 309 st1 {v0.4h}, [x6], x1 310 subs w4, w4, #4 311 st1 {v0.4h}, [x0], x1 312 st1 {v0.4h}, [x6], x1 313 b.gt 4b 314 ret 31580: 316 ld1 {v0.8h}, [x2] 317 addv h0, v0.8h 318 urshr v0.4h, v0.4h, #3 319 dup v0.8h, v0.h[0] 3208: 321 st1 {v0.8h}, [x0], x1 322 st1 {v0.8h}, [x6], x1 323 subs w4, w4, #4 324 st1 {v0.8h}, [x0], x1 325 st1 {v0.8h}, [x6], x1 326 b.gt 8b 327 ret 328160: 329 ld1 {v0.8h, v1.8h}, [x2] 330 addp v0.8h, v0.8h, v1.8h 331 addv h0, v0.8h 332 urshr v2.4h, v0.4h, #4 333 dup v0.8h, v2.h[0] 334 dup v1.8h, v2.h[0] 33516: 336 st1 {v0.8h, v1.8h}, [x0], x1 337 st1 {v0.8h, v1.8h}, [x6], x1 338 subs w4, w4, #4 339 st1 {v0.8h, v1.8h}, [x0], x1 340 st1 {v0.8h, v1.8h}, [x6], x1 341 b.gt 16b 342 ret 343320: 344 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 345 addp v0.8h, v0.8h, v1.8h 346 addp v2.8h, v2.8h, v3.8h 347 addp v0.8h, v0.8h, v2.8h 348 uaddlv s0, v0.8h 349 rshrn v4.4h, v0.4s, #5 350 dup v0.8h, v4.h[0] 351 dup v1.8h, v4.h[0] 352 dup v2.8h, v4.h[0] 353 dup v3.8h, v4.h[0] 35432: 355 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 356 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 357 subs w4, w4, #4 358 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 359 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 360 b.gt 32b 361 ret 362640: 363 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 364 addp v0.8h, v0.8h, v1.8h 365 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 366 addp v2.8h, v2.8h, v3.8h 367 addp v4.8h, v4.8h, v5.8h 368 addp v6.8h, v6.8h, v7.8h 369 addp v0.8h, v0.8h, v2.8h 370 addp v4.8h, v4.8h, v6.8h 371 addp v0.8h, v0.8h, v4.8h 372 uaddlv s0, v0.8h 373 rshrn v4.4h, v0.4s, #6 374 sub x1, x1, #64 375 dup v0.8h, v4.h[0] 376 dup v1.8h, v4.h[0] 377 dup v2.8h, v4.h[0] 378 dup v3.8h, v4.h[0] 37964: 380 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 381 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 382 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 383 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 384 subs w4, w4, #4 385 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 386 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 387 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 388 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 389 b.gt 64b 390 ret 391 392L(ipred_dc_top_tbl): 393 .hword L(ipred_dc_top_tbl) - 640b 394 .hword L(ipred_dc_top_tbl) - 320b 395 .hword L(ipred_dc_top_tbl) - 160b 396 .hword L(ipred_dc_top_tbl) - 80b 397 .hword L(ipred_dc_top_tbl) - 40b 398endfunc 399 400// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 401// const pixel *const topleft, 402// const int width, const int height, const int a, 403// const int max_width, const int max_height); 404function ipred_dc_left_16bpc_neon, export=1 405 sub x2, x2, w4, uxtw #1 406 clz w3, w3 407 clz w7, w4 408 adr x5, L(ipred_dc_left_tbl) 409 sub w3, w3, #20 // 25 leading bits, minus table offset 5 410 sub w7, w7, #25 411 ldrh w3, [x5, w3, uxtw #1] 412 ldrh w7, [x5, w7, uxtw #1] 413 sub x3, x5, w3, uxtw 414 sub x5, x5, w7, uxtw 415 add x6, x0, x1 416 lsl x1, x1, #1 417 br x5 418 419L(ipred_dc_left_h4): 420 ld1 {v0.4h}, [x2] 421 addv h0, v0.4h 422 urshr v0.4h, v0.4h, #2 423 dup v0.8h, v0.h[0] 424 br x3 425L(ipred_dc_left_w4): 426 st1 {v0.4h}, [x0], x1 427 st1 {v0.4h}, [x6], x1 428 subs w4, w4, #4 429 st1 {v0.4h}, [x0], x1 430 st1 {v0.4h}, [x6], x1 431 b.gt L(ipred_dc_left_w4) 432 ret 433 434L(ipred_dc_left_h8): 435 ld1 {v0.8h}, [x2] 436 addv h0, v0.8h 437 urshr v0.4h, v0.4h, #3 438 dup v0.8h, v0.h[0] 439 br x3 440L(ipred_dc_left_w8): 441 st1 {v0.8h}, [x0], x1 442 st1 {v0.8h}, [x6], x1 443 subs w4, w4, #4 444 st1 {v0.8h}, [x0], x1 445 st1 {v0.8h}, [x6], x1 446 b.gt L(ipred_dc_left_w8) 447 ret 448 449L(ipred_dc_left_h16): 450 ld1 {v0.8h, v1.8h}, [x2] 451 addp v0.8h, v0.8h, v1.8h 452 addv h0, v0.8h 453 urshr v2.4h, v0.4h, #4 454 dup v0.8h, v2.h[0] 455 dup v1.8h, v2.h[0] 456 br x3 457L(ipred_dc_left_w16): 458 mov v1.16b, v0.16b 4591: 460 st1 {v0.8h, v1.8h}, [x0], x1 461 st1 {v0.8h, v1.8h}, [x6], x1 462 subs w4, w4, #4 463 st1 {v0.8h, v1.8h}, [x0], x1 464 st1 {v0.8h, v1.8h}, [x6], x1 465 b.gt 1b 466 ret 467 468L(ipred_dc_left_h32): 469 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 470 addp v0.8h, v0.8h, v1.8h 471 addp v2.8h, v2.8h, v3.8h 472 addp v0.8h, v0.8h, v2.8h 473 uaddlp v0.4s, v0.8h 474 addv s0, v0.4s 475 rshrn v4.4h, v0.4s, #5 476 dup v0.8h, v4.h[0] 477 br x3 478L(ipred_dc_left_w32): 479 mov v1.16b, v0.16b 480 mov v2.16b, v0.16b 481 mov v3.16b, v0.16b 4821: 483 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 484 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 485 subs w4, w4, #4 486 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 487 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 488 b.gt 1b 489 ret 490 491L(ipred_dc_left_h64): 492 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 493 addp v0.8h, v0.8h, v1.8h 494 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 495 addp v2.8h, v2.8h, v3.8h 496 addp v4.8h, v4.8h, v5.8h 497 addp v6.8h, v6.8h, v7.8h 498 addp v0.8h, v0.8h, v2.8h 499 addp v4.8h, v4.8h, v6.8h 500 addp v0.8h, v0.8h, v4.8h 501 uaddlv s0, v0.8h 502 rshrn v4.4h, v0.4s, #6 503 dup v0.8h, v4.h[0] 504 br x3 505L(ipred_dc_left_w64): 506 mov v1.16b, v0.16b 507 mov v2.16b, v0.16b 508 mov v3.16b, v0.16b 509 sub x1, x1, #64 5101: 511 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 512 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 513 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 514 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 515 subs w4, w4, #4 516 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 517 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 518 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 519 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 520 b.gt 1b 521 ret 522 523L(ipred_dc_left_tbl): 524 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 525 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 526 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 527 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 528 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 529 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 530 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 531 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 532 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 533 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 534endfunc 535 536// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, 537// const pixel *const topleft, 538// const int width, const int height, const int a, 539// const int max_width, const int max_height); 540function ipred_dc_16bpc_neon, export=1 541 sub x2, x2, w4, uxtw #1 542 add w7, w3, w4 // width + height 543 clz w3, w3 544 clz w6, w4 545 dup v16.4s, w7 // width + height 546 adr x5, L(ipred_dc_tbl) 547 rbit w7, w7 // rbit(width + height) 548 sub w3, w3, #20 // 25 leading bits, minus table offset 5 549 sub w6, w6, #25 550 clz w7, w7 // ctz(width + height) 551 ldrh w3, [x5, w3, uxtw #1] 552 ldrh w6, [x5, w6, uxtw #1] 553 neg w7, w7 // -ctz(width + height) 554 sub x3, x5, w3, uxtw 555 sub x5, x5, w6, uxtw 556 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 557 dup v17.4s, w7 // -ctz(width + height) 558 add x6, x0, x1 559 lsl x1, x1, #1 560 br x5 561 562L(ipred_dc_h4): 563 ld1 {v0.4h}, [x2], #8 564 uaddlv s0, v0.4h 565 br x3 566L(ipred_dc_w4): 567 add x2, x2, #2 568 ld1 {v1.4h}, [x2] 569 add v0.2s, v0.2s, v16.2s 570 uaddlv s1, v1.4h 571 cmp w4, #4 572 add v0.2s, v0.2s, v1.2s 573 ushl v0.2s, v0.2s, v17.2s 574 b.eq 1f 575 // h = 8/16 576 cmp w4, #16 577 mov w16, #0x6667 578 mov w17, #0xAAAB 579 csel w16, w16, w17, eq 580 dup v16.2s, w16 581 mul v0.2s, v0.2s, v16.2s 582 ushr v0.2s, v0.2s, #17 5831: 584 dup v0.4h, v0.h[0] 5852: 586 st1 {v0.4h}, [x0], x1 587 st1 {v0.4h}, [x6], x1 588 subs w4, w4, #4 589 st1 {v0.4h}, [x0], x1 590 st1 {v0.4h}, [x6], x1 591 b.gt 2b 592 ret 593 594L(ipred_dc_h8): 595 ld1 {v0.8h}, [x2], #16 596 uaddlv s0, v0.8h 597 br x3 598L(ipred_dc_w8): 599 add x2, x2, #2 600 ld1 {v1.8h}, [x2] 601 add v0.2s, v0.2s, v16.2s 602 uaddlv s1, v1.8h 603 cmp w4, #8 604 add v0.2s, v0.2s, v1.2s 605 ushl v0.2s, v0.2s, v17.2s 606 b.eq 1f 607 // h = 4/16/32 608 cmp w4, #32 609 mov w16, #0x6667 610 mov w17, #0xAAAB 611 csel w16, w16, w17, eq 612 dup v16.2s, w16 613 mul v0.2s, v0.2s, v16.2s 614 ushr v0.2s, v0.2s, #17 6151: 616 dup v0.8h, v0.h[0] 6172: 618 st1 {v0.8h}, [x0], x1 619 st1 {v0.8h}, [x6], x1 620 subs w4, w4, #4 621 st1 {v0.8h}, [x0], x1 622 st1 {v0.8h}, [x6], x1 623 b.gt 2b 624 ret 625 626L(ipred_dc_h16): 627 ld1 {v0.8h, v1.8h}, [x2], #32 628 addp v0.8h, v0.8h, v1.8h 629 uaddlv s0, v0.8h 630 br x3 631L(ipred_dc_w16): 632 add x2, x2, #2 633 ld1 {v1.8h, v2.8h}, [x2] 634 add v0.2s, v0.2s, v16.2s 635 addp v1.8h, v1.8h, v2.8h 636 uaddlv s1, v1.8h 637 cmp w4, #16 638 add v0.2s, v0.2s, v1.2s 639 ushl v4.2s, v0.2s, v17.2s 640 b.eq 1f 641 // h = 4/8/32/64 642 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 643 mov w16, #0x6667 644 mov w17, #0xAAAB 645 csel w16, w16, w17, eq 646 dup v16.2s, w16 647 mul v4.2s, v4.2s, v16.2s 648 ushr v4.2s, v4.2s, #17 6491: 650 dup v0.8h, v4.h[0] 651 dup v1.8h, v4.h[0] 6522: 653 st1 {v0.8h, v1.8h}, [x0], x1 654 st1 {v0.8h, v1.8h}, [x6], x1 655 subs w4, w4, #4 656 st1 {v0.8h, v1.8h}, [x0], x1 657 st1 {v0.8h, v1.8h}, [x6], x1 658 b.gt 2b 659 ret 660 661L(ipred_dc_h32): 662 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 663 addp v0.8h, v0.8h, v1.8h 664 addp v2.8h, v2.8h, v3.8h 665 addp v0.8h, v0.8h, v2.8h 666 uaddlv s0, v0.8h 667 br x3 668L(ipred_dc_w32): 669 add x2, x2, #2 670 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] 671 add v0.2s, v0.2s, v16.2s 672 addp v1.8h, v1.8h, v2.8h 673 addp v3.8h, v3.8h, v4.8h 674 addp v1.8h, v1.8h, v3.8h 675 uaddlv s1, v1.8h 676 cmp w4, #32 677 add v0.2s, v0.2s, v1.2s 678 ushl v4.2s, v0.2s, v17.2s 679 b.eq 1f 680 // h = 8/16/64 681 cmp w4, #8 682 mov w16, #0x6667 683 mov w17, #0xAAAB 684 csel w16, w16, w17, eq 685 dup v16.2s, w16 686 mul v4.2s, v4.2s, v16.2s 687 ushr v4.2s, v4.2s, #17 6881: 689 dup v0.8h, v4.h[0] 690 dup v1.8h, v4.h[0] 691 dup v2.8h, v4.h[0] 692 dup v3.8h, v4.h[0] 6932: 694 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 695 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 696 subs w4, w4, #4 697 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 698 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 699 b.gt 2b 700 ret 701 702L(ipred_dc_h64): 703 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 704 addp v0.8h, v0.8h, v1.8h 705 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 706 addp v2.8h, v2.8h, v3.8h 707 addp v4.8h, v4.8h, v5.8h 708 addp v6.8h, v6.8h, v7.8h 709 addp v0.8h, v0.8h, v2.8h 710 addp v4.8h, v4.8h, v6.8h 711 addp v0.8h, v0.8h, v4.8h 712 uaddlv s0, v0.8h 713 br x3 714L(ipred_dc_w64): 715 add x2, x2, #2 716 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 717 add v0.2s, v0.2s, v16.2s 718 addp v1.8h, v1.8h, v2.8h 719 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] 720 addp v3.8h, v3.8h, v4.8h 721 addp v20.8h, v20.8h, v21.8h 722 addp v22.8h, v22.8h, v23.8h 723 addp v1.8h, v1.8h, v3.8h 724 addp v20.8h, v20.8h, v22.8h 725 addp v1.8h, v1.8h, v20.8h 726 uaddlv s1, v1.8h 727 cmp w4, #64 728 add v0.2s, v0.2s, v1.2s 729 ushl v4.2s, v0.2s, v17.2s 730 b.eq 1f 731 // h = 16/32 732 cmp w4, #16 733 mov w16, #0x6667 734 mov w17, #0xAAAB 735 csel w16, w16, w17, eq 736 dup v16.2s, w16 737 mul v4.2s, v4.2s, v16.2s 738 ushr v4.2s, v4.2s, #17 7391: 740 sub x1, x1, #64 741 dup v0.8h, v4.h[0] 742 dup v1.8h, v4.h[0] 743 dup v2.8h, v4.h[0] 744 dup v3.8h, v4.h[0] 7452: 746 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 747 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 748 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 750 subs w4, w4, #4 751 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 752 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 753 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 754 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 755 b.gt 2b 756 ret 757 758L(ipred_dc_tbl): 759 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 760 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 761 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 762 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 763 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 764 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 765 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 766 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 767 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 768 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 769endfunc 770 771// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 772// const pixel *const topleft, 773// const int width, const int height, const int a, 774// const int max_width, const int max_height); 775function ipred_paeth_16bpc_neon, export=1 776 clz w9, w3 777 adr x5, L(ipred_paeth_tbl) 778 sub w9, w9, #25 779 ldrh w9, [x5, w9, uxtw #1] 780 ld1r {v4.8h}, [x2] 781 add x8, x2, #2 782 sub x2, x2, #8 783 sub x5, x5, w9, uxtw 784 mov x7, #-8 785 add x6, x0, x1 786 lsl x1, x1, #1 787 br x5 78840: 789 ld1r {v5.2d}, [x8] 790 sub v6.8h, v5.8h, v4.8h // top - topleft 7914: 792 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 793 zip1 v0.2d, v0.2d, v1.2d 794 zip1 v2.2d, v2.2d, v3.2d 795 add v16.8h, v6.8h, v0.8h // base 796 add v17.8h, v6.8h, v2.8h 797 sabd v20.8h, v5.8h, v16.8h // tdiff 798 sabd v21.8h, v5.8h, v17.8h 799 sabd v22.8h, v4.8h, v16.8h // tldiff 800 sabd v23.8h, v4.8h, v17.8h 801 sabd v16.8h, v0.8h, v16.8h // ldiff 802 sabd v17.8h, v2.8h, v17.8h 803 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) 804 umin v19.8h, v21.8h, v23.8h 805 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff 806 cmge v21.8h, v23.8h, v21.8h 807 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff 808 cmge v17.8h, v19.8h, v17.8h 809 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 810 bsl v20.16b, v5.16b, v4.16b 811 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 812 bit v20.16b, v0.16b, v16.16b 813 st1 {v21.d}[1], [x0], x1 814 st1 {v21.d}[0], [x6], x1 815 subs w4, w4, #4 816 st1 {v20.d}[1], [x0], x1 817 st1 {v20.d}[0], [x6], x1 818 b.gt 4b 819 ret 82080: 821160: 822320: 823640: 824 ld1 {v5.8h}, [x8], #16 825 mov w9, w3 826 // Set up pointers for four rows in parallel; x0, x6, x5, x10 827 add x5, x0, x1 828 add x10, x6, x1 829 lsl x1, x1, #1 830 sub x1, x1, w3, uxtw #1 8311: 832 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 8332: 834 sub v6.8h, v5.8h, v4.8h // top - topleft 835 add v16.8h, v6.8h, v0.8h // base 836 add v17.8h, v6.8h, v1.8h 837 add v18.8h, v6.8h, v2.8h 838 add v19.8h, v6.8h, v3.8h 839 sabd v20.8h, v5.8h, v16.8h // tdiff 840 sabd v21.8h, v5.8h, v17.8h 841 sabd v22.8h, v5.8h, v18.8h 842 sabd v23.8h, v5.8h, v19.8h 843 sabd v24.8h, v4.8h, v16.8h // tldiff 844 sabd v25.8h, v4.8h, v17.8h 845 sabd v26.8h, v4.8h, v18.8h 846 sabd v27.8h, v4.8h, v19.8h 847 sabd v16.8h, v0.8h, v16.8h // ldiff 848 sabd v17.8h, v1.8h, v17.8h 849 sabd v18.8h, v2.8h, v18.8h 850 sabd v19.8h, v3.8h, v19.8h 851 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) 852 umin v29.8h, v21.8h, v25.8h 853 umin v30.8h, v22.8h, v26.8h 854 umin v31.8h, v23.8h, v27.8h 855 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff 856 cmge v21.8h, v25.8h, v21.8h 857 cmge v22.8h, v26.8h, v22.8h 858 cmge v23.8h, v27.8h, v23.8h 859 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff 860 cmge v17.8h, v29.8h, v17.8h 861 cmge v18.8h, v30.8h, v18.8h 862 cmge v19.8h, v31.8h, v19.8h 863 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 864 bsl v22.16b, v5.16b, v4.16b 865 bsl v21.16b, v5.16b, v4.16b 866 bsl v20.16b, v5.16b, v4.16b 867 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 868 bit v22.16b, v2.16b, v18.16b 869 bit v21.16b, v1.16b, v17.16b 870 bit v20.16b, v0.16b, v16.16b 871 st1 {v23.8h}, [x0], #16 872 st1 {v22.8h}, [x6], #16 873 subs w3, w3, #8 874 st1 {v21.8h}, [x5], #16 875 st1 {v20.8h}, [x10], #16 876 b.le 8f 877 ld1 {v5.8h}, [x8], #16 878 b 2b 8798: 880 subs w4, w4, #4 881 b.le 9f 882 // End of horizontal loop, move pointers to next four rows 883 sub x8, x8, w9, uxtw #1 884 add x0, x0, x1 885 add x6, x6, x1 886 // Load the top row as early as possible 887 ld1 {v5.8h}, [x8], #16 888 add x5, x5, x1 889 add x10, x10, x1 890 mov w3, w9 891 b 1b 8929: 893 ret 894 895L(ipred_paeth_tbl): 896 .hword L(ipred_paeth_tbl) - 640b 897 .hword L(ipred_paeth_tbl) - 320b 898 .hword L(ipred_paeth_tbl) - 160b 899 .hword L(ipred_paeth_tbl) - 80b 900 .hword L(ipred_paeth_tbl) - 40b 901endfunc 902 903// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 904// const pixel *const topleft, 905// const int width, const int height, const int a, 906// const int max_width, const int max_height); 907function ipred_smooth_16bpc_neon, export=1 908 movrel x10, X(sm_weights) 909 add x11, x10, w4, uxtw 910 add x10, x10, w3, uxtw 911 clz w9, w3 912 adr x5, L(ipred_smooth_tbl) 913 sub x12, x2, w4, uxtw #1 914 sub w9, w9, #25 915 ldrh w9, [x5, w9, uxtw #1] 916 ld1r {v4.8h}, [x12] // bottom 917 add x8, x2, #2 918 sub x5, x5, w9, uxtw 919 add x6, x0, x1 920 lsl x1, x1, #1 921 br x5 92240: 923 ld1r {v6.2d}, [x8] // top 924 ld1r {v7.2s}, [x10] // weights_hor 925 sub x2, x2, #8 926 mov x7, #-8 927 dup v5.8h, v6.h[3] // right 928 sub v6.8h, v6.8h, v4.8h // top-bottom 929 uxtl v7.8h, v7.8b // weights_hor 930 add v31.4h, v4.4h, v5.4h // bottom+right 9314: 932 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 933 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 934 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 935 ushll v21.4s, v31.4h, #8 936 ushll v22.4s, v31.4h, #8 937 ushll v23.4s, v31.4h, #8 938 zip1 v1.2d, v1.2d, v0.2d // left, flipped 939 zip1 v0.2d, v3.2d, v2.2d 940 zip1 v16.2s, v16.2s, v17.2s // weights_ver 941 zip1 v18.2s, v18.2s, v19.2s 942 sub v0.8h, v0.8h, v5.8h // left-right 943 sub v1.8h, v1.8h, v5.8h 944 uxtl v16.8h, v16.8b // weights_ver 945 uxtl v18.8h, v18.8b 946 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor 947 smlal2 v21.4s, v0.8h, v7.8h 948 smlal v22.4s, v1.4h, v7.4h 949 smlal2 v23.4s, v1.8h, v7.8h 950 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 951 smlal2 v21.4s, v6.8h, v16.8h 952 smlal v22.4s, v6.4h, v18.4h 953 smlal2 v23.4s, v6.8h, v18.8h 954 rshrn v20.4h, v20.4s, #9 955 rshrn v21.4h, v21.4s, #9 956 rshrn v22.4h, v22.4s, #9 957 rshrn v23.4h, v23.4s, #9 958 st1 {v20.4h}, [x0], x1 959 st1 {v21.4h}, [x6], x1 960 subs w4, w4, #4 961 st1 {v22.4h}, [x0], x1 962 st1 {v23.4h}, [x6], x1 963 b.gt 4b 964 ret 96580: 966 ld1 {v6.8h}, [x8] // top 967 ld1 {v7.8b}, [x10] // weights_hor 968 sub x2, x2, #8 969 mov x7, #-8 970 dup v5.8h, v6.h[7] // right 971 sub v6.8h, v6.8h, v4.8h // top-bottom 972 uxtl v7.8h, v7.8b // weights_hor 973 add v31.4h, v4.4h, v5.4h // bottom+right 9748: 975 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 976 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 977 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 978 ushll v21.4s, v31.4h, #8 979 ushll v22.4s, v31.4h, #8 980 ushll v23.4s, v31.4h, #8 981 ushll v24.4s, v31.4h, #8 982 ushll v25.4s, v31.4h, #8 983 ushll v26.4s, v31.4h, #8 984 ushll v27.4s, v31.4h, #8 985 sub v0.8h, v0.8h, v5.8h // left-right 986 sub v1.8h, v1.8h, v5.8h 987 sub v2.8h, v2.8h, v5.8h 988 sub v3.8h, v3.8h, v5.8h 989 uxtl v16.8h, v16.8b // weights_ver 990 uxtl v17.8h, v17.8b 991 uxtl v18.8h, v18.8b 992 uxtl v19.8h, v19.8b 993 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor 994 smlal2 v21.4s, v3.8h, v7.8h // (left flipped) 995 smlal v22.4s, v2.4h, v7.4h 996 smlal2 v23.4s, v2.8h, v7.8h 997 smlal v24.4s, v1.4h, v7.4h 998 smlal2 v25.4s, v1.8h, v7.8h 999 smlal v26.4s, v0.4h, v7.4h 1000 smlal2 v27.4s, v0.8h, v7.8h 1001 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1002 smlal2 v21.4s, v6.8h, v16.8h 1003 smlal v22.4s, v6.4h, v17.4h 1004 smlal2 v23.4s, v6.8h, v17.8h 1005 smlal v24.4s, v6.4h, v18.4h 1006 smlal2 v25.4s, v6.8h, v18.8h 1007 smlal v26.4s, v6.4h, v19.4h 1008 smlal2 v27.4s, v6.8h, v19.8h 1009 rshrn v20.4h, v20.4s, #9 1010 rshrn2 v20.8h, v21.4s, #9 1011 rshrn v21.4h, v22.4s, #9 1012 rshrn2 v21.8h, v23.4s, #9 1013 rshrn v22.4h, v24.4s, #9 1014 rshrn2 v22.8h, v25.4s, #9 1015 rshrn v23.4h, v26.4s, #9 1016 rshrn2 v23.8h, v27.4s, #9 1017 st1 {v20.8h}, [x0], x1 1018 st1 {v21.8h}, [x6], x1 1019 subs w4, w4, #4 1020 st1 {v22.8h}, [x0], x1 1021 st1 {v23.8h}, [x6], x1 1022 b.gt 8b 1023 ret 1024160: 1025320: 1026640: 1027 add x12, x2, w3, uxtw #1 1028 sub x1, x1, w3, uxtw #1 1029 ld1r {v5.8h}, [x12] // right 1030 sub x2, x2, #4 1031 mov x7, #-4 1032 mov w9, w3 1033 add v31.4h, v4.4h, v5.4h // bottom+right 1034 10351: 1036 ld2r {v0.8h, v1.8h}, [x2], x7 // left 1037 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1038 sub v0.8h, v0.8h, v5.8h // left-right 1039 sub v1.8h, v1.8h, v5.8h 1040 uxtl v16.8h, v16.8b // weights_ver 1041 uxtl v17.8h, v17.8b 10422: 1043 ld1 {v7.16b}, [x10], #16 // weights_hor 1044 ld1 {v2.8h, v3.8h}, [x8], #32 // top 1045 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1046 ushll v21.4s, v31.4h, #8 1047 ushll v22.4s, v31.4h, #8 1048 ushll v23.4s, v31.4h, #8 1049 ushll v24.4s, v31.4h, #8 1050 ushll v25.4s, v31.4h, #8 1051 ushll v26.4s, v31.4h, #8 1052 ushll v27.4s, v31.4h, #8 1053 uxtl v6.8h, v7.8b // weights_hor 1054 uxtl2 v7.8h, v7.16b 1055 sub v2.8h, v2.8h, v4.8h // top-bottom 1056 sub v3.8h, v3.8h, v4.8h 1057 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor 1058 smlal2 v21.4s, v1.8h, v6.8h // (left flipped) 1059 smlal v22.4s, v1.4h, v7.4h 1060 smlal2 v23.4s, v1.8h, v7.8h 1061 smlal v24.4s, v0.4h, v6.4h 1062 smlal2 v25.4s, v0.8h, v6.8h 1063 smlal v26.4s, v0.4h, v7.4h 1064 smlal2 v27.4s, v0.8h, v7.8h 1065 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver 1066 smlal2 v21.4s, v2.8h, v16.8h 1067 smlal v22.4s, v3.4h, v16.4h 1068 smlal2 v23.4s, v3.8h, v16.8h 1069 smlal v24.4s, v2.4h, v17.4h 1070 smlal2 v25.4s, v2.8h, v17.8h 1071 smlal v26.4s, v3.4h, v17.4h 1072 smlal2 v27.4s, v3.8h, v17.8h 1073 rshrn v20.4h, v20.4s, #9 1074 rshrn2 v20.8h, v21.4s, #9 1075 rshrn v21.4h, v22.4s, #9 1076 rshrn2 v21.8h, v23.4s, #9 1077 rshrn v22.4h, v24.4s, #9 1078 rshrn2 v22.8h, v25.4s, #9 1079 rshrn v23.4h, v26.4s, #9 1080 rshrn2 v23.8h, v27.4s, #9 1081 subs w3, w3, #16 1082 st1 {v20.8h, v21.8h}, [x0], #32 1083 st1 {v22.8h, v23.8h}, [x6], #32 1084 b.gt 2b 1085 subs w4, w4, #2 1086 b.le 9f 1087 sub x8, x8, w9, uxtw #1 1088 sub x10, x10, w9, uxtw 1089 add x0, x0, x1 1090 add x6, x6, x1 1091 mov w3, w9 1092 b 1b 10939: 1094 ret 1095 1096L(ipred_smooth_tbl): 1097 .hword L(ipred_smooth_tbl) - 640b 1098 .hword L(ipred_smooth_tbl) - 320b 1099 .hword L(ipred_smooth_tbl) - 160b 1100 .hword L(ipred_smooth_tbl) - 80b 1101 .hword L(ipred_smooth_tbl) - 40b 1102endfunc 1103 1104// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1105// const pixel *const topleft, 1106// const int width, const int height, const int a, 1107// const int max_width, const int max_height); 1108function ipred_smooth_v_16bpc_neon, export=1 1109 movrel x7, X(sm_weights) 1110 add x7, x7, w4, uxtw 1111 clz w9, w3 1112 adr x5, L(ipred_smooth_v_tbl) 1113 sub x8, x2, w4, uxtw #1 1114 sub w9, w9, #25 1115 ldrh w9, [x5, w9, uxtw #1] 1116 ld1r {v4.8h}, [x8] // bottom 1117 add x2, x2, #2 1118 sub x5, x5, w9, uxtw 1119 add x6, x0, x1 1120 lsl x1, x1, #1 1121 br x5 112240: 1123 ld1r {v6.2d}, [x2] // top 1124 sub v6.8h, v6.8h, v4.8h // top-bottom 11254: 1126 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1127 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1128 zip1 v18.2s, v18.2s, v19.2s 1129 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1130 ushll v18.8h, v18.8b, #7 1131 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1132 sqrdmulh v21.8h, v6.8h, v18.8h 1133 add v20.8h, v20.8h, v4.8h 1134 add v21.8h, v21.8h, v4.8h 1135 st1 {v20.d}[0], [x0], x1 1136 st1 {v20.d}[1], [x6], x1 1137 subs w4, w4, #4 1138 st1 {v21.d}[0], [x0], x1 1139 st1 {v21.d}[1], [x6], x1 1140 b.gt 4b 1141 ret 114280: 1143 ld1 {v6.8h}, [x2] // top 1144 sub v6.8h, v6.8h, v4.8h // top-bottom 11458: 1146 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1147 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1148 ushll v17.8h, v17.8b, #7 1149 ushll v18.8h, v18.8b, #7 1150 ushll v19.8h, v19.8b, #7 1151 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1152 sqrdmulh v21.8h, v6.8h, v17.8h 1153 sqrdmulh v22.8h, v6.8h, v18.8h 1154 sqrdmulh v23.8h, v6.8h, v19.8h 1155 add v20.8h, v20.8h, v4.8h 1156 add v21.8h, v21.8h, v4.8h 1157 add v22.8h, v22.8h, v4.8h 1158 add v23.8h, v23.8h, v4.8h 1159 st1 {v20.8h}, [x0], x1 1160 st1 {v21.8h}, [x6], x1 1161 subs w4, w4, #4 1162 st1 {v22.8h}, [x0], x1 1163 st1 {v23.8h}, [x6], x1 1164 b.gt 8b 1165 ret 1166160: 1167320: 1168640: 1169 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1170 add x5, x0, x1 1171 add x8, x6, x1 1172 lsl x1, x1, #1 1173 sub x1, x1, w3, uxtw #1 1174 mov w9, w3 1175 11761: 1177 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1178 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1179 ushll v17.8h, v17.8b, #7 1180 ushll v18.8h, v18.8b, #7 1181 ushll v19.8h, v19.8b, #7 11822: 1183 ld1 {v2.8h, v3.8h}, [x2], #32 // top 1184 sub v2.8h, v2.8h, v4.8h // top-bottom 1185 sub v3.8h, v3.8h, v4.8h 1186 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1187 sqrdmulh v21.8h, v3.8h, v16.8h 1188 sqrdmulh v22.8h, v2.8h, v17.8h 1189 sqrdmulh v23.8h, v3.8h, v17.8h 1190 sqrdmulh v24.8h, v2.8h, v18.8h 1191 sqrdmulh v25.8h, v3.8h, v18.8h 1192 sqrdmulh v26.8h, v2.8h, v19.8h 1193 sqrdmulh v27.8h, v3.8h, v19.8h 1194 add v20.8h, v20.8h, v4.8h 1195 add v21.8h, v21.8h, v4.8h 1196 add v22.8h, v22.8h, v4.8h 1197 add v23.8h, v23.8h, v4.8h 1198 add v24.8h, v24.8h, v4.8h 1199 add v25.8h, v25.8h, v4.8h 1200 add v26.8h, v26.8h, v4.8h 1201 add v27.8h, v27.8h, v4.8h 1202 subs w3, w3, #16 1203 st1 {v20.8h, v21.8h}, [x0], #32 1204 st1 {v22.8h, v23.8h}, [x6], #32 1205 st1 {v24.8h, v25.8h}, [x5], #32 1206 st1 {v26.8h, v27.8h}, [x8], #32 1207 b.gt 2b 1208 subs w4, w4, #4 1209 b.le 9f 1210 sub x2, x2, w9, uxtw #1 1211 add x0, x0, x1 1212 add x6, x6, x1 1213 add x5, x5, x1 1214 add x8, x8, x1 1215 mov w3, w9 1216 b 1b 12179: 1218 ret 1219 1220L(ipred_smooth_v_tbl): 1221 .hword L(ipred_smooth_v_tbl) - 640b 1222 .hword L(ipred_smooth_v_tbl) - 320b 1223 .hword L(ipred_smooth_v_tbl) - 160b 1224 .hword L(ipred_smooth_v_tbl) - 80b 1225 .hword L(ipred_smooth_v_tbl) - 40b 1226endfunc 1227 1228// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1229// const pixel *const topleft, 1230// const int width, const int height, const int a, 1231// const int max_width, const int max_height); 1232function ipred_smooth_h_16bpc_neon, export=1 1233 movrel x8, X(sm_weights) 1234 add x8, x8, w3, uxtw 1235 clz w9, w3 1236 adr x5, L(ipred_smooth_h_tbl) 1237 add x12, x2, w3, uxtw #1 1238 sub w9, w9, #25 1239 ldrh w9, [x5, w9, uxtw #1] 1240 ld1r {v5.8h}, [x12] // right 1241 sub x5, x5, w9, uxtw 1242 add x6, x0, x1 1243 lsl x1, x1, #1 1244 br x5 124540: 1246 ld1r {v7.2s}, [x8] // weights_hor 1247 sub x2, x2, #8 1248 mov x7, #-8 1249 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12504: 1251 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 1252 zip1 v1.2d, v1.2d, v0.2d // left, flipped 1253 zip1 v0.2d, v3.2d, v2.2d 1254 sub v0.8h, v0.8h, v5.8h // left-right 1255 sub v1.8h, v1.8h, v5.8h 1256 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1257 sqrdmulh v21.8h, v1.8h, v7.8h 1258 add v20.8h, v20.8h, v5.8h 1259 add v21.8h, v21.8h, v5.8h 1260 st1 {v20.d}[0], [x0], x1 1261 st1 {v20.d}[1], [x6], x1 1262 subs w4, w4, #4 1263 st1 {v21.d}[0], [x0], x1 1264 st1 {v21.d}[1], [x6], x1 1265 b.gt 4b 1266 ret 126780: 1268 ld1 {v7.8b}, [x8] // weights_hor 1269 sub x2, x2, #8 1270 mov x7, #-8 1271 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12728: 1273 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1274 sub v3.8h, v3.8h, v5.8h // left-right 1275 sub v2.8h, v2.8h, v5.8h 1276 sub v1.8h, v1.8h, v5.8h 1277 sub v0.8h, v0.8h, v5.8h 1278 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1279 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) 1280 sqrdmulh v22.8h, v1.8h, v7.8h 1281 sqrdmulh v23.8h, v0.8h, v7.8h 1282 add v20.8h, v20.8h, v5.8h 1283 add v21.8h, v21.8h, v5.8h 1284 add v22.8h, v22.8h, v5.8h 1285 add v23.8h, v23.8h, v5.8h 1286 st1 {v20.8h}, [x0], x1 1287 st1 {v21.8h}, [x6], x1 1288 subs w4, w4, #4 1289 st1 {v22.8h}, [x0], x1 1290 st1 {v23.8h}, [x6], x1 1291 b.gt 8b 1292 ret 1293160: 1294320: 1295640: 1296 sub x2, x2, #8 1297 mov x7, #-8 1298 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1299 add x5, x0, x1 1300 add x10, x6, x1 1301 lsl x1, x1, #1 1302 sub x1, x1, w3, uxtw #1 1303 mov w9, w3 1304 13051: 1306 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1307 sub v0.8h, v0.8h, v5.8h // left-right 1308 sub v1.8h, v1.8h, v5.8h 1309 sub v2.8h, v2.8h, v5.8h 1310 sub v3.8h, v3.8h, v5.8h 13112: 1312 ld1 {v7.16b}, [x8], #16 // weights_hor 1313 ushll v6.8h, v7.8b, #7 // weights_hor << 7 1314 ushll2 v7.8h, v7.16b, #7 1315 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 1316 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) 1317 sqrdmulh v22.8h, v2.8h, v6.8h 1318 sqrdmulh v23.8h, v2.8h, v7.8h 1319 sqrdmulh v24.8h, v1.8h, v6.8h 1320 sqrdmulh v25.8h, v1.8h, v7.8h 1321 sqrdmulh v26.8h, v0.8h, v6.8h 1322 sqrdmulh v27.8h, v0.8h, v7.8h 1323 add v20.8h, v20.8h, v5.8h 1324 add v21.8h, v21.8h, v5.8h 1325 add v22.8h, v22.8h, v5.8h 1326 add v23.8h, v23.8h, v5.8h 1327 add v24.8h, v24.8h, v5.8h 1328 add v25.8h, v25.8h, v5.8h 1329 add v26.8h, v26.8h, v5.8h 1330 add v27.8h, v27.8h, v5.8h 1331 subs w3, w3, #16 1332 st1 {v20.8h, v21.8h}, [x0], #32 1333 st1 {v22.8h, v23.8h}, [x6], #32 1334 st1 {v24.8h, v25.8h}, [x5], #32 1335 st1 {v26.8h, v27.8h}, [x10], #32 1336 b.gt 2b 1337 subs w4, w4, #4 1338 b.le 9f 1339 sub x8, x8, w9, uxtw 1340 add x0, x0, x1 1341 add x6, x6, x1 1342 add x5, x5, x1 1343 add x10, x10, x1 1344 mov w3, w9 1345 b 1b 13469: 1347 ret 1348 1349L(ipred_smooth_h_tbl): 1350 .hword L(ipred_smooth_h_tbl) - 640b 1351 .hword L(ipred_smooth_h_tbl) - 320b 1352 .hword L(ipred_smooth_h_tbl) - 160b 1353 .hword L(ipred_smooth_h_tbl) - 80b 1354 .hword L(ipred_smooth_h_tbl) - 40b 1355endfunc 1356 1357// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1358// const pixel *const topleft, 1359// const int width, const int height, const int filt_idx, 1360// const int max_width, const int max_height, 1361// const int bitdepth_max); 1362.macro filter_fn bpc 1363function ipred_filter_\bpc\()bpc_neon 1364 and w5, w5, #511 1365 movrel x6, X(filter_intra_taps) 1366 lsl w5, w5, #6 1367 add x6, x6, w5, uxtw 1368 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 1369 clz w9, w3 1370 adr x5, L(ipred_filter\bpc\()_tbl) 1371 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 1372 sub w9, w9, #26 1373 ldrh w9, [x5, w9, uxtw #1] 1374 sxtl v16.8h, v16.8b 1375 sxtl v17.8h, v17.8b 1376 sub x5, x5, w9, uxtw 1377 sxtl v18.8h, v18.8b 1378 sxtl v19.8h, v19.8b 1379 add x6, x0, x1 1380 lsl x1, x1, #1 1381 sxtl v20.8h, v20.8b 1382 sxtl v21.8h, v21.8b 1383 sxtl v22.8h, v22.8b 1384 dup v31.8h, w8 1385 movi v30.8h, #0 1386 br x5 138740: 1388 ldur d0, [x2, #2] // top (0-3) 1389 sub x2, x2, #4 1390 mov x7, #-4 13914: 1392 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1393.if \bpc == 10 1394 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1395 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1396 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1397 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1398 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1399 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1400 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1401 srshr v2.8h, v2.8h, #4 1402 smax v2.8h, v2.8h, v30.8h 1403.else 1404 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1405 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1406 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1407 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1408 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1409 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1410 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1411 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1412 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1413 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1414 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1415 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1416 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1417 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1418 sqrshrun v2.4h, v2.4s, #4 1419 sqrshrun2 v2.8h, v3.4s, #4 1420.endif 1421 smin v2.8h, v2.8h, v31.8h 1422 subs w4, w4, #2 1423 st1 {v2.d}[0], [x0], x1 1424 uxtl v0.8h, v2.8b 1425 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] 1426 st1 {v2.d}[1], [x6], x1 1427 b.gt 4b 1428 ret 142980: 1430 ldur q0, [x2, #2] // top (0-7) 1431 sub x2, x2, #4 1432 mov x7, #-4 14338: 1434 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 1435.if \bpc == 10 1436 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1437 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1438 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1439 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1440 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1441 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1442 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1443 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1444 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1445 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1446 srshr v2.8h, v2.8h, #4 1447 smax v2.8h, v2.8h, v30.8h 1448 smin v2.8h, v2.8h, v31.8h 1449 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1450 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1451 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1452 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1453 srshr v3.8h, v3.8h, #4 1454 smax v3.8h, v3.8h, v30.8h 1455.else 1456 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 1457 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 1458 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 1459 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 1460 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 1461 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 1462 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 1463 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1464 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1465 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1466 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1467 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1468 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1469 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1470 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) 1471 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) 1472 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) 1473 sqrshrun v2.4h, v2.4s, #4 1474 sqrshrun2 v2.8h, v3.4s, #4 1475 smin v2.8h, v2.8h, v31.8h 1476 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) 1477 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) 1478 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) 1479 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) 1480 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1481 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1482 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1483 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1484 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1485 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 1486 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 1487 sqrshrun v3.4h, v4.4s, #4 1488 sqrshrun2 v3.8h, v5.4s, #4 1489.endif 1490 smin v3.8h, v3.8h, v31.8h 1491 subs w4, w4, #2 1492 st2 {v2.d, v3.d}[0], [x0], x1 1493 zip2 v0.2d, v2.2d, v3.2d 1494 st2 {v2.d, v3.d}[1], [x6], x1 1495 b.gt 8b 1496 ret 1497160: 1498320: 1499 add x8, x2, #2 1500 sub x2, x2, #4 1501 mov x7, #-4 1502 sub x1, x1, w3, uxtw #1 1503 mov w9, w3 1504 15051: 1506 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 15072: 1508 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) 1509.if \bpc == 10 1510 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1511 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1512 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1513 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1514 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1515 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1516 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1517 1518 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1519 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1520 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1521 srshr v3.8h, v3.8h, #4 1522 smax v3.8h, v3.8h, v30.8h 1523 smin v3.8h, v3.8h, v31.8h 1524 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1525 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1526 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1527 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1528 1529 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1530 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1531 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1532 srshr v4.8h, v4.8h, #4 1533 smax v4.8h, v4.8h, v30.8h 1534 smin v4.8h, v4.8h, v31.8h 1535 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1536 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1537 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1538 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1539 1540 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1541 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1542 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1543 srshr v5.8h, v5.8h, #4 1544 smax v5.8h, v5.8h, v30.8h 1545 smin v5.8h, v5.8h, v31.8h 1546 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1547 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1548 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1549 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1550 1551 subs w3, w3, #16 1552 srshr v6.8h, v6.8h, #4 1553 smax v6.8h, v6.8h, v30.8h 1554.else 1555 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) 1556 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) 1557 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) 1558 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) 1559 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) 1560 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) 1561 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) 1562 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1563 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1564 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1565 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1566 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1567 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1568 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1569 1570 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) 1571 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) 1572 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) 1573 sqrshrun v3.4h, v3.4s, #4 1574 sqrshrun2 v3.8h, v4.4s, #4 1575 smin v3.8h, v3.8h, v31.8h 1576 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) 1577 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) 1578 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) 1579 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) 1580 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1581 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1582 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1583 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1584 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1585 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 1586 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 1587 1588 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) 1589 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) 1590 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) 1591 sqrshrun v4.4h, v5.4s, #4 1592 sqrshrun2 v4.8h, v6.4s, #4 1593 smin v4.8h, v4.8h, v31.8h 1594 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) 1595 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) 1596 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) 1597 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) 1598 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1599 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1600 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1601 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1602 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1603 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 1604 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 1605 1606 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) 1607 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) 1608 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) 1609 sqrshrun v5.4h, v24.4s, #4 1610 sqrshrun2 v5.8h, v25.4s, #4 1611 smin v5.8h, v5.8h, v31.8h 1612 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) 1613 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) 1614 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) 1615 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) 1616 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1617 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1618 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1619 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1620 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1621 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 1622 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 1623 1624 subs w3, w3, #16 1625 sqrshrun v6.4h, v26.4s, #4 1626 sqrshrun2 v6.8h, v27.4s, #4 1627.endif 1628 smin v6.8h, v6.8h, v31.8h 1629 1630 ins v0.h[2], v2.h[7] 1631 st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 1632 ins v0.h[0], v6.h[7] 1633 st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 1634 ins v0.h[1], v6.h[3] 1635 b.gt 2b 1636 subs w4, w4, #2 1637 b.le 9f 1638 sub x8, x6, w9, uxtw #1 1639 add x0, x0, x1 1640 add x6, x6, x1 1641 mov w3, w9 1642 b 1b 16439: 1644 ret 1645 1646L(ipred_filter\bpc\()_tbl): 1647 .hword L(ipred_filter\bpc\()_tbl) - 320b 1648 .hword L(ipred_filter\bpc\()_tbl) - 160b 1649 .hword L(ipred_filter\bpc\()_tbl) - 80b 1650 .hword L(ipred_filter\bpc\()_tbl) - 40b 1651endfunc 1652.endm 1653 1654filter_fn 10 1655filter_fn 12 1656 1657function ipred_filter_16bpc_neon, export=1 1658 ldr w8, [sp] 1659 cmp w8, 0x3ff 1660 b.le ipred_filter_10bpc_neon 1661 b ipred_filter_12bpc_neon 1662endfunc 1663 1664// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1665// const uint16_t *const pal, const uint8_t *idx, 1666// const int w, const int h); 1667function pal_pred_16bpc_neon, export=1 1668 ld1 {v30.8h}, [x2] 1669 clz w9, w4 1670 adr x6, L(pal_pred_tbl) 1671 sub w9, w9, #25 1672 ldrh w9, [x6, w9, uxtw #1] 1673 movi v31.8h, #1, lsl #8 1674 sub x6, x6, w9, uxtw 1675 br x6 167640: 1677 add x2, x0, x1 1678 lsl x1, x1, #1 16794: 1680 ld1 {v1.16b}, [x3], #16 1681 subs w5, w5, #4 1682 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... 1683 add v1.16b, v1.16b, v1.16b 1684 zip1 v0.16b, v1.16b, v1.16b 1685 zip2 v1.16b, v1.16b, v1.16b 1686 add v0.8h, v0.8h, v31.8h 1687 add v1.8h, v1.8h, v31.8h 1688 tbl v0.16b, {v30.16b}, v0.16b 1689 st1 {v0.d}[0], [x0], x1 1690 tbl v1.16b, {v30.16b}, v1.16b 1691 st1 {v0.d}[1], [x2], x1 1692 st1 {v1.d}[0], [x0], x1 1693 st1 {v1.d}[1], [x2], x1 1694 b.gt 4b 1695 ret 169680: 1697 add x2, x0, x1 1698 lsl x1, x1, #1 16998: 1700 ld1 {v2.16b, v3.16b}, [x3], #32 1701 subs w5, w5, #4 1702 add v2.16b, v2.16b, v2.16b 1703 add v3.16b, v3.16b, v3.16b 1704 zip1 v0.16b, v2.16b, v2.16b 1705 zip2 v1.16b, v2.16b, v2.16b 1706 zip1 v2.16b, v3.16b, v3.16b 1707 zip2 v3.16b, v3.16b, v3.16b 1708 add v0.8h, v0.8h, v31.8h 1709 add v1.8h, v1.8h, v31.8h 1710 add v2.8h, v2.8h, v31.8h 1711 add v3.8h, v3.8h, v31.8h 1712 tbl v0.16b, {v30.16b}, v0.16b 1713 tbl v1.16b, {v30.16b}, v1.16b 1714 st1 {v0.8h}, [x0], x1 1715 tbl v2.16b, {v30.16b}, v2.16b 1716 st1 {v1.8h}, [x2], x1 1717 tbl v3.16b, {v30.16b}, v3.16b 1718 st1 {v2.8h}, [x0], x1 1719 st1 {v3.8h}, [x2], x1 1720 b.gt 8b 1721 ret 1722160: 1723 add x2, x0, x1 1724 lsl x1, x1, #1 172516: 1726 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1727 subs w5, w5, #4 1728 add v4.16b, v4.16b, v4.16b 1729 add v5.16b, v5.16b, v5.16b 1730 add v6.16b, v6.16b, v6.16b 1731 add v7.16b, v7.16b, v7.16b 1732 zip1 v0.16b, v4.16b, v4.16b 1733 zip2 v1.16b, v4.16b, v4.16b 1734 zip1 v2.16b, v5.16b, v5.16b 1735 zip2 v3.16b, v5.16b, v5.16b 1736 zip1 v4.16b, v6.16b, v6.16b 1737 zip2 v5.16b, v6.16b, v6.16b 1738 zip1 v6.16b, v7.16b, v7.16b 1739 zip2 v7.16b, v7.16b, v7.16b 1740 add v0.8h, v0.8h, v31.8h 1741 add v1.8h, v1.8h, v31.8h 1742 add v2.8h, v2.8h, v31.8h 1743 add v3.8h, v3.8h, v31.8h 1744 add v4.8h, v4.8h, v31.8h 1745 tbl v0.16b, {v30.16b}, v0.16b 1746 add v5.8h, v5.8h, v31.8h 1747 tbl v1.16b, {v30.16b}, v1.16b 1748 add v6.8h, v6.8h, v31.8h 1749 tbl v2.16b, {v30.16b}, v2.16b 1750 add v7.8h, v7.8h, v31.8h 1751 tbl v3.16b, {v30.16b}, v3.16b 1752 tbl v4.16b, {v30.16b}, v4.16b 1753 tbl v5.16b, {v30.16b}, v5.16b 1754 st1 {v0.8h, v1.8h}, [x0], x1 1755 tbl v6.16b, {v30.16b}, v6.16b 1756 st1 {v2.8h, v3.8h}, [x2], x1 1757 tbl v7.16b, {v30.16b}, v7.16b 1758 st1 {v4.8h, v5.8h}, [x0], x1 1759 st1 {v6.8h, v7.8h}, [x2], x1 1760 b.gt 16b 1761 ret 1762320: 1763 add x2, x0, x1 1764 lsl x1, x1, #1 176532: 1766 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1767 subs w5, w5, #2 1768 add v4.16b, v4.16b, v4.16b 1769 add v5.16b, v5.16b, v5.16b 1770 add v6.16b, v6.16b, v6.16b 1771 add v7.16b, v7.16b, v7.16b 1772 zip1 v0.16b, v4.16b, v4.16b 1773 zip2 v1.16b, v4.16b, v4.16b 1774 zip1 v2.16b, v5.16b, v5.16b 1775 zip2 v3.16b, v5.16b, v5.16b 1776 zip1 v4.16b, v6.16b, v6.16b 1777 zip2 v5.16b, v6.16b, v6.16b 1778 zip1 v6.16b, v7.16b, v7.16b 1779 zip2 v7.16b, v7.16b, v7.16b 1780 add v0.8h, v0.8h, v31.8h 1781 add v1.8h, v1.8h, v31.8h 1782 add v2.8h, v2.8h, v31.8h 1783 add v3.8h, v3.8h, v31.8h 1784 add v4.8h, v4.8h, v31.8h 1785 tbl v0.16b, {v30.16b}, v0.16b 1786 add v5.8h, v5.8h, v31.8h 1787 tbl v1.16b, {v30.16b}, v1.16b 1788 add v6.8h, v6.8h, v31.8h 1789 tbl v2.16b, {v30.16b}, v2.16b 1790 add v7.8h, v7.8h, v31.8h 1791 tbl v3.16b, {v30.16b}, v3.16b 1792 tbl v4.16b, {v30.16b}, v4.16b 1793 tbl v5.16b, {v30.16b}, v5.16b 1794 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1795 tbl v6.16b, {v30.16b}, v6.16b 1796 tbl v7.16b, {v30.16b}, v7.16b 1797 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1798 b.gt 32b 1799 ret 1800640: 1801 add x2, x0, #64 180264: 1803 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 1804 subs w5, w5, #1 1805 add v4.16b, v4.16b, v4.16b 1806 add v5.16b, v5.16b, v5.16b 1807 add v6.16b, v6.16b, v6.16b 1808 add v7.16b, v7.16b, v7.16b 1809 zip1 v0.16b, v4.16b, v4.16b 1810 zip2 v1.16b, v4.16b, v4.16b 1811 zip1 v2.16b, v5.16b, v5.16b 1812 zip2 v3.16b, v5.16b, v5.16b 1813 zip1 v4.16b, v6.16b, v6.16b 1814 zip2 v5.16b, v6.16b, v6.16b 1815 zip1 v6.16b, v7.16b, v7.16b 1816 zip2 v7.16b, v7.16b, v7.16b 1817 add v0.8h, v0.8h, v31.8h 1818 add v1.8h, v1.8h, v31.8h 1819 add v2.8h, v2.8h, v31.8h 1820 add v3.8h, v3.8h, v31.8h 1821 add v4.8h, v4.8h, v31.8h 1822 tbl v0.16b, {v30.16b}, v0.16b 1823 add v5.8h, v5.8h, v31.8h 1824 tbl v1.16b, {v30.16b}, v1.16b 1825 add v6.8h, v6.8h, v31.8h 1826 tbl v2.16b, {v30.16b}, v2.16b 1827 add v7.8h, v7.8h, v31.8h 1828 tbl v3.16b, {v30.16b}, v3.16b 1829 tbl v4.16b, {v30.16b}, v4.16b 1830 tbl v5.16b, {v30.16b}, v5.16b 1831 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 1832 tbl v6.16b, {v30.16b}, v6.16b 1833 tbl v7.16b, {v30.16b}, v7.16b 1834 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 1835 b.gt 64b 1836 ret 1837 1838L(pal_pred_tbl): 1839 .hword L(pal_pred_tbl) - 640b 1840 .hword L(pal_pred_tbl) - 320b 1841 .hword L(pal_pred_tbl) - 160b 1842 .hword L(pal_pred_tbl) - 80b 1843 .hword L(pal_pred_tbl) - 40b 1844endfunc 1845 1846// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1847// const pixel *const topleft, 1848// const int width, const int height, 1849// const int16_t *ac, const int alpha, 1850// const int bitdepth_max); 1851function ipred_cfl_128_16bpc_neon, export=1 1852 dup v31.8h, w7 // bitdepth_max 1853 clz w9, w3 1854 adr x7, L(ipred_cfl_128_tbl) 1855 sub w9, w9, #26 1856 ldrh w9, [x7, w9, uxtw #1] 1857 urshr v0.8h, v31.8h, #1 1858 dup v1.8h, w6 // alpha 1859 sub x7, x7, w9, uxtw 1860 add x6, x0, x1 1861 lsl x1, x1, #1 1862 movi v30.8h, #0 1863 br x7 1864L(ipred_cfl_splat_w4): 1865 ld1 {v4.8h, v5.8h}, [x5], #32 1866 subs w4, w4, #4 1867 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1868 smull2 v3.4s, v4.8h, v1.8h 1869 smull v4.4s, v5.4h, v1.4h 1870 smull2 v5.4s, v5.8h, v1.8h 1871 sshr v16.4s, v2.4s, #31 // sign = diff >> 31 1872 sshr v17.4s, v3.4s, #31 1873 sshr v18.4s, v4.4s, #31 1874 sshr v19.4s, v5.4s, #31 1875 add v2.4s, v2.4s, v16.4s // diff + sign 1876 add v3.4s, v3.4s, v17.4s 1877 add v4.4s, v4.4s, v18.4s 1878 add v5.4s, v5.4s, v19.4s 1879 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1880 rshrn2 v2.8h, v3.4s, #6 1881 rshrn v3.4h, v4.4s, #6 1882 rshrn2 v3.8h, v5.4s, #6 1883 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1884 add v3.8h, v3.8h, v0.8h 1885 smax v2.8h, v2.8h, v30.8h 1886 smax v3.8h, v3.8h, v30.8h 1887 smin v2.8h, v2.8h, v31.8h 1888 smin v3.8h, v3.8h, v31.8h 1889 st1 {v2.d}[0], [x0], x1 1890 st1 {v2.d}[1], [x6], x1 1891 st1 {v3.d}[0], [x0], x1 1892 st1 {v3.d}[1], [x6], x1 1893 b.gt L(ipred_cfl_splat_w4) 1894 ret 1895L(ipred_cfl_splat_w8): 1896 ld1 {v4.8h, v5.8h}, [x5], #32 1897 subs w4, w4, #2 1898 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 1899 smull2 v3.4s, v4.8h, v1.8h 1900 smull v4.4s, v5.4h, v1.4h 1901 smull2 v5.4s, v5.8h, v1.8h 1902 sshr v16.4s, v2.4s, #31 // sign = diff >> 31 1903 sshr v17.4s, v3.4s, #31 1904 sshr v18.4s, v4.4s, #31 1905 sshr v19.4s, v5.4s, #31 1906 add v2.4s, v2.4s, v16.4s // diff + sign 1907 add v3.4s, v3.4s, v17.4s 1908 add v4.4s, v4.4s, v18.4s 1909 add v5.4s, v5.4s, v19.4s 1910 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1911 rshrn2 v2.8h, v3.4s, #6 1912 rshrn v3.4h, v4.4s, #6 1913 rshrn2 v3.8h, v5.4s, #6 1914 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1915 add v3.8h, v3.8h, v0.8h 1916 smax v2.8h, v2.8h, v30.8h 1917 smax v3.8h, v3.8h, v30.8h 1918 smin v2.8h, v2.8h, v31.8h 1919 smin v3.8h, v3.8h, v31.8h 1920 st1 {v2.8h}, [x0], x1 1921 st1 {v3.8h}, [x6], x1 1922 b.gt L(ipred_cfl_splat_w8) 1923 ret 1924L(ipred_cfl_splat_w16): 1925 add x7, x5, w3, uxtw #1 1926 sub x1, x1, w3, uxtw #1 1927 mov w9, w3 19281: 1929 ld1 {v2.8h, v3.8h}, [x5], #32 1930 ld1 {v4.8h, v5.8h}, [x7], #32 1931 subs w3, w3, #16 1932 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha 1933 smull2 v17.4s, v2.8h, v1.8h 1934 smull v18.4s, v3.4h, v1.4h 1935 smull2 v19.4s, v3.8h, v1.8h 1936 smull v2.4s, v4.4h, v1.4h 1937 smull2 v3.4s, v4.8h, v1.8h 1938 smull v4.4s, v5.4h, v1.4h 1939 smull2 v5.4s, v5.8h, v1.8h 1940 sshr v20.4s, v16.4s, #31 // sign = diff >> 31 1941 sshr v21.4s, v17.4s, #31 1942 sshr v22.4s, v18.4s, #31 1943 sshr v23.4s, v19.4s, #31 1944 sshr v24.4s, v2.4s, #31 1945 sshr v25.4s, v3.4s, #31 1946 sshr v26.4s, v4.4s, #31 1947 sshr v27.4s, v5.4s, #31 1948 add v16.4s, v16.4s, v20.4s // diff + sign 1949 add v17.4s, v17.4s, v21.4s 1950 add v18.4s, v18.4s, v22.4s 1951 add v19.4s, v19.4s, v23.4s 1952 add v2.4s, v2.4s, v24.4s 1953 add v3.4s, v3.4s, v25.4s 1954 add v4.4s, v4.4s, v26.4s 1955 add v5.4s, v5.4s, v27.4s 1956 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 1957 rshrn2 v16.8h, v17.4s, #6 1958 rshrn v17.4h, v18.4s, #6 1959 rshrn2 v17.8h, v19.4s, #6 1960 rshrn v6.4h, v2.4s, #6 1961 rshrn2 v6.8h, v3.4s, #6 1962 rshrn v7.4h, v4.4s, #6 1963 rshrn2 v7.8h, v5.4s, #6 1964 add v2.8h, v16.8h, v0.8h // dc + apply_sign() 1965 add v3.8h, v17.8h, v0.8h 1966 add v4.8h, v6.8h, v0.8h 1967 add v5.8h, v7.8h, v0.8h 1968 smax v2.8h, v2.8h, v30.8h 1969 smax v3.8h, v3.8h, v30.8h 1970 smax v4.8h, v4.8h, v30.8h 1971 smax v5.8h, v5.8h, v30.8h 1972 smin v2.8h, v2.8h, v31.8h 1973 smin v3.8h, v3.8h, v31.8h 1974 smin v4.8h, v4.8h, v31.8h 1975 smin v5.8h, v5.8h, v31.8h 1976 st1 {v2.8h, v3.8h}, [x0], #32 1977 st1 {v4.8h, v5.8h}, [x6], #32 1978 b.gt 1b 1979 subs w4, w4, #2 1980 add x5, x5, w9, uxtw #1 1981 add x7, x7, w9, uxtw #1 1982 add x0, x0, x1 1983 add x6, x6, x1 1984 mov w3, w9 1985 b.gt 1b 1986 ret 1987 1988L(ipred_cfl_128_tbl): 1989L(ipred_cfl_splat_tbl): 1990 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1991 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1992 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 1993 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 1994endfunc 1995 1996// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1997// const pixel *const topleft, 1998// const int width, const int height, 1999// const int16_t *ac, const int alpha, 2000// const int bitdepth_max); 2001function ipred_cfl_top_16bpc_neon, export=1 2002 dup v31.8h, w7 // bitdepth_max 2003 clz w9, w3 2004 adr x7, L(ipred_cfl_top_tbl) 2005 sub w9, w9, #26 2006 ldrh w9, [x7, w9, uxtw #1] 2007 dup v1.8h, w6 // alpha 2008 add x2, x2, #2 2009 sub x7, x7, w9, uxtw 2010 add x6, x0, x1 2011 lsl x1, x1, #1 2012 movi v30.8h, #0 2013 br x7 20144: 2015 ld1 {v0.4h}, [x2] 2016 addv h0, v0.4h 2017 urshr v0.4h, v0.4h, #2 2018 dup v0.8h, v0.h[0] 2019 b L(ipred_cfl_splat_w4) 20208: 2021 ld1 {v0.8h}, [x2] 2022 addv h0, v0.8h 2023 urshr v0.4h, v0.4h, #3 2024 dup v0.8h, v0.h[0] 2025 b L(ipred_cfl_splat_w8) 202616: 2027 ld1 {v2.8h, v3.8h}, [x2] 2028 addp v0.8h, v2.8h, v3.8h 2029 addv h0, v0.8h 2030 urshr v0.4h, v0.4h, #4 2031 dup v0.8h, v0.h[0] 2032 b L(ipred_cfl_splat_w16) 203332: 2034 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2035 addp v2.8h, v2.8h, v3.8h 2036 addp v4.8h, v4.8h, v5.8h 2037 addp v0.8h, v2.8h, v4.8h 2038 uaddlv s0, v0.8h 2039 rshrn v0.4h, v0.4s, #5 2040 dup v0.8h, v0.h[0] 2041 b L(ipred_cfl_splat_w16) 2042 2043L(ipred_cfl_top_tbl): 2044 .hword L(ipred_cfl_top_tbl) - 32b 2045 .hword L(ipred_cfl_top_tbl) - 16b 2046 .hword L(ipred_cfl_top_tbl) - 8b 2047 .hword L(ipred_cfl_top_tbl) - 4b 2048endfunc 2049 2050// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2051// const pixel *const topleft, 2052// const int width, const int height, 2053// const int16_t *ac, const int alpha, 2054// const int bitdepth_max); 2055function ipred_cfl_left_16bpc_neon, export=1 2056 dup v31.8h, w7 // bitdepth_max 2057 sub x2, x2, w4, uxtw #1 2058 clz w9, w3 2059 clz w8, w4 2060 adr x10, L(ipred_cfl_splat_tbl) 2061 adr x7, L(ipred_cfl_left_tbl) 2062 sub w9, w9, #26 2063 sub w8, w8, #26 2064 ldrh w9, [x10, w9, uxtw #1] 2065 ldrh w8, [x7, w8, uxtw #1] 2066 dup v1.8h, w6 // alpha 2067 sub x9, x10, w9, uxtw 2068 sub x7, x7, w8, uxtw 2069 add x6, x0, x1 2070 lsl x1, x1, #1 2071 movi v30.8h, #0 2072 br x7 2073 2074L(ipred_cfl_left_h4): 2075 ld1 {v0.4h}, [x2] 2076 addv h0, v0.4h 2077 urshr v0.4h, v0.4h, #2 2078 dup v0.8h, v0.h[0] 2079 br x9 2080 2081L(ipred_cfl_left_h8): 2082 ld1 {v0.8h}, [x2] 2083 addv h0, v0.8h 2084 urshr v0.4h, v0.4h, #3 2085 dup v0.8h, v0.h[0] 2086 br x9 2087 2088L(ipred_cfl_left_h16): 2089 ld1 {v2.8h, v3.8h}, [x2] 2090 addp v0.8h, v2.8h, v3.8h 2091 addv h0, v0.8h 2092 urshr v0.4h, v0.4h, #4 2093 dup v0.8h, v0.h[0] 2094 br x9 2095 2096L(ipred_cfl_left_h32): 2097 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2098 addp v2.8h, v2.8h, v3.8h 2099 addp v4.8h, v4.8h, v5.8h 2100 addp v0.8h, v2.8h, v4.8h 2101 uaddlv s0, v0.8h 2102 rshrn v0.4h, v0.4s, #5 2103 dup v0.8h, v0.h[0] 2104 br x9 2105 2106L(ipred_cfl_left_tbl): 2107 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 2108 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 2109 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 2110 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 2111endfunc 2112 2113// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2114// const pixel *const topleft, 2115// const int width, const int height, 2116// const int16_t *ac, const int alpha, 2117// const int bitdepth_max); 2118function ipred_cfl_16bpc_neon, export=1 2119 dup v31.8h, w7 // bitdepth_max 2120 sub x2, x2, w4, uxtw #1 2121 add w8, w3, w4 // width + height 2122 dup v1.8h, w6 // alpha 2123 clz w9, w3 2124 clz w6, w4 2125 dup v16.4s, w8 // width + height 2126 adr x7, L(ipred_cfl_tbl) 2127 rbit w8, w8 // rbit(width + height) 2128 sub w9, w9, #22 // 26 leading bits, minus table offset 4 2129 sub w6, w6, #26 2130 clz w8, w8 // ctz(width + height) 2131 ldrh w9, [x7, w9, uxtw #1] 2132 ldrh w6, [x7, w6, uxtw #1] 2133 neg w8, w8 // -ctz(width + height) 2134 sub x9, x7, w9, uxtw 2135 sub x7, x7, w6, uxtw 2136 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 2137 dup v17.4s, w8 // -ctz(width + height) 2138 add x6, x0, x1 2139 lsl x1, x1, #1 2140 movi v30.8h, #0 2141 br x7 2142 2143L(ipred_cfl_h4): 2144 ld1 {v0.4h}, [x2], #8 2145 uaddlv s0, v0.4h 2146 br x9 2147L(ipred_cfl_w4): 2148 add x2, x2, #2 2149 ld1 {v2.4h}, [x2] 2150 add v0.2s, v0.2s, v16.2s 2151 uaddlv s2, v2.4h 2152 cmp w4, #4 2153 add v0.2s, v0.2s, v2.2s 2154 ushl v0.2s, v0.2s, v17.2s 2155 b.eq 1f 2156 // h = 8/16 2157 cmp w4, #16 2158 mov w16, #0x6667 2159 mov w17, #0xAAAB 2160 csel w16, w16, w17, eq 2161 dup v16.2s, w16 2162 mul v0.2s, v0.2s, v16.2s 2163 ushr v0.2s, v0.2s, #17 21641: 2165 dup v0.8h, v0.h[0] 2166 b L(ipred_cfl_splat_w4) 2167 2168L(ipred_cfl_h8): 2169 ld1 {v0.8h}, [x2], #16 2170 uaddlv s0, v0.8h 2171 br x9 2172L(ipred_cfl_w8): 2173 add x2, x2, #2 2174 ld1 {v2.8h}, [x2] 2175 add v0.2s, v0.2s, v16.2s 2176 uaddlv s2, v2.8h 2177 cmp w4, #8 2178 add v0.2s, v0.2s, v2.2s 2179 ushl v0.2s, v0.2s, v17.2s 2180 b.eq 1f 2181 // h = 4/16/32 2182 cmp w4, #32 2183 mov w16, #0x6667 2184 mov w17, #0xAAAB 2185 csel w16, w16, w17, eq 2186 dup v16.2s, w16 2187 mul v0.2s, v0.2s, v16.2s 2188 ushr v0.2s, v0.2s, #17 21891: 2190 dup v0.8h, v0.h[0] 2191 b L(ipred_cfl_splat_w8) 2192 2193L(ipred_cfl_h16): 2194 ld1 {v2.8h, v3.8h}, [x2], #32 2195 addp v0.8h, v2.8h, v3.8h 2196 uaddlv s0, v0.8h 2197 br x9 2198L(ipred_cfl_w16): 2199 add x2, x2, #2 2200 ld1 {v2.8h, v3.8h}, [x2] 2201 add v0.2s, v0.2s, v16.2s 2202 addp v2.8h, v2.8h, v3.8h 2203 uaddlv s2, v2.8h 2204 cmp w4, #16 2205 add v0.2s, v0.2s, v2.2s 2206 ushl v0.2s, v0.2s, v17.2s 2207 b.eq 1f 2208 // h = 4/8/32 2209 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 2210 mov w16, #0x6667 2211 mov w17, #0xAAAB 2212 csel w16, w16, w17, eq 2213 dup v16.2s, w16 2214 mul v0.2s, v0.2s, v16.2s 2215 ushr v0.2s, v0.2s, #17 22161: 2217 dup v0.8h, v0.h[0] 2218 b L(ipred_cfl_splat_w16) 2219 2220L(ipred_cfl_h32): 2221 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 2222 addp v2.8h, v2.8h, v3.8h 2223 addp v4.8h, v4.8h, v5.8h 2224 addp v0.8h, v2.8h, v4.8h 2225 uaddlv s0, v0.8h 2226 br x9 2227L(ipred_cfl_w32): 2228 add x2, x2, #2 2229 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 2230 add v0.4s, v0.4s, v16.4s 2231 addp v2.8h, v2.8h, v3.8h 2232 addp v4.8h, v4.8h, v5.8h 2233 addp v2.8h, v2.8h, v4.8h 2234 cmp w4, #32 2235 uaddlv s2, v2.8h 2236 add v0.2s, v0.2s, v2.2s 2237 ushl v0.2s, v0.2s, v17.2s 2238 b.eq 1f 2239 // h = 8/16 2240 cmp w4, #8 2241 mov w16, #0x6667 2242 mov w17, #0xAAAB 2243 csel w16, w16, w17, eq 2244 dup v16.2s, w16 2245 mul v0.2s, v0.2s, v16.2s 2246 ushr v0.2s, v0.2s, #17 22471: 2248 dup v0.8h, v0.h[0] 2249 b L(ipred_cfl_splat_w16) 2250 2251L(ipred_cfl_tbl): 2252 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 2253 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 2254 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 2255 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 2256 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 2257 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 2258 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 2259 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 2260endfunc 2261 2262// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2263// const ptrdiff_t stride, const int w_pad, 2264// const int h_pad, const int cw, const int ch); 2265function ipred_cfl_ac_420_16bpc_neon, export=1 2266 clz w8, w5 2267 lsl w4, w4, #2 2268 adr x7, L(ipred_cfl_ac_420_tbl) 2269 sub w8, w8, #27 2270 ldrh w8, [x7, w8, uxtw #1] 2271 movi v24.4s, #0 2272 movi v25.4s, #0 2273 movi v26.4s, #0 2274 movi v27.4s, #0 2275 sub x7, x7, w8, uxtw 2276 sub w8, w6, w4 // height - h_pad 2277 rbit w9, w5 // rbit(width) 2278 rbit w10, w6 // rbit(height) 2279 clz w9, w9 // ctz(width) 2280 clz w10, w10 // ctz(height) 2281 add w9, w9, w10 // log2sz 2282 add x10, x1, x2 2283 dup v31.4s, w9 2284 lsl x2, x2, #1 2285 neg v31.4s, v31.4s // -log2sz 2286 br x7 2287 2288L(ipred_cfl_ac_420_w4): 22891: // Copy and subsample input 2290 ld1 {v0.8h}, [x1], x2 2291 ld1 {v1.8h}, [x10], x2 2292 ld1 {v2.8h}, [x1], x2 2293 ld1 {v3.8h}, [x10], x2 2294 addp v0.8h, v0.8h, v2.8h 2295 addp v1.8h, v1.8h, v3.8h 2296 add v0.8h, v0.8h, v1.8h 2297 shl v0.8h, v0.8h, #1 2298 subs w8, w8, #2 2299 st1 {v0.8h}, [x0], #16 2300 uaddw v24.4s, v24.4s, v0.4h 2301 uaddw2 v25.4s, v25.4s, v0.8h 2302 b.gt 1b 2303 trn2 v1.2d, v0.2d, v0.2d 2304 trn2 v0.2d, v0.2d, v0.2d 2305L(ipred_cfl_ac_420_w4_hpad): 2306 cbz w4, 3f 23072: // Vertical padding (h_pad > 0) 2308 subs w4, w4, #4 2309 st1 {v0.8h, v1.8h}, [x0], #32 2310 uaddw v24.4s, v24.4s, v0.4h 2311 uaddw2 v25.4s, v25.4s, v0.8h 2312 uaddw v26.4s, v26.4s, v1.4h 2313 uaddw2 v27.4s, v27.4s, v1.8h 2314 b.gt 2b 23153: 2316L(ipred_cfl_ac_420_w4_calc_subtract_dc): 2317 // Aggregate the sums 2318 add v24.4s, v24.4s, v25.4s 2319 add v26.4s, v26.4s, v27.4s 2320 add v0.4s, v24.4s, v26.4s 2321 addv s0, v0.4s // sum 2322 sub x0, x0, w6, uxtw #3 2323 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2324 dup v4.8h, v4.h[0] 23256: // Subtract dc from ac 2326 ld1 {v0.8h, v1.8h}, [x0] 2327 subs w6, w6, #4 2328 sub v0.8h, v0.8h, v4.8h 2329 sub v1.8h, v1.8h, v4.8h 2330 st1 {v0.8h, v1.8h}, [x0], #32 2331 b.gt 6b 2332 ret 2333 2334L(ipred_cfl_ac_420_w8): 2335 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 23361: // Copy and subsample input, without padding 2337 ld1 {v0.8h, v1.8h}, [x1], x2 2338 ld1 {v2.8h, v3.8h}, [x10], x2 2339 ld1 {v4.8h, v5.8h}, [x1], x2 2340 addp v0.8h, v0.8h, v1.8h 2341 ld1 {v6.8h, v7.8h}, [x10], x2 2342 addp v2.8h, v2.8h, v3.8h 2343 addp v4.8h, v4.8h, v5.8h 2344 addp v6.8h, v6.8h, v7.8h 2345 add v0.8h, v0.8h, v2.8h 2346 add v4.8h, v4.8h, v6.8h 2347 shl v0.8h, v0.8h, #1 2348 shl v1.8h, v4.8h, #1 2349 subs w8, w8, #2 2350 st1 {v0.8h, v1.8h}, [x0], #32 2351 uaddw v24.4s, v24.4s, v0.4h 2352 uaddw2 v25.4s, v25.4s, v0.8h 2353 uaddw v26.4s, v26.4s, v1.4h 2354 uaddw2 v27.4s, v27.4s, v1.8h 2355 b.gt 1b 2356 mov v0.16b, v1.16b 2357 b L(ipred_cfl_ac_420_w8_hpad) 2358 2359L(ipred_cfl_ac_420_w8_wpad): 23601: // Copy and subsample input, padding 4 2361 ld1 {v0.8h}, [x1], x2 2362 ld1 {v1.8h}, [x10], x2 2363 ld1 {v2.8h}, [x1], x2 2364 ld1 {v3.8h}, [x10], x2 2365 addp v0.8h, v0.8h, v2.8h 2366 addp v1.8h, v1.8h, v3.8h 2367 add v0.8h, v0.8h, v1.8h 2368 shl v0.8h, v0.8h, #1 2369 dup v1.4h, v0.h[3] 2370 dup v3.4h, v0.h[7] 2371 trn2 v2.2d, v0.2d, v0.2d 2372 subs w8, w8, #2 2373 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 2374 uaddw v24.4s, v24.4s, v0.4h 2375 uaddw v25.4s, v25.4s, v1.4h 2376 uaddw v26.4s, v26.4s, v2.4h 2377 uaddw v27.4s, v27.4s, v3.4h 2378 b.gt 1b 2379 trn1 v0.2d, v2.2d, v3.2d 2380 trn1 v1.2d, v2.2d, v3.2d 2381 2382L(ipred_cfl_ac_420_w8_hpad): 2383 cbz w4, 3f 23842: // Vertical padding (h_pad > 0) 2385 subs w4, w4, #4 2386 st1 {v0.8h, v1.8h}, [x0], #32 2387 uaddw v24.4s, v24.4s, v0.4h 2388 uaddw2 v25.4s, v25.4s, v0.8h 2389 uaddw v26.4s, v26.4s, v1.4h 2390 uaddw2 v27.4s, v27.4s, v1.8h 2391 st1 {v0.8h, v1.8h}, [x0], #32 2392 uaddw v24.4s, v24.4s, v0.4h 2393 uaddw2 v25.4s, v25.4s, v0.8h 2394 uaddw v26.4s, v26.4s, v1.4h 2395 uaddw2 v27.4s, v27.4s, v1.8h 2396 b.gt 2b 23973: 2398 2399 // Double the height and reuse the w4 summing/subtracting 2400 lsl w6, w6, #1 2401 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2402 2403L(ipred_cfl_ac_420_w16): 2404 adr x7, L(ipred_cfl_ac_420_w16_tbl) 2405 ldrh w3, [x7, w3, uxtw #1] 2406 sub x7, x7, w3, uxtw 2407 br x7 2408 2409L(ipred_cfl_ac_420_w16_wpad0): 24101: // Copy and subsample input, without padding 2411 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2412 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2413 addp v0.8h, v0.8h, v1.8h 2414 addp v2.8h, v2.8h, v3.8h 2415 addp v4.8h, v4.8h, v5.8h 2416 addp v6.8h, v6.8h, v7.8h 2417 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 2418 add v0.8h, v0.8h, v4.8h 2419 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 2420 add v2.8h, v2.8h, v6.8h 2421 addp v16.8h, v16.8h, v17.8h 2422 addp v18.8h, v18.8h, v19.8h 2423 addp v20.8h, v20.8h, v21.8h 2424 addp v22.8h, v22.8h, v23.8h 2425 add v16.8h, v16.8h, v20.8h 2426 add v18.8h, v18.8h, v22.8h 2427 shl v0.8h, v0.8h, #1 2428 shl v1.8h, v2.8h, #1 2429 shl v2.8h, v16.8h, #1 2430 shl v3.8h, v18.8h, #1 2431 subs w8, w8, #2 2432 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2433 uaddw v24.4s, v24.4s, v0.4h 2434 uaddw2 v25.4s, v25.4s, v0.8h 2435 uaddw v26.4s, v26.4s, v1.4h 2436 uaddw2 v27.4s, v27.4s, v1.8h 2437 uaddw v24.4s, v24.4s, v2.4h 2438 uaddw2 v25.4s, v25.4s, v2.8h 2439 uaddw v26.4s, v26.4s, v3.4h 2440 uaddw2 v27.4s, v27.4s, v3.8h 2441 b.gt 1b 2442 mov v0.16b, v2.16b 2443 mov v1.16b, v3.16b 2444 b L(ipred_cfl_ac_420_w16_hpad) 2445 2446L(ipred_cfl_ac_420_w16_wpad1): 24471: // Copy and subsample input, padding 4 2448 ldr q2, [x1, #32] 2449 ld1 {v0.8h, v1.8h}, [x1], x2 2450 ldr q5, [x10, #32] 2451 ld1 {v3.8h, v4.8h}, [x10], x2 2452 addp v2.8h, v2.8h, v2.8h 2453 addp v0.8h, v0.8h, v1.8h 2454 addp v5.8h, v5.8h, v5.8h 2455 addp v3.8h, v3.8h, v4.8h 2456 ldr q18, [x1, #32] 2457 add v2.4h, v2.4h, v5.4h 2458 ld1 {v16.8h, v17.8h}, [x1], x2 2459 add v0.8h, v0.8h, v3.8h 2460 ldr q21, [x10, #32] 2461 ld1 {v19.8h, v20.8h}, [x10], x2 2462 addp v18.8h, v18.8h, v18.8h 2463 addp v16.8h, v16.8h, v17.8h 2464 addp v21.8h, v21.8h, v21.8h 2465 addp v19.8h, v19.8h, v20.8h 2466 add v18.4h, v18.4h, v21.4h 2467 add v16.8h, v16.8h, v19.8h 2468 shl v1.4h, v2.4h, #1 2469 shl v0.8h, v0.8h, #1 2470 shl v3.4h, v18.4h, #1 2471 shl v2.8h, v16.8h, #1 2472 dup v4.4h, v1.h[3] 2473 dup v5.4h, v3.h[3] 2474 trn1 v1.2d, v1.2d, v4.2d 2475 trn1 v3.2d, v3.2d, v5.2d 2476 subs w8, w8, #2 2477 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2478 uaddw v24.4s, v24.4s, v0.4h 2479 uaddw2 v25.4s, v25.4s, v0.8h 2480 uaddw v26.4s, v26.4s, v1.4h 2481 uaddw2 v27.4s, v27.4s, v1.8h 2482 uaddw v24.4s, v24.4s, v2.4h 2483 uaddw2 v25.4s, v25.4s, v2.8h 2484 uaddw v26.4s, v26.4s, v3.4h 2485 uaddw2 v27.4s, v27.4s, v3.8h 2486 b.gt 1b 2487 mov v0.16b, v2.16b 2488 mov v1.16b, v3.16b 2489 b L(ipred_cfl_ac_420_w16_hpad) 2490 2491L(ipred_cfl_ac_420_w16_wpad2): 24921: // Copy and subsample input, padding 8 2493 ld1 {v0.8h, v1.8h}, [x1], x2 2494 ld1 {v2.8h, v3.8h}, [x10], x2 2495 ld1 {v4.8h, v5.8h}, [x1], x2 2496 addp v0.8h, v0.8h, v1.8h 2497 ld1 {v6.8h, v7.8h}, [x10], x2 2498 addp v2.8h, v2.8h, v3.8h 2499 addp v4.8h, v4.8h, v5.8h 2500 addp v6.8h, v6.8h, v7.8h 2501 add v0.8h, v0.8h, v2.8h 2502 add v4.8h, v4.8h, v6.8h 2503 shl v0.8h, v0.8h, #1 2504 shl v2.8h, v4.8h, #1 2505 dup v1.8h, v0.h[7] 2506 dup v3.8h, v2.h[7] 2507 subs w8, w8, #2 2508 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2509 uaddw v24.4s, v24.4s, v0.4h 2510 uaddw2 v25.4s, v25.4s, v0.8h 2511 uaddw v26.4s, v26.4s, v1.4h 2512 uaddw2 v27.4s, v27.4s, v1.8h 2513 uaddw v24.4s, v24.4s, v2.4h 2514 uaddw2 v25.4s, v25.4s, v2.8h 2515 uaddw v26.4s, v26.4s, v3.4h 2516 uaddw2 v27.4s, v27.4s, v3.8h 2517 b.gt 1b 2518 mov v0.16b, v2.16b 2519 mov v1.16b, v3.16b 2520 b L(ipred_cfl_ac_420_w16_hpad) 2521 2522L(ipred_cfl_ac_420_w16_wpad3): 25231: // Copy and subsample input, padding 12 2524 ld1 {v0.8h}, [x1], x2 2525 ld1 {v2.8h}, [x10], x2 2526 ld1 {v4.8h}, [x1], x2 2527 ld1 {v6.8h}, [x10], x2 2528 addp v0.8h, v0.8h, v4.8h 2529 addp v2.8h, v2.8h, v6.8h 2530 add v0.8h, v0.8h, v2.8h 2531 shl v0.8h, v0.8h, #1 2532 dup v1.8h, v0.h[3] 2533 dup v3.8h, v0.h[7] 2534 trn2 v2.2d, v0.2d, v3.2d 2535 trn1 v0.2d, v0.2d, v1.2d 2536 subs w8, w8, #2 2537 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2538 uaddw v24.4s, v24.4s, v0.4h 2539 uaddw2 v25.4s, v25.4s, v0.8h 2540 uaddw v26.4s, v26.4s, v1.4h 2541 uaddw2 v27.4s, v27.4s, v1.8h 2542 uaddw v24.4s, v24.4s, v2.4h 2543 uaddw2 v25.4s, v25.4s, v2.8h 2544 uaddw v26.4s, v26.4s, v3.4h 2545 uaddw2 v27.4s, v27.4s, v3.8h 2546 b.gt 1b 2547 mov v0.16b, v2.16b 2548 mov v1.16b, v3.16b 2549 2550L(ipred_cfl_ac_420_w16_hpad): 2551 cbz w4, 3f 25522: // Vertical padding (h_pad > 0) 2553 subs w4, w4, #4 2554 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2555 uaddw v24.4s, v24.4s, v0.4h 2556 uaddw2 v25.4s, v25.4s, v0.8h 2557 uaddw v26.4s, v26.4s, v1.4h 2558 uaddw2 v27.4s, v27.4s, v1.8h 2559 uaddw v24.4s, v24.4s, v2.4h 2560 uaddw2 v25.4s, v25.4s, v2.8h 2561 uaddw v26.4s, v26.4s, v3.4h 2562 uaddw2 v27.4s, v27.4s, v3.8h 2563 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2564 uaddw v24.4s, v24.4s, v0.4h 2565 uaddw2 v25.4s, v25.4s, v0.8h 2566 uaddw v26.4s, v26.4s, v1.4h 2567 uaddw2 v27.4s, v27.4s, v1.8h 2568 uaddw v24.4s, v24.4s, v2.4h 2569 uaddw2 v25.4s, v25.4s, v2.8h 2570 uaddw v26.4s, v26.4s, v3.4h 2571 uaddw2 v27.4s, v27.4s, v3.8h 2572 b.gt 2b 25733: 2574 2575 // Quadruple the height and reuse the w4 summing/subtracting 2576 lsl w6, w6, #2 2577 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 2578 2579L(ipred_cfl_ac_420_tbl): 2580 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 2581 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 2582 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 2583 .hword 0 2584 2585L(ipred_cfl_ac_420_w16_tbl): 2586 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 2587 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 2588 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 2589 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 2590endfunc 2591 2592// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2593// const ptrdiff_t stride, const int w_pad, 2594// const int h_pad, const int cw, const int ch); 2595function ipred_cfl_ac_422_16bpc_neon, export=1 2596 clz w8, w5 2597 lsl w4, w4, #2 2598 adr x7, L(ipred_cfl_ac_422_tbl) 2599 sub w8, w8, #27 2600 ldrh w8, [x7, w8, uxtw #1] 2601 movi v24.4s, #0 2602 movi v25.4s, #0 2603 movi v26.4s, #0 2604 movi v27.4s, #0 2605 sub x7, x7, w8, uxtw 2606 sub w8, w6, w4 // height - h_pad 2607 rbit w9, w5 // rbit(width) 2608 rbit w10, w6 // rbit(height) 2609 clz w9, w9 // ctz(width) 2610 clz w10, w10 // ctz(height) 2611 add w9, w9, w10 // log2sz 2612 add x10, x1, x2 2613 dup v31.4s, w9 2614 lsl x2, x2, #1 2615 neg v31.4s, v31.4s // -log2sz 2616 br x7 2617 2618L(ipred_cfl_ac_422_w4): 26191: // Copy and subsample input 2620 ld1 {v0.8h}, [x1], x2 2621 ld1 {v1.8h}, [x10], x2 2622 ld1 {v2.8h}, [x1], x2 2623 ld1 {v3.8h}, [x10], x2 2624 addp v0.8h, v0.8h, v1.8h 2625 addp v2.8h, v2.8h, v3.8h 2626 shl v0.8h, v0.8h, #2 2627 shl v1.8h, v2.8h, #2 2628 subs w8, w8, #4 2629 st1 {v0.8h, v1.8h}, [x0], #32 2630 uaddw v24.4s, v24.4s, v0.4h 2631 uaddw2 v25.4s, v25.4s, v0.8h 2632 uaddw v26.4s, v26.4s, v1.4h 2633 uaddw2 v27.4s, v27.4s, v1.8h 2634 b.gt 1b 2635 trn2 v0.2d, v1.2d, v1.2d 2636 trn2 v1.2d, v1.2d, v1.2d 2637 b L(ipred_cfl_ac_420_w4_hpad) 2638 2639L(ipred_cfl_ac_422_w8): 2640 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 26411: // Copy and subsample input, without padding 2642 ld1 {v0.8h, v1.8h}, [x1], x2 2643 ld1 {v2.8h, v3.8h}, [x10], x2 2644 ld1 {v4.8h, v5.8h}, [x1], x2 2645 addp v0.8h, v0.8h, v1.8h 2646 ld1 {v6.8h, v7.8h}, [x10], x2 2647 addp v2.8h, v2.8h, v3.8h 2648 addp v4.8h, v4.8h, v5.8h 2649 addp v6.8h, v6.8h, v7.8h 2650 shl v0.8h, v0.8h, #2 2651 shl v1.8h, v2.8h, #2 2652 shl v2.8h, v4.8h, #2 2653 shl v3.8h, v6.8h, #2 2654 subs w8, w8, #4 2655 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2656 uaddw v24.4s, v24.4s, v0.4h 2657 uaddw2 v25.4s, v25.4s, v0.8h 2658 uaddw v26.4s, v26.4s, v1.4h 2659 uaddw2 v27.4s, v27.4s, v1.8h 2660 uaddw v24.4s, v24.4s, v2.4h 2661 uaddw2 v25.4s, v25.4s, v2.8h 2662 uaddw v26.4s, v26.4s, v3.4h 2663 uaddw2 v27.4s, v27.4s, v3.8h 2664 b.gt 1b 2665 mov v0.16b, v3.16b 2666 mov v1.16b, v3.16b 2667 b L(ipred_cfl_ac_420_w8_hpad) 2668 2669L(ipred_cfl_ac_422_w8_wpad): 26701: // Copy and subsample input, padding 4 2671 ld1 {v0.8h}, [x1], x2 2672 ld1 {v1.8h}, [x10], x2 2673 ld1 {v2.8h}, [x1], x2 2674 ld1 {v3.8h}, [x10], x2 2675 addp v0.8h, v0.8h, v1.8h 2676 addp v2.8h, v2.8h, v3.8h 2677 shl v0.8h, v0.8h, #2 2678 shl v2.8h, v2.8h, #2 2679 dup v4.4h, v0.h[3] 2680 dup v5.8h, v0.h[7] 2681 dup v6.4h, v2.h[3] 2682 dup v7.8h, v2.h[7] 2683 trn2 v1.2d, v0.2d, v5.2d 2684 trn1 v0.2d, v0.2d, v4.2d 2685 trn2 v3.2d, v2.2d, v7.2d 2686 trn1 v2.2d, v2.2d, v6.2d 2687 subs w8, w8, #4 2688 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2689 uaddw v24.4s, v24.4s, v0.4h 2690 uaddw2 v25.4s, v25.4s, v0.8h 2691 uaddw v26.4s, v26.4s, v1.4h 2692 uaddw2 v27.4s, v27.4s, v1.8h 2693 uaddw v24.4s, v24.4s, v2.4h 2694 uaddw2 v25.4s, v25.4s, v2.8h 2695 uaddw v26.4s, v26.4s, v3.4h 2696 uaddw2 v27.4s, v27.4s, v3.8h 2697 b.gt 1b 2698 mov v0.16b, v3.16b 2699 mov v1.16b, v3.16b 2700 b L(ipred_cfl_ac_420_w8_hpad) 2701 2702L(ipred_cfl_ac_422_w16): 2703 adr x7, L(ipred_cfl_ac_422_w16_tbl) 2704 ldrh w3, [x7, w3, uxtw #1] 2705 sub x7, x7, w3, uxtw 2706 br x7 2707 2708L(ipred_cfl_ac_422_w16_wpad0): 27091: // Copy and subsample input, without padding 2710 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2711 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 2712 addp v0.8h, v0.8h, v1.8h 2713 addp v2.8h, v2.8h, v3.8h 2714 addp v4.8h, v4.8h, v5.8h 2715 addp v6.8h, v6.8h, v7.8h 2716 shl v0.8h, v0.8h, #2 2717 shl v1.8h, v2.8h, #2 2718 shl v2.8h, v4.8h, #2 2719 shl v3.8h, v6.8h, #2 2720 subs w8, w8, #2 2721 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2722 uaddw v24.4s, v24.4s, v0.4h 2723 uaddw2 v25.4s, v25.4s, v0.8h 2724 uaddw v26.4s, v26.4s, v1.4h 2725 uaddw2 v27.4s, v27.4s, v1.8h 2726 uaddw v24.4s, v24.4s, v2.4h 2727 uaddw2 v25.4s, v25.4s, v2.8h 2728 uaddw v26.4s, v26.4s, v3.4h 2729 uaddw2 v27.4s, v27.4s, v3.8h 2730 b.gt 1b 2731 mov v0.16b, v2.16b 2732 mov v1.16b, v3.16b 2733 b L(ipred_cfl_ac_420_w16_hpad) 2734 2735L(ipred_cfl_ac_422_w16_wpad1): 27361: // Copy and subsample input, padding 4 2737 ldr q2, [x1, #32] 2738 ld1 {v0.8h, v1.8h}, [x1], x2 2739 ldr q6, [x10, #32] 2740 ld1 {v4.8h, v5.8h}, [x10], x2 2741 addp v2.8h, v2.8h, v2.8h 2742 addp v0.8h, v0.8h, v1.8h 2743 addp v6.8h, v6.8h, v6.8h 2744 addp v4.8h, v4.8h, v5.8h 2745 shl v1.4h, v2.4h, #2 2746 shl v0.8h, v0.8h, #2 2747 shl v3.4h, v6.4h, #2 2748 shl v2.8h, v4.8h, #2 2749 dup v4.4h, v1.h[3] 2750 dup v5.4h, v3.h[3] 2751 trn1 v1.2d, v1.2d, v4.2d 2752 trn1 v3.2d, v3.2d, v5.2d 2753 subs w8, w8, #2 2754 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2755 uaddw v24.4s, v24.4s, v0.4h 2756 uaddw2 v25.4s, v25.4s, v0.8h 2757 uaddw v26.4s, v26.4s, v1.4h 2758 uaddw2 v27.4s, v27.4s, v1.8h 2759 uaddw v24.4s, v24.4s, v2.4h 2760 uaddw2 v25.4s, v25.4s, v2.8h 2761 uaddw v26.4s, v26.4s, v3.4h 2762 uaddw2 v27.4s, v27.4s, v3.8h 2763 b.gt 1b 2764 mov v0.16b, v2.16b 2765 mov v1.16b, v3.16b 2766 b L(ipred_cfl_ac_420_w16_hpad) 2767 2768L(ipred_cfl_ac_422_w16_wpad2): 27691: // Copy and subsample input, padding 8 2770 ld1 {v0.8h, v1.8h}, [x1], x2 2771 ld1 {v2.8h, v3.8h}, [x10], x2 2772 addp v0.8h, v0.8h, v1.8h 2773 addp v2.8h, v2.8h, v3.8h 2774 shl v0.8h, v0.8h, #2 2775 shl v2.8h, v2.8h, #2 2776 dup v1.8h, v0.h[7] 2777 dup v3.8h, v2.h[7] 2778 subs w8, w8, #2 2779 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2780 uaddw v24.4s, v24.4s, v0.4h 2781 uaddw2 v25.4s, v25.4s, v0.8h 2782 uaddw v26.4s, v26.4s, v1.4h 2783 uaddw2 v27.4s, v27.4s, v1.8h 2784 uaddw v24.4s, v24.4s, v2.4h 2785 uaddw2 v25.4s, v25.4s, v2.8h 2786 uaddw v26.4s, v26.4s, v3.4h 2787 uaddw2 v27.4s, v27.4s, v3.8h 2788 b.gt 1b 2789 mov v0.16b, v2.16b 2790 mov v1.16b, v3.16b 2791 b L(ipred_cfl_ac_420_w16_hpad) 2792 2793L(ipred_cfl_ac_422_w16_wpad3): 27941: // Copy and subsample input, padding 12 2795 ld1 {v0.8h}, [x1], x2 2796 ld1 {v2.8h}, [x10], x2 2797 addp v0.8h, v0.8h, v0.8h 2798 addp v2.8h, v2.8h, v2.8h 2799 shl v0.4h, v0.4h, #2 2800 shl v2.4h, v2.4h, #2 2801 dup v1.8h, v0.h[3] 2802 dup v3.8h, v2.h[3] 2803 trn1 v0.2d, v0.2d, v1.2d 2804 trn1 v2.2d, v2.2d, v3.2d 2805 subs w8, w8, #2 2806 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2807 uaddw v24.4s, v24.4s, v0.4h 2808 uaddw2 v25.4s, v25.4s, v0.8h 2809 uaddw v26.4s, v26.4s, v1.4h 2810 uaddw2 v27.4s, v27.4s, v1.8h 2811 uaddw v24.4s, v24.4s, v2.4h 2812 uaddw2 v25.4s, v25.4s, v2.8h 2813 uaddw v26.4s, v26.4s, v3.4h 2814 uaddw2 v27.4s, v27.4s, v3.8h 2815 b.gt 1b 2816 mov v0.16b, v2.16b 2817 mov v1.16b, v3.16b 2818 b L(ipred_cfl_ac_420_w16_hpad) 2819 2820L(ipred_cfl_ac_422_tbl): 2821 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 2822 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 2823 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 2824 .hword 0 2825 2826L(ipred_cfl_ac_422_w16_tbl): 2827 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 2828 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 2829 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 2830 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 2831endfunc 2832 2833// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, 2834// const ptrdiff_t stride, const int w_pad, 2835// const int h_pad, const int cw, const int ch); 2836function ipred_cfl_ac_444_16bpc_neon, export=1 2837 clz w8, w5 2838 lsl w4, w4, #2 2839 adr x7, L(ipred_cfl_ac_444_tbl) 2840 sub w8, w8, #26 2841 ldrh w8, [x7, w8, uxtw #1] 2842 movi v24.4s, #0 2843 movi v25.4s, #0 2844 movi v26.4s, #0 2845 movi v27.4s, #0 2846 sub x7, x7, w8, uxtw 2847 sub w8, w6, w4 // height - h_pad 2848 rbit w9, w5 // rbit(width) 2849 rbit w10, w6 // rbit(height) 2850 clz w9, w9 // ctz(width) 2851 clz w10, w10 // ctz(height) 2852 add w9, w9, w10 // log2sz 2853 add x10, x1, x2 2854 dup v31.4s, w9 2855 lsl x2, x2, #1 2856 neg v31.4s, v31.4s // -log2sz 2857 br x7 2858 2859L(ipred_cfl_ac_444_w4): 28601: // Copy and expand input 2861 ld1 {v0.4h}, [x1], x2 2862 ld1 {v0.d}[1], [x10], x2 2863 ld1 {v1.4h}, [x1], x2 2864 ld1 {v1.d}[1], [x10], x2 2865 shl v0.8h, v0.8h, #3 2866 shl v1.8h, v1.8h, #3 2867 subs w8, w8, #4 2868 st1 {v0.8h, v1.8h}, [x0], #32 2869 uaddw v24.4s, v24.4s, v0.4h 2870 uaddw2 v25.4s, v25.4s, v0.8h 2871 uaddw v26.4s, v26.4s, v1.4h 2872 uaddw2 v27.4s, v27.4s, v1.8h 2873 b.gt 1b 2874 trn2 v0.2d, v1.2d, v1.2d 2875 trn2 v1.2d, v1.2d, v1.2d 2876 b L(ipred_cfl_ac_420_w4_hpad) 2877 2878L(ipred_cfl_ac_444_w8): 28791: // Copy and expand input 2880 ld1 {v0.8h}, [x1], x2 2881 ld1 {v1.8h}, [x10], x2 2882 ld1 {v2.8h}, [x1], x2 2883 shl v0.8h, v0.8h, #3 2884 ld1 {v3.8h}, [x10], x2 2885 shl v1.8h, v1.8h, #3 2886 shl v2.8h, v2.8h, #3 2887 shl v3.8h, v3.8h, #3 2888 subs w8, w8, #4 2889 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2890 uaddw v24.4s, v24.4s, v0.4h 2891 uaddw2 v25.4s, v25.4s, v0.8h 2892 uaddw v26.4s, v26.4s, v1.4h 2893 uaddw2 v27.4s, v27.4s, v1.8h 2894 uaddw v24.4s, v24.4s, v2.4h 2895 uaddw2 v25.4s, v25.4s, v2.8h 2896 uaddw v26.4s, v26.4s, v3.4h 2897 uaddw2 v27.4s, v27.4s, v3.8h 2898 b.gt 1b 2899 mov v0.16b, v3.16b 2900 mov v1.16b, v3.16b 2901 b L(ipred_cfl_ac_420_w8_hpad) 2902 2903L(ipred_cfl_ac_444_w16): 2904 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 29051: // Copy and expand input, without padding 2906 ld1 {v0.8h, v1.8h}, [x1], x2 2907 ld1 {v2.8h, v3.8h}, [x10], x2 2908 shl v0.8h, v0.8h, #3 2909 shl v1.8h, v1.8h, #3 2910 shl v2.8h, v2.8h, #3 2911 shl v3.8h, v3.8h, #3 2912 subs w8, w8, #2 2913 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2914 uaddw v24.4s, v24.4s, v0.4h 2915 uaddw2 v25.4s, v25.4s, v0.8h 2916 uaddw v26.4s, v26.4s, v1.4h 2917 uaddw2 v27.4s, v27.4s, v1.8h 2918 uaddw v24.4s, v24.4s, v2.4h 2919 uaddw2 v25.4s, v25.4s, v2.8h 2920 uaddw v26.4s, v26.4s, v3.4h 2921 uaddw2 v27.4s, v27.4s, v3.8h 2922 b.gt 1b 2923 mov v0.16b, v2.16b 2924 mov v1.16b, v3.16b 2925 b L(ipred_cfl_ac_420_w16_hpad) 2926 2927L(ipred_cfl_ac_444_w16_wpad): 29281: // Copy and expand input, padding 8 2929 ld1 {v0.8h}, [x1], x2 2930 ld1 {v2.8h}, [x10], x2 2931 shl v0.8h, v0.8h, #3 2932 shl v2.8h, v2.8h, #3 2933 dup v1.8h, v0.h[7] 2934 dup v3.8h, v2.h[7] 2935 subs w8, w8, #2 2936 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2937 uaddw v24.4s, v24.4s, v0.4h 2938 uaddw2 v25.4s, v25.4s, v0.8h 2939 uaddw v26.4s, v26.4s, v1.4h 2940 uaddw2 v27.4s, v27.4s, v1.8h 2941 uaddw v24.4s, v24.4s, v2.4h 2942 uaddw2 v25.4s, v25.4s, v2.8h 2943 uaddw v26.4s, v26.4s, v3.4h 2944 uaddw2 v27.4s, v27.4s, v3.8h 2945 b.gt 1b 2946 mov v0.16b, v2.16b 2947 mov v1.16b, v3.16b 2948 b L(ipred_cfl_ac_420_w16_hpad) 2949 2950L(ipred_cfl_ac_444_w32): 2951 adr x7, L(ipred_cfl_ac_444_w32_tbl) 2952 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 2953 lsr x2, x2, #1 // Restore the stride to one line increments 2954 sub x7, x7, w3, uxtw 2955 br x7 2956 2957L(ipred_cfl_ac_444_w32_wpad0): 29581: // Copy and expand input, without padding 2959 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 2960 shl v0.8h, v0.8h, #3 2961 shl v1.8h, v1.8h, #3 2962 shl v2.8h, v2.8h, #3 2963 shl v3.8h, v3.8h, #3 2964 subs w8, w8, #1 2965 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2966 uaddw v24.4s, v24.4s, v0.4h 2967 uaddw2 v25.4s, v25.4s, v0.8h 2968 uaddw v26.4s, v26.4s, v1.4h 2969 uaddw2 v27.4s, v27.4s, v1.8h 2970 uaddw v24.4s, v24.4s, v2.4h 2971 uaddw2 v25.4s, v25.4s, v2.8h 2972 uaddw v26.4s, v26.4s, v3.4h 2973 uaddw2 v27.4s, v27.4s, v3.8h 2974 b.gt 1b 2975 b L(ipred_cfl_ac_444_w32_hpad) 2976 2977L(ipred_cfl_ac_444_w32_wpad2): 29781: // Copy and expand input, padding 8 2979 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 2980 shl v2.8h, v2.8h, #3 2981 shl v0.8h, v0.8h, #3 2982 shl v1.8h, v1.8h, #3 2983 dup v3.8h, v2.h[7] 2984 subs w8, w8, #1 2985 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2986 uaddw v24.4s, v24.4s, v0.4h 2987 uaddw2 v25.4s, v25.4s, v0.8h 2988 uaddw v26.4s, v26.4s, v1.4h 2989 uaddw2 v27.4s, v27.4s, v1.8h 2990 uaddw v24.4s, v24.4s, v2.4h 2991 uaddw2 v25.4s, v25.4s, v2.8h 2992 uaddw v26.4s, v26.4s, v3.4h 2993 uaddw2 v27.4s, v27.4s, v3.8h 2994 b.gt 1b 2995 b L(ipred_cfl_ac_444_w32_hpad) 2996 2997L(ipred_cfl_ac_444_w32_wpad4): 29981: // Copy and expand input, padding 16 2999 ld1 {v0.8h, v1.8h}, [x1], x2 3000 shl v1.8h, v1.8h, #3 3001 shl v0.8h, v0.8h, #3 3002 dup v2.8h, v1.h[7] 3003 dup v3.8h, v1.h[7] 3004 subs w8, w8, #1 3005 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3006 uaddw v24.4s, v24.4s, v0.4h 3007 uaddw2 v25.4s, v25.4s, v0.8h 3008 uaddw v26.4s, v26.4s, v1.4h 3009 uaddw2 v27.4s, v27.4s, v1.8h 3010 uaddw v24.4s, v24.4s, v2.4h 3011 uaddw2 v25.4s, v25.4s, v2.8h 3012 uaddw v26.4s, v26.4s, v3.4h 3013 uaddw2 v27.4s, v27.4s, v3.8h 3014 b.gt 1b 3015 b L(ipred_cfl_ac_444_w32_hpad) 3016 3017L(ipred_cfl_ac_444_w32_wpad6): 30181: // Copy and expand input, padding 24 3019 ld1 {v0.8h}, [x1], x2 3020 shl v0.8h, v0.8h, #3 3021 dup v1.8h, v0.h[7] 3022 dup v2.8h, v0.h[7] 3023 dup v3.8h, v0.h[7] 3024 subs w8, w8, #1 3025 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3026 uaddw v24.4s, v24.4s, v0.4h 3027 uaddw2 v25.4s, v25.4s, v0.8h 3028 uaddw v26.4s, v26.4s, v1.4h 3029 uaddw2 v27.4s, v27.4s, v1.8h 3030 uaddw v24.4s, v24.4s, v2.4h 3031 uaddw2 v25.4s, v25.4s, v2.8h 3032 uaddw v26.4s, v26.4s, v3.4h 3033 uaddw2 v27.4s, v27.4s, v3.8h 3034 b.gt 1b 3035 3036L(ipred_cfl_ac_444_w32_hpad): 3037 cbz w4, 3f 30382: // Vertical padding (h_pad > 0) 3039 subs w4, w4, #2 3040 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3041 uaddw v24.4s, v24.4s, v0.4h 3042 uaddw2 v25.4s, v25.4s, v0.8h 3043 uaddw v26.4s, v26.4s, v1.4h 3044 uaddw2 v27.4s, v27.4s, v1.8h 3045 uaddw v24.4s, v24.4s, v2.4h 3046 uaddw2 v25.4s, v25.4s, v2.8h 3047 uaddw v26.4s, v26.4s, v3.4h 3048 uaddw2 v27.4s, v27.4s, v3.8h 3049 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 3050 uaddw v24.4s, v24.4s, v0.4h 3051 uaddw2 v25.4s, v25.4s, v0.8h 3052 uaddw v26.4s, v26.4s, v1.4h 3053 uaddw2 v27.4s, v27.4s, v1.8h 3054 uaddw v24.4s, v24.4s, v2.4h 3055 uaddw2 v25.4s, v25.4s, v2.8h 3056 uaddw v26.4s, v26.4s, v3.4h 3057 uaddw2 v27.4s, v27.4s, v3.8h 3058 b.gt 2b 30593: 3060 3061 // Multiply the height by eight and reuse the w4 subtracting 3062 lsl w6, w6, #3 3063 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 3064 3065L(ipred_cfl_ac_444_tbl): 3066 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 3067 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 3068 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 3069 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 3070 3071L(ipred_cfl_ac_444_w32_tbl): 3072 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 3073 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 3074 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 3075 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 3076endfunc 3077