1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height); 35function ipred_dc_128_8bpc_neon, export=1 36 clz w3, w3 37 adr x5, L(ipred_dc_128_tbl) 38 sub w3, w3, #25 39 ldrh w3, [x5, w3, uxtw #1] 40 movi v0.16b, #128 41 sub x5, x5, w3, uxtw 42 add x6, x0, x1 43 lsl x1, x1, #1 44 br x5 454: 46 st1 {v0.s}[0], [x0], x1 47 st1 {v0.s}[0], [x6], x1 48 subs w4, w4, #4 49 st1 {v0.s}[0], [x0], x1 50 st1 {v0.s}[0], [x6], x1 51 b.gt 4b 52 ret 538: 54 st1 {v0.8b}, [x0], x1 55 st1 {v0.8b}, [x6], x1 56 subs w4, w4, #4 57 st1 {v0.8b}, [x0], x1 58 st1 {v0.8b}, [x6], x1 59 b.gt 8b 60 ret 6116: 62 st1 {v0.16b}, [x0], x1 63 st1 {v0.16b}, [x6], x1 64 subs w4, w4, #4 65 st1 {v0.16b}, [x0], x1 66 st1 {v0.16b}, [x6], x1 67 b.gt 16b 68 ret 69320: 70 movi v1.16b, #128 7132: 72 st1 {v0.16b, v1.16b}, [x0], x1 73 st1 {v0.16b, v1.16b}, [x6], x1 74 subs w4, w4, #4 75 st1 {v0.16b, v1.16b}, [x0], x1 76 st1 {v0.16b, v1.16b}, [x6], x1 77 b.gt 32b 78 ret 79640: 80 movi v1.16b, #128 81 movi v2.16b, #128 82 movi v3.16b, #128 8364: 84 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 85 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 86 subs w4, w4, #4 87 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 88 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 89 b.gt 64b 90 ret 91 92L(ipred_dc_128_tbl): 93 .hword L(ipred_dc_128_tbl) - 640b 94 .hword L(ipred_dc_128_tbl) - 320b 95 .hword L(ipred_dc_128_tbl) - 16b 96 .hword L(ipred_dc_128_tbl) - 8b 97 .hword L(ipred_dc_128_tbl) - 4b 98endfunc 99 100// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 101// const pixel *const topleft, 102// const int width, const int height, const int a, 103// const int max_width, const int max_height); 104function ipred_v_8bpc_neon, export=1 105 clz w3, w3 106 adr x5, L(ipred_v_tbl) 107 sub w3, w3, #25 108 ldrh w3, [x5, w3, uxtw #1] 109 add x2, x2, #1 110 sub x5, x5, w3, uxtw 111 add x6, x0, x1 112 lsl x1, x1, #1 113 br x5 11440: 115 ld1 {v0.s}[0], [x2] 1164: 117 st1 {v0.s}[0], [x0], x1 118 st1 {v0.s}[0], [x6], x1 119 subs w4, w4, #4 120 st1 {v0.s}[0], [x0], x1 121 st1 {v0.s}[0], [x6], x1 122 b.gt 4b 123 ret 12480: 125 ld1 {v0.8b}, [x2] 1268: 127 st1 {v0.8b}, [x0], x1 128 st1 {v0.8b}, [x6], x1 129 subs w4, w4, #4 130 st1 {v0.8b}, [x0], x1 131 st1 {v0.8b}, [x6], x1 132 b.gt 8b 133 ret 134160: 135 ld1 {v0.16b}, [x2] 13616: 137 st1 {v0.16b}, [x0], x1 138 st1 {v0.16b}, [x6], x1 139 subs w4, w4, #4 140 st1 {v0.16b}, [x0], x1 141 st1 {v0.16b}, [x6], x1 142 b.gt 16b 143 ret 144320: 145 ld1 {v0.16b, v1.16b}, [x2] 14632: 147 st1 {v0.16b, v1.16b}, [x0], x1 148 st1 {v0.16b, v1.16b}, [x6], x1 149 subs w4, w4, #4 150 st1 {v0.16b, v1.16b}, [x0], x1 151 st1 {v0.16b, v1.16b}, [x6], x1 152 b.gt 32b 153 ret 154640: 155 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 15664: 157 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 158 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 159 subs w4, w4, #4 160 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 161 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 162 b.gt 64b 163 ret 164 165L(ipred_v_tbl): 166 .hword L(ipred_v_tbl) - 640b 167 .hword L(ipred_v_tbl) - 320b 168 .hword L(ipred_v_tbl) - 160b 169 .hword L(ipred_v_tbl) - 80b 170 .hword L(ipred_v_tbl) - 40b 171endfunc 172 173// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 174// const pixel *const topleft, 175// const int width, const int height, const int a, 176// const int max_width, const int max_height); 177function ipred_h_8bpc_neon, export=1 178 clz w3, w3 179 adr x5, L(ipred_h_tbl) 180 sub w3, w3, #25 181 ldrh w3, [x5, w3, uxtw #1] 182 sub x2, x2, #4 183 sub x5, x5, w3, uxtw 184 mov x7, #-4 185 add x6, x0, x1 186 lsl x1, x1, #1 187 br x5 1884: 189 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 190 st1 {v3.s}[0], [x0], x1 191 st1 {v2.s}[0], [x6], x1 192 subs w4, w4, #4 193 st1 {v1.s}[0], [x0], x1 194 st1 {v0.s}[0], [x6], x1 195 b.gt 4b 196 ret 1978: 198 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 199 st1 {v3.8b}, [x0], x1 200 st1 {v2.8b}, [x6], x1 201 subs w4, w4, #4 202 st1 {v1.8b}, [x0], x1 203 st1 {v0.8b}, [x6], x1 204 b.gt 8b 205 ret 20616: 207 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 208 st1 {v3.16b}, [x0], x1 209 st1 {v2.16b}, [x6], x1 210 subs w4, w4, #4 211 st1 {v1.16b}, [x0], x1 212 st1 {v0.16b}, [x6], x1 213 b.gt 16b 214 ret 21532: 216 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 217 str q3, [x0, #16] 218 str q2, [x6, #16] 219 st1 {v3.16b}, [x0], x1 220 st1 {v2.16b}, [x6], x1 221 subs w4, w4, #4 222 str q1, [x0, #16] 223 str q0, [x6, #16] 224 st1 {v1.16b}, [x0], x1 225 st1 {v0.16b}, [x6], x1 226 b.gt 32b 227 ret 22864: 229 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 230 str q3, [x0, #16] 231 str q2, [x6, #16] 232 stp q3, q3, [x0, #32] 233 stp q2, q2, [x6, #32] 234 st1 {v3.16b}, [x0], x1 235 st1 {v2.16b}, [x6], x1 236 subs w4, w4, #4 237 str q1, [x0, #16] 238 str q0, [x6, #16] 239 stp q1, q1, [x0, #32] 240 stp q0, q0, [x6, #32] 241 st1 {v1.16b}, [x0], x1 242 st1 {v0.16b}, [x6], x1 243 b.gt 64b 244 ret 245 246L(ipred_h_tbl): 247 .hword L(ipred_h_tbl) - 64b 248 .hword L(ipred_h_tbl) - 32b 249 .hword L(ipred_h_tbl) - 16b 250 .hword L(ipred_h_tbl) - 8b 251 .hword L(ipred_h_tbl) - 4b 252endfunc 253 254// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 255// const pixel *const topleft, 256// const int width, const int height, const int a, 257// const int max_width, const int max_height); 258function ipred_dc_top_8bpc_neon, export=1 259 clz w3, w3 260 adr x5, L(ipred_dc_top_tbl) 261 sub w3, w3, #25 262 ldrh w3, [x5, w3, uxtw #1] 263 add x2, x2, #1 264 sub x5, x5, w3, uxtw 265 add x6, x0, x1 266 lsl x1, x1, #1 267 br x5 26840: 269 ld1r {v0.2s}, [x2] 270 uaddlv h0, v0.8b 271 rshrn v0.8b, v0.8h, #3 272 dup v0.8b, v0.b[0] 2734: 274 st1 {v0.s}[0], [x0], x1 275 st1 {v0.s}[0], [x6], x1 276 subs w4, w4, #4 277 st1 {v0.s}[0], [x0], x1 278 st1 {v0.s}[0], [x6], x1 279 b.gt 4b 280 ret 28180: 282 ld1 {v0.8b}, [x2] 283 uaddlv h0, v0.8b 284 rshrn v0.8b, v0.8h, #3 285 dup v0.8b, v0.b[0] 2868: 287 st1 {v0.8b}, [x0], x1 288 st1 {v0.8b}, [x6], x1 289 subs w4, w4, #4 290 st1 {v0.8b}, [x0], x1 291 st1 {v0.8b}, [x6], x1 292 b.gt 8b 293 ret 294160: 295 ld1 {v0.16b}, [x2] 296 uaddlv h0, v0.16b 297 rshrn v0.8b, v0.8h, #4 298 dup v0.16b, v0.b[0] 29916: 300 st1 {v0.16b}, [x0], x1 301 st1 {v0.16b}, [x6], x1 302 subs w4, w4, #4 303 st1 {v0.16b}, [x0], x1 304 st1 {v0.16b}, [x6], x1 305 b.gt 16b 306 ret 307320: 308 ld1 {v0.16b, v1.16b}, [x2] 309 uaddlv h0, v0.16b 310 uaddlv h1, v1.16b 311 add v2.4h, v0.4h, v1.4h 312 rshrn v2.8b, v2.8h, #5 313 dup v0.16b, v2.b[0] 314 dup v1.16b, v2.b[0] 31532: 316 st1 {v0.16b, v1.16b}, [x0], x1 317 st1 {v0.16b, v1.16b}, [x6], x1 318 subs w4, w4, #4 319 st1 {v0.16b, v1.16b}, [x0], x1 320 st1 {v0.16b, v1.16b}, [x6], x1 321 b.gt 32b 322 ret 323640: 324 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 325 uaddlv h0, v0.16b 326 uaddlv h1, v1.16b 327 uaddlv h2, v2.16b 328 uaddlv h3, v3.16b 329 add v4.4h, v0.4h, v1.4h 330 add v5.4h, v2.4h, v3.4h 331 add v4.4h, v4.4h, v5.4h 332 rshrn v4.8b, v4.8h, #6 333 dup v0.16b, v4.b[0] 334 dup v1.16b, v4.b[0] 335 dup v2.16b, v4.b[0] 336 dup v3.16b, v4.b[0] 33764: 338 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 339 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 340 subs w4, w4, #4 341 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 342 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 343 b.gt 64b 344 ret 345 346L(ipred_dc_top_tbl): 347 .hword L(ipred_dc_top_tbl) - 640b 348 .hword L(ipred_dc_top_tbl) - 320b 349 .hword L(ipred_dc_top_tbl) - 160b 350 .hword L(ipred_dc_top_tbl) - 80b 351 .hword L(ipred_dc_top_tbl) - 40b 352endfunc 353 354// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 355// const pixel *const topleft, 356// const int width, const int height, const int a, 357// const int max_width, const int max_height); 358function ipred_dc_left_8bpc_neon, export=1 359 sub x2, x2, w4, uxtw 360 clz w3, w3 361 clz w7, w4 362 adr x5, L(ipred_dc_left_tbl) 363 sub w3, w3, #20 // 25 leading bits, minus table offset 5 364 sub w7, w7, #25 365 ldrh w3, [x5, w3, uxtw #1] 366 ldrh w7, [x5, w7, uxtw #1] 367 sub x3, x5, w3, uxtw 368 sub x5, x5, w7, uxtw 369 add x6, x0, x1 370 lsl x1, x1, #1 371 br x5 372 373L(ipred_dc_left_h4): 374 ld1r {v0.2s}, [x2] 375 uaddlv h0, v0.8b 376 rshrn v0.8b, v0.8h, #3 377 dup v0.16b, v0.b[0] 378 br x3 379L(ipred_dc_left_w4): 380 st1 {v0.s}[0], [x0], x1 381 st1 {v0.s}[0], [x6], x1 382 subs w4, w4, #4 383 st1 {v0.s}[0], [x0], x1 384 st1 {v0.s}[0], [x6], x1 385 b.gt L(ipred_dc_left_w4) 386 ret 387 388L(ipred_dc_left_h8): 389 ld1 {v0.8b}, [x2] 390 uaddlv h0, v0.8b 391 rshrn v0.8b, v0.8h, #3 392 dup v0.16b, v0.b[0] 393 br x3 394L(ipred_dc_left_w8): 395 st1 {v0.8b}, [x0], x1 396 st1 {v0.8b}, [x6], x1 397 subs w4, w4, #4 398 st1 {v0.8b}, [x0], x1 399 st1 {v0.8b}, [x6], x1 400 b.gt L(ipred_dc_left_w8) 401 ret 402 403L(ipred_dc_left_h16): 404 ld1 {v0.16b}, [x2] 405 uaddlv h0, v0.16b 406 rshrn v0.8b, v0.8h, #4 407 dup v0.16b, v0.b[0] 408 br x3 409L(ipred_dc_left_w16): 410 st1 {v0.16b}, [x0], x1 411 st1 {v0.16b}, [x6], x1 412 subs w4, w4, #4 413 st1 {v0.16b}, [x0], x1 414 st1 {v0.16b}, [x6], x1 415 b.gt L(ipred_dc_left_w16) 416 ret 417 418L(ipred_dc_left_h32): 419 ld1 {v0.16b, v1.16b}, [x2] 420 uaddlv h0, v0.16b 421 uaddlv h1, v1.16b 422 add v0.4h, v0.4h, v1.4h 423 rshrn v0.8b, v0.8h, #5 424 dup v0.16b, v0.b[0] 425 br x3 426L(ipred_dc_left_w32): 427 mov v1.16b, v0.16b 4281: 429 st1 {v0.16b, v1.16b}, [x0], x1 430 st1 {v0.16b, v1.16b}, [x6], x1 431 subs w4, w4, #4 432 st1 {v0.16b, v1.16b}, [x0], x1 433 st1 {v0.16b, v1.16b}, [x6], x1 434 b.gt 1b 435 ret 436 437L(ipred_dc_left_h64): 438 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 439 uaddlv h0, v0.16b 440 uaddlv h1, v1.16b 441 uaddlv h2, v2.16b 442 uaddlv h3, v3.16b 443 add v0.4h, v0.4h, v1.4h 444 add v2.4h, v2.4h, v3.4h 445 add v0.4h, v0.4h, v2.4h 446 rshrn v0.8b, v0.8h, #6 447 dup v0.16b, v0.b[0] 448 br x3 449L(ipred_dc_left_w64): 450 mov v1.16b, v0.16b 451 mov v2.16b, v0.16b 452 mov v3.16b, v0.16b 4531: 454 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 455 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 456 subs w4, w4, #4 457 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 458 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 459 b.gt 1b 460 ret 461 462L(ipred_dc_left_tbl): 463 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 464 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 465 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 466 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 467 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 468 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 469 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 470 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 471 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 472 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 473endfunc 474 475// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, 476// const pixel *const topleft, 477// const int width, const int height, const int a, 478// const int max_width, const int max_height); 479function ipred_dc_8bpc_neon, export=1 480 sub x2, x2, w4, uxtw 481 add w7, w3, w4 // width + height 482 clz w3, w3 483 clz w6, w4 484 dup v16.8h, w7 // width + height 485 adr x5, L(ipred_dc_tbl) 486 rbit w7, w7 // rbit(width + height) 487 sub w3, w3, #20 // 25 leading bits, minus table offset 5 488 sub w6, w6, #25 489 clz w7, w7 // ctz(width + height) 490 ldrh w3, [x5, w3, uxtw #1] 491 ldrh w6, [x5, w6, uxtw #1] 492 neg w7, w7 // -ctz(width + height) 493 sub x3, x5, w3, uxtw 494 sub x5, x5, w6, uxtw 495 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 496 dup v17.8h, w7 // -ctz(width + height) 497 add x6, x0, x1 498 lsl x1, x1, #1 499 br x5 500 501L(ipred_dc_h4): 502 ld1 {v0.s}[0], [x2], #4 503 ins v0.s[1], wzr 504 uaddlv h0, v0.8b 505 br x3 506L(ipred_dc_w4): 507 add x2, x2, #1 508 ld1 {v1.s}[0], [x2] 509 ins v1.s[1], wzr 510 add v0.4h, v0.4h, v16.4h 511 uaddlv h1, v1.8b 512 cmp w4, #4 513 add v0.4h, v0.4h, v1.4h 514 ushl v0.4h, v0.4h, v17.4h 515 b.eq 1f 516 // h = 8/16 517 mov w16, #(0x3334/2) 518 movk w16, #(0x5556/2), lsl #16 519 add w17, w4, w4 // w17 = 2*h = 16 or 32 520 lsr w16, w16, w17 521 dup v16.4h, w16 522 sqdmulh v0.4h, v0.4h, v16.4h 5231: 524 dup v0.8b, v0.b[0] 5252: 526 st1 {v0.s}[0], [x0], x1 527 st1 {v0.s}[0], [x6], x1 528 subs w4, w4, #4 529 st1 {v0.s}[0], [x0], x1 530 st1 {v0.s}[0], [x6], x1 531 b.gt 2b 532 ret 533 534L(ipred_dc_h8): 535 ld1 {v0.8b}, [x2], #8 536 uaddlv h0, v0.8b 537 br x3 538L(ipred_dc_w8): 539 add x2, x2, #1 540 ld1 {v1.8b}, [x2] 541 add v0.4h, v0.4h, v16.4h 542 uaddlv h1, v1.8b 543 cmp w4, #8 544 add v0.4h, v0.4h, v1.4h 545 ushl v0.4h, v0.4h, v17.4h 546 b.eq 1f 547 // h = 4/16/32 548 cmp w4, #32 549 mov w16, #(0x3334/2) 550 mov w17, #(0x5556/2) 551 csel w16, w16, w17, eq 552 dup v16.4h, w16 553 sqdmulh v0.4h, v0.4h, v16.4h 5541: 555 dup v0.8b, v0.b[0] 5562: 557 st1 {v0.8b}, [x0], x1 558 st1 {v0.8b}, [x6], x1 559 subs w4, w4, #4 560 st1 {v0.8b}, [x0], x1 561 st1 {v0.8b}, [x6], x1 562 b.gt 2b 563 ret 564 565L(ipred_dc_h16): 566 ld1 {v0.16b}, [x2], #16 567 uaddlv h0, v0.16b 568 br x3 569L(ipred_dc_w16): 570 add x2, x2, #1 571 ld1 {v1.16b}, [x2] 572 add v0.4h, v0.4h, v16.4h 573 uaddlv h1, v1.16b 574 cmp w4, #16 575 add v0.4h, v0.4h, v1.4h 576 ushl v0.4h, v0.4h, v17.4h 577 b.eq 1f 578 // h = 4/8/32/64 579 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 580 mov w16, #(0x3334/2) 581 mov w17, #(0x5556/2) 582 csel w16, w16, w17, eq 583 dup v16.4h, w16 584 sqdmulh v0.4h, v0.4h, v16.4h 5851: 586 dup v0.16b, v0.b[0] 5872: 588 st1 {v0.16b}, [x0], x1 589 st1 {v0.16b}, [x6], x1 590 subs w4, w4, #4 591 st1 {v0.16b}, [x0], x1 592 st1 {v0.16b}, [x6], x1 593 b.gt 2b 594 ret 595 596L(ipred_dc_h32): 597 ld1 {v0.16b, v1.16b}, [x2], #32 598 uaddlv h0, v0.16b 599 uaddlv h1, v1.16b 600 add v0.4h, v0.4h, v1.4h 601 br x3 602L(ipred_dc_w32): 603 add x2, x2, #1 604 ld1 {v1.16b, v2.16b}, [x2] 605 add v0.4h, v0.4h, v16.4h 606 uaddlv h1, v1.16b 607 uaddlv h2, v2.16b 608 cmp w4, #32 609 add v0.4h, v0.4h, v1.4h 610 add v0.4h, v0.4h, v2.4h 611 ushl v4.4h, v0.4h, v17.4h 612 b.eq 1f 613 // h = 8/16/64 614 cmp w4, #8 615 mov w16, #(0x3334/2) 616 mov w17, #(0x5556/2) 617 csel w16, w16, w17, eq 618 dup v16.4h, w16 619 sqdmulh v4.4h, v4.4h, v16.4h 6201: 621 dup v0.16b, v4.b[0] 622 dup v1.16b, v4.b[0] 6232: 624 st1 {v0.16b, v1.16b}, [x0], x1 625 st1 {v0.16b, v1.16b}, [x6], x1 626 subs w4, w4, #4 627 st1 {v0.16b, v1.16b}, [x0], x1 628 st1 {v0.16b, v1.16b}, [x6], x1 629 b.gt 2b 630 ret 631 632L(ipred_dc_h64): 633 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 634 uaddlv h0, v0.16b 635 uaddlv h1, v1.16b 636 uaddlv h2, v2.16b 637 uaddlv h3, v3.16b 638 add v0.4h, v0.4h, v1.4h 639 add v2.4h, v2.4h, v3.4h 640 add v0.4h, v0.4h, v2.4h 641 br x3 642L(ipred_dc_w64): 643 add x2, x2, #1 644 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] 645 add v0.4h, v0.4h, v16.4h 646 uaddlv h1, v1.16b 647 uaddlv h2, v2.16b 648 uaddlv h3, v3.16b 649 uaddlv h4, v4.16b 650 add v1.4h, v1.4h, v2.4h 651 add v3.4h, v3.4h, v4.4h 652 cmp w4, #64 653 add v0.4h, v0.4h, v1.4h 654 add v0.4h, v0.4h, v3.4h 655 ushl v4.4h, v0.4h, v17.4h 656 b.eq 1f 657 // h = 16/32 658 mov w16, #(0x5556/2) 659 movk w16, #(0x3334/2), lsl #16 660 lsr w16, w16, w4 661 dup v16.4h, w16 662 sqdmulh v4.4h, v4.4h, v16.4h 6631: 664 dup v0.16b, v4.b[0] 665 dup v1.16b, v4.b[0] 666 dup v2.16b, v4.b[0] 667 dup v3.16b, v4.b[0] 6682: 669 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 670 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 671 subs w4, w4, #4 672 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 673 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 674 b.gt 2b 675 ret 676 677L(ipred_dc_tbl): 678 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 679 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 680 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 681 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 682 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 683 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 684 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 685 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 686 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 687 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 688endfunc 689 690// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 691// const pixel *const topleft, 692// const int width, const int height, const int a, 693// const int max_width, const int max_height); 694function ipred_paeth_8bpc_neon, export=1 695 clz w9, w3 696 adr x5, L(ipred_paeth_tbl) 697 sub w9, w9, #25 698 ldrh w9, [x5, w9, uxtw #1] 699 ld1r {v4.16b}, [x2] 700 add x8, x2, #1 701 sub x2, x2, #4 702 sub x5, x5, w9, uxtw 703 mov x7, #-4 704 add x6, x0, x1 705 lsl x1, x1, #1 706 br x5 70740: 708 ld1r {v5.4s}, [x8] 709 usubl v6.8h, v5.8b, v4.8b // top - topleft 7104: 711 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 712 zip1 v0.2s, v0.2s, v1.2s 713 zip1 v2.2s, v2.2s, v3.2s 714 uaddw v16.8h, v6.8h, v0.8b 715 uaddw v17.8h, v6.8h, v2.8b 716 sqxtun v16.8b, v16.8h // base 717 sqxtun2 v16.16b, v17.8h 718 zip1 v0.2d, v0.2d, v2.2d 719 uabd v20.16b, v5.16b, v16.16b // tdiff 720 uabd v22.16b, v4.16b, v16.16b // tldiff 721 uabd v16.16b, v0.16b, v16.16b // ldiff 722 umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) 723 cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff 724 cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff 725 bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 726 bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... 727 st1 {v20.s}[3], [x0], x1 728 st1 {v20.s}[2], [x6], x1 729 subs w4, w4, #4 730 st1 {v20.s}[1], [x0], x1 731 st1 {v20.s}[0], [x6], x1 732 b.gt 4b 733 ret 73480: 735 ld1r {v5.2d}, [x8] 736 usubl v6.8h, v5.8b, v4.8b // top - topleft 7378: 738 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 739 uaddw v16.8h, v6.8h, v0.8b 740 uaddw v17.8h, v6.8h, v1.8b 741 uaddw v18.8h, v6.8h, v2.8b 742 uaddw v19.8h, v6.8h, v3.8b 743 sqxtun v16.8b, v16.8h // base 744 sqxtun2 v16.16b, v17.8h 745 sqxtun v18.8b, v18.8h 746 sqxtun2 v18.16b, v19.8h 747 zip1 v2.2d, v2.2d, v3.2d 748 zip1 v0.2d, v0.2d, v1.2d 749 uabd v21.16b, v5.16b, v18.16b // tdiff 750 uabd v20.16b, v5.16b, v16.16b 751 uabd v23.16b, v4.16b, v18.16b // tldiff 752 uabd v22.16b, v4.16b, v16.16b 753 uabd v17.16b, v2.16b, v18.16b // ldiff 754 uabd v16.16b, v0.16b, v16.16b 755 umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) 756 umin v18.16b, v20.16b, v22.16b 757 cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff 758 cmhs v20.16b, v22.16b, v20.16b 759 cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff 760 cmhs v16.16b, v18.16b, v16.16b 761 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 762 bsl v20.16b, v5.16b, v4.16b 763 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 764 bit v20.16b, v0.16b, v16.16b 765 st1 {v21.d}[1], [x0], x1 766 st1 {v21.d}[0], [x6], x1 767 subs w4, w4, #4 768 st1 {v20.d}[1], [x0], x1 769 st1 {v20.d}[0], [x6], x1 770 b.gt 8b 771 ret 772160: 773320: 774640: 775 ld1 {v5.16b}, [x8], #16 776 mov w9, w3 777 // Set up pointers for four rows in parallel; x0, x6, x5, x10 778 add x5, x0, x1 779 add x10, x6, x1 780 lsl x1, x1, #1 781 sub x1, x1, w3, uxtw 7821: 783 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 7842: 785 usubl v6.8h, v5.8b, v4.8b // top - topleft 786 usubl2 v7.8h, v5.16b, v4.16b 787 uaddw v24.8h, v6.8h, v0.8b 788 uaddw v25.8h, v7.8h, v0.8b 789 uaddw v26.8h, v6.8h, v1.8b 790 uaddw v27.8h, v7.8h, v1.8b 791 uaddw v28.8h, v6.8h, v2.8b 792 uaddw v29.8h, v7.8h, v2.8b 793 uaddw v30.8h, v6.8h, v3.8b 794 uaddw v31.8h, v7.8h, v3.8b 795 sqxtun v17.8b, v26.8h // base 796 sqxtun2 v17.16b, v27.8h 797 sqxtun v16.8b, v24.8h 798 sqxtun2 v16.16b, v25.8h 799 sqxtun v19.8b, v30.8h 800 sqxtun2 v19.16b, v31.8h 801 sqxtun v18.8b, v28.8h 802 sqxtun2 v18.16b, v29.8h 803 uabd v23.16b, v5.16b, v19.16b // tdiff 804 uabd v22.16b, v5.16b, v18.16b 805 uabd v21.16b, v5.16b, v17.16b 806 uabd v20.16b, v5.16b, v16.16b 807 uabd v27.16b, v4.16b, v19.16b // tldiff 808 uabd v26.16b, v4.16b, v18.16b 809 uabd v25.16b, v4.16b, v17.16b 810 uabd v24.16b, v4.16b, v16.16b 811 uabd v19.16b, v3.16b, v19.16b // ldiff 812 uabd v18.16b, v2.16b, v18.16b 813 uabd v17.16b, v1.16b, v17.16b 814 uabd v16.16b, v0.16b, v16.16b 815 umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) 816 umin v30.16b, v22.16b, v26.16b 817 umin v29.16b, v21.16b, v25.16b 818 umin v28.16b, v20.16b, v24.16b 819 cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff 820 cmhs v22.16b, v26.16b, v22.16b 821 cmhs v21.16b, v25.16b, v21.16b 822 cmhs v20.16b, v24.16b, v20.16b 823 cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff 824 cmhs v18.16b, v30.16b, v18.16b 825 cmhs v17.16b, v29.16b, v17.16b 826 cmhs v16.16b, v28.16b, v16.16b 827 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 828 bsl v22.16b, v5.16b, v4.16b 829 bsl v21.16b, v5.16b, v4.16b 830 bsl v20.16b, v5.16b, v4.16b 831 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 832 bit v22.16b, v2.16b, v18.16b 833 bit v21.16b, v1.16b, v17.16b 834 bit v20.16b, v0.16b, v16.16b 835 subs w3, w3, #16 836 st1 {v23.16b}, [x0], #16 837 st1 {v22.16b}, [x6], #16 838 st1 {v21.16b}, [x5], #16 839 st1 {v20.16b}, [x10], #16 840 b.le 8f 841 ld1 {v5.16b}, [x8], #16 842 b 2b 8438: 844 subs w4, w4, #4 845 b.le 9f 846 // End of horizontal loop, move pointers to next four rows 847 sub x8, x8, w9, uxtw 848 add x0, x0, x1 849 add x6, x6, x1 850 // Load the top row as early as possible 851 ld1 {v5.16b}, [x8], #16 852 add x5, x5, x1 853 add x10, x10, x1 854 mov w3, w9 855 b 1b 8569: 857 ret 858 859L(ipred_paeth_tbl): 860 .hword L(ipred_paeth_tbl) - 640b 861 .hword L(ipred_paeth_tbl) - 320b 862 .hword L(ipred_paeth_tbl) - 160b 863 .hword L(ipred_paeth_tbl) - 80b 864 .hword L(ipred_paeth_tbl) - 40b 865endfunc 866 867// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 868// const pixel *const topleft, 869// const int width, const int height, const int a, 870// const int max_width, const int max_height); 871function ipred_smooth_8bpc_neon, export=1 872 movrel x10, X(sm_weights) 873 add x11, x10, w4, uxtw 874 add x10, x10, w3, uxtw 875 clz w9, w3 876 adr x5, L(ipred_smooth_tbl) 877 sub x12, x2, w4, uxtw 878 sub w9, w9, #25 879 ldrh w9, [x5, w9, uxtw #1] 880 ld1r {v4.16b}, [x12] // bottom 881 add x8, x2, #1 882 sub x5, x5, w9, uxtw 883 add x6, x0, x1 884 lsl x1, x1, #1 885 br x5 88640: 887 ld1r {v6.2s}, [x8] // top 888 ld1r {v7.2s}, [x10] // weights_hor 889 sub x2, x2, #4 890 mov x7, #-4 891 dup v5.16b, v6.b[3] // right 892 usubl v6.8h, v6.8b, v4.8b // top-bottom 893 uxtl v7.8h, v7.8b // weights_hor 8944: 895 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 896 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 897 shll v20.8h, v5.8b, #8 // right*256 898 shll v21.8h, v5.8b, #8 899 zip1 v1.2s, v1.2s, v0.2s // left, flipped 900 zip1 v0.2s, v3.2s, v2.2s 901 zip1 v16.2s, v16.2s, v17.2s // weights_ver 902 zip1 v18.2s, v18.2s, v19.2s 903 shll v22.8h, v4.8b, #8 // bottom*256 904 shll v23.8h, v4.8b, #8 905 usubl v0.8h, v0.8b, v5.8b // left-right 906 usubl v1.8h, v1.8b, v5.8b 907 uxtl v16.8h, v16.8b // weights_ver 908 uxtl v18.8h, v18.8b 909 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 910 mla v21.8h, v1.8h, v7.8h 911 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 912 mla v23.8h, v6.8h, v18.8h 913 uhadd v20.8h, v20.8h, v22.8h 914 uhadd v21.8h, v21.8h, v23.8h 915 rshrn v20.8b, v20.8h, #8 916 rshrn v21.8b, v21.8h, #8 917 st1 {v20.s}[0], [x0], x1 918 st1 {v20.s}[1], [x6], x1 919 subs w4, w4, #4 920 st1 {v21.s}[0], [x0], x1 921 st1 {v21.s}[1], [x6], x1 922 b.gt 4b 923 ret 92480: 925 ld1 {v6.8b}, [x8] // top 926 ld1 {v7.8b}, [x10] // weights_hor 927 sub x2, x2, #4 928 mov x7, #-4 929 dup v5.16b, v6.b[7] // right 930 usubl v6.8h, v6.8b, v4.8b // top-bottom 931 uxtl v7.8h, v7.8b // weights_hor 9328: 933 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 934 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 935 shll v20.8h, v5.8b, #8 // right*256 936 shll v21.8h, v5.8b, #8 937 shll v22.8h, v5.8b, #8 938 shll v23.8h, v5.8b, #8 939 usubl v0.8h, v0.8b, v5.8b // left-right 940 usubl v1.8h, v1.8b, v5.8b 941 usubl v2.8h, v2.8b, v5.8b 942 usubl v3.8h, v3.8b, v5.8b 943 shll v24.8h, v4.8b, #8 // bottom*256 944 shll v25.8h, v4.8b, #8 945 shll v26.8h, v4.8b, #8 946 shll v27.8h, v4.8b, #8 947 uxtl v16.8h, v16.8b // weights_ver 948 uxtl v17.8h, v17.8b 949 uxtl v18.8h, v18.8b 950 uxtl v19.8h, v19.8b 951 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 952 mla v21.8h, v2.8h, v7.8h // (left flipped) 953 mla v22.8h, v1.8h, v7.8h 954 mla v23.8h, v0.8h, v7.8h 955 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 956 mla v25.8h, v6.8h, v17.8h 957 mla v26.8h, v6.8h, v18.8h 958 mla v27.8h, v6.8h, v19.8h 959 uhadd v20.8h, v20.8h, v24.8h 960 uhadd v21.8h, v21.8h, v25.8h 961 uhadd v22.8h, v22.8h, v26.8h 962 uhadd v23.8h, v23.8h, v27.8h 963 rshrn v20.8b, v20.8h, #8 964 rshrn v21.8b, v21.8h, #8 965 rshrn v22.8b, v22.8h, #8 966 rshrn v23.8b, v23.8h, #8 967 st1 {v20.8b}, [x0], x1 968 st1 {v21.8b}, [x6], x1 969 subs w4, w4, #4 970 st1 {v22.8b}, [x0], x1 971 st1 {v23.8b}, [x6], x1 972 b.gt 8b 973 ret 974160: 975320: 976640: 977 add x12, x2, w3, uxtw 978 sub x2, x2, #2 979 mov x7, #-2 980 ld1r {v5.16b}, [x12] // right 981 sub x1, x1, w3, uxtw 982 mov w9, w3 983 9841: 985 ld2r {v0.8b, v1.8b}, [x2], x7 // left 986 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 987 usubl v0.8h, v0.8b, v5.8b // left-right 988 usubl v1.8h, v1.8b, v5.8b 989 uxtl v16.8h, v16.8b // weights_ver 990 uxtl v17.8h, v17.8b 9912: 992 ld1 {v7.16b}, [x10], #16 // weights_hor 993 ld1 {v3.16b}, [x8], #16 // top 994 shll v20.8h, v5.8b, #8 // right*256 995 shll v21.8h, v5.8b, #8 996 shll v22.8h, v5.8b, #8 997 shll v23.8h, v5.8b, #8 998 uxtl v6.8h, v7.8b // weights_hor 999 uxtl2 v7.8h, v7.16b 1000 usubl v2.8h, v3.8b, v4.8b // top-bottom 1001 usubl2 v3.8h, v3.16b, v4.16b 1002 mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor 1003 mla v21.8h, v1.8h, v7.8h // (left flipped) 1004 mla v22.8h, v0.8h, v6.8h 1005 mla v23.8h, v0.8h, v7.8h 1006 shll v24.8h, v4.8b, #8 // bottom*256 1007 shll v25.8h, v4.8b, #8 1008 shll v26.8h, v4.8b, #8 1009 shll v27.8h, v4.8b, #8 1010 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1011 mla v25.8h, v3.8h, v16.8h 1012 mla v26.8h, v2.8h, v17.8h 1013 mla v27.8h, v3.8h, v17.8h 1014 uhadd v20.8h, v20.8h, v24.8h 1015 uhadd v21.8h, v21.8h, v25.8h 1016 uhadd v22.8h, v22.8h, v26.8h 1017 uhadd v23.8h, v23.8h, v27.8h 1018 rshrn v20.8b, v20.8h, #8 1019 rshrn2 v20.16b, v21.8h, #8 1020 rshrn v22.8b, v22.8h, #8 1021 rshrn2 v22.16b, v23.8h, #8 1022 subs w3, w3, #16 1023 st1 {v20.16b}, [x0], #16 1024 st1 {v22.16b}, [x6], #16 1025 b.gt 2b 1026 subs w4, w4, #2 1027 b.le 9f 1028 sub x8, x8, w9, uxtw 1029 sub x10, x10, w9, uxtw 1030 add x0, x0, x1 1031 add x6, x6, x1 1032 mov w3, w9 1033 b 1b 10349: 1035 ret 1036 1037L(ipred_smooth_tbl): 1038 .hword L(ipred_smooth_tbl) - 640b 1039 .hword L(ipred_smooth_tbl) - 320b 1040 .hword L(ipred_smooth_tbl) - 160b 1041 .hword L(ipred_smooth_tbl) - 80b 1042 .hword L(ipred_smooth_tbl) - 40b 1043endfunc 1044 1045// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1046// const pixel *const topleft, 1047// const int width, const int height, const int a, 1048// const int max_width, const int max_height); 1049function ipred_smooth_v_8bpc_neon, export=1 1050 movrel x7, X(sm_weights) 1051 add x7, x7, w4, uxtw 1052 clz w9, w3 1053 adr x5, L(ipred_smooth_v_tbl) 1054 sub x8, x2, w4, uxtw 1055 sub w9, w9, #25 1056 ldrh w9, [x5, w9, uxtw #1] 1057 ld1r {v4.16b}, [x8] // bottom 1058 add x2, x2, #1 1059 sub x5, x5, w9, uxtw 1060 add x6, x0, x1 1061 lsl x1, x1, #1 1062 br x5 106340: 1064 ld1r {v6.2s}, [x2] // top 1065 usubl v6.8h, v6.8b, v4.8b // top-bottom 10664: 1067 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1068 shll v22.8h, v4.8b, #8 // bottom*256 1069 shll v23.8h, v4.8b, #8 1070 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1071 zip1 v18.2s, v18.2s, v19.2s 1072 uxtl v16.8h, v16.8b // weights_ver 1073 uxtl v18.8h, v18.8b 1074 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1075 mla v23.8h, v6.8h, v18.8h 1076 rshrn v22.8b, v22.8h, #8 1077 rshrn v23.8b, v23.8h, #8 1078 st1 {v22.s}[0], [x0], x1 1079 st1 {v22.s}[1], [x6], x1 1080 subs w4, w4, #4 1081 st1 {v23.s}[0], [x0], x1 1082 st1 {v23.s}[1], [x6], x1 1083 b.gt 4b 1084 ret 108580: 1086 ld1 {v6.8b}, [x2] // top 1087 usubl v6.8h, v6.8b, v4.8b // top-bottom 10888: 1089 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1090 shll v24.8h, v4.8b, #8 // bottom*256 1091 shll v25.8h, v4.8b, #8 1092 shll v26.8h, v4.8b, #8 1093 shll v27.8h, v4.8b, #8 1094 uxtl v16.8h, v16.8b // weights_ver 1095 uxtl v17.8h, v17.8b 1096 uxtl v18.8h, v18.8b 1097 uxtl v19.8h, v19.8b 1098 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1099 mla v25.8h, v6.8h, v17.8h 1100 mla v26.8h, v6.8h, v18.8h 1101 mla v27.8h, v6.8h, v19.8h 1102 rshrn v24.8b, v24.8h, #8 1103 rshrn v25.8b, v25.8h, #8 1104 rshrn v26.8b, v26.8h, #8 1105 rshrn v27.8b, v27.8h, #8 1106 st1 {v24.8b}, [x0], x1 1107 st1 {v25.8b}, [x6], x1 1108 subs w4, w4, #4 1109 st1 {v26.8b}, [x0], x1 1110 st1 {v27.8b}, [x6], x1 1111 b.gt 8b 1112 ret 1113160: 1114320: 1115640: 1116 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1117 add x5, x0, x1 1118 add x8, x6, x1 1119 lsl x1, x1, #1 1120 sub x1, x1, w3, uxtw 1121 mov w9, w3 1122 11231: 1124 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1125 uxtl v16.8h, v16.8b // weights_ver 1126 uxtl v17.8h, v17.8b 1127 uxtl v18.8h, v18.8b 1128 uxtl v19.8h, v19.8b 11292: 1130 ld1 {v3.16b}, [x2], #16 // top 1131 shll v20.8h, v4.8b, #8 // bottom*256 1132 shll v21.8h, v4.8b, #8 1133 shll v22.8h, v4.8b, #8 1134 shll v23.8h, v4.8b, #8 1135 shll v24.8h, v4.8b, #8 1136 shll v25.8h, v4.8b, #8 1137 shll v26.8h, v4.8b, #8 1138 shll v27.8h, v4.8b, #8 1139 usubl v2.8h, v3.8b, v4.8b // top-bottom 1140 usubl2 v3.8h, v3.16b, v4.16b 1141 mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1142 mla v21.8h, v3.8h, v16.8h 1143 mla v22.8h, v2.8h, v17.8h 1144 mla v23.8h, v3.8h, v17.8h 1145 mla v24.8h, v2.8h, v18.8h 1146 mla v25.8h, v3.8h, v18.8h 1147 mla v26.8h, v2.8h, v19.8h 1148 mla v27.8h, v3.8h, v19.8h 1149 rshrn v20.8b, v20.8h, #8 1150 rshrn2 v20.16b, v21.8h, #8 1151 rshrn v22.8b, v22.8h, #8 1152 rshrn2 v22.16b, v23.8h, #8 1153 rshrn v24.8b, v24.8h, #8 1154 rshrn2 v24.16b, v25.8h, #8 1155 rshrn v26.8b, v26.8h, #8 1156 rshrn2 v26.16b, v27.8h, #8 1157 subs w3, w3, #16 1158 st1 {v20.16b}, [x0], #16 1159 st1 {v22.16b}, [x6], #16 1160 st1 {v24.16b}, [x5], #16 1161 st1 {v26.16b}, [x8], #16 1162 b.gt 2b 1163 subs w4, w4, #4 1164 b.le 9f 1165 sub x2, x2, w9, uxtw 1166 add x0, x0, x1 1167 add x6, x6, x1 1168 add x5, x5, x1 1169 add x8, x8, x1 1170 mov w3, w9 1171 b 1b 11729: 1173 ret 1174 1175L(ipred_smooth_v_tbl): 1176 .hword L(ipred_smooth_v_tbl) - 640b 1177 .hword L(ipred_smooth_v_tbl) - 320b 1178 .hword L(ipred_smooth_v_tbl) - 160b 1179 .hword L(ipred_smooth_v_tbl) - 80b 1180 .hword L(ipred_smooth_v_tbl) - 40b 1181endfunc 1182 1183// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1184// const pixel *const topleft, 1185// const int width, const int height, const int a, 1186// const int max_width, const int max_height); 1187function ipred_smooth_h_8bpc_neon, export=1 1188 movrel x8, X(sm_weights) 1189 add x8, x8, w3, uxtw 1190 clz w9, w3 1191 adr x5, L(ipred_smooth_h_tbl) 1192 add x12, x2, w3, uxtw 1193 sub w9, w9, #25 1194 ldrh w9, [x5, w9, uxtw #1] 1195 ld1r {v5.16b}, [x12] // right 1196 sub x5, x5, w9, uxtw 1197 add x6, x0, x1 1198 lsl x1, x1, #1 1199 br x5 120040: 1201 ld1r {v7.2s}, [x8] // weights_hor 1202 sub x2, x2, #4 1203 mov x7, #-4 1204 uxtl v7.8h, v7.8b // weights_hor 12054: 1206 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1207 shll v20.8h, v5.8b, #8 // right*256 1208 shll v21.8h, v5.8b, #8 1209 zip1 v1.2s, v1.2s, v0.2s // left, flipped 1210 zip1 v0.2s, v3.2s, v2.2s 1211 usubl v0.8h, v0.8b, v5.8b // left-right 1212 usubl v1.8h, v1.8b, v5.8b 1213 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 1214 mla v21.8h, v1.8h, v7.8h 1215 rshrn v20.8b, v20.8h, #8 1216 rshrn v21.8b, v21.8h, #8 1217 st1 {v20.s}[0], [x0], x1 1218 st1 {v20.s}[1], [x6], x1 1219 subs w4, w4, #4 1220 st1 {v21.s}[0], [x0], x1 1221 st1 {v21.s}[1], [x6], x1 1222 b.gt 4b 1223 ret 122480: 1225 ld1 {v7.8b}, [x8] // weights_hor 1226 sub x2, x2, #4 1227 mov x7, #-4 1228 uxtl v7.8h, v7.8b // weights_hor 12298: 1230 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1231 shll v20.8h, v5.8b, #8 // right*256 1232 shll v21.8h, v5.8b, #8 1233 shll v22.8h, v5.8b, #8 1234 shll v23.8h, v5.8b, #8 1235 usubl v3.8h, v3.8b, v5.8b // left-right 1236 usubl v2.8h, v2.8b, v5.8b 1237 usubl v1.8h, v1.8b, v5.8b 1238 usubl v0.8h, v0.8b, v5.8b 1239 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 1240 mla v21.8h, v2.8h, v7.8h // (left flipped) 1241 mla v22.8h, v1.8h, v7.8h 1242 mla v23.8h, v0.8h, v7.8h 1243 rshrn v20.8b, v20.8h, #8 1244 rshrn v21.8b, v21.8h, #8 1245 rshrn v22.8b, v22.8h, #8 1246 rshrn v23.8b, v23.8h, #8 1247 st1 {v20.8b}, [x0], x1 1248 st1 {v21.8b}, [x6], x1 1249 subs w4, w4, #4 1250 st1 {v22.8b}, [x0], x1 1251 st1 {v23.8b}, [x6], x1 1252 b.gt 8b 1253 ret 1254160: 1255320: 1256640: 1257 sub x2, x2, #4 1258 mov x7, #-4 1259 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1260 add x5, x0, x1 1261 add x10, x6, x1 1262 lsl x1, x1, #1 1263 sub x1, x1, w3, uxtw 1264 mov w9, w3 1265 12661: 1267 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1268 usubl v0.8h, v0.8b, v5.8b // left-right 1269 usubl v1.8h, v1.8b, v5.8b 1270 usubl v2.8h, v2.8b, v5.8b 1271 usubl v3.8h, v3.8b, v5.8b 12722: 1273 ld1 {v7.16b}, [x8], #16 // weights_hor 1274 shll v20.8h, v5.8b, #8 // right*256 1275 shll v21.8h, v5.8b, #8 1276 shll v22.8h, v5.8b, #8 1277 shll v23.8h, v5.8b, #8 1278 shll v24.8h, v5.8b, #8 1279 shll v25.8h, v5.8b, #8 1280 shll v26.8h, v5.8b, #8 1281 shll v27.8h, v5.8b, #8 1282 uxtl v6.8h, v7.8b // weights_hor 1283 uxtl2 v7.8h, v7.16b 1284 mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor 1285 mla v21.8h, v3.8h, v7.8h // (left flipped) 1286 mla v22.8h, v2.8h, v6.8h 1287 mla v23.8h, v2.8h, v7.8h 1288 mla v24.8h, v1.8h, v6.8h 1289 mla v25.8h, v1.8h, v7.8h 1290 mla v26.8h, v0.8h, v6.8h 1291 mla v27.8h, v0.8h, v7.8h 1292 rshrn v20.8b, v20.8h, #8 1293 rshrn2 v20.16b, v21.8h, #8 1294 rshrn v22.8b, v22.8h, #8 1295 rshrn2 v22.16b, v23.8h, #8 1296 rshrn v24.8b, v24.8h, #8 1297 rshrn2 v24.16b, v25.8h, #8 1298 rshrn v26.8b, v26.8h, #8 1299 rshrn2 v26.16b, v27.8h, #8 1300 subs w3, w3, #16 1301 st1 {v20.16b}, [x0], #16 1302 st1 {v22.16b}, [x6], #16 1303 st1 {v24.16b}, [x5], #16 1304 st1 {v26.16b}, [x10], #16 1305 b.gt 2b 1306 subs w4, w4, #4 1307 b.le 9f 1308 sub x8, x8, w9, uxtw 1309 add x0, x0, x1 1310 add x6, x6, x1 1311 add x5, x5, x1 1312 add x10, x10, x1 1313 mov w3, w9 1314 b 1b 13159: 1316 ret 1317 1318L(ipred_smooth_h_tbl): 1319 .hword L(ipred_smooth_h_tbl) - 640b 1320 .hword L(ipred_smooth_h_tbl) - 320b 1321 .hword L(ipred_smooth_h_tbl) - 160b 1322 .hword L(ipred_smooth_h_tbl) - 80b 1323 .hword L(ipred_smooth_h_tbl) - 40b 1324endfunc 1325 1326// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1327// const pixel *const topleft, 1328// const int width, const int height, const int filt_idx, 1329// const int max_width, const int max_height); 1330function ipred_filter_8bpc_neon, export=1 1331 and w5, w5, #511 1332 movrel x6, X(filter_intra_taps) 1333 lsl w5, w5, #6 1334 add x6, x6, w5, uxtw 1335 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 1336 clz w9, w3 1337 adr x5, L(ipred_filter_tbl) 1338 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 1339 sub w9, w9, #26 1340 ldrh w9, [x5, w9, uxtw #1] 1341 sxtl v16.8h, v16.8b 1342 sxtl v17.8h, v17.8b 1343 sub x5, x5, w9, uxtw 1344 sxtl v18.8h, v18.8b 1345 sxtl v19.8h, v19.8b 1346 add x6, x0, x1 1347 lsl x1, x1, #1 1348 sxtl v20.8h, v20.8b 1349 sxtl v21.8h, v21.8b 1350 sxtl v22.8h, v22.8b 1351 br x5 135240: 1353 ldur s0, [x2, #1] // top (0-3) 1354 sub x2, x2, #2 1355 mov x7, #-2 1356 uxtl v0.8h, v0.8b // top (0-3) 13574: 1358 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 1359 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1360 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1361 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1362 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 1363 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1364 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1365 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1366 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1367 sqrshrun v2.8b, v2.8h, #4 1368 subs w4, w4, #2 1369 st1 {v2.s}[0], [x0], x1 1370 uxtl v0.8h, v2.8b 1371 st1 {v2.s}[1], [x6], x1 1372 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] 1373 b.gt 4b 1374 ret 137580: 1376 ldur d0, [x2, #1] // top (0-7) 1377 sub x2, x2, #2 1378 mov x7, #-2 1379 uxtl v0.8h, v0.8b // top (0-7) 13808: 1381 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 1382 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 1383 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 1384 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 1385 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 1386 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 1387 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 1388 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 1389 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 1390 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 1391 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 1392 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 1393 sqrshrun v2.8b, v2.8h, #4 1394 uxtl v1.8h, v2.8b // first block, in 16 bit 1395 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 1396 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 1397 mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) 1398 mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) 1399 sqrshrun v3.8b, v3.8h, #4 1400 subs w4, w4, #2 1401 st2 {v2.s, v3.s}[0], [x0], x1 1402 zip2 v0.2s, v2.2s, v3.2s 1403 st2 {v2.s, v3.s}[1], [x6], x1 1404 uxtl v0.8h, v0.8b 1405 b.gt 8b 1406 ret 1407160: 1408320: 1409 add x8, x2, #1 1410 sub x2, x2, #2 1411 mov x7, #-2 1412 sub x1, x1, w3, uxtw 1413 mov w9, w3 1414 14151: 1416 ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) 1417 uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 14182: 1419 ld1 {v2.16b}, [x8], #16 // top(0-15) 1420 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 1421 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 1422 uxtl v1.8h, v2.8b // top(0-7) 1423 uxtl2 v2.8h, v2.16b // top(8-15) 1424 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 1425 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 1426 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 1427 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 1428 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 1429 1430 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 1431 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 1432 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 1433 sqrshrun v3.8b, v3.8h, #4 1434 uxtl v0.8h, v3.8b // first block, in 16 bit 1435 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 1436 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 1437 mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 1438 mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 1439 1440 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 1441 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 1442 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 1443 sqrshrun v4.8b, v4.8h, #4 1444 uxtl v0.8h, v4.8b // second block, in 16 bit 1445 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 1446 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 1447 mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 1448 mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 1449 1450 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 1451 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 1452 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 1453 sqrshrun v5.8b, v5.8h, #4 1454 uxtl v0.8h, v5.8b // third block, in 16 bit 1455 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 1456 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 1457 mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 1458 mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 1459 1460 subs w3, w3, #16 1461 sqrshrun v6.8b, v6.8h, #4 1462 1463 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 1464 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 1465 b.le 8f 1466 ins v0.h[2], v2.h[7] 1467 ins v0.b[0], v6.b[7] 1468 ins v0.b[2], v6.b[3] 1469 b 2b 14708: 1471 subs w4, w4, #2 1472 b.le 9f 1473 sub x8, x6, w9, uxtw 1474 add x0, x0, x1 1475 add x6, x6, x1 1476 mov w3, w9 1477 b 1b 14789: 1479 ret 1480 1481L(ipred_filter_tbl): 1482 .hword L(ipred_filter_tbl) - 320b 1483 .hword L(ipred_filter_tbl) - 160b 1484 .hword L(ipred_filter_tbl) - 80b 1485 .hword L(ipred_filter_tbl) - 40b 1486endfunc 1487 1488// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1489// const uint16_t *const pal, const uint8_t *idx, 1490// const int w, const int h); 1491function pal_pred_8bpc_neon, export=1 1492 ld1 {v0.8h}, [x2] 1493 clz w9, w4 1494 adr x6, L(pal_pred_tbl) 1495 sub w9, w9, #25 1496 ldrh w9, [x6, w9, uxtw #1] 1497 xtn v0.8b, v0.8h 1498 sub x6, x6, w9, uxtw 1499 add x2, x0, x1 1500 lsl x1, x1, #1 1501 br x6 15024: 1503 ld1 {v1.16b}, [x3], #16 1504 subs w5, w5, #4 1505 tbl v1.16b, {v0.16b}, v1.16b 1506 st1 {v1.s}[0], [x0], x1 1507 st1 {v1.s}[1], [x2], x1 1508 st1 {v1.s}[2], [x0], x1 1509 st1 {v1.s}[3], [x2], x1 1510 b.gt 4b 1511 ret 15128: 1513 ld1 {v1.16b, v2.16b}, [x3], #32 1514 subs w5, w5, #4 1515 tbl v1.16b, {v0.16b}, v1.16b 1516 st1 {v1.d}[0], [x0], x1 1517 tbl v2.16b, {v0.16b}, v2.16b 1518 st1 {v1.d}[1], [x2], x1 1519 st1 {v2.d}[0], [x0], x1 1520 st1 {v2.d}[1], [x2], x1 1521 b.gt 8b 1522 ret 152316: 1524 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 1525 subs w5, w5, #4 1526 tbl v1.16b, {v0.16b}, v1.16b 1527 tbl v2.16b, {v0.16b}, v2.16b 1528 st1 {v1.16b}, [x0], x1 1529 tbl v3.16b, {v0.16b}, v3.16b 1530 st1 {v2.16b}, [x2], x1 1531 tbl v4.16b, {v0.16b}, v4.16b 1532 st1 {v3.16b}, [x0], x1 1533 st1 {v4.16b}, [x2], x1 1534 b.gt 16b 1535 ret 153632: 1537 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 1538 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 1539 subs w5, w5, #4 1540 tbl v16.16b, {v0.16b}, v16.16b 1541 tbl v17.16b, {v0.16b}, v17.16b 1542 tbl v18.16b, {v0.16b}, v18.16b 1543 tbl v19.16b, {v0.16b}, v19.16b 1544 tbl v20.16b, {v0.16b}, v20.16b 1545 st1 {v16.16b, v17.16b}, [x0], x1 1546 tbl v21.16b, {v0.16b}, v21.16b 1547 st1 {v18.16b, v19.16b}, [x2], x1 1548 tbl v22.16b, {v0.16b}, v22.16b 1549 st1 {v20.16b, v21.16b}, [x0], x1 1550 tbl v23.16b, {v0.16b}, v23.16b 1551 st1 {v22.16b, v23.16b}, [x2], x1 1552 b.gt 32b 1553 ret 155464: 1555 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 1556 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 1557 subs w5, w5, #2 1558 tbl v16.16b, {v0.16b}, v16.16b 1559 tbl v17.16b, {v0.16b}, v17.16b 1560 tbl v18.16b, {v0.16b}, v18.16b 1561 tbl v19.16b, {v0.16b}, v19.16b 1562 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 1563 tbl v20.16b, {v0.16b}, v20.16b 1564 tbl v21.16b, {v0.16b}, v21.16b 1565 tbl v22.16b, {v0.16b}, v22.16b 1566 tbl v23.16b, {v0.16b}, v23.16b 1567 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 1568 b.gt 64b 1569 ret 1570 1571L(pal_pred_tbl): 1572 .hword L(pal_pred_tbl) - 64b 1573 .hword L(pal_pred_tbl) - 32b 1574 .hword L(pal_pred_tbl) - 16b 1575 .hword L(pal_pred_tbl) - 8b 1576 .hword L(pal_pred_tbl) - 4b 1577endfunc 1578 1579// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1580// const pixel *const topleft, 1581// const int width, const int height, 1582// const int16_t *ac, const int alpha); 1583function ipred_cfl_128_8bpc_neon, export=1 1584 clz w9, w3 1585 adr x7, L(ipred_cfl_128_tbl) 1586 sub w9, w9, #26 1587 ldrh w9, [x7, w9, uxtw #1] 1588 movi v0.8h, #128 // dc 1589 dup v1.8h, w6 // alpha 1590 sub x7, x7, w9, uxtw 1591 add x6, x0, x1 1592 lsl x1, x1, #1 1593 br x7 1594L(ipred_cfl_splat_w4): 1595 ld1 {v2.8h, v3.8h}, [x5], #32 1596 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 1597 mul v3.8h, v3.8h, v1.8h 1598 sshr v4.8h, v2.8h, #15 // sign = diff >> 15 1599 sshr v5.8h, v3.8h, #15 1600 add v2.8h, v2.8h, v4.8h // diff + sign 1601 add v3.8h, v3.8h, v5.8h 1602 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 1603 srshr v3.8h, v3.8h, #6 1604 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1605 add v3.8h, v3.8h, v0.8h 1606 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 1607 sqxtun v3.8b, v3.8h 1608 st1 {v2.s}[0], [x0], x1 1609 st1 {v2.s}[1], [x6], x1 1610 subs w4, w4, #4 1611 st1 {v3.s}[0], [x0], x1 1612 st1 {v3.s}[1], [x6], x1 1613 b.gt L(ipred_cfl_splat_w4) 1614 ret 1615L(ipred_cfl_splat_w8): 1616 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 1617 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 1618 mul v3.8h, v3.8h, v1.8h 1619 mul v4.8h, v4.8h, v1.8h 1620 mul v5.8h, v5.8h, v1.8h 1621 sshr v16.8h, v2.8h, #15 // sign = diff >> 15 1622 sshr v17.8h, v3.8h, #15 1623 sshr v18.8h, v4.8h, #15 1624 sshr v19.8h, v5.8h, #15 1625 add v2.8h, v2.8h, v16.8h // diff + sign 1626 add v3.8h, v3.8h, v17.8h 1627 add v4.8h, v4.8h, v18.8h 1628 add v5.8h, v5.8h, v19.8h 1629 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 1630 srshr v3.8h, v3.8h, #6 1631 srshr v4.8h, v4.8h, #6 1632 srshr v5.8h, v5.8h, #6 1633 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1634 add v3.8h, v3.8h, v0.8h 1635 add v4.8h, v4.8h, v0.8h 1636 add v5.8h, v5.8h, v0.8h 1637 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 1638 sqxtun v3.8b, v3.8h 1639 sqxtun v4.8b, v4.8h 1640 sqxtun v5.8b, v5.8h 1641 st1 {v2.8b}, [x0], x1 1642 st1 {v3.8b}, [x6], x1 1643 subs w4, w4, #4 1644 st1 {v4.8b}, [x0], x1 1645 st1 {v5.8b}, [x6], x1 1646 b.gt L(ipred_cfl_splat_w8) 1647 ret 1648L(ipred_cfl_splat_w16): 1649 add x7, x5, w3, uxtw #1 1650 sub x1, x1, w3, uxtw 1651 mov w9, w3 16521: 1653 ld1 {v2.8h, v3.8h}, [x5], #32 1654 ld1 {v4.8h, v5.8h}, [x7], #32 1655 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 1656 mul v3.8h, v3.8h, v1.8h 1657 mul v4.8h, v4.8h, v1.8h 1658 mul v5.8h, v5.8h, v1.8h 1659 sshr v16.8h, v2.8h, #15 // sign = diff >> 15 1660 sshr v17.8h, v3.8h, #15 1661 sshr v18.8h, v4.8h, #15 1662 sshr v19.8h, v5.8h, #15 1663 add v2.8h, v2.8h, v16.8h // diff + sign 1664 add v3.8h, v3.8h, v17.8h 1665 add v4.8h, v4.8h, v18.8h 1666 add v5.8h, v5.8h, v19.8h 1667 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 1668 srshr v3.8h, v3.8h, #6 1669 srshr v4.8h, v4.8h, #6 1670 srshr v5.8h, v5.8h, #6 1671 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 1672 add v3.8h, v3.8h, v0.8h 1673 add v4.8h, v4.8h, v0.8h 1674 add v5.8h, v5.8h, v0.8h 1675 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 1676 sqxtun v3.8b, v3.8h 1677 sqxtun v4.8b, v4.8h 1678 sqxtun v5.8b, v5.8h 1679 subs w3, w3, #16 1680 st1 {v2.8b, v3.8b}, [x0], #16 1681 st1 {v4.8b, v5.8b}, [x6], #16 1682 b.gt 1b 1683 subs w4, w4, #2 1684 add x5, x5, w9, uxtw #1 1685 add x7, x7, w9, uxtw #1 1686 add x0, x0, x1 1687 add x6, x6, x1 1688 mov w3, w9 1689 b.gt 1b 1690 ret 1691 1692L(ipred_cfl_128_tbl): 1693L(ipred_cfl_splat_tbl): 1694 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1695 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 1696 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 1697 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 1698endfunc 1699 1700// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1701// const pixel *const topleft, 1702// const int width, const int height, 1703// const int16_t *ac, const int alpha); 1704function ipred_cfl_top_8bpc_neon, export=1 1705 clz w9, w3 1706 adr x7, L(ipred_cfl_top_tbl) 1707 sub w9, w9, #26 1708 ldrh w9, [x7, w9, uxtw #1] 1709 dup v1.8h, w6 // alpha 1710 add x2, x2, #1 1711 sub x7, x7, w9, uxtw 1712 add x6, x0, x1 1713 lsl x1, x1, #1 1714 br x7 17154: 1716 ld1r {v0.2s}, [x2] 1717 uaddlv h0, v0.8b 1718 urshr v0.4h, v0.4h, #3 1719 dup v0.8h, v0.h[0] 1720 b L(ipred_cfl_splat_w4) 17218: 1722 ld1 {v0.8b}, [x2] 1723 uaddlv h0, v0.8b 1724 urshr v0.4h, v0.4h, #3 1725 dup v0.8h, v0.h[0] 1726 b L(ipred_cfl_splat_w8) 172716: 1728 ld1 {v0.16b}, [x2] 1729 uaddlv h0, v0.16b 1730 urshr v0.4h, v0.4h, #4 1731 dup v0.8h, v0.h[0] 1732 b L(ipred_cfl_splat_w16) 173332: 1734 ld1 {v2.16b, v3.16b}, [x2] 1735 uaddlv h2, v2.16b 1736 uaddlv h3, v3.16b 1737 add v2.4h, v2.4h, v3.4h 1738 urshr v2.4h, v2.4h, #5 1739 dup v0.8h, v2.h[0] 1740 b L(ipred_cfl_splat_w16) 1741 1742L(ipred_cfl_top_tbl): 1743 .hword L(ipred_cfl_top_tbl) - 32b 1744 .hword L(ipred_cfl_top_tbl) - 16b 1745 .hword L(ipred_cfl_top_tbl) - 8b 1746 .hword L(ipred_cfl_top_tbl) - 4b 1747endfunc 1748 1749// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1750// const pixel *const topleft, 1751// const int width, const int height, 1752// const int16_t *ac, const int alpha); 1753function ipred_cfl_left_8bpc_neon, export=1 1754 sub x2, x2, w4, uxtw 1755 clz w9, w3 1756 clz w8, w4 1757 adr x10, L(ipred_cfl_splat_tbl) 1758 adr x7, L(ipred_cfl_left_tbl) 1759 sub w9, w9, #26 1760 sub w8, w8, #26 1761 ldrh w9, [x10, w9, uxtw #1] 1762 ldrh w8, [x7, w8, uxtw #1] 1763 dup v1.8h, w6 // alpha 1764 sub x9, x10, w9, uxtw 1765 sub x7, x7, w8, uxtw 1766 add x6, x0, x1 1767 lsl x1, x1, #1 1768 br x7 1769 1770L(ipred_cfl_left_h4): 1771 ld1r {v0.2s}, [x2] 1772 uaddlv h0, v0.8b 1773 urshr v0.4h, v0.4h, #3 1774 dup v0.8h, v0.h[0] 1775 br x9 1776 1777L(ipred_cfl_left_h8): 1778 ld1 {v0.8b}, [x2] 1779 uaddlv h0, v0.8b 1780 urshr v0.4h, v0.4h, #3 1781 dup v0.8h, v0.h[0] 1782 br x9 1783 1784L(ipred_cfl_left_h16): 1785 ld1 {v0.16b}, [x2] 1786 uaddlv h0, v0.16b 1787 urshr v0.4h, v0.4h, #4 1788 dup v0.8h, v0.h[0] 1789 br x9 1790 1791L(ipred_cfl_left_h32): 1792 ld1 {v2.16b, v3.16b}, [x2] 1793 uaddlv h2, v2.16b 1794 uaddlv h3, v3.16b 1795 add v2.4h, v2.4h, v3.4h 1796 urshr v2.4h, v2.4h, #5 1797 dup v0.8h, v2.h[0] 1798 br x9 1799 1800L(ipred_cfl_left_tbl): 1801 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 1802 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 1803 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 1804 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 1805endfunc 1806 1807// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1808// const pixel *const topleft, 1809// const int width, const int height, 1810// const int16_t *ac, const int alpha); 1811function ipred_cfl_8bpc_neon, export=1 1812 sub x2, x2, w4, uxtw 1813 add w8, w3, w4 // width + height 1814 dup v1.8h, w6 // alpha 1815 clz w9, w3 1816 clz w6, w4 1817 dup v16.8h, w8 // width + height 1818 adr x7, L(ipred_cfl_tbl) 1819 rbit w8, w8 // rbit(width + height) 1820 sub w9, w9, #22 // 26 leading bits, minus table offset 4 1821 sub w6, w6, #26 1822 clz w8, w8 // ctz(width + height) 1823 ldrh w9, [x7, w9, uxtw #1] 1824 ldrh w6, [x7, w6, uxtw #1] 1825 neg w8, w8 // -ctz(width + height) 1826 sub x9, x7, w9, uxtw 1827 sub x7, x7, w6, uxtw 1828 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 1829 dup v17.8h, w8 // -ctz(width + height) 1830 add x6, x0, x1 1831 lsl x1, x1, #1 1832 br x7 1833 1834L(ipred_cfl_h4): 1835 ld1 {v0.s}[0], [x2], #4 1836 ins v0.s[1], wzr 1837 uaddlv h0, v0.8b 1838 br x9 1839L(ipred_cfl_w4): 1840 add x2, x2, #1 1841 ld1 {v2.s}[0], [x2] 1842 ins v2.s[1], wzr 1843 add v0.4h, v0.4h, v16.4h 1844 uaddlv h2, v2.8b 1845 cmp w4, #4 1846 add v0.4h, v0.4h, v2.4h 1847 ushl v0.4h, v0.4h, v17.4h 1848 b.eq 1f 1849 // h = 8/16 1850 mov w16, #(0x3334/2) 1851 movk w16, #(0x5556/2), lsl #16 1852 add w17, w4, w4 // w17 = 2*h = 16 or 32 1853 lsr w16, w16, w17 1854 dup v16.4h, w16 1855 sqdmulh v0.4h, v0.4h, v16.4h 18561: 1857 dup v0.8h, v0.h[0] 1858 b L(ipred_cfl_splat_w4) 1859 1860L(ipred_cfl_h8): 1861 ld1 {v0.8b}, [x2], #8 1862 uaddlv h0, v0.8b 1863 br x9 1864L(ipred_cfl_w8): 1865 add x2, x2, #1 1866 ld1 {v2.8b}, [x2] 1867 add v0.4h, v0.4h, v16.4h 1868 uaddlv h2, v2.8b 1869 cmp w4, #8 1870 add v0.4h, v0.4h, v2.4h 1871 ushl v0.4h, v0.4h, v17.4h 1872 b.eq 1f 1873 // h = 4/16/32 1874 cmp w4, #32 1875 mov w16, #(0x3334/2) 1876 mov w17, #(0x5556/2) 1877 csel w16, w16, w17, eq 1878 dup v16.4h, w16 1879 sqdmulh v0.4h, v0.4h, v16.4h 18801: 1881 dup v0.8h, v0.h[0] 1882 b L(ipred_cfl_splat_w8) 1883 1884L(ipred_cfl_h16): 1885 ld1 {v0.16b}, [x2], #16 1886 uaddlv h0, v0.16b 1887 br x9 1888L(ipred_cfl_w16): 1889 add x2, x2, #1 1890 ld1 {v2.16b}, [x2] 1891 add v0.4h, v0.4h, v16.4h 1892 uaddlv h2, v2.16b 1893 cmp w4, #16 1894 add v0.4h, v0.4h, v2.4h 1895 ushl v0.4h, v0.4h, v17.4h 1896 b.eq 1f 1897 // h = 4/8/32 1898 cmp w4, #4 1899 mov w16, #(0x3334/2) 1900 mov w17, #(0x5556/2) 1901 csel w16, w16, w17, eq 1902 dup v16.4h, w16 1903 sqdmulh v0.4h, v0.4h, v16.4h 19041: 1905 dup v0.8h, v0.h[0] 1906 b L(ipred_cfl_splat_w16) 1907 1908L(ipred_cfl_h32): 1909 ld1 {v2.16b, v3.16b}, [x2], #32 1910 uaddlv h2, v2.16b 1911 uaddlv h3, v3.16b 1912 add v0.4h, v2.4h, v3.4h 1913 br x9 1914L(ipred_cfl_w32): 1915 add x2, x2, #1 1916 ld1 {v2.16b, v3.16b}, [x2] 1917 add v0.4h, v0.4h, v16.4h 1918 uaddlv h2, v2.16b 1919 uaddlv h3, v3.16b 1920 cmp w4, #32 1921 add v0.4h, v0.4h, v2.4h 1922 add v0.4h, v0.4h, v3.4h 1923 ushl v0.4h, v0.4h, v17.4h 1924 b.eq 1f 1925 // h = 8/16 1926 mov w16, #(0x5556/2) 1927 movk w16, #(0x3334/2), lsl #16 1928 add w17, w4, w4 // w17 = 2*h = 16 or 32 1929 lsr w16, w16, w17 1930 dup v16.4h, w16 1931 sqdmulh v0.4h, v0.4h, v16.4h 19321: 1933 dup v0.8h, v0.h[0] 1934 b L(ipred_cfl_splat_w16) 1935 1936L(ipred_cfl_tbl): 1937 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 1938 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 1939 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 1940 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 1941 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 1942 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 1943 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 1944 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 1945endfunc 1946 1947// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, 1948// const ptrdiff_t stride, const int w_pad, 1949// const int h_pad, const int cw, const int ch); 1950function ipred_cfl_ac_420_8bpc_neon, export=1 1951 clz w8, w5 1952 lsl w4, w4, #2 1953 adr x7, L(ipred_cfl_ac_420_tbl) 1954 sub w8, w8, #27 1955 ldrh w8, [x7, w8, uxtw #1] 1956 movi v16.8h, #0 1957 movi v17.8h, #0 1958 movi v18.8h, #0 1959 movi v19.8h, #0 1960 sub x7, x7, w8, uxtw 1961 sub w8, w6, w4 // height - h_pad 1962 rbit w9, w5 // rbit(width) 1963 rbit w10, w6 // rbit(height) 1964 clz w9, w9 // ctz(width) 1965 clz w10, w10 // ctz(height) 1966 add w9, w9, w10 // log2sz 1967 add x10, x1, x2 1968 dup v31.4s, w9 1969 lsl x2, x2, #1 1970 neg v31.4s, v31.4s // -log2sz 1971 br x7 1972 1973L(ipred_cfl_ac_420_w4): 19741: // Copy and subsample input 1975 ld1 {v0.8b}, [x1], x2 1976 ld1 {v1.8b}, [x10], x2 1977 ld1 {v0.d}[1], [x1], x2 1978 ld1 {v1.d}[1], [x10], x2 1979 uaddlp v0.8h, v0.16b 1980 uaddlp v1.8h, v1.16b 1981 add v0.8h, v0.8h, v1.8h 1982 shl v0.8h, v0.8h, #1 1983 subs w8, w8, #2 1984 st1 {v0.8h}, [x0], #16 1985 add v16.8h, v16.8h, v0.8h 1986 b.gt 1b 1987 trn2 v1.2d, v0.2d, v0.2d 1988 trn2 v0.2d, v0.2d, v0.2d 1989L(ipred_cfl_ac_420_w4_hpad): 1990 cbz w4, 3f 19912: // Vertical padding (h_pad > 0) 1992 subs w4, w4, #4 1993 st1 {v0.8h, v1.8h}, [x0], #32 1994 add v16.8h, v16.8h, v0.8h 1995 add v17.8h, v17.8h, v1.8h 1996 b.gt 2b 19973: 1998 // Aggregate the sums 1999 add v0.8h, v16.8h, v17.8h 2000 uaddlv s0, v0.8h // sum 2001 sub x0, x0, w6, uxtw #3 2002 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2003 dup v4.8h, v4.h[0] 20046: // Subtract dc from ac 2005 ld1 {v0.8h, v1.8h}, [x0] 2006 subs w6, w6, #4 2007 sub v0.8h, v0.8h, v4.8h 2008 sub v1.8h, v1.8h, v4.8h 2009 st1 {v0.8h, v1.8h}, [x0], #32 2010 b.gt 6b 2011 ret 2012 2013L(ipred_cfl_ac_420_w8): 2014 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 20151: // Copy and subsample input, without padding 2016 ld1 {v0.16b}, [x1], x2 2017 ld1 {v1.16b}, [x10], x2 2018 ld1 {v2.16b}, [x1], x2 2019 uaddlp v0.8h, v0.16b 2020 ld1 {v3.16b}, [x10], x2 2021 uaddlp v1.8h, v1.16b 2022 uaddlp v2.8h, v2.16b 2023 uaddlp v3.8h, v3.16b 2024 add v0.8h, v0.8h, v1.8h 2025 add v2.8h, v2.8h, v3.8h 2026 shl v0.8h, v0.8h, #1 2027 shl v1.8h, v2.8h, #1 2028 subs w8, w8, #2 2029 st1 {v0.8h, v1.8h}, [x0], #32 2030 add v16.8h, v16.8h, v0.8h 2031 add v17.8h, v17.8h, v1.8h 2032 b.gt 1b 2033 mov v0.16b, v1.16b 2034 b L(ipred_cfl_ac_420_w8_hpad) 2035 2036L(ipred_cfl_ac_420_w8_wpad): 20371: // Copy and subsample input, padding 4 2038 ld1 {v0.8b}, [x1], x2 2039 ld1 {v1.8b}, [x10], x2 2040 ld1 {v0.d}[1], [x1], x2 2041 ld1 {v1.d}[1], [x10], x2 2042 uaddlp v0.8h, v0.16b 2043 uaddlp v1.8h, v1.16b 2044 add v0.8h, v0.8h, v1.8h 2045 shl v0.8h, v0.8h, #1 2046 dup v1.4h, v0.h[3] 2047 dup v3.4h, v0.h[7] 2048 trn2 v2.2d, v0.2d, v0.2d 2049 subs w8, w8, #2 2050 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 2051 add v16.4h, v16.4h, v0.4h 2052 add v17.4h, v17.4h, v1.4h 2053 add v18.4h, v18.4h, v2.4h 2054 add v19.4h, v19.4h, v3.4h 2055 b.gt 1b 2056 trn1 v0.2d, v2.2d, v3.2d 2057 trn1 v1.2d, v2.2d, v3.2d 2058 2059L(ipred_cfl_ac_420_w8_hpad): 2060 cbz w4, 3f 20612: // Vertical padding (h_pad > 0) 2062 subs w4, w4, #4 2063 st1 {v0.8h, v1.8h}, [x0], #32 2064 add v16.8h, v16.8h, v0.8h 2065 add v17.8h, v17.8h, v1.8h 2066 st1 {v0.8h, v1.8h}, [x0], #32 2067 add v18.8h, v18.8h, v0.8h 2068 add v19.8h, v19.8h, v1.8h 2069 b.gt 2b 20703: 2071 2072L(ipred_cfl_ac_420_w8_calc_subtract_dc): 2073 // Aggregate the sums 2074 add v0.8h, v16.8h, v17.8h 2075 add v2.8h, v18.8h, v19.8h 2076 uaddlp v0.4s, v0.8h 2077 uaddlp v2.4s, v2.8h 2078 add v0.4s, v0.4s, v2.4s 2079 addv s0, v0.4s // sum 2080 sub x0, x0, w6, uxtw #4 2081 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2082 dup v4.8h, v4.h[0] 2083L(ipred_cfl_ac_420_w8_subtract_dc): 20846: // Subtract dc from ac 2085 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 2086 subs w6, w6, #4 2087 sub v0.8h, v0.8h, v4.8h 2088 sub v1.8h, v1.8h, v4.8h 2089 sub v2.8h, v2.8h, v4.8h 2090 sub v3.8h, v3.8h, v4.8h 2091 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2092 b.gt 6b 2093 ret 2094 2095L(ipred_cfl_ac_420_w16): 2096 adr x7, L(ipred_cfl_ac_420_w16_tbl) 2097 ldrh w3, [x7, w3, uxtw #1] 2098 sub x7, x7, w3, uxtw 2099 br x7 2100 2101L(ipred_cfl_ac_420_w16_wpad0): 21021: // Copy and subsample input, without padding 2103 ld1 {v0.16b, v1.16b}, [x1], x2 2104 ld1 {v2.16b, v3.16b}, [x10], x2 2105 uaddlp v0.8h, v0.16b 2106 ld1 {v4.16b, v5.16b}, [x1], x2 2107 uaddlp v1.8h, v1.16b 2108 ld1 {v6.16b, v7.16b}, [x10], x2 2109 uaddlp v2.8h, v2.16b 2110 uaddlp v3.8h, v3.16b 2111 uaddlp v4.8h, v4.16b 2112 uaddlp v5.8h, v5.16b 2113 uaddlp v6.8h, v6.16b 2114 uaddlp v7.8h, v7.16b 2115 add v0.8h, v0.8h, v2.8h 2116 add v1.8h, v1.8h, v3.8h 2117 add v4.8h, v4.8h, v6.8h 2118 add v5.8h, v5.8h, v7.8h 2119 shl v0.8h, v0.8h, #1 2120 shl v1.8h, v1.8h, #1 2121 shl v2.8h, v4.8h, #1 2122 shl v3.8h, v5.8h, #1 2123 subs w8, w8, #2 2124 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2125 add v16.8h, v16.8h, v0.8h 2126 add v17.8h, v17.8h, v1.8h 2127 add v18.8h, v18.8h, v2.8h 2128 add v19.8h, v19.8h, v3.8h 2129 b.gt 1b 2130 mov v0.16b, v2.16b 2131 mov v1.16b, v3.16b 2132 b L(ipred_cfl_ac_420_w16_hpad) 2133 2134L(ipred_cfl_ac_420_w16_wpad1): 21351: // Copy and subsample input, padding 4 2136 ldr d1, [x1, #16] 2137 ld1 {v0.16b}, [x1], x2 2138 ldr d3, [x10, #16] 2139 ld1 {v2.16b}, [x10], x2 2140 uaddlp v1.4h, v1.8b 2141 ldr d5, [x1, #16] 2142 uaddlp v0.8h, v0.16b 2143 ld1 {v4.16b}, [x1], x2 2144 uaddlp v3.4h, v3.8b 2145 ldr d7, [x10, #16] 2146 uaddlp v2.8h, v2.16b 2147 ld1 {v6.16b}, [x10], x2 2148 uaddlp v5.4h, v5.8b 2149 uaddlp v4.8h, v4.16b 2150 uaddlp v7.4h, v7.8b 2151 uaddlp v6.8h, v6.16b 2152 add v1.4h, v1.4h, v3.4h 2153 add v0.8h, v0.8h, v2.8h 2154 add v5.4h, v5.4h, v7.4h 2155 add v4.8h, v4.8h, v6.8h 2156 shl v1.4h, v1.4h, #1 2157 shl v0.8h, v0.8h, #1 2158 shl v3.4h, v5.4h, #1 2159 shl v2.8h, v4.8h, #1 2160 dup v4.4h, v1.h[3] 2161 dup v5.4h, v3.h[3] 2162 trn1 v1.2d, v1.2d, v4.2d 2163 trn1 v3.2d, v3.2d, v5.2d 2164 subs w8, w8, #2 2165 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2166 add v16.8h, v16.8h, v0.8h 2167 add v17.8h, v17.8h, v1.8h 2168 add v18.8h, v18.8h, v2.8h 2169 add v19.8h, v19.8h, v3.8h 2170 b.gt 1b 2171 mov v0.16b, v2.16b 2172 mov v1.16b, v3.16b 2173 b L(ipred_cfl_ac_420_w16_hpad) 2174 2175L(ipred_cfl_ac_420_w16_wpad2): 21761: // Copy and subsample input, padding 8 2177 ld1 {v0.16b}, [x1], x2 2178 ld1 {v2.16b}, [x10], x2 2179 ld1 {v4.16b}, [x1], x2 2180 uaddlp v0.8h, v0.16b 2181 ld1 {v6.16b}, [x10], x2 2182 uaddlp v2.8h, v2.16b 2183 uaddlp v4.8h, v4.16b 2184 uaddlp v6.8h, v6.16b 2185 add v0.8h, v0.8h, v2.8h 2186 add v4.8h, v4.8h, v6.8h 2187 shl v0.8h, v0.8h, #1 2188 shl v2.8h, v4.8h, #1 2189 dup v1.8h, v0.h[7] 2190 dup v3.8h, v2.h[7] 2191 subs w8, w8, #2 2192 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2193 add v16.8h, v16.8h, v0.8h 2194 add v17.8h, v17.8h, v1.8h 2195 add v18.8h, v18.8h, v2.8h 2196 add v19.8h, v19.8h, v3.8h 2197 b.gt 1b 2198 mov v0.16b, v2.16b 2199 mov v1.16b, v3.16b 2200 b L(ipred_cfl_ac_420_w16_hpad) 2201 2202L(ipred_cfl_ac_420_w16_wpad3): 22031: // Copy and subsample input, padding 12 2204 ld1 {v0.8b}, [x1], x2 2205 ld1 {v2.8b}, [x10], x2 2206 ld1 {v4.8b}, [x1], x2 2207 uaddlp v0.4h, v0.8b 2208 ld1 {v6.8b}, [x10], x2 2209 uaddlp v2.4h, v2.8b 2210 uaddlp v4.4h, v4.8b 2211 uaddlp v6.4h, v6.8b 2212 add v0.4h, v0.4h, v2.4h 2213 add v4.4h, v4.4h, v6.4h 2214 shl v0.4h, v0.4h, #1 2215 shl v2.4h, v4.4h, #1 2216 dup v1.8h, v0.h[3] 2217 dup v3.8h, v2.h[3] 2218 trn1 v0.2d, v0.2d, v1.2d 2219 trn1 v2.2d, v2.2d, v3.2d 2220 subs w8, w8, #2 2221 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2222 add v16.8h, v16.8h, v0.8h 2223 add v17.8h, v17.8h, v1.8h 2224 add v18.8h, v18.8h, v2.8h 2225 add v19.8h, v19.8h, v3.8h 2226 b.gt 1b 2227 mov v0.16b, v2.16b 2228 mov v1.16b, v3.16b 2229 2230L(ipred_cfl_ac_420_w16_hpad): 2231 cbz w4, 3f 22322: // Vertical padding (h_pad > 0) 2233 subs w4, w4, #4 2234 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2235 add v16.8h, v16.8h, v0.8h 2236 add v17.8h, v17.8h, v1.8h 2237 add v18.8h, v18.8h, v2.8h 2238 add v19.8h, v19.8h, v3.8h 2239 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2240 add v16.8h, v16.8h, v0.8h 2241 add v17.8h, v17.8h, v1.8h 2242 add v18.8h, v18.8h, v2.8h 2243 add v19.8h, v19.8h, v3.8h 2244 b.gt 2b 22453: 2246 2247 // Double the height and reuse the w8 summing/subtracting 2248 lsl w6, w6, #1 2249 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) 2250 2251L(ipred_cfl_ac_420_tbl): 2252 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 2253 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 2254 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 2255 .hword 0 2256 2257L(ipred_cfl_ac_420_w16_tbl): 2258 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 2259 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 2260 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 2261 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 2262endfunc 2263 2264// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, 2265// const ptrdiff_t stride, const int w_pad, 2266// const int h_pad, const int cw, const int ch); 2267function ipred_cfl_ac_422_8bpc_neon, export=1 2268 clz w8, w5 2269 lsl w4, w4, #2 2270 adr x7, L(ipred_cfl_ac_422_tbl) 2271 sub w8, w8, #27 2272 ldrh w8, [x7, w8, uxtw #1] 2273 movi v16.8h, #0 2274 movi v17.8h, #0 2275 movi v18.8h, #0 2276 movi v19.8h, #0 2277 sub x7, x7, w8, uxtw 2278 sub w8, w6, w4 // height - h_pad 2279 rbit w9, w5 // rbit(width) 2280 rbit w10, w6 // rbit(height) 2281 clz w9, w9 // ctz(width) 2282 clz w10, w10 // ctz(height) 2283 add w9, w9, w10 // log2sz 2284 add x10, x1, x2 2285 dup v31.4s, w9 2286 lsl x2, x2, #1 2287 neg v31.4s, v31.4s // -log2sz 2288 br x7 2289 2290L(ipred_cfl_ac_422_w4): 22911: // Copy and subsample input 2292 ld1 {v0.8b}, [x1], x2 2293 ld1 {v0.d}[1], [x10], x2 2294 ld1 {v1.8b}, [x1], x2 2295 ld1 {v1.d}[1], [x10], x2 2296 uaddlp v0.8h, v0.16b 2297 uaddlp v1.8h, v1.16b 2298 shl v0.8h, v0.8h, #2 2299 shl v1.8h, v1.8h, #2 2300 subs w8, w8, #4 2301 add v16.8h, v16.8h, v0.8h 2302 add v17.8h, v17.8h, v1.8h 2303 st1 {v0.8h, v1.8h}, [x0], #32 2304 b.gt 1b 2305 trn2 v0.2d, v1.2d, v1.2d 2306 trn2 v1.2d, v1.2d, v1.2d 2307 b L(ipred_cfl_ac_420_w4_hpad) 2308 2309L(ipred_cfl_ac_422_w8): 2310 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 23111: // Copy and subsample input, without padding 2312 ld1 {v0.16b}, [x1], x2 2313 ld1 {v1.16b}, [x10], x2 2314 ld1 {v2.16b}, [x1], x2 2315 uaddlp v0.8h, v0.16b 2316 ld1 {v3.16b}, [x10], x2 2317 uaddlp v1.8h, v1.16b 2318 uaddlp v2.8h, v2.16b 2319 uaddlp v3.8h, v3.16b 2320 shl v0.8h, v0.8h, #2 2321 shl v1.8h, v1.8h, #2 2322 shl v2.8h, v2.8h, #2 2323 shl v3.8h, v3.8h, #2 2324 subs w8, w8, #4 2325 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2326 add v16.8h, v16.8h, v0.8h 2327 add v17.8h, v17.8h, v1.8h 2328 add v18.8h, v18.8h, v2.8h 2329 add v19.8h, v19.8h, v3.8h 2330 b.gt 1b 2331 mov v0.16b, v3.16b 2332 mov v1.16b, v3.16b 2333 b L(ipred_cfl_ac_420_w8_hpad) 2334 2335L(ipred_cfl_ac_422_w8_wpad): 23361: // Copy and subsample input, padding 4 2337 ld1 {v0.8b}, [x1], x2 2338 ld1 {v0.d}[1], [x10], x2 2339 ld1 {v2.8b}, [x1], x2 2340 ld1 {v2.d}[1], [x10], x2 2341 uaddlp v0.8h, v0.16b 2342 uaddlp v2.8h, v2.16b 2343 shl v0.8h, v0.8h, #2 2344 shl v2.8h, v2.8h, #2 2345 dup v4.4h, v0.h[3] 2346 dup v5.8h, v0.h[7] 2347 dup v6.4h, v2.h[3] 2348 dup v7.8h, v2.h[7] 2349 trn2 v1.2d, v0.2d, v5.2d 2350 trn1 v0.2d, v0.2d, v4.2d 2351 trn2 v3.2d, v2.2d, v7.2d 2352 trn1 v2.2d, v2.2d, v6.2d 2353 subs w8, w8, #4 2354 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2355 add v16.8h, v16.8h, v0.8h 2356 add v17.8h, v17.8h, v1.8h 2357 add v18.8h, v18.8h, v2.8h 2358 add v19.8h, v19.8h, v3.8h 2359 b.gt 1b 2360 mov v0.16b, v3.16b 2361 mov v1.16b, v3.16b 2362 b L(ipred_cfl_ac_420_w8_hpad) 2363 2364L(ipred_cfl_ac_422_w16): 2365 adr x7, L(ipred_cfl_ac_422_w16_tbl) 2366 ldrh w3, [x7, w3, uxtw #1] 2367 sub x7, x7, w3, uxtw 2368 br x7 2369 2370L(ipred_cfl_ac_422_w16_wpad0): 23711: // Copy and subsample input, without padding 2372 ld1 {v0.16b, v1.16b}, [x1], x2 2373 ld1 {v2.16b, v3.16b}, [x10], x2 2374 uaddlp v0.8h, v0.16b 2375 uaddlp v1.8h, v1.16b 2376 uaddlp v2.8h, v2.16b 2377 uaddlp v3.8h, v3.16b 2378 shl v0.8h, v0.8h, #2 2379 shl v1.8h, v1.8h, #2 2380 shl v2.8h, v2.8h, #2 2381 shl v3.8h, v3.8h, #2 2382 subs w8, w8, #2 2383 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2384 add v16.8h, v16.8h, v0.8h 2385 add v17.8h, v17.8h, v1.8h 2386 add v18.8h, v18.8h, v2.8h 2387 add v19.8h, v19.8h, v3.8h 2388 b.gt 1b 2389 mov v0.16b, v2.16b 2390 mov v1.16b, v3.16b 2391 b L(ipred_cfl_ac_420_w16_hpad) 2392 2393L(ipred_cfl_ac_422_w16_wpad1): 23941: // Copy and subsample input, padding 4 2395 ldr d1, [x1, #16] 2396 ld1 {v0.16b}, [x1], x2 2397 ldr d3, [x10, #16] 2398 ld1 {v2.16b}, [x10], x2 2399 uaddlp v1.4h, v1.8b 2400 uaddlp v0.8h, v0.16b 2401 uaddlp v3.4h, v3.8b 2402 uaddlp v2.8h, v2.16b 2403 shl v1.4h, v1.4h, #2 2404 shl v0.8h, v0.8h, #2 2405 shl v3.4h, v3.4h, #2 2406 shl v2.8h, v2.8h, #2 2407 dup v4.4h, v1.h[3] 2408 dup v5.4h, v3.h[3] 2409 trn1 v1.2d, v1.2d, v4.2d 2410 trn1 v3.2d, v3.2d, v5.2d 2411 subs w8, w8, #2 2412 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2413 add v16.8h, v16.8h, v0.8h 2414 add v17.8h, v17.8h, v1.8h 2415 add v18.8h, v18.8h, v2.8h 2416 add v19.8h, v19.8h, v3.8h 2417 b.gt 1b 2418 mov v0.16b, v2.16b 2419 mov v1.16b, v3.16b 2420 b L(ipred_cfl_ac_420_w16_hpad) 2421 2422L(ipred_cfl_ac_422_w16_wpad2): 24231: // Copy and subsample input, padding 8 2424 ld1 {v0.16b}, [x1], x2 2425 ld1 {v2.16b}, [x10], x2 2426 uaddlp v0.8h, v0.16b 2427 uaddlp v2.8h, v2.16b 2428 shl v0.8h, v0.8h, #2 2429 shl v2.8h, v2.8h, #2 2430 dup v1.8h, v0.h[7] 2431 dup v3.8h, v2.h[7] 2432 subs w8, w8, #2 2433 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2434 add v16.8h, v16.8h, v0.8h 2435 add v17.8h, v17.8h, v1.8h 2436 add v18.8h, v18.8h, v2.8h 2437 add v19.8h, v19.8h, v3.8h 2438 b.gt 1b 2439 mov v0.16b, v2.16b 2440 mov v1.16b, v3.16b 2441 b L(ipred_cfl_ac_420_w16_hpad) 2442 2443L(ipred_cfl_ac_422_w16_wpad3): 24441: // Copy and subsample input, padding 12 2445 ld1 {v0.8b}, [x1], x2 2446 ld1 {v2.8b}, [x10], x2 2447 uaddlp v0.4h, v0.8b 2448 uaddlp v2.4h, v2.8b 2449 shl v0.4h, v0.4h, #2 2450 shl v2.4h, v2.4h, #2 2451 dup v1.8h, v0.h[3] 2452 dup v3.8h, v2.h[3] 2453 trn1 v0.2d, v0.2d, v1.2d 2454 trn1 v2.2d, v2.2d, v3.2d 2455 subs w8, w8, #2 2456 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2457 add v16.8h, v16.8h, v0.8h 2458 add v17.8h, v17.8h, v1.8h 2459 add v18.8h, v18.8h, v2.8h 2460 add v19.8h, v19.8h, v3.8h 2461 b.gt 1b 2462 mov v0.16b, v2.16b 2463 mov v1.16b, v3.16b 2464 b L(ipred_cfl_ac_420_w16_hpad) 2465 2466L(ipred_cfl_ac_422_tbl): 2467 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 2468 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 2469 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 2470 .hword 0 2471 2472L(ipred_cfl_ac_422_w16_tbl): 2473 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 2474 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 2475 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 2476 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 2477endfunc 2478 2479// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, 2480// const ptrdiff_t stride, const int w_pad, 2481// const int h_pad, const int cw, const int ch); 2482function ipred_cfl_ac_444_8bpc_neon, export=1 2483 clz w8, w5 2484 lsl w4, w4, #2 2485 adr x7, L(ipred_cfl_ac_444_tbl) 2486 sub w8, w8, #26 2487 ldrh w8, [x7, w8, uxtw #1] 2488 movi v16.8h, #0 2489 movi v17.8h, #0 2490 movi v18.8h, #0 2491 movi v19.8h, #0 2492 sub x7, x7, w8, uxtw 2493 sub w8, w6, w4 // height - h_pad 2494 rbit w9, w5 // rbit(width) 2495 rbit w10, w6 // rbit(height) 2496 clz w9, w9 // ctz(width) 2497 clz w10, w10 // ctz(height) 2498 add w9, w9, w10 // log2sz 2499 add x10, x1, x2 2500 dup v31.4s, w9 2501 lsl x2, x2, #1 2502 neg v31.4s, v31.4s // -log2sz 2503 br x7 2504 2505L(ipred_cfl_ac_444_w4): 25061: // Copy and expand input 2507 ld1 {v0.s}[0], [x1], x2 2508 ld1 {v0.s}[1], [x10], x2 2509 ld1 {v1.s}[0], [x1], x2 2510 ld1 {v1.s}[1], [x10], x2 2511 ushll v0.8h, v0.8b, #3 2512 ushll v1.8h, v1.8b, #3 2513 subs w8, w8, #4 2514 add v16.8h, v16.8h, v0.8h 2515 add v17.8h, v17.8h, v1.8h 2516 st1 {v0.8h, v1.8h}, [x0], #32 2517 b.gt 1b 2518 trn2 v0.2d, v1.2d, v1.2d 2519 trn2 v1.2d, v1.2d, v1.2d 2520 b L(ipred_cfl_ac_420_w4_hpad) 2521 2522L(ipred_cfl_ac_444_w8): 25231: // Copy and expand input 2524 ld1 {v0.8b}, [x1], x2 2525 ld1 {v1.8b}, [x10], x2 2526 ld1 {v2.8b}, [x1], x2 2527 ushll v0.8h, v0.8b, #3 2528 ld1 {v3.8b}, [x10], x2 2529 ushll v1.8h, v1.8b, #3 2530 ushll v2.8h, v2.8b, #3 2531 ushll v3.8h, v3.8b, #3 2532 subs w8, w8, #4 2533 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2534 add v16.8h, v16.8h, v0.8h 2535 add v17.8h, v17.8h, v1.8h 2536 add v18.8h, v18.8h, v2.8h 2537 add v19.8h, v19.8h, v3.8h 2538 b.gt 1b 2539 mov v0.16b, v3.16b 2540 mov v1.16b, v3.16b 2541 b L(ipred_cfl_ac_420_w8_hpad) 2542 2543L(ipred_cfl_ac_444_w16): 2544 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 25451: // Copy and expand input, without padding 2546 ld1 {v0.16b}, [x1], x2 2547 ld1 {v2.16b}, [x10], x2 2548 ld1 {v4.16b}, [x1], x2 2549 ushll2 v1.8h, v0.16b, #3 2550 ushll v0.8h, v0.8b, #3 2551 ld1 {v6.16b}, [x10], x2 2552 ushll2 v3.8h, v2.16b, #3 2553 ushll v2.8h, v2.8b, #3 2554 ushll2 v5.8h, v4.16b, #3 2555 ushll v4.8h, v4.8b, #3 2556 ushll2 v7.8h, v6.16b, #3 2557 ushll v6.8h, v6.8b, #3 2558 subs w8, w8, #4 2559 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2560 add v16.8h, v16.8h, v0.8h 2561 add v17.8h, v17.8h, v1.8h 2562 add v18.8h, v18.8h, v2.8h 2563 add v19.8h, v19.8h, v3.8h 2564 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2565 add v16.8h, v16.8h, v4.8h 2566 add v17.8h, v17.8h, v5.8h 2567 add v18.8h, v18.8h, v6.8h 2568 add v19.8h, v19.8h, v7.8h 2569 b.gt 1b 2570 mov v0.16b, v6.16b 2571 mov v1.16b, v7.16b 2572 mov v2.16b, v6.16b 2573 mov v3.16b, v7.16b 2574 b L(ipred_cfl_ac_420_w16_hpad) 2575 2576L(ipred_cfl_ac_444_w16_wpad): 25771: // Copy and expand input, padding 8 2578 ld1 {v0.8b}, [x1], x2 2579 ld1 {v2.8b}, [x10], x2 2580 ld1 {v4.8b}, [x1], x2 2581 ld1 {v6.8b}, [x10], x2 2582 ushll v0.8h, v0.8b, #3 2583 ushll v2.8h, v2.8b, #3 2584 ushll v4.8h, v4.8b, #3 2585 ushll v6.8h, v6.8b, #3 2586 dup v1.8h, v0.h[7] 2587 dup v3.8h, v2.h[7] 2588 dup v5.8h, v4.h[7] 2589 dup v7.8h, v6.h[7] 2590 subs w8, w8, #4 2591 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2592 add v16.8h, v16.8h, v0.8h 2593 add v17.8h, v17.8h, v1.8h 2594 add v18.8h, v18.8h, v2.8h 2595 add v19.8h, v19.8h, v3.8h 2596 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2597 add v16.8h, v16.8h, v4.8h 2598 add v17.8h, v17.8h, v5.8h 2599 add v18.8h, v18.8h, v6.8h 2600 add v19.8h, v19.8h, v7.8h 2601 b.gt 1b 2602 mov v0.16b, v6.16b 2603 mov v1.16b, v7.16b 2604 mov v2.16b, v6.16b 2605 mov v3.16b, v7.16b 2606 b L(ipred_cfl_ac_420_w16_hpad) 2607 2608L(ipred_cfl_ac_444_w32): 2609 adr x7, L(ipred_cfl_ac_444_w32_tbl) 2610 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 2611 sub x7, x7, w3, uxtw 2612 br x7 2613 2614L(ipred_cfl_ac_444_w32_wpad0): 26151: // Copy and expand input, without padding 2616 ld1 {v2.16b, v3.16b}, [x1], x2 2617 ld1 {v6.16b, v7.16b}, [x10], x2 2618 ushll v0.8h, v2.8b, #3 2619 ushll2 v1.8h, v2.16b, #3 2620 ushll v2.8h, v3.8b, #3 2621 ushll2 v3.8h, v3.16b, #3 2622 ushll v4.8h, v6.8b, #3 2623 ushll2 v5.8h, v6.16b, #3 2624 ushll v6.8h, v7.8b, #3 2625 ushll2 v7.8h, v7.16b, #3 2626 subs w8, w8, #2 2627 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2628 add v16.8h, v16.8h, v0.8h 2629 add v17.8h, v17.8h, v1.8h 2630 add v18.8h, v18.8h, v2.8h 2631 add v19.8h, v19.8h, v3.8h 2632 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2633 add v16.8h, v16.8h, v4.8h 2634 add v17.8h, v17.8h, v5.8h 2635 add v18.8h, v18.8h, v6.8h 2636 add v19.8h, v19.8h, v7.8h 2637 b.gt 1b 2638 b L(ipred_cfl_ac_444_w32_hpad) 2639 2640L(ipred_cfl_ac_444_w32_wpad2): 26411: // Copy and expand input, padding 8 2642 ldr d2, [x1, #16] 2643 ld1 {v1.16b}, [x1], x2 2644 ldr d6, [x10, #16] 2645 ld1 {v5.16b}, [x10], x2 2646 ushll v2.8h, v2.8b, #3 2647 ushll v0.8h, v1.8b, #3 2648 ushll2 v1.8h, v1.16b, #3 2649 ushll v6.8h, v6.8b, #3 2650 ushll v4.8h, v5.8b, #3 2651 ushll2 v5.8h, v5.16b, #3 2652 dup v3.8h, v2.h[7] 2653 dup v7.8h, v6.h[7] 2654 subs w8, w8, #2 2655 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2656 add v16.8h, v16.8h, v0.8h 2657 add v17.8h, v17.8h, v1.8h 2658 add v18.8h, v18.8h, v2.8h 2659 add v19.8h, v19.8h, v3.8h 2660 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2661 add v16.8h, v16.8h, v4.8h 2662 add v17.8h, v17.8h, v5.8h 2663 add v18.8h, v18.8h, v6.8h 2664 add v19.8h, v19.8h, v7.8h 2665 b.gt 1b 2666 b L(ipred_cfl_ac_444_w32_hpad) 2667 2668L(ipred_cfl_ac_444_w32_wpad4): 26691: // Copy and expand input, padding 16 2670 ld1 {v1.16b}, [x1], x2 2671 ld1 {v5.16b}, [x10], x2 2672 ushll v0.8h, v1.8b, #3 2673 ushll2 v1.8h, v1.16b, #3 2674 ushll v4.8h, v5.8b, #3 2675 ushll2 v5.8h, v5.16b, #3 2676 dup v2.8h, v1.h[7] 2677 dup v3.8h, v1.h[7] 2678 dup v6.8h, v5.h[7] 2679 dup v7.8h, v5.h[7] 2680 subs w8, w8, #2 2681 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2682 add v16.8h, v16.8h, v0.8h 2683 add v17.8h, v17.8h, v1.8h 2684 add v18.8h, v18.8h, v2.8h 2685 add v19.8h, v19.8h, v3.8h 2686 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2687 add v16.8h, v16.8h, v4.8h 2688 add v17.8h, v17.8h, v5.8h 2689 add v18.8h, v18.8h, v6.8h 2690 add v19.8h, v19.8h, v7.8h 2691 b.gt 1b 2692 b L(ipred_cfl_ac_444_w32_hpad) 2693 2694L(ipred_cfl_ac_444_w32_wpad6): 26951: // Copy and expand input, padding 24 2696 ld1 {v0.8b}, [x1], x2 2697 ld1 {v4.8b}, [x10], x2 2698 ushll v0.8h, v0.8b, #3 2699 ushll v4.8h, v4.8b, #3 2700 dup v1.8h, v0.h[7] 2701 dup v2.8h, v0.h[7] 2702 dup v3.8h, v0.h[7] 2703 dup v5.8h, v4.h[7] 2704 dup v6.8h, v4.h[7] 2705 dup v7.8h, v4.h[7] 2706 subs w8, w8, #2 2707 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 2708 add v16.8h, v16.8h, v0.8h 2709 add v17.8h, v17.8h, v1.8h 2710 add v18.8h, v18.8h, v2.8h 2711 add v19.8h, v19.8h, v3.8h 2712 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2713 add v16.8h, v16.8h, v4.8h 2714 add v17.8h, v17.8h, v5.8h 2715 add v18.8h, v18.8h, v6.8h 2716 add v19.8h, v19.8h, v7.8h 2717 b.gt 1b 2718 2719L(ipred_cfl_ac_444_w32_hpad): 2720 cbz w4, 3f 27212: // Vertical padding (h_pad > 0) 2722 subs w4, w4, #2 2723 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2724 add v16.8h, v16.8h, v4.8h 2725 add v17.8h, v17.8h, v5.8h 2726 add v18.8h, v18.8h, v6.8h 2727 add v19.8h, v19.8h, v7.8h 2728 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 2729 add v16.8h, v16.8h, v4.8h 2730 add v17.8h, v17.8h, v5.8h 2731 add v18.8h, v18.8h, v6.8h 2732 add v19.8h, v19.8h, v7.8h 2733 b.gt 2b 27343: 2735 2736 // Quadruple the height and reuse the w8 subtracting 2737 lsl w6, w6, #2 2738 // Aggregate the sums, with wider intermediates earlier than in 2739 // ipred_cfl_ac_420_w8_calc_subtract_dc. 2740 uaddlp v0.4s, v16.8h 2741 uaddlp v1.4s, v17.8h 2742 uaddlp v2.4s, v18.8h 2743 uaddlp v3.4s, v19.8h 2744 add v0.4s, v0.4s, v1.4s 2745 add v2.4s, v2.4s, v3.4s 2746 add v0.4s, v0.4s, v2.4s 2747 addv s0, v0.4s // sum 2748 sub x0, x0, w6, uxtw #4 2749 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 2750 dup v4.8h, v4.h[0] 2751 b L(ipred_cfl_ac_420_w8_subtract_dc) 2752 2753L(ipred_cfl_ac_444_tbl): 2754 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 2755 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 2756 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 2757 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 2758 2759L(ipred_cfl_ac_444_w32_tbl): 2760 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 2761 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 2762 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 2763 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 2764endfunc 2765