1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 adr x7, L(\type\()_tbl) 83 sub w4, w4, #24 84 ldrh w4, [x7, x4, lsl #1] 85 \type v4, v0, v1, v2, v3 86 sub x7, x7, w4, uxtw 87 br x7 8840: 89 AARCH64_VALID_JUMP_TARGET 90 add x7, x0, x1 91 lsl x1, x1, #1 924: 93 cmp w5, #4 94 st1 {v4.s}[0], [x0], x1 95 st1 {v4.s}[1], [x7], x1 96 st1 {v4.s}[2], [x0], x1 97 st1 {v4.s}[3], [x7], x1 98 b.eq 0f 99 \type v5, v0, v1, v2, v3 100 cmp w5, #8 101 st1 {v5.s}[0], [x0], x1 102 st1 {v5.s}[1], [x7], x1 103 st1 {v5.s}[2], [x0], x1 104 st1 {v5.s}[3], [x7], x1 105 b.eq 0f 106 \type v4, v0, v1, v2, v3 107 st1 {v4.s}[0], [x0], x1 108 st1 {v4.s}[1], [x7], x1 109 \type v5, v0, v1, v2, v3 110 st1 {v4.s}[2], [x0], x1 111 st1 {v4.s}[3], [x7], x1 112 st1 {v5.s}[0], [x0], x1 113 st1 {v5.s}[1], [x7], x1 114 st1 {v5.s}[2], [x0], x1 115 st1 {v5.s}[3], [x7], x1 116 ret 11780: 118 AARCH64_VALID_JUMP_TARGET 119 add x7, x0, x1 120 lsl x1, x1, #1 1218: 122 st1 {v4.d}[0], [x0], x1 123 \type v5, v0, v1, v2, v3 124 st1 {v4.d}[1], [x7], x1 125 st1 {v5.d}[0], [x0], x1 126 subs w5, w5, #4 127 st1 {v5.d}[1], [x7], x1 128 b.le 0f 129 \type v4, v0, v1, v2, v3 130 b 8b 13116: 132 AARCH64_VALID_JUMP_TARGET 133 \type v5, v0, v1, v2, v3 134 st1 {v4.16b}, [x0], x1 135 \type v6, v0, v1, v2, v3 136 st1 {v5.16b}, [x0], x1 137 \type v7, v0, v1, v2, v3 138 st1 {v6.16b}, [x0], x1 139 subs w5, w5, #4 140 st1 {v7.16b}, [x0], x1 141 b.le 0f 142 \type v4, v0, v1, v2, v3 143 b 16b 144320: 145 AARCH64_VALID_JUMP_TARGET 146 add x7, x0, x1 147 lsl x1, x1, #1 14832: 149 \type v5, v0, v1, v2, v3 150 \type v6, v0, v1, v2, v3 151 st1 {v4.16b,v5.16b}, [x0], x1 152 \type v7, v0, v1, v2, v3 153 subs w5, w5, #2 154 st1 {v6.16b,v7.16b}, [x7], x1 155 b.le 0f 156 \type v4, v0, v1, v2, v3 157 b 32b 158640: 159 AARCH64_VALID_JUMP_TARGET 160 add x7, x0, x1 161 lsl x1, x1, #1 16264: 163 \type v5, v0, v1, v2, v3 164 \type v6, v0, v1, v2, v3 165 \type v7, v0, v1, v2, v3 166 \type v16, v0, v1, v2, v3 167 \type v17, v0, v1, v2, v3 168 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 169 \type v18, v0, v1, v2, v3 170 \type v19, v0, v1, v2, v3 171 subs w5, w5, #2 172 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 173 b.le 0f 174 \type v4, v0, v1, v2, v3 175 b 64b 1761280: 177 AARCH64_VALID_JUMP_TARGET 178 add x7, x0, #64 179128: 180 \type v5, v0, v1, v2, v3 181 \type v6, v0, v1, v2, v3 182 \type v7, v0, v1, v2, v3 183 \type v16, v0, v1, v2, v3 184 \type v17, v0, v1, v2, v3 185 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 186 \type v18, v0, v1, v2, v3 187 \type v19, v0, v1, v2, v3 188 subs w5, w5, #1 189 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 190 b.le 0f 191 \type v4, v0, v1, v2, v3 192 b 128b 1930: 194 ret 195L(\type\()_tbl): 196 .hword L(\type\()_tbl) - 1280b 197 .hword L(\type\()_tbl) - 640b 198 .hword L(\type\()_tbl) - 320b 199 .hword L(\type\()_tbl) - 16b 200 .hword L(\type\()_tbl) - 80b 201 .hword L(\type\()_tbl) - 40b 202endfunc 203.endm 204 205bidir_fn avg 206bidir_fn w_avg 207bidir_fn mask 208 209 210.macro w_mask_fn type 211function w_mask_\type\()_8bpc_neon, export=1 212 clz w8, w4 213 adr x9, L(w_mask_\type\()_tbl) 214 sub w8, w8, #24 215 ldrh w8, [x9, x8, lsl #1] 216 sub x9, x9, w8, uxtw 217 mov w10, #6903 218 dup v0.8h, w10 219.if \type == 444 220 movi v1.16b, #64 221.elseif \type == 422 222 dup v2.8b, w7 223 movi v3.8b, #129 224 sub v3.8b, v3.8b, v2.8b 225.elseif \type == 420 226 dup v2.8h, w7 227 movi v3.8h, #1, lsl #8 228 sub v3.8h, v3.8h, v2.8h 229.endif 230 add x12, x0, x1 231 lsl x1, x1, #1 232 br x9 2334: 234 AARCH64_VALID_JUMP_TARGET 235 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 236 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 237 subs w5, w5, #4 238 sub v16.8h, v6.8h, v4.8h 239 sub v17.8h, v7.8h, v5.8h 240 sabd v18.8h, v4.8h, v6.8h 241 sabd v19.8h, v5.8h, v7.8h 242 uqsub v18.8h, v0.8h, v18.8h 243 uqsub v19.8h, v0.8h, v19.8h 244 ushr v18.8h, v18.8h, #8 245 ushr v19.8h, v19.8h, #8 246 shl v20.8h, v18.8h, #9 247 shl v21.8h, v19.8h, #9 248 sqdmulh v20.8h, v20.8h, v16.8h 249 sqdmulh v21.8h, v21.8h, v17.8h 250 add v20.8h, v20.8h, v4.8h 251 add v21.8h, v21.8h, v5.8h 252 sqrshrun v22.8b, v20.8h, #4 253 sqrshrun v23.8b, v21.8h, #4 254.if \type == 444 255 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 256 sub v18.16b, v1.16b, v18.16b 257 st1 {v18.16b}, [x6], #16 258.elseif \type == 422 259 addp v18.8h, v18.8h, v19.8h 260 xtn v18.8b, v18.8h 261 uhsub v18.8b, v3.8b, v18.8b 262 st1 {v18.8b}, [x6], #8 263.elseif \type == 420 264 trn1 v24.2d, v18.2d, v19.2d 265 trn2 v25.2d, v18.2d, v19.2d 266 add v24.8h, v24.8h, v25.8h 267 addp v18.8h, v24.8h, v24.8h 268 sub v18.4h, v3.4h, v18.4h 269 rshrn v18.8b, v18.8h, #2 270 st1 {v18.s}[0], [x6], #4 271.endif 272 st1 {v22.s}[0], [x0], x1 273 st1 {v22.s}[1], [x12], x1 274 st1 {v23.s}[0], [x0], x1 275 st1 {v23.s}[1], [x12], x1 276 b.gt 4b 277 ret 2788: 279 AARCH64_VALID_JUMP_TARGET 280 ld1 {v4.8h, v5.8h}, [x2], #32 281 ld1 {v6.8h, v7.8h}, [x3], #32 282 subs w5, w5, #2 283 sub v16.8h, v6.8h, v4.8h 284 sub v17.8h, v7.8h, v5.8h 285 sabd v18.8h, v4.8h, v6.8h 286 sabd v19.8h, v5.8h, v7.8h 287 uqsub v18.8h, v0.8h, v18.8h 288 uqsub v19.8h, v0.8h, v19.8h 289 ushr v18.8h, v18.8h, #8 290 ushr v19.8h, v19.8h, #8 291 shl v20.8h, v18.8h, #9 292 shl v21.8h, v19.8h, #9 293 sqdmulh v20.8h, v20.8h, v16.8h 294 sqdmulh v21.8h, v21.8h, v17.8h 295 add v20.8h, v20.8h, v4.8h 296 add v21.8h, v21.8h, v5.8h 297 sqrshrun v22.8b, v20.8h, #4 298 sqrshrun v23.8b, v21.8h, #4 299.if \type == 444 300 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 301 sub v18.16b, v1.16b, v18.16b 302 st1 {v18.16b}, [x6], #16 303.elseif \type == 422 304 addp v18.8h, v18.8h, v19.8h 305 xtn v18.8b, v18.8h 306 uhsub v18.8b, v3.8b, v18.8b 307 st1 {v18.8b}, [x6], #8 308.elseif \type == 420 309 add v18.8h, v18.8h, v19.8h 310 addp v18.8h, v18.8h, v18.8h 311 sub v18.4h, v3.4h, v18.4h 312 rshrn v18.8b, v18.8h, #2 313 st1 {v18.s}[0], [x6], #4 314.endif 315 st1 {v22.8b}, [x0], x1 316 st1 {v23.8b}, [x12], x1 317 b.gt 8b 318 ret 3191280: 320640: 321320: 322160: 323 AARCH64_VALID_JUMP_TARGET 324 mov w11, w4 325 sub x1, x1, w4, uxtw 326.if \type == 444 327 add x10, x6, w4, uxtw 328.elseif \type == 422 329 add x10, x6, x11, lsr #1 330.endif 331 add x9, x3, w4, uxtw #1 332 add x7, x2, w4, uxtw #1 333161: 334 mov w8, w4 33516: 336 ld1 {v4.8h, v5.8h}, [x2], #32 337 ld1 {v6.8h, v7.8h}, [x3], #32 338 ld1 {v16.8h, v17.8h}, [x7], #32 339 ld1 {v18.8h, v19.8h}, [x9], #32 340 subs w8, w8, #16 341 sub v6.8h, v6.8h, v4.8h 342 sub v7.8h, v7.8h, v5.8h 343 sub v18.8h, v18.8h, v16.8h 344 sub v19.8h, v19.8h, v17.8h 345 abs v20.8h, v6.8h 346 abs v21.8h, v7.8h 347 abs v22.8h, v18.8h 348 abs v23.8h, v19.8h 349 uqsub v20.8h, v0.8h, v20.8h 350 uqsub v21.8h, v0.8h, v21.8h 351 uqsub v22.8h, v0.8h, v22.8h 352 uqsub v23.8h, v0.8h, v23.8h 353 ushr v20.8h, v20.8h, #8 354 ushr v21.8h, v21.8h, #8 355 ushr v22.8h, v22.8h, #8 356 ushr v23.8h, v23.8h, #8 357 shl v24.8h, v20.8h, #9 358 shl v25.8h, v21.8h, #9 359 shl v26.8h, v22.8h, #9 360 shl v27.8h, v23.8h, #9 361 sqdmulh v24.8h, v24.8h, v6.8h 362 sqdmulh v25.8h, v25.8h, v7.8h 363 sqdmulh v26.8h, v26.8h, v18.8h 364 sqdmulh v27.8h, v27.8h, v19.8h 365 add v24.8h, v24.8h, v4.8h 366 add v25.8h, v25.8h, v5.8h 367 add v26.8h, v26.8h, v16.8h 368 add v27.8h, v27.8h, v17.8h 369 sqrshrun v24.8b, v24.8h, #4 370 sqrshrun v25.8b, v25.8h, #4 371 sqrshrun v26.8b, v26.8h, #4 372 sqrshrun v27.8b, v27.8h, #4 373.if \type == 444 374 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 375 uzp1 v21.16b, v22.16b, v23.16b // Ditto 376 sub v20.16b, v1.16b, v20.16b 377 sub v21.16b, v1.16b, v21.16b 378 st1 {v20.16b}, [x6], #16 379 st1 {v21.16b}, [x10], #16 380.elseif \type == 422 381 addp v20.8h, v20.8h, v21.8h 382 addp v21.8h, v22.8h, v23.8h 383 xtn v20.8b, v20.8h 384 xtn v21.8b, v21.8h 385 uhsub v20.8b, v3.8b, v20.8b 386 uhsub v21.8b, v3.8b, v21.8b 387 st1 {v20.8b}, [x6], #8 388 st1 {v21.8b}, [x10], #8 389.elseif \type == 420 390 add v20.8h, v20.8h, v22.8h 391 add v21.8h, v21.8h, v23.8h 392 addp v20.8h, v20.8h, v21.8h 393 sub v20.8h, v3.8h, v20.8h 394 rshrn v20.8b, v20.8h, #2 395 st1 {v20.8b}, [x6], #8 396.endif 397 st1 {v24.8b, v25.8b}, [x0], #16 398 st1 {v26.8b, v27.8b}, [x12], #16 399 b.gt 16b 400 subs w5, w5, #2 401 add x2, x2, w4, uxtw #1 402 add x3, x3, w4, uxtw #1 403 add x7, x7, w4, uxtw #1 404 add x9, x9, w4, uxtw #1 405.if \type == 444 406 add x6, x6, w4, uxtw 407 add x10, x10, w4, uxtw 408.elseif \type == 422 409 add x6, x6, x11, lsr #1 410 add x10, x10, x11, lsr #1 411.endif 412 add x0, x0, x1 413 add x12, x12, x1 414 b.gt 161b 415 ret 416L(w_mask_\type\()_tbl): 417 .hword L(w_mask_\type\()_tbl) - 1280b 418 .hword L(w_mask_\type\()_tbl) - 640b 419 .hword L(w_mask_\type\()_tbl) - 320b 420 .hword L(w_mask_\type\()_tbl) - 160b 421 .hword L(w_mask_\type\()_tbl) - 8b 422 .hword L(w_mask_\type\()_tbl) - 4b 423endfunc 424.endm 425 426w_mask_fn 444 427w_mask_fn 422 428w_mask_fn 420 429 430 431function blend_8bpc_neon, export=1 432 adr x6, L(blend_tbl) 433 clz w3, w3 434 sub w3, w3, #26 435 ldrh w3, [x6, x3, lsl #1] 436 sub x6, x6, w3, uxtw 437 movi v4.16b, #64 438 add x8, x0, x1 439 lsl x1, x1, #1 440 br x6 4414: 442 AARCH64_VALID_JUMP_TARGET 443 ld1 {v2.8b}, [x5], #8 444 ld1 {v1.d}[0], [x2], #8 445 ld1 {v0.s}[0], [x0] 446 subs w4, w4, #2 447 ld1 {v0.s}[1], [x8] 448 sub v3.8b, v4.8b, v2.8b 449 umull v5.8h, v1.8b, v2.8b 450 umlal v5.8h, v0.8b, v3.8b 451 rshrn v6.8b, v5.8h, #6 452 st1 {v6.s}[0], [x0], x1 453 st1 {v6.s}[1], [x8], x1 454 b.gt 4b 455 ret 4568: 457 AARCH64_VALID_JUMP_TARGET 458 ld1 {v2.16b}, [x5], #16 459 ld1 {v1.16b}, [x2], #16 460 ld1 {v0.d}[0], [x0] 461 ld1 {v0.d}[1], [x8] 462 sub v3.16b, v4.16b, v2.16b 463 subs w4, w4, #2 464 umull v5.8h, v1.8b, v2.8b 465 umlal v5.8h, v0.8b, v3.8b 466 umull2 v6.8h, v1.16b, v2.16b 467 umlal2 v6.8h, v0.16b, v3.16b 468 rshrn v7.8b, v5.8h, #6 469 rshrn2 v7.16b, v6.8h, #6 470 st1 {v7.d}[0], [x0], x1 471 st1 {v7.d}[1], [x8], x1 472 b.gt 8b 473 ret 47416: 475 AARCH64_VALID_JUMP_TARGET 476 ld1 {v1.16b, v2.16b}, [x5], #32 477 ld1 {v5.16b, v6.16b}, [x2], #32 478 ld1 {v0.16b}, [x0] 479 subs w4, w4, #2 480 sub v7.16b, v4.16b, v1.16b 481 sub v20.16b, v4.16b, v2.16b 482 ld1 {v3.16b}, [x8] 483 umull v16.8h, v5.8b, v1.8b 484 umlal v16.8h, v0.8b, v7.8b 485 umull2 v17.8h, v5.16b, v1.16b 486 umlal2 v17.8h, v0.16b, v7.16b 487 umull v21.8h, v6.8b, v2.8b 488 umlal v21.8h, v3.8b, v20.8b 489 umull2 v22.8h, v6.16b, v2.16b 490 umlal2 v22.8h, v3.16b, v20.16b 491 rshrn v18.8b, v16.8h, #6 492 rshrn2 v18.16b, v17.8h, #6 493 rshrn v19.8b, v21.8h, #6 494 rshrn2 v19.16b, v22.8h, #6 495 st1 {v18.16b}, [x0], x1 496 st1 {v19.16b}, [x8], x1 497 b.gt 16b 498 ret 49932: 500 AARCH64_VALID_JUMP_TARGET 501 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 502 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 503 ld1 {v20.16b, v21.16b}, [x0] 504 subs w4, w4, #2 505 ld1 {v22.16b, v23.16b}, [x8] 506 sub v5.16b, v4.16b, v0.16b 507 sub v6.16b, v4.16b, v1.16b 508 sub v30.16b, v4.16b, v2.16b 509 sub v31.16b, v4.16b, v3.16b 510 umull v24.8h, v16.8b, v0.8b 511 umlal v24.8h, v20.8b, v5.8b 512 umull2 v26.8h, v16.16b, v0.16b 513 umlal2 v26.8h, v20.16b, v5.16b 514 umull v28.8h, v17.8b, v1.8b 515 umlal v28.8h, v21.8b, v6.8b 516 umull2 v7.8h, v17.16b, v1.16b 517 umlal2 v7.8h, v21.16b, v6.16b 518 umull v27.8h, v18.8b, v2.8b 519 umlal v27.8h, v22.8b, v30.8b 520 umull2 v1.8h, v18.16b, v2.16b 521 umlal2 v1.8h, v22.16b, v30.16b 522 umull v29.8h, v19.8b, v3.8b 523 umlal v29.8h, v23.8b, v31.8b 524 umull2 v21.8h, v19.16b, v3.16b 525 umlal2 v21.8h, v23.16b, v31.16b 526 rshrn v24.8b, v24.8h, #6 527 rshrn2 v24.16b, v26.8h, #6 528 rshrn v25.8b, v28.8h, #6 529 rshrn2 v25.16b, v7.8h, #6 530 rshrn v27.8b, v27.8h, #6 531 rshrn2 v27.16b, v1.8h, #6 532 rshrn v28.8b, v29.8h, #6 533 rshrn2 v28.16b, v21.8h, #6 534 st1 {v24.16b, v25.16b}, [x0], x1 535 st1 {v27.16b, v28.16b}, [x8], x1 536 b.gt 32b 537 ret 538L(blend_tbl): 539 .hword L(blend_tbl) - 32b 540 .hword L(blend_tbl) - 16b 541 .hword L(blend_tbl) - 8b 542 .hword L(blend_tbl) - 4b 543endfunc 544 545function blend_h_8bpc_neon, export=1 546 adr x6, L(blend_h_tbl) 547 movrel x5, X(obmc_masks) 548 add x5, x5, w4, uxtw 549 sub w4, w4, w4, lsr #2 550 clz w7, w3 551 movi v4.16b, #64 552 add x8, x0, x1 553 lsl x1, x1, #1 554 sub w7, w7, #24 555 ldrh w7, [x6, x7, lsl #1] 556 sub x6, x6, w7, uxtw 557 br x6 5582: 559 AARCH64_VALID_JUMP_TARGET 560 ld1 {v0.h}[0], [x5], #2 561 ld1 {v1.s}[0], [x2], #4 562 subs w4, w4, #2 563 ld1 {v2.h}[0], [x0] 564 zip1 v0.8b, v0.8b, v0.8b 565 sub v3.8b, v4.8b, v0.8b 566 ld1 {v2.h}[1], [x8] 567 umull v5.8h, v1.8b, v0.8b 568 umlal v5.8h, v2.8b, v3.8b 569 rshrn v5.8b, v5.8h, #6 570 st1 {v5.h}[0], [x0], x1 571 st1 {v5.h}[1], [x8], x1 572 b.gt 2b 573 ret 5744: 575 AARCH64_VALID_JUMP_TARGET 576 ld2r {v0.8b, v1.8b}, [x5], #2 577 ld1 {v2.8b}, [x2], #8 578 subs w4, w4, #2 579 ext v0.8b, v0.8b, v1.8b, #4 580 ld1 {v3.s}[0], [x0] 581 sub v5.8b, v4.8b, v0.8b 582 ld1 {v3.s}[1], [x8] 583 umull v6.8h, v2.8b, v0.8b 584 umlal v6.8h, v3.8b, v5.8b 585 rshrn v6.8b, v6.8h, #6 586 st1 {v6.s}[0], [x0], x1 587 st1 {v6.s}[1], [x8], x1 588 b.gt 4b 589 ret 5908: 591 AARCH64_VALID_JUMP_TARGET 592 ld2r {v0.16b, v1.16b}, [x5], #2 593 ld1 {v2.16b}, [x2], #16 594 ld1 {v3.d}[0], [x0] 595 ext v0.16b, v0.16b, v1.16b, #8 596 sub v5.16b, v4.16b, v0.16b 597 ld1 {v3.d}[1], [x8] 598 subs w4, w4, #2 599 umull v6.8h, v0.8b, v2.8b 600 umlal v6.8h, v3.8b, v5.8b 601 umull2 v7.8h, v0.16b, v2.16b 602 umlal2 v7.8h, v3.16b, v5.16b 603 rshrn v16.8b, v6.8h, #6 604 rshrn2 v16.16b, v7.8h, #6 605 st1 {v16.d}[0], [x0], x1 606 st1 {v16.d}[1], [x8], x1 607 b.gt 8b 608 ret 60916: 610 AARCH64_VALID_JUMP_TARGET 611 ld2r {v0.16b, v1.16b}, [x5], #2 612 ld1 {v2.16b, v3.16b}, [x2], #32 613 ld1 {v5.16b}, [x0] 614 sub v7.16b, v4.16b, v0.16b 615 sub v16.16b, v4.16b, v1.16b 616 ld1 {v6.16b}, [x8] 617 subs w4, w4, #2 618 umull v17.8h, v0.8b, v2.8b 619 umlal v17.8h, v5.8b, v7.8b 620 umull2 v18.8h, v0.16b, v2.16b 621 umlal2 v18.8h, v5.16b, v7.16b 622 umull v19.8h, v1.8b, v3.8b 623 umlal v19.8h, v6.8b, v16.8b 624 umull2 v20.8h, v1.16b, v3.16b 625 umlal2 v20.8h, v6.16b, v16.16b 626 rshrn v21.8b, v17.8h, #6 627 rshrn2 v21.16b, v18.8h, #6 628 rshrn v22.8b, v19.8h, #6 629 rshrn2 v22.16b, v20.8h, #6 630 st1 {v21.16b}, [x0], x1 631 st1 {v22.16b}, [x8], x1 632 b.gt 16b 633 ret 6341280: 635640: 636320: 637 AARCH64_VALID_JUMP_TARGET 638 sub x1, x1, w3, uxtw 639 add x7, x2, w3, uxtw 640321: 641 ld2r {v0.16b, v1.16b}, [x5], #2 642 mov w6, w3 643 sub v20.16b, v4.16b, v0.16b 644 sub v21.16b, v4.16b, v1.16b 64532: 646 ld1 {v16.16b, v17.16b}, [x2], #32 647 ld1 {v2.16b, v3.16b}, [x0] 648 subs w6, w6, #32 649 umull v23.8h, v0.8b, v16.8b 650 umlal v23.8h, v2.8b, v20.8b 651 ld1 {v18.16b, v19.16b}, [x7], #32 652 umull2 v27.8h, v0.16b, v16.16b 653 umlal2 v27.8h, v2.16b, v20.16b 654 ld1 {v6.16b, v7.16b}, [x8] 655 umull v24.8h, v0.8b, v17.8b 656 umlal v24.8h, v3.8b, v20.8b 657 umull2 v28.8h, v0.16b, v17.16b 658 umlal2 v28.8h, v3.16b, v20.16b 659 umull v25.8h, v1.8b, v18.8b 660 umlal v25.8h, v6.8b, v21.8b 661 umull2 v5.8h, v1.16b, v18.16b 662 umlal2 v5.8h, v6.16b, v21.16b 663 rshrn v29.8b, v23.8h, #6 664 rshrn2 v29.16b, v27.8h, #6 665 umull v26.8h, v1.8b, v19.8b 666 umlal v26.8h, v7.8b, v21.8b 667 umull2 v31.8h, v1.16b, v19.16b 668 umlal2 v31.8h, v7.16b, v21.16b 669 rshrn v30.8b, v24.8h, #6 670 rshrn2 v30.16b, v28.8h, #6 671 rshrn v23.8b, v25.8h, #6 672 rshrn2 v23.16b, v5.8h, #6 673 rshrn v24.8b, v26.8h, #6 674 st1 {v29.16b, v30.16b}, [x0], #32 675 rshrn2 v24.16b, v31.8h, #6 676 st1 {v23.16b, v24.16b}, [x8], #32 677 b.gt 32b 678 subs w4, w4, #2 679 add x0, x0, x1 680 add x8, x8, x1 681 add x2, x2, w3, uxtw 682 add x7, x7, w3, uxtw 683 b.gt 321b 684 ret 685L(blend_h_tbl): 686 .hword L(blend_h_tbl) - 1280b 687 .hword L(blend_h_tbl) - 640b 688 .hword L(blend_h_tbl) - 320b 689 .hword L(blend_h_tbl) - 16b 690 .hword L(blend_h_tbl) - 8b 691 .hword L(blend_h_tbl) - 4b 692 .hword L(blend_h_tbl) - 2b 693endfunc 694 695function blend_v_8bpc_neon, export=1 696 adr x6, L(blend_v_tbl) 697 movrel x5, X(obmc_masks) 698 add x5, x5, w3, uxtw 699 clz w3, w3 700 movi v4.16b, #64 701 add x8, x0, x1 702 lsl x1, x1, #1 703 sub w3, w3, #26 704 ldrh w3, [x6, x3, lsl #1] 705 sub x6, x6, w3, uxtw 706 br x6 70720: 708 AARCH64_VALID_JUMP_TARGET 709 ld1r {v0.8b}, [x5] 710 sub v1.8b, v4.8b, v0.8b 7112: 712 ld1 {v2.h}[0], [x2], #2 713 ld1 {v3.b}[0], [x0] 714 subs w4, w4, #2 715 ld1 {v2.b}[1], [x2] 716 ld1 {v3.b}[1], [x8] 717 umull v5.8h, v2.8b, v0.8b 718 umlal v5.8h, v3.8b, v1.8b 719 rshrn v5.8b, v5.8h, #6 720 add x2, x2, #2 721 st1 {v5.b}[0], [x0], x1 722 st1 {v5.b}[1], [x8], x1 723 b.gt 2b 724 ret 72540: 726 AARCH64_VALID_JUMP_TARGET 727 ld1r {v0.2s}, [x5] 728 sub x1, x1, #2 729 sub v1.8b, v4.8b, v0.8b 7304: 731 ld1 {v2.8b}, [x2], #8 732 ld1 {v3.s}[0], [x0] 733 ld1 {v3.s}[1], [x8] 734 subs w4, w4, #2 735 umull v5.8h, v2.8b, v0.8b 736 umlal v5.8h, v3.8b, v1.8b 737 rshrn v5.8b, v5.8h, #6 738 st1 {v5.h}[0], [x0], #2 739 st1 {v5.h}[2], [x8], #2 740 st1 {v5.b}[2], [x0], x1 741 st1 {v5.b}[6], [x8], x1 742 b.gt 4b 743 ret 74480: 745 AARCH64_VALID_JUMP_TARGET 746 ld1r {v0.2d}, [x5] 747 sub x1, x1, #4 748 sub v1.16b, v4.16b, v0.16b 7498: 750 ld1 {v2.16b}, [x2], #16 751 ld1 {v3.d}[0], [x0] 752 ld1 {v3.d}[1], [x8] 753 subs w4, w4, #2 754 umull v5.8h, v0.8b, v2.8b 755 umlal v5.8h, v3.8b, v1.8b 756 umull2 v6.8h, v0.16b, v2.16b 757 umlal2 v6.8h, v3.16b, v1.16b 758 rshrn v7.8b, v5.8h, #6 759 rshrn2 v7.16b, v6.8h, #6 760 st1 {v7.s}[0], [x0], #4 761 st1 {v7.s}[2], [x8], #4 762 st1 {v7.h}[2], [x0], x1 763 st1 {v7.h}[6], [x8], x1 764 b.gt 8b 765 ret 766160: 767 AARCH64_VALID_JUMP_TARGET 768 ld1 {v0.16b}, [x5] 769 sub x1, x1, #8 770 sub v2.16b, v4.16b, v0.16b 77116: 772 ld1 {v5.16b, v6.16b}, [x2], #32 773 ld1 {v7.16b}, [x0] 774 subs w4, w4, #2 775 ld1 {v16.16b}, [x8] 776 umull v17.8h, v5.8b, v0.8b 777 umlal v17.8h, v7.8b, v2.8b 778 umull2 v18.8h, v5.16b, v0.16b 779 umlal2 v18.8h, v7.16b, v2.16b 780 umull v20.8h, v6.8b, v0.8b 781 umlal v20.8h, v16.8b, v2.8b 782 umull2 v21.8h, v6.16b, v0.16b 783 umlal2 v21.8h, v16.16b, v2.16b 784 rshrn v19.8b, v17.8h, #6 785 rshrn2 v19.16b, v18.8h, #6 786 rshrn v22.8b, v20.8h, #6 787 rshrn2 v22.16b, v21.8h, #6 788 st1 {v19.8b}, [x0], #8 789 st1 {v22.8b}, [x8], #8 790 st1 {v19.s}[2], [x0], x1 791 st1 {v22.s}[2], [x8], x1 792 b.gt 16b 793 ret 794320: 795 AARCH64_VALID_JUMP_TARGET 796 ld1 {v0.16b, v1.16b}, [x5] 797 sub x1, x1, #16 798 sub v2.16b, v4.16b, v0.16b 799 sub v3.8b, v4.8b, v1.8b 80032: 801 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 802 ld1 {v5.16b, v6.16b}, [x0] 803 subs w4, w4, #2 804 ld1 {v20.16b, v21.16b}, [x8] 805 umull v22.8h, v16.8b, v0.8b 806 umlal v22.8h, v5.8b, v2.8b 807 umull2 v23.8h, v16.16b, v0.16b 808 umlal2 v23.8h, v5.16b, v2.16b 809 umull v28.8h, v17.8b, v1.8b 810 umlal v28.8h, v6.8b, v3.8b 811 umull v30.8h, v18.8b, v0.8b 812 umlal v30.8h, v20.8b, v2.8b 813 umull2 v31.8h, v18.16b, v0.16b 814 umlal2 v31.8h, v20.16b, v2.16b 815 umull v25.8h, v19.8b, v1.8b 816 umlal v25.8h, v21.8b, v3.8b 817 rshrn v24.8b, v22.8h, #6 818 rshrn2 v24.16b, v23.8h, #6 819 rshrn v28.8b, v28.8h, #6 820 rshrn v30.8b, v30.8h, #6 821 rshrn2 v30.16b, v31.8h, #6 822 rshrn v27.8b, v25.8h, #6 823 st1 {v24.16b}, [x0], #16 824 st1 {v30.16b}, [x8], #16 825 st1 {v28.8b}, [x0], x1 826 st1 {v27.8b}, [x8], x1 827 b.gt 32b 828 ret 829L(blend_v_tbl): 830 .hword L(blend_v_tbl) - 320b 831 .hword L(blend_v_tbl) - 160b 832 .hword L(blend_v_tbl) - 80b 833 .hword L(blend_v_tbl) - 40b 834 .hword L(blend_v_tbl) - 20b 835endfunc 836 837 838// This has got the same signature as the put_8tap functions, 839// and assumes that x8 is set to (clz(w)-24). 840function put_neon 841 adr x9, L(put_tbl) 842 ldrh w8, [x9, x8, lsl #1] 843 sub x9, x9, w8, uxtw 844 br x9 845 8462: 847 AARCH64_VALID_JUMP_TARGET 848 ld1 {v0.h}[0], [x2], x3 849 ld1 {v1.h}[0], [x2], x3 850 subs w5, w5, #2 851 st1 {v0.h}[0], [x0], x1 852 st1 {v1.h}[0], [x0], x1 853 b.gt 2b 854 ret 8554: 856 AARCH64_VALID_JUMP_TARGET 857 ld1 {v0.s}[0], [x2], x3 858 ld1 {v1.s}[0], [x2], x3 859 subs w5, w5, #2 860 st1 {v0.s}[0], [x0], x1 861 st1 {v1.s}[0], [x0], x1 862 b.gt 4b 863 ret 8648: 865 AARCH64_VALID_JUMP_TARGET 866 ld1 {v0.8b}, [x2], x3 867 ld1 {v1.8b}, [x2], x3 868 subs w5, w5, #2 869 st1 {v0.8b}, [x0], x1 870 st1 {v1.8b}, [x0], x1 871 b.gt 8b 872 ret 873160: 874 AARCH64_VALID_JUMP_TARGET 875 add x8, x0, x1 876 lsl x1, x1, #1 877 add x9, x2, x3 878 lsl x3, x3, #1 87916: 880 ld1 {v0.16b}, [x2], x3 881 ld1 {v1.16b}, [x9], x3 882 subs w5, w5, #2 883 st1 {v0.16b}, [x0], x1 884 st1 {v1.16b}, [x8], x1 885 b.gt 16b 886 ret 88732: 888 AARCH64_VALID_JUMP_TARGET 889 ldp x6, x7, [x2] 890 ldp x8, x9, [x2, #16] 891 stp x6, x7, [x0] 892 subs w5, w5, #1 893 stp x8, x9, [x0, #16] 894 add x2, x2, x3 895 add x0, x0, x1 896 b.gt 32b 897 ret 89864: 899 AARCH64_VALID_JUMP_TARGET 900 ldp x6, x7, [x2] 901 ldp x8, x9, [x2, #16] 902 stp x6, x7, [x0] 903 ldp x10, x11, [x2, #32] 904 stp x8, x9, [x0, #16] 905 subs w5, w5, #1 906 ldp x12, x13, [x2, #48] 907 stp x10, x11, [x0, #32] 908 stp x12, x13, [x0, #48] 909 add x2, x2, x3 910 add x0, x0, x1 911 b.gt 64b 912 ret 913128: 914 AARCH64_VALID_JUMP_TARGET 915 ldp q0, q1, [x2] 916 ldp q2, q3, [x2, #32] 917 stp q0, q1, [x0] 918 ldp q4, q5, [x2, #64] 919 stp q2, q3, [x0, #32] 920 ldp q6, q7, [x2, #96] 921 subs w5, w5, #1 922 stp q4, q5, [x0, #64] 923 stp q6, q7, [x0, #96] 924 add x2, x2, x3 925 add x0, x0, x1 926 b.gt 128b 927 ret 928 929L(put_tbl): 930 .hword L(put_tbl) - 128b 931 .hword L(put_tbl) - 64b 932 .hword L(put_tbl) - 32b 933 .hword L(put_tbl) - 160b 934 .hword L(put_tbl) - 8b 935 .hword L(put_tbl) - 4b 936 .hword L(put_tbl) - 2b 937endfunc 938 939 940// This has got the same signature as the prep_8tap functions, 941// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 942function prep_neon 943 adr x9, L(prep_tbl) 944 ldrh w8, [x9, x8, lsl #1] 945 sub x9, x9, w8, uxtw 946 br x9 947 9484: 949 AARCH64_VALID_JUMP_TARGET 950 ld1 {v0.s}[0], [x1], x2 951 ld1 {v1.s}[0], [x1], x2 952 subs w4, w4, #2 953 ushll v0.8h, v0.8b, #4 954 ushll v1.8h, v1.8b, #4 955 st1 {v0.4h, v1.4h}, [x0], #16 956 b.gt 4b 957 ret 9588: 959 AARCH64_VALID_JUMP_TARGET 960 ld1 {v0.8b}, [x1], x2 961 ld1 {v1.8b}, [x1], x2 962 subs w4, w4, #2 963 ushll v0.8h, v0.8b, #4 964 ushll v1.8h, v1.8b, #4 965 st1 {v0.8h, v1.8h}, [x0], #32 966 b.gt 8b 967 ret 968160: 969 AARCH64_VALID_JUMP_TARGET 970 add x9, x1, x2 971 lsl x2, x2, #1 97216: 973 ld1 {v0.16b}, [x1], x2 974 ld1 {v1.16b}, [x9], x2 975 subs w4, w4, #2 976 ushll v4.8h, v0.8b, #4 977 ushll2 v5.8h, v0.16b, #4 978 ushll v6.8h, v1.8b, #4 979 ushll2 v7.8h, v1.16b, #4 980 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 981 b.gt 16b 982 ret 983320: 984 AARCH64_VALID_JUMP_TARGET 985 add x8, x0, w3, uxtw 98632: 987 ld1 {v0.16b, v1.16b}, [x1], x2 988 subs w4, w4, #2 989 ushll v4.8h, v0.8b, #4 990 ushll2 v5.8h, v0.16b, #4 991 ld1 {v2.16b, v3.16b}, [x1], x2 992 ushll v6.8h, v1.8b, #4 993 ushll2 v7.8h, v1.16b, #4 994 ushll v16.8h, v2.8b, #4 995 st1 {v4.8h, v5.8h}, [x0], x7 996 ushll2 v17.8h, v2.16b, #4 997 st1 {v6.8h, v7.8h}, [x8], x7 998 ushll v18.8h, v3.8b, #4 999 st1 {v16.8h, v17.8h}, [x0], x7 1000 ushll2 v19.8h, v3.16b, #4 1001 st1 {v18.8h, v19.8h}, [x8], x7 1002 b.gt 32b 1003 ret 1004640: 1005 AARCH64_VALID_JUMP_TARGET 1006 add x8, x0, #32 1007 mov x6, #64 100864: 1009 ldp q0, q1, [x1] 1010 subs w4, w4, #1 1011 ushll v4.8h, v0.8b, #4 1012 ushll2 v5.8h, v0.16b, #4 1013 ldp q2, q3, [x1, #32] 1014 ushll v6.8h, v1.8b, #4 1015 ushll2 v7.8h, v1.16b, #4 1016 add x1, x1, x2 1017 ushll v16.8h, v2.8b, #4 1018 st1 {v4.8h, v5.8h}, [x0], x6 1019 ushll2 v17.8h, v2.16b, #4 1020 ushll v18.8h, v3.8b, #4 1021 st1 {v6.8h, v7.8h}, [x8], x6 1022 ushll2 v19.8h, v3.16b, #4 1023 st1 {v16.8h, v17.8h}, [x0], x6 1024 st1 {v18.8h, v19.8h}, [x8], x6 1025 b.gt 64b 1026 ret 10271280: 1028 AARCH64_VALID_JUMP_TARGET 1029 add x8, x0, #64 1030 mov x6, #128 1031128: 1032 ldp q0, q1, [x1] 1033 ldp q2, q3, [x1, #32] 1034 ushll v16.8h, v0.8b, #4 1035 ushll2 v17.8h, v0.16b, #4 1036 ushll v18.8h, v1.8b, #4 1037 ushll2 v19.8h, v1.16b, #4 1038 ushll v20.8h, v2.8b, #4 1039 ushll2 v21.8h, v2.16b, #4 1040 ldp q4, q5, [x1, #64] 1041 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 1042 ushll v22.8h, v3.8b, #4 1043 ushll2 v23.8h, v3.16b, #4 1044 ushll v24.8h, v4.8b, #4 1045 ushll2 v25.8h, v4.16b, #4 1046 ushll v26.8h, v5.8b, #4 1047 ushll2 v27.8h, v5.16b, #4 1048 ldp q6, q7, [x1, #96] 1049 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 1050 ushll v28.8h, v6.8b, #4 1051 ushll2 v29.8h, v6.16b, #4 1052 ushll v30.8h, v7.8b, #4 1053 ushll2 v31.8h, v7.16b, #4 1054 subs w4, w4, #1 1055 add x1, x1, x2 1056 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 1057 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 1058 b.gt 128b 1059 ret 1060 1061L(prep_tbl): 1062 .hword L(prep_tbl) - 1280b 1063 .hword L(prep_tbl) - 640b 1064 .hword L(prep_tbl) - 320b 1065 .hword L(prep_tbl) - 160b 1066 .hword L(prep_tbl) - 8b 1067 .hword L(prep_tbl) - 4b 1068endfunc 1069 1070 1071.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1072 ld1 {\d0\wd}[0], [\s0], \strd 1073 ld1 {\d1\wd}[0], [\s1], \strd 1074.ifnb \d2 1075 ld1 {\d2\wd}[0], [\s0], \strd 1076 ld1 {\d3\wd}[0], [\s1], \strd 1077.endif 1078.ifnb \d4 1079 ld1 {\d4\wd}[0], [\s0], \strd 1080.endif 1081.ifnb \d5 1082 ld1 {\d5\wd}[0], [\s1], \strd 1083.endif 1084.ifnb \d6 1085 ld1 {\d6\wd}[0], [\s0], \strd 1086.endif 1087.endm 1088.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1089 ld1 {\d0\wd}, [\s0], \strd 1090 ld1 {\d1\wd}, [\s1], \strd 1091.ifnb \d2 1092 ld1 {\d2\wd}, [\s0], \strd 1093 ld1 {\d3\wd}, [\s1], \strd 1094.endif 1095.ifnb \d4 1096 ld1 {\d4\wd}, [\s0], \strd 1097.endif 1098.ifnb \d5 1099 ld1 {\d5\wd}, [\s1], \strd 1100.endif 1101.ifnb \d6 1102 ld1 {\d6\wd}, [\s0], \strd 1103.endif 1104.endm 1105.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1106 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1107.endm 1108.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1109 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1110.endm 1111.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1112 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1113.endm 1114.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1115 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1116.endm 1117.macro interleave_1 wd, r0, r1, r2, r3, r4 1118 trn1 \r0\wd, \r0\wd, \r1\wd 1119 trn1 \r1\wd, \r1\wd, \r2\wd 1120.ifnb \r3 1121 trn1 \r2\wd, \r2\wd, \r3\wd 1122 trn1 \r3\wd, \r3\wd, \r4\wd 1123.endif 1124.endm 1125.macro interleave_1_h r0, r1, r2, r3, r4 1126 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1127.endm 1128.macro interleave_1_s r0, r1, r2, r3, r4 1129 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1130.endm 1131.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1132 trn1 \r0\wd, \r0\wd, \r2\wd 1133 trn1 \r1\wd, \r1\wd, \r3\wd 1134 trn1 \r2\wd, \r2\wd, \r4\wd 1135 trn1 \r3\wd, \r3\wd, \r5\wd 1136.endm 1137.macro interleave_2_s r0, r1, r2, r3, r4, r5 1138 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1139.endm 1140.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1141 uxtl \r0\().8h, \r0\().8b 1142 uxtl \r1\().8h, \r1\().8b 1143.ifnb \r2 1144 uxtl \r2\().8h, \r2\().8b 1145 uxtl \r3\().8h, \r3\().8b 1146.endif 1147.ifnb \r4 1148 uxtl \r4\().8h, \r4\().8b 1149.endif 1150.ifnb \r5 1151 uxtl \r5\().8h, \r5\().8b 1152.endif 1153.ifnb \r6 1154 uxtl \r6\().8h, \r6\().8b 1155.endif 1156.endm 1157.macro mul_mla_4 d, s0, s1, s2, s3, wd 1158 mul \d\wd, \s0\wd, v0.h[0] 1159 mla \d\wd, \s1\wd, v0.h[1] 1160 mla \d\wd, \s2\wd, v0.h[2] 1161 mla \d\wd, \s3\wd, v0.h[3] 1162.endm 1163// Interleaving the mul/mla chains actually hurts performance 1164// significantly on Cortex A53, thus keeping mul/mla tightly 1165// chained like this. 1166.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 1167 mul \d0\().4h, \s0\().4h, v0.h[0] 1168 mla \d0\().4h, \s1\().4h, v0.h[1] 1169 mla \d0\().4h, \s2\().4h, v0.h[2] 1170 mla \d0\().4h, \s3\().4h, v0.h[3] 1171 mla \d0\().4h, \s4\().4h, v0.h[4] 1172 mla \d0\().4h, \s5\().4h, v0.h[5] 1173 mla \d0\().4h, \s6\().4h, v0.h[6] 1174 mla \d0\().4h, \s7\().4h, v0.h[7] 1175.endm 1176.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 1177 mul \d0\().8h, \s0\().8h, v0.h[0] 1178 mla \d0\().8h, \s1\().8h, v0.h[1] 1179 mla \d0\().8h, \s2\().8h, v0.h[2] 1180 mla \d0\().8h, \s3\().8h, v0.h[3] 1181 mla \d0\().8h, \s4\().8h, v0.h[4] 1182 mla \d0\().8h, \s5\().8h, v0.h[5] 1183 mla \d0\().8h, \s6\().8h, v0.h[6] 1184 mla \d0\().8h, \s7\().8h, v0.h[7] 1185.endm 1186.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1187 mul \d0\().8h, \s0\().8h, v0.h[0] 1188 mla \d0\().8h, \s1\().8h, v0.h[1] 1189 mla \d0\().8h, \s2\().8h, v0.h[2] 1190 mla \d0\().8h, \s3\().8h, v0.h[3] 1191 mla \d0\().8h, \s4\().8h, v0.h[4] 1192 mla \d0\().8h, \s5\().8h, v0.h[5] 1193 mla \d0\().8h, \s6\().8h, v0.h[6] 1194 mla \d0\().8h, \s7\().8h, v0.h[7] 1195 mul \d1\().8h, \s1\().8h, v0.h[0] 1196 mla \d1\().8h, \s2\().8h, v0.h[1] 1197 mla \d1\().8h, \s3\().8h, v0.h[2] 1198 mla \d1\().8h, \s4\().8h, v0.h[3] 1199 mla \d1\().8h, \s5\().8h, v0.h[4] 1200 mla \d1\().8h, \s6\().8h, v0.h[5] 1201 mla \d1\().8h, \s7\().8h, v0.h[6] 1202 mla \d1\().8h, \s8\().8h, v0.h[7] 1203.endm 1204.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1205 mul \d0\().8h, \s0\().8h, v0.h[0] 1206 mla \d0\().8h, \s1\().8h, v0.h[1] 1207 mla \d0\().8h, \s2\().8h, v0.h[2] 1208 mla \d0\().8h, \s3\().8h, v0.h[3] 1209 mla \d0\().8h, \s4\().8h, v0.h[4] 1210 mla \d0\().8h, \s5\().8h, v0.h[5] 1211 mla \d0\().8h, \s6\().8h, v0.h[6] 1212 mla \d0\().8h, \s7\().8h, v0.h[7] 1213 mul \d1\().8h, \s2\().8h, v0.h[0] 1214 mla \d1\().8h, \s3\().8h, v0.h[1] 1215 mla \d1\().8h, \s4\().8h, v0.h[2] 1216 mla \d1\().8h, \s5\().8h, v0.h[3] 1217 mla \d1\().8h, \s6\().8h, v0.h[4] 1218 mla \d1\().8h, \s7\().8h, v0.h[5] 1219 mla \d1\().8h, \s8\().8h, v0.h[6] 1220 mla \d1\().8h, \s9\().8h, v0.h[7] 1221.endm 1222.macro sqrshrun_b shift, r0, r1, r2, r3 1223 sqrshrun \r0\().8b, \r0\().8h, #\shift 1224.ifnb \r1 1225 sqrshrun \r1\().8b, \r1\().8h, #\shift 1226.endif 1227.ifnb \r2 1228 sqrshrun \r2\().8b, \r2\().8h, #\shift 1229 sqrshrun \r3\().8b, \r3\().8h, #\shift 1230.endif 1231.endm 1232.macro srshr_h shift, r0, r1, r2, r3 1233 srshr \r0\().8h, \r0\().8h, #\shift 1234.ifnb \r1 1235 srshr \r1\().8h, \r1\().8h, #\shift 1236.endif 1237.ifnb \r2 1238 srshr \r2\().8h, \r2\().8h, #\shift 1239 srshr \r3\().8h, \r3\().8h, #\shift 1240.endif 1241.endm 1242.macro st_h strd, reg, lanes 1243 st1 {\reg\().h}[0], [x0], \strd 1244 st1 {\reg\().h}[1], [x8], \strd 1245.if \lanes > 2 1246 st1 {\reg\().h}[2], [x0], \strd 1247 st1 {\reg\().h}[3], [x8], \strd 1248.endif 1249.endm 1250.macro st_s strd, r0, r1 1251 st1 {\r0\().s}[0], [x0], \strd 1252 st1 {\r0\().s}[1], [x8], \strd 1253.ifnb \r1 1254 st1 {\r1\().s}[0], [x0], \strd 1255 st1 {\r1\().s}[1], [x8], \strd 1256.endif 1257.endm 1258.macro st_d strd, r0, r1 1259 st1 {\r0\().d}[0], [x0], \strd 1260 st1 {\r0\().d}[1], [x8], \strd 1261.ifnb \r1 1262 st1 {\r1\().d}[0], [x0], \strd 1263 st1 {\r1\().d}[1], [x8], \strd 1264.endif 1265.endm 1266.macro shift_store_4 type, strd, r0, r1 1267.ifc \type, put 1268 sqrshrun_b 6, \r0, \r1 1269 st_s \strd, \r0, \r1 1270.else 1271 srshr_h 2, \r0, \r1 1272 st_d \strd, \r0, \r1 1273.endif 1274.endm 1275.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1276 st1 {\r0\wd}, [x0], \strd 1277 st1 {\r1\wd}, [x8], \strd 1278.ifnb \r2 1279 st1 {\r2\wd}, [x0], \strd 1280 st1 {\r3\wd}, [x8], \strd 1281.endif 1282.ifnb \r4 1283 st1 {\r4\wd}, [x0], \strd 1284 st1 {\r5\wd}, [x8], \strd 1285 st1 {\r6\wd}, [x0], \strd 1286 st1 {\r7\wd}, [x8], \strd 1287.endif 1288.endm 1289.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1290 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1291.endm 1292.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1293 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1294.endm 1295.macro shift_store_8 type, strd, r0, r1, r2, r3 1296.ifc \type, put 1297 sqrshrun_b 6, \r0, \r1, \r2, \r3 1298 st_8b \strd, \r0, \r1, \r2, \r3 1299.else 1300 srshr_h 2, \r0, \r1, \r2, \r3 1301 st_16b \strd, \r0, \r1, \r2, \r3 1302.endif 1303.endm 1304.macro shift_store_16 type, strd, r0, r1, r2, r3 1305.ifc \type, put 1306 sqrshrun \r0\().8b, \r0\().8h, #6 1307 sqrshrun2 \r0\().16b, \r1\().8h, #6 1308 sqrshrun \r2\().8b, \r2\().8h, #6 1309 sqrshrun2 \r2\().16b, \r3\().8h, #6 1310 st_16b \strd, \r0, \r2 1311.else 1312 srshr_h 2, \r0, \r1, \r2, \r3 1313 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1314 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1315.endif 1316.endm 1317 1318.macro make_8tap_fn op, type, type_h, type_v 1319function \op\()_8tap_\type\()_8bpc_neon, export=1 1320 mov x8, \type_h 1321 mov x9, \type_v 1322 b \op\()_8tap_neon 1323endfunc 1324.endm 1325 1326// No spaces in these expressions, due to gas-preprocessor. 1327#define REGULAR ((0*15<<7)|3*15) 1328#define SMOOTH ((1*15<<7)|4*15) 1329#define SHARP ((2*15<<7)|3*15) 1330 1331.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 1332make_8tap_fn \type, regular, REGULAR, REGULAR 1333make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1334make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1335make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1336make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1337make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1338make_8tap_fn \type, sharp, SHARP, SHARP 1339make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1340make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1341 1342function \type\()_8tap_neon 1343 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1344 mul \mx, \mx, w10 1345 mul \my, \my, w10 1346 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1347 add \my, \my, w9 // my, 8tap_v, 4tap_v 1348.ifc \type, prep 1349 uxtw \d_strd, \w 1350 lsl \d_strd, \d_strd, #1 1351.endif 1352 1353 clz w8, \w 1354 tst \mx, #(0x7f << 14) 1355 sub w8, w8, #24 1356 movrel x10, X(mc_subpel_filters), -8 1357 b.ne L(\type\()_8tap_h) 1358 tst \my, #(0x7f << 14) 1359 b.ne L(\type\()_8tap_v) 1360 b \type\()_neon 1361 1362L(\type\()_8tap_h): 1363 cmp \w, #4 1364 ubfx w9, \mx, #7, #7 1365 and \mx, \mx, #0x7f 1366 b.le 4f 1367 mov \mx, w9 13684: 1369 tst \my, #(0x7f << 14) 1370 add \xmx, x10, \mx, uxtw #3 1371 b.ne L(\type\()_8tap_hv) 1372 1373 adr x9, L(\type\()_8tap_h_tbl) 1374 ldrh w8, [x9, x8, lsl #1] 1375 sub x9, x9, w8, uxtw 1376 br x9 1377 137820: // 2xN h 1379 AARCH64_VALID_JUMP_TARGET 1380.ifc \type, put 1381 add \xmx, \xmx, #2 1382 ld1 {v0.s}[0], [\xmx] 1383 sub \src, \src, #1 1384 add \ds2, \dst, \d_strd 1385 add \sr2, \src, \s_strd 1386 lsl \d_strd, \d_strd, #1 1387 lsl \s_strd, \s_strd, #1 1388 sxtl v0.8h, v0.8b 13892: 1390 ld1 {v4.8b}, [\src], \s_strd 1391 ld1 {v6.8b}, [\sr2], \s_strd 1392 uxtl v4.8h, v4.8b 1393 uxtl v6.8h, v6.8b 1394 ext v5.16b, v4.16b, v4.16b, #2 1395 ext v7.16b, v6.16b, v6.16b, #2 1396 subs \h, \h, #2 1397 trn1 v3.2s, v4.2s, v6.2s 1398 trn2 v6.2s, v4.2s, v6.2s 1399 trn1 v4.2s, v5.2s, v7.2s 1400 trn2 v7.2s, v5.2s, v7.2s 1401 mul v3.4h, v3.4h, v0.h[0] 1402 mla v3.4h, v4.4h, v0.h[1] 1403 mla v3.4h, v6.4h, v0.h[2] 1404 mla v3.4h, v7.4h, v0.h[3] 1405 srshr v3.4h, v3.4h, #2 1406 sqrshrun v3.8b, v3.8h, #4 1407 st1 {v3.h}[0], [\dst], \d_strd 1408 st1 {v3.h}[1], [\ds2], \d_strd 1409 b.gt 2b 1410 ret 1411.endif 1412 141340: // 4xN h 1414 AARCH64_VALID_JUMP_TARGET 1415 add \xmx, \xmx, #2 1416 ld1 {v0.s}[0], [\xmx] 1417 sub \src, \src, #1 1418 add \ds2, \dst, \d_strd 1419 add \sr2, \src, \s_strd 1420 lsl \d_strd, \d_strd, #1 1421 lsl \s_strd, \s_strd, #1 1422 sxtl v0.8h, v0.8b 14234: 1424 ld1 {v16.8b}, [\src], \s_strd 1425 ld1 {v20.8b}, [\sr2], \s_strd 1426 uxtl v16.8h, v16.8b 1427 uxtl v20.8h, v20.8b 1428 ext v17.16b, v16.16b, v16.16b, #2 1429 ext v18.16b, v16.16b, v16.16b, #4 1430 ext v19.16b, v16.16b, v16.16b, #6 1431 ext v21.16b, v20.16b, v20.16b, #2 1432 ext v22.16b, v20.16b, v20.16b, #4 1433 ext v23.16b, v20.16b, v20.16b, #6 1434 subs \h, \h, #2 1435 mul v16.4h, v16.4h, v0.h[0] 1436 mla v16.4h, v17.4h, v0.h[1] 1437 mla v16.4h, v18.4h, v0.h[2] 1438 mla v16.4h, v19.4h, v0.h[3] 1439 mul v20.4h, v20.4h, v0.h[0] 1440 mla v20.4h, v21.4h, v0.h[1] 1441 mla v20.4h, v22.4h, v0.h[2] 1442 mla v20.4h, v23.4h, v0.h[3] 1443 srshr v16.4h, v16.4h, #2 1444 srshr v20.4h, v20.4h, #2 1445.ifc \type, put 1446 sqrshrun v16.8b, v16.8h, #4 1447 sqrshrun v20.8b, v20.8h, #4 1448 st1 {v16.s}[0], [\dst], \d_strd 1449 st1 {v20.s}[0], [\ds2], \d_strd 1450.else 1451 st1 {v16.4h}, [\dst], \d_strd 1452 st1 {v20.4h}, [\ds2], \d_strd 1453.endif 1454 b.gt 4b 1455 ret 1456 145780: // 8xN h 1458 AARCH64_VALID_JUMP_TARGET 1459 ld1 {v0.8b}, [\xmx] 1460 sub \src, \src, #3 1461 add \ds2, \dst, \d_strd 1462 add \sr2, \src, \s_strd 1463 lsl \d_strd, \d_strd, #1 1464 lsl \s_strd, \s_strd, #1 1465 sxtl v0.8h, v0.8b 14668: 1467 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1468 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1469 uxtl v16.8h, v16.8b 1470 uxtl v17.8h, v17.8b 1471 uxtl v20.8h, v20.8b 1472 uxtl v21.8h, v21.8b 1473 1474 mul v18.8h, v16.8h, v0.h[0] 1475 mul v22.8h, v20.8h, v0.h[0] 1476.irpc i, 1234567 1477 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1478 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1479 mla v18.8h, v19.8h, v0.h[\i] 1480 mla v22.8h, v23.8h, v0.h[\i] 1481.endr 1482 subs \h, \h, #2 1483 srshr v18.8h, v18.8h, #2 1484 srshr v22.8h, v22.8h, #2 1485.ifc \type, put 1486 sqrshrun v18.8b, v18.8h, #4 1487 sqrshrun v22.8b, v22.8h, #4 1488 st1 {v18.8b}, [\dst], \d_strd 1489 st1 {v22.8b}, [\ds2], \d_strd 1490.else 1491 st1 {v18.8h}, [\dst], \d_strd 1492 st1 {v22.8h}, [\ds2], \d_strd 1493.endif 1494 b.gt 8b 1495 ret 1496160: 1497320: 1498640: 14991280: // 16xN, 32xN, ... h 1500 AARCH64_VALID_JUMP_TARGET 1501 ld1 {v0.8b}, [\xmx] 1502 sub \src, \src, #3 1503 add \ds2, \dst, \d_strd 1504 add \sr2, \src, \s_strd 1505 lsl \s_strd, \s_strd, #1 1506 sxtl v0.8h, v0.8b 1507 1508 sub \s_strd, \s_strd, \w, uxtw 1509 sub \s_strd, \s_strd, #8 1510.ifc \type, put 1511 lsl \d_strd, \d_strd, #1 1512 sub \d_strd, \d_strd, \w, uxtw 1513.endif 1514161: 1515 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1516 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1517 mov \mx, \w 1518 uxtl v16.8h, v16.8b 1519 uxtl v17.8h, v17.8b 1520 uxtl v18.8h, v18.8b 1521 uxtl v20.8h, v20.8b 1522 uxtl v21.8h, v21.8b 1523 uxtl v22.8h, v22.8b 1524 152516: 1526 mul v24.8h, v16.8h, v0.h[0] 1527 mul v25.8h, v17.8h, v0.h[0] 1528 mul v26.8h, v20.8h, v0.h[0] 1529 mul v27.8h, v21.8h, v0.h[0] 1530.irpc i, 1234567 1531 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1532 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1533 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1534 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1535 mla v24.8h, v28.8h, v0.h[\i] 1536 mla v25.8h, v29.8h, v0.h[\i] 1537 mla v26.8h, v30.8h, v0.h[\i] 1538 mla v27.8h, v31.8h, v0.h[\i] 1539.endr 1540 srshr v24.8h, v24.8h, #2 1541 srshr v25.8h, v25.8h, #2 1542 srshr v26.8h, v26.8h, #2 1543 srshr v27.8h, v27.8h, #2 1544 subs \mx, \mx, #16 1545.ifc \type, put 1546 sqrshrun v24.8b, v24.8h, #4 1547 sqrshrun2 v24.16b, v25.8h, #4 1548 sqrshrun v26.8b, v26.8h, #4 1549 sqrshrun2 v26.16b, v27.8h, #4 1550 st1 {v24.16b}, [\dst], #16 1551 st1 {v26.16b}, [\ds2], #16 1552.else 1553 st1 {v24.8h, v25.8h}, [\dst], #32 1554 st1 {v26.8h, v27.8h}, [\ds2], #32 1555.endif 1556 b.le 9f 1557 1558 mov v16.16b, v18.16b 1559 mov v20.16b, v22.16b 1560 ld1 {v17.8b, v18.8b}, [\src], #16 1561 ld1 {v21.8b, v22.8b}, [\sr2], #16 1562 uxtl v17.8h, v17.8b 1563 uxtl v18.8h, v18.8b 1564 uxtl v21.8h, v21.8b 1565 uxtl v22.8h, v22.8b 1566 b 16b 1567 15689: 1569 add \dst, \dst, \d_strd 1570 add \ds2, \ds2, \d_strd 1571 add \src, \src, \s_strd 1572 add \sr2, \sr2, \s_strd 1573 1574 subs \h, \h, #2 1575 b.gt 161b 1576 ret 1577 1578L(\type\()_8tap_h_tbl): 1579 .hword L(\type\()_8tap_h_tbl) - 1280b 1580 .hword L(\type\()_8tap_h_tbl) - 640b 1581 .hword L(\type\()_8tap_h_tbl) - 320b 1582 .hword L(\type\()_8tap_h_tbl) - 160b 1583 .hword L(\type\()_8tap_h_tbl) - 80b 1584 .hword L(\type\()_8tap_h_tbl) - 40b 1585 .hword L(\type\()_8tap_h_tbl) - 20b 1586 .hword 0 1587 1588 1589L(\type\()_8tap_v): 1590 cmp \h, #4 1591 ubfx w9, \my, #7, #7 1592 and \my, \my, #0x7f 1593 b.le 4f 1594 mov \my, w9 15954: 1596 add \xmy, x10, \my, uxtw #3 1597 1598 adr x9, L(\type\()_8tap_v_tbl) 1599 ldrh w8, [x9, x8, lsl #1] 1600 sub x9, x9, w8, uxtw 1601 br x9 1602 160320: // 2xN v 1604 AARCH64_VALID_JUMP_TARGET 1605.ifc \type, put 1606 b.gt 28f 1607 1608 cmp \h, #2 1609 add \xmy, \xmy, #2 1610 ld1 {v0.s}[0], [\xmy] 1611 sub \src, \src, \s_strd 1612 add \ds2, \dst, \d_strd 1613 add \sr2, \src, \s_strd 1614 lsl \s_strd, \s_strd, #1 1615 lsl \d_strd, \d_strd, #1 1616 sxtl v0.8h, v0.8b 1617 1618 // 2x2 v 1619 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1620 interleave_1_h v1, v2, v3, v4, v5 1621 b.gt 24f 1622 uxtl_b v1, v2, v3, v4 1623 mul_mla_4 v6, v1, v2, v3, v4, .4h 1624 sqrshrun_b 6, v6 1625 st_h \d_strd, v6, 2 1626 ret 1627 162824: // 2x4 v 1629 load_h \sr2, \src, \s_strd, v6, v7 1630 interleave_1_h v5, v6, v7 1631 interleave_2_s v1, v2, v3, v4, v5, v6 1632 uxtl_b v1, v2, v3, v4 1633 mul_mla_4 v6, v1, v2, v3, v4, .8h 1634 sqrshrun_b 6, v6 1635 st_h \d_strd, v6, 4 1636 ret 1637 163828: // 2x6, 2x8, 2x12, 2x16 v 1639 ld1 {v0.8b}, [\xmy] 1640 sub \sr2, \src, \s_strd, lsl #1 1641 add \ds2, \dst, \d_strd 1642 sub \src, \sr2, \s_strd 1643 lsl \d_strd, \d_strd, #1 1644 lsl \s_strd, \s_strd, #1 1645 sxtl v0.8h, v0.8b 1646 1647 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1648 interleave_1_h v1, v2, v3, v4, v5 1649 interleave_1_h v5, v6, v7 1650 interleave_2_s v1, v2, v3, v4, v5, v6 1651 uxtl_b v1, v2, v3, v4 1652216: 1653 subs \h, \h, #4 1654 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1655 interleave_1_h v7, v16, v17, v18, v19 1656 interleave_2_s v5, v6, v7, v16, v17, v18 1657 uxtl_b v5, v6, v7, v16 1658 mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 1659 sqrshrun_b 6, v30 1660 st_h \d_strd, v30, 4 1661 b.le 0f 1662 cmp \h, #2 1663 mov v1.16b, v5.16b 1664 mov v2.16b, v6.16b 1665 mov v3.16b, v7.16b 1666 mov v4.16b, v16.16b 1667 mov v5.16b, v17.16b 1668 mov v6.16b, v18.16b 1669 mov v7.16b, v19.16b 1670 b.eq 26f 1671 b 216b 167226: 1673 load_h \sr2, \src, \s_strd, v16, v17 1674 interleave_1_h v7, v16, v17 1675 uxtl_b v5, v6, v7, v16 1676 mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 1677 sqrshrun_b 6, v30 1678 st_h \d_strd, v30, 2 16790: 1680 ret 1681.endif 1682 168340: 1684 AARCH64_VALID_JUMP_TARGET 1685 b.gt 480f 1686 1687 // 4x2, 4x4 v 1688 cmp \h, #2 1689 add \xmy, \xmy, #2 1690 ld1 {v0.s}[0], [\xmy] 1691 sub \src, \src, \s_strd 1692 add \ds2, \dst, \d_strd 1693 add \sr2, \src, \s_strd 1694 lsl \s_strd, \s_strd, #1 1695 lsl \d_strd, \d_strd, #1 1696 sxtl v0.8h, v0.8b 1697 1698 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1699 interleave_1_s v1, v2, v3, v4, v5 1700 uxtl_b v1, v2, v3, v4 1701 mul_mla_4 v6, v1, v2, v3, v4, .8h 1702 shift_store_4 \type, \d_strd, v6 1703 b.le 0f 1704 load_s \sr2, \src, \s_strd, v6, v7 1705 interleave_1_s v5, v6, v7 1706 uxtl_b v5, v6 1707 mul_mla_4 v7, v3, v4, v5, v6, .8h 1708 shift_store_4 \type, \d_strd, v7 17090: 1710 ret 1711 1712480: // 4x6, 4x8, 4x12, 4x16 v 1713 ld1 {v0.8b}, [\xmy] 1714 sub \sr2, \src, \s_strd, lsl #1 1715 add \ds2, \dst, \d_strd 1716 sub \src, \sr2, \s_strd 1717 lsl \s_strd, \s_strd, #1 1718 lsl \d_strd, \d_strd, #1 1719 sxtl v0.8h, v0.8b 1720 1721 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1722 interleave_1_s v16, v17, v18 1723 interleave_1_s v18, v19, v20, v21, v22 1724 uxtl_b v16, v17 1725 uxtl_b v18, v19, v20, v21 1726 172748: 1728 subs \h, \h, #4 1729 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1730 interleave_1_s v22, v23, v24, v25, v26 1731 uxtl_b v22, v23, v24, v25 1732 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1733 shift_store_4 \type, \d_strd, v1, v2 1734 b.le 0f 1735 load_s \sr2, \src, \s_strd, v27, v16 1736 subs \h, \h, #2 1737 interleave_1_s v26, v27, v16 1738 uxtl_b v26, v27 1739 mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 1740 shift_store_4 \type, \d_strd, v1 1741 b.le 0f 1742 load_s \sr2, \src, \s_strd, v17, v18 1743 subs \h, \h, #2 1744 interleave_1_s v16, v17, v18 1745 uxtl_b v16, v17 1746 mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 1747 shift_store_4 \type, \d_strd, v2 1748 b.le 0f 1749 subs \h, \h, #4 1750 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1751 interleave_1_s v18, v19, v20, v21, v22 1752 uxtl_b v18, v19, v20, v21 1753 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1754 shift_store_4 \type, \d_strd, v1, v2 1755 b.gt 48b 17560: 1757 ret 1758 175980: 1760 AARCH64_VALID_JUMP_TARGET 1761 b.gt 880f 1762 1763 // 8x2, 8x4 v 1764 cmp \h, #2 1765 add \xmy, \xmy, #2 1766 ld1 {v0.s}[0], [\xmy] 1767 sub \src, \src, \s_strd 1768 add \ds2, \dst, \d_strd 1769 add \sr2, \src, \s_strd 1770 lsl \s_strd, \s_strd, #1 1771 lsl \d_strd, \d_strd, #1 1772 sxtl v0.8h, v0.8b 1773 1774 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1775 uxtl_b v1, v2, v3, v4, v5 1776 mul_mla_4 v6, v1, v2, v3, v4, .8h 1777 mul_mla_4 v7, v2, v3, v4, v5, .8h 1778 shift_store_8 \type, \d_strd, v6, v7 1779 b.le 0f 1780 load_8b \sr2, \src, \s_strd, v6, v7 1781 uxtl_b v6, v7 1782 mul_mla_4 v1, v3, v4, v5, v6, .8h 1783 mul_mla_4 v2, v4, v5, v6, v7, .8h 1784 shift_store_8 \type, \d_strd, v1, v2 17850: 1786 ret 1787 1788880: // 8x6, 8x8, 8x16, 8x32 v 17891680: // 16x8, 16x16, ... 1790320: // 32x8, 32x16, ... 1791640: 17921280: 1793 AARCH64_VALID_JUMP_TARGET 1794 ld1 {v0.8b}, [\xmy] 1795 sub \src, \src, \s_strd 1796 sub \src, \src, \s_strd, lsl #1 1797 sxtl v0.8h, v0.8b 1798 mov \my, \h 1799168: 1800 add \ds2, \dst, \d_strd 1801 add \sr2, \src, \s_strd 1802 lsl \s_strd, \s_strd, #1 1803 lsl \d_strd, \d_strd, #1 1804 1805 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1806 uxtl_b v16, v17, v18, v19, v20, v21, v22 1807 180888: 1809 subs \h, \h, #2 1810 load_8b \sr2, \src, \s_strd, v23, v24 1811 uxtl_b v23, v24 1812 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1813 shift_store_8 \type, \d_strd, v1, v2 1814 b.le 9f 1815 subs \h, \h, #2 1816 load_8b \sr2, \src, \s_strd, v25, v26 1817 uxtl_b v25, v26 1818 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1819 shift_store_8 \type, \d_strd, v3, v4 1820 b.le 9f 1821 subs \h, \h, #2 1822 load_8b \sr2, \src, \s_strd, v27, v16 1823 uxtl_b v27, v16 1824 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1825 shift_store_8 \type, \d_strd, v1, v2 1826 b.le 9f 1827 subs \h, \h, #2 1828 load_8b \sr2, \src, \s_strd, v17, v18 1829 uxtl_b v17, v18 1830 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1831 shift_store_8 \type, \d_strd, v3, v4 1832 b.le 9f 1833 subs \h, \h, #4 1834 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1835 uxtl_b v19, v20, v21, v22 1836 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1837 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1838 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1839 b.gt 88b 18409: 1841 subs \w, \w, #8 1842 b.le 0f 1843 asr \s_strd, \s_strd, #1 1844 asr \d_strd, \d_strd, #1 1845 msub \src, \s_strd, \xmy, \src 1846 msub \dst, \d_strd, \xmy, \dst 1847 sub \src, \src, \s_strd, lsl #3 1848 mov \h, \my 1849 add \src, \src, #8 1850.ifc \type, put 1851 add \dst, \dst, #8 1852.else 1853 add \dst, \dst, #16 1854.endif 1855 b 168b 18560: 1857 ret 1858 1859160: 1860 AARCH64_VALID_JUMP_TARGET 1861 b.gt 1680b 1862 1863 // 16x2, 16x4 v 1864 add \xmy, \xmy, #2 1865 ld1 {v0.s}[0], [\xmy] 1866 sub \src, \src, \s_strd 1867 add \ds2, \dst, \d_strd 1868 add \sr2, \src, \s_strd 1869 lsl \s_strd, \s_strd, #1 1870 lsl \d_strd, \d_strd, #1 1871 sxtl v0.8h, v0.8b 1872 1873 cmp \h, #2 1874 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1875 uxtl v16.8h, v1.8b 1876 uxtl v17.8h, v2.8b 1877 uxtl v18.8h, v3.8b 1878 uxtl v19.8h, v4.8b 1879 uxtl v20.8h, v5.8b 1880 uxtl2 v23.8h, v1.16b 1881 uxtl2 v24.8h, v2.16b 1882 uxtl2 v25.8h, v3.16b 1883 uxtl2 v26.8h, v4.16b 1884 uxtl2 v27.8h, v5.16b 1885 mul_mla_4 v1, v16, v17, v18, v19, .8h 1886 mul_mla_4 v16, v17, v18, v19, v20, .8h 1887 mul_mla_4 v2, v23, v24, v25, v26, .8h 1888 mul_mla_4 v17, v24, v25, v26, v27, .8h 1889 shift_store_16 \type, \d_strd, v1, v2, v16, v17 1890 b.le 0f 1891 load_16b \sr2, \src, \s_strd, v6, v7 1892 uxtl v21.8h, v6.8b 1893 uxtl v22.8h, v7.8b 1894 uxtl2 v28.8h, v6.16b 1895 uxtl2 v29.8h, v7.16b 1896 mul_mla_4 v1, v18, v19, v20, v21, .8h 1897 mul_mla_4 v3, v19, v20, v21, v22, .8h 1898 mul_mla_4 v2, v25, v26, v27, v28, .8h 1899 mul_mla_4 v4, v26, v27, v28, v29, .8h 1900 shift_store_16 \type, \d_strd, v1, v2, v3, v4 19010: 1902 ret 1903 1904L(\type\()_8tap_v_tbl): 1905 .hword L(\type\()_8tap_v_tbl) - 1280b 1906 .hword L(\type\()_8tap_v_tbl) - 640b 1907 .hword L(\type\()_8tap_v_tbl) - 320b 1908 .hword L(\type\()_8tap_v_tbl) - 160b 1909 .hword L(\type\()_8tap_v_tbl) - 80b 1910 .hword L(\type\()_8tap_v_tbl) - 40b 1911 .hword L(\type\()_8tap_v_tbl) - 20b 1912 .hword 0 1913 1914L(\type\()_8tap_hv): 1915 cmp \h, #4 1916 ubfx w9, \my, #7, #7 1917 and \my, \my, #0x7f 1918 b.le 4f 1919 mov \my, w9 19204: 1921 add \xmy, x10, \my, uxtw #3 1922 1923 adr x9, L(\type\()_8tap_hv_tbl) 1924 ldrh w8, [x9, x8, lsl #1] 1925 sub x9, x9, w8, uxtw 1926 br x9 1927 192820: 1929 AARCH64_VALID_JUMP_TARGET 1930.ifc \type, put 1931 add \xmx, \xmx, #2 1932 ld1 {v0.s}[0], [\xmx] 1933 b.gt 280f 1934 add \xmy, \xmy, #2 1935 ld1 {v1.s}[0], [\xmy] 1936 1937 // 2x2, 2x4 hv 1938 sub \sr2, \src, #1 1939 sub \src, \sr2, \s_strd 1940 add \ds2, \dst, \d_strd 1941 lsl \s_strd, \s_strd, #1 1942 lsl \d_strd, \d_strd, #1 1943 sxtl v0.8h, v0.8b 1944 sxtl v1.8h, v1.8b 1945 mov x15, x30 1946 1947 ld1 {v28.8b}, [\src], \s_strd 1948 uxtl v28.8h, v28.8b 1949 ext v29.16b, v28.16b, v28.16b, #2 1950 mul v28.4h, v28.4h, v0.4h 1951 mul v29.4h, v29.4h, v0.4h 1952 addp v28.4h, v28.4h, v29.4h 1953 addp v16.4h, v28.4h, v28.4h 1954 srshr v16.4h, v16.4h, #2 1955 bl L(\type\()_8tap_filter_2) 1956 1957 trn1 v16.2s, v16.2s, v28.2s 1958 mov v17.8b, v28.8b 1959 19602: 1961 bl L(\type\()_8tap_filter_2) 1962 1963 ext v18.8b, v17.8b, v28.8b, #4 1964 smull v2.4s, v16.4h, v1.h[0] 1965 smlal v2.4s, v17.4h, v1.h[1] 1966 smlal v2.4s, v18.4h, v1.h[2] 1967 smlal v2.4s, v28.4h, v1.h[3] 1968 1969 sqrshrn v2.4h, v2.4s, #\shift_hv 1970 sqxtun v2.8b, v2.8h 1971 subs \h, \h, #2 1972 st1 {v2.h}[0], [\dst], \d_strd 1973 st1 {v2.h}[1], [\ds2], \d_strd 1974 b.le 0f 1975 mov v16.8b, v18.8b 1976 mov v17.8b, v28.8b 1977 b 2b 1978 1979280: // 2x8, 2x16, 2x32 hv 1980 ld1 {v1.8b}, [\xmy] 1981 sub \src, \src, #1 1982 sub \sr2, \src, \s_strd, lsl #1 1983 sub \src, \sr2, \s_strd 1984 add \ds2, \dst, \d_strd 1985 lsl \s_strd, \s_strd, #1 1986 lsl \d_strd, \d_strd, #1 1987 sxtl v0.8h, v0.8b 1988 sxtl v1.8h, v1.8b 1989 mov x15, x30 1990 1991 ld1 {v28.8b}, [\src], \s_strd 1992 uxtl v28.8h, v28.8b 1993 ext v29.16b, v28.16b, v28.16b, #2 1994 mul v28.4h, v28.4h, v0.4h 1995 mul v29.4h, v29.4h, v0.4h 1996 addp v28.4h, v28.4h, v29.4h 1997 addp v16.4h, v28.4h, v28.4h 1998 srshr v16.4h, v16.4h, #2 1999 2000 bl L(\type\()_8tap_filter_2) 2001 trn1 v16.2s, v16.2s, v28.2s 2002 mov v17.8b, v28.8b 2003 bl L(\type\()_8tap_filter_2) 2004 ext v18.8b, v17.8b, v28.8b, #4 2005 mov v19.8b, v28.8b 2006 bl L(\type\()_8tap_filter_2) 2007 ext v20.8b, v19.8b, v28.8b, #4 2008 mov v21.8b, v28.8b 2009 201028: 2011 bl L(\type\()_8tap_filter_2) 2012 ext v22.8b, v21.8b, v28.8b, #4 2013 smull v2.4s, v16.4h, v1.h[0] 2014 smlal v2.4s, v17.4h, v1.h[1] 2015 smlal v2.4s, v18.4h, v1.h[2] 2016 smlal v2.4s, v19.4h, v1.h[3] 2017 smlal v2.4s, v20.4h, v1.h[4] 2018 smlal v2.4s, v21.4h, v1.h[5] 2019 smlal v2.4s, v22.4h, v1.h[6] 2020 smlal v2.4s, v28.4h, v1.h[7] 2021 2022 sqrshrn v2.4h, v2.4s, #\shift_hv 2023 sqxtun v2.8b, v2.8h 2024 subs \h, \h, #2 2025 st1 {v2.h}[0], [\dst], \d_strd 2026 st1 {v2.h}[1], [\ds2], \d_strd 2027 b.le 0f 2028 mov v16.8b, v18.8b 2029 mov v17.8b, v19.8b 2030 mov v18.8b, v20.8b 2031 mov v19.8b, v21.8b 2032 mov v20.8b, v22.8b 2033 mov v21.8b, v28.8b 2034 b 28b 2035 20360: 2037 ret x15 2038 2039L(\type\()_8tap_filter_2): 2040 ld1 {v28.8b}, [\sr2], \s_strd 2041 ld1 {v30.8b}, [\src], \s_strd 2042 uxtl v28.8h, v28.8b 2043 uxtl v30.8h, v30.8b 2044 ext v29.16b, v28.16b, v28.16b, #2 2045 ext v31.16b, v30.16b, v30.16b, #2 2046 trn1 v27.2s, v28.2s, v30.2s 2047 trn2 v30.2s, v28.2s, v30.2s 2048 trn1 v28.2s, v29.2s, v31.2s 2049 trn2 v31.2s, v29.2s, v31.2s 2050 mul v27.4h, v27.4h, v0.h[0] 2051 mla v27.4h, v28.4h, v0.h[1] 2052 mla v27.4h, v30.4h, v0.h[2] 2053 mla v27.4h, v31.4h, v0.h[3] 2054 srshr v28.4h, v27.4h, #2 2055 ret 2056.endif 2057 205840: 2059 AARCH64_VALID_JUMP_TARGET 2060 add \xmx, \xmx, #2 2061 ld1 {v0.s}[0], [\xmx] 2062 b.gt 480f 2063 add \xmy, \xmy, #2 2064 ld1 {v1.s}[0], [\xmy] 2065 sub \sr2, \src, #1 2066 sub \src, \sr2, \s_strd 2067 add \ds2, \dst, \d_strd 2068 lsl \s_strd, \s_strd, #1 2069 lsl \d_strd, \d_strd, #1 2070 sxtl v0.8h, v0.8b 2071 sxtl v1.8h, v1.8b 2072 mov x15, x30 2073 2074 // 4x2, 4x4 hv 2075 ld1 {v26.8b}, [\src], \s_strd 2076 uxtl v26.8h, v26.8b 2077 ext v28.16b, v26.16b, v26.16b, #2 2078 ext v29.16b, v26.16b, v26.16b, #4 2079 ext v30.16b, v26.16b, v26.16b, #6 2080 mul v31.4h, v26.4h, v0.h[0] 2081 mla v31.4h, v28.4h, v0.h[1] 2082 mla v31.4h, v29.4h, v0.h[2] 2083 mla v31.4h, v30.4h, v0.h[3] 2084 srshr v16.4h, v31.4h, #2 2085 2086 bl L(\type\()_8tap_filter_4) 2087 mov v17.8b, v28.8b 2088 mov v18.8b, v29.8b 2089 20904: 2091 bl L(\type\()_8tap_filter_4) 2092 // Interleaving the mul/mla chains actually hurts performance 2093 // significantly on Cortex A53, thus keeping mul/mla tightly 2094 // chained like this. 2095 smull v2.4s, v16.4h, v1.h[0] 2096 smlal v2.4s, v17.4h, v1.h[1] 2097 smlal v2.4s, v18.4h, v1.h[2] 2098 smlal v2.4s, v28.4h, v1.h[3] 2099 smull v3.4s, v17.4h, v1.h[0] 2100 smlal v3.4s, v18.4h, v1.h[1] 2101 smlal v3.4s, v28.4h, v1.h[2] 2102 smlal v3.4s, v29.4h, v1.h[3] 2103 sqrshrn v2.4h, v2.4s, #\shift_hv 2104 sqrshrn v3.4h, v3.4s, #\shift_hv 2105 subs \h, \h, #2 2106.ifc \type, put 2107 sqxtun v2.8b, v2.8h 2108 sqxtun v3.8b, v3.8h 2109 st1 {v2.s}[0], [\dst], \d_strd 2110 st1 {v3.s}[0], [\ds2], \d_strd 2111.else 2112 st1 {v2.4h}, [\dst], \d_strd 2113 st1 {v3.4h}, [\ds2], \d_strd 2114.endif 2115 b.le 0f 2116 mov v16.8b, v18.8b 2117 mov v17.8b, v28.8b 2118 mov v18.8b, v29.8b 2119 b 4b 2120 2121480: // 4x8, 4x16, 4x32 hv 2122 ld1 {v1.8b}, [\xmy] 2123 sub \src, \src, #1 2124 sub \sr2, \src, \s_strd, lsl #1 2125 sub \src, \sr2, \s_strd 2126 add \ds2, \dst, \d_strd 2127 lsl \s_strd, \s_strd, #1 2128 lsl \d_strd, \d_strd, #1 2129 sxtl v0.8h, v0.8b 2130 sxtl v1.8h, v1.8b 2131 mov x15, x30 2132 2133 ld1 {v26.8b}, [\src], \s_strd 2134 uxtl v26.8h, v26.8b 2135 ext v28.16b, v26.16b, v26.16b, #2 2136 ext v29.16b, v26.16b, v26.16b, #4 2137 ext v30.16b, v26.16b, v26.16b, #6 2138 mul v31.4h, v26.4h, v0.h[0] 2139 mla v31.4h, v28.4h, v0.h[1] 2140 mla v31.4h, v29.4h, v0.h[2] 2141 mla v31.4h, v30.4h, v0.h[3] 2142 srshr v16.4h, v31.4h, #2 2143 2144 bl L(\type\()_8tap_filter_4) 2145 mov v17.8b, v28.8b 2146 mov v18.8b, v29.8b 2147 bl L(\type\()_8tap_filter_4) 2148 mov v19.8b, v28.8b 2149 mov v20.8b, v29.8b 2150 bl L(\type\()_8tap_filter_4) 2151 mov v21.8b, v28.8b 2152 mov v22.8b, v29.8b 2153 215448: 2155 bl L(\type\()_8tap_filter_4) 2156 smull v2.4s, v16.4h, v1.h[0] 2157 smlal v2.4s, v17.4h, v1.h[1] 2158 smlal v2.4s, v18.4h, v1.h[2] 2159 smlal v2.4s, v19.4h, v1.h[3] 2160 smlal v2.4s, v20.4h, v1.h[4] 2161 smlal v2.4s, v21.4h, v1.h[5] 2162 smlal v2.4s, v22.4h, v1.h[6] 2163 smlal v2.4s, v28.4h, v1.h[7] 2164 smull v3.4s, v17.4h, v1.h[0] 2165 smlal v3.4s, v18.4h, v1.h[1] 2166 smlal v3.4s, v19.4h, v1.h[2] 2167 smlal v3.4s, v20.4h, v1.h[3] 2168 smlal v3.4s, v21.4h, v1.h[4] 2169 smlal v3.4s, v22.4h, v1.h[5] 2170 smlal v3.4s, v28.4h, v1.h[6] 2171 smlal v3.4s, v29.4h, v1.h[7] 2172 sqrshrn v2.4h, v2.4s, #\shift_hv 2173 sqrshrn v3.4h, v3.4s, #\shift_hv 2174 subs \h, \h, #2 2175.ifc \type, put 2176 sqxtun v2.8b, v2.8h 2177 sqxtun v3.8b, v3.8h 2178 st1 {v2.s}[0], [\dst], \d_strd 2179 st1 {v3.s}[0], [\ds2], \d_strd 2180.else 2181 st1 {v2.4h}, [\dst], \d_strd 2182 st1 {v3.4h}, [\ds2], \d_strd 2183.endif 2184 b.le 0f 2185 mov v16.8b, v18.8b 2186 mov v17.8b, v19.8b 2187 mov v18.8b, v20.8b 2188 mov v19.8b, v21.8b 2189 mov v20.8b, v22.8b 2190 mov v21.8b, v28.8b 2191 mov v22.8b, v29.8b 2192 b 48b 21930: 2194 ret x15 2195 2196L(\type\()_8tap_filter_4): 2197 ld1 {v26.8b}, [\sr2], \s_strd 2198 ld1 {v27.8b}, [\src], \s_strd 2199 uxtl v26.8h, v26.8b 2200 uxtl v27.8h, v27.8b 2201 ext v28.16b, v26.16b, v26.16b, #2 2202 ext v29.16b, v26.16b, v26.16b, #4 2203 ext v30.16b, v26.16b, v26.16b, #6 2204 mul v31.4h, v26.4h, v0.h[0] 2205 mla v31.4h, v28.4h, v0.h[1] 2206 mla v31.4h, v29.4h, v0.h[2] 2207 mla v31.4h, v30.4h, v0.h[3] 2208 ext v28.16b, v27.16b, v27.16b, #2 2209 ext v29.16b, v27.16b, v27.16b, #4 2210 ext v30.16b, v27.16b, v27.16b, #6 2211 mul v27.4h, v27.4h, v0.h[0] 2212 mla v27.4h, v28.4h, v0.h[1] 2213 mla v27.4h, v29.4h, v0.h[2] 2214 mla v27.4h, v30.4h, v0.h[3] 2215 srshr v28.4h, v31.4h, #2 2216 srshr v29.4h, v27.4h, #2 2217 ret 2218 221980: 2220160: 2221320: 2222 AARCH64_VALID_JUMP_TARGET 2223 b.gt 880f 2224 add \xmy, \xmy, #2 2225 ld1 {v0.8b}, [\xmx] 2226 ld1 {v1.s}[0], [\xmy] 2227 sub \src, \src, #3 2228 sub \src, \src, \s_strd 2229 sxtl v0.8h, v0.8b 2230 sxtl v1.8h, v1.8b 2231 mov x15, x30 2232 mov \my, \h 2233 2234164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2235 add \ds2, \dst, \d_strd 2236 add \sr2, \src, \s_strd 2237 lsl \d_strd, \d_strd, #1 2238 lsl \s_strd, \s_strd, #1 2239 2240 bl L(\type\()_8tap_filter_8_first) 2241 bl L(\type\()_8tap_filter_8) 2242 mov v17.16b, v24.16b 2243 mov v18.16b, v25.16b 2244 22458: 2246 smull v2.4s, v16.4h, v1.h[0] 2247 smull2 v3.4s, v16.8h, v1.h[0] 2248 bl L(\type\()_8tap_filter_8) 2249 smull v4.4s, v17.4h, v1.h[0] 2250 smull2 v5.4s, v17.8h, v1.h[0] 2251 smlal v2.4s, v17.4h, v1.h[1] 2252 smlal2 v3.4s, v17.8h, v1.h[1] 2253 smlal v4.4s, v18.4h, v1.h[1] 2254 smlal2 v5.4s, v18.8h, v1.h[1] 2255 smlal v2.4s, v18.4h, v1.h[2] 2256 smlal2 v3.4s, v18.8h, v1.h[2] 2257 smlal v4.4s, v24.4h, v1.h[2] 2258 smlal2 v5.4s, v24.8h, v1.h[2] 2259 smlal v2.4s, v24.4h, v1.h[3] 2260 smlal2 v3.4s, v24.8h, v1.h[3] 2261 smlal v4.4s, v25.4h, v1.h[3] 2262 smlal2 v5.4s, v25.8h, v1.h[3] 2263 sqrshrn v2.4h, v2.4s, #\shift_hv 2264 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2265 sqrshrn v4.4h, v4.4s, #\shift_hv 2266 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2267 subs \h, \h, #2 2268.ifc \type, put 2269 sqxtun v2.8b, v2.8h 2270 sqxtun v4.8b, v4.8h 2271 st1 {v2.8b}, [\dst], \d_strd 2272 st1 {v4.8b}, [\ds2], \d_strd 2273.else 2274 st1 {v2.8h}, [\dst], \d_strd 2275 st1 {v4.8h}, [\ds2], \d_strd 2276.endif 2277 b.le 9f 2278 mov v16.16b, v18.16b 2279 mov v17.16b, v24.16b 2280 mov v18.16b, v25.16b 2281 b 8b 22829: 2283 subs \w, \w, #8 2284 b.le 0f 2285 asr \s_strd, \s_strd, #1 2286 asr \d_strd, \d_strd, #1 2287 msub \src, \s_strd, \xmy, \src 2288 msub \dst, \d_strd, \xmy, \dst 2289 sub \src, \src, \s_strd, lsl #2 2290 mov \h, \my 2291 add \src, \src, #8 2292.ifc \type, put 2293 add \dst, \dst, #8 2294.else 2295 add \dst, \dst, #16 2296.endif 2297 b 164b 2298 2299880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2300640: 23011280: 2302 AARCH64_VALID_JUMP_TARGET 2303 ld1 {v0.8b}, [\xmx] 2304 ld1 {v1.8b}, [\xmy] 2305 sub \src, \src, #3 2306 sub \src, \src, \s_strd 2307 sub \src, \src, \s_strd, lsl #1 2308 sxtl v0.8h, v0.8b 2309 sxtl v1.8h, v1.8b 2310 mov x15, x30 2311 mov \my, \h 2312 2313168: 2314 add \ds2, \dst, \d_strd 2315 add \sr2, \src, \s_strd 2316 lsl \d_strd, \d_strd, #1 2317 lsl \s_strd, \s_strd, #1 2318 2319 bl L(\type\()_8tap_filter_8_first) 2320 bl L(\type\()_8tap_filter_8) 2321 mov v17.16b, v24.16b 2322 mov v18.16b, v25.16b 2323 bl L(\type\()_8tap_filter_8) 2324 mov v19.16b, v24.16b 2325 mov v20.16b, v25.16b 2326 bl L(\type\()_8tap_filter_8) 2327 mov v21.16b, v24.16b 2328 mov v22.16b, v25.16b 2329 233088: 2331 smull v2.4s, v16.4h, v1.h[0] 2332 smull2 v3.4s, v16.8h, v1.h[0] 2333 bl L(\type\()_8tap_filter_8) 2334 smull v4.4s, v17.4h, v1.h[0] 2335 smull2 v5.4s, v17.8h, v1.h[0] 2336 smlal v2.4s, v17.4h, v1.h[1] 2337 smlal2 v3.4s, v17.8h, v1.h[1] 2338 smlal v4.4s, v18.4h, v1.h[1] 2339 smlal2 v5.4s, v18.8h, v1.h[1] 2340 smlal v2.4s, v18.4h, v1.h[2] 2341 smlal2 v3.4s, v18.8h, v1.h[2] 2342 smlal v4.4s, v19.4h, v1.h[2] 2343 smlal2 v5.4s, v19.8h, v1.h[2] 2344 smlal v2.4s, v19.4h, v1.h[3] 2345 smlal2 v3.4s, v19.8h, v1.h[3] 2346 smlal v4.4s, v20.4h, v1.h[3] 2347 smlal2 v5.4s, v20.8h, v1.h[3] 2348 smlal v2.4s, v20.4h, v1.h[4] 2349 smlal2 v3.4s, v20.8h, v1.h[4] 2350 smlal v4.4s, v21.4h, v1.h[4] 2351 smlal2 v5.4s, v21.8h, v1.h[4] 2352 smlal v2.4s, v21.4h, v1.h[5] 2353 smlal2 v3.4s, v21.8h, v1.h[5] 2354 smlal v4.4s, v22.4h, v1.h[5] 2355 smlal2 v5.4s, v22.8h, v1.h[5] 2356 smlal v2.4s, v22.4h, v1.h[6] 2357 smlal2 v3.4s, v22.8h, v1.h[6] 2358 smlal v4.4s, v24.4h, v1.h[6] 2359 smlal2 v5.4s, v24.8h, v1.h[6] 2360 smlal v2.4s, v24.4h, v1.h[7] 2361 smlal2 v3.4s, v24.8h, v1.h[7] 2362 smlal v4.4s, v25.4h, v1.h[7] 2363 smlal2 v5.4s, v25.8h, v1.h[7] 2364 sqrshrn v2.4h, v2.4s, #\shift_hv 2365 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2366 sqrshrn v4.4h, v4.4s, #\shift_hv 2367 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2368 subs \h, \h, #2 2369.ifc \type, put 2370 sqxtun v2.8b, v2.8h 2371 sqxtun v4.8b, v4.8h 2372 st1 {v2.8b}, [\dst], \d_strd 2373 st1 {v4.8b}, [\ds2], \d_strd 2374.else 2375 st1 {v2.8h}, [\dst], \d_strd 2376 st1 {v4.8h}, [\ds2], \d_strd 2377.endif 2378 b.le 9f 2379 mov v16.16b, v18.16b 2380 mov v17.16b, v19.16b 2381 mov v18.16b, v20.16b 2382 mov v19.16b, v21.16b 2383 mov v20.16b, v22.16b 2384 mov v21.16b, v24.16b 2385 mov v22.16b, v25.16b 2386 b 88b 23879: 2388 subs \w, \w, #8 2389 b.le 0f 2390 asr \s_strd, \s_strd, #1 2391 asr \d_strd, \d_strd, #1 2392 msub \src, \s_strd, \xmy, \src 2393 msub \dst, \d_strd, \xmy, \dst 2394 sub \src, \src, \s_strd, lsl #3 2395 mov \h, \my 2396 add \src, \src, #8 2397.ifc \type, put 2398 add \dst, \dst, #8 2399.else 2400 add \dst, \dst, #16 2401.endif 2402 b 168b 24030: 2404 ret x15 2405 2406L(\type\()_8tap_filter_8_first): 2407 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2408 uxtl v28.8h, v28.8b 2409 uxtl v29.8h, v29.8b 2410 mul v16.8h, v28.8h, v0.h[0] 2411 ext v24.16b, v28.16b, v29.16b, #(2*1) 2412 ext v25.16b, v28.16b, v29.16b, #(2*2) 2413 ext v26.16b, v28.16b, v29.16b, #(2*3) 2414 ext v27.16b, v28.16b, v29.16b, #(2*4) 2415 mla v16.8h, v24.8h, v0.h[1] 2416 mla v16.8h, v25.8h, v0.h[2] 2417 mla v16.8h, v26.8h, v0.h[3] 2418 mla v16.8h, v27.8h, v0.h[4] 2419 ext v24.16b, v28.16b, v29.16b, #(2*5) 2420 ext v25.16b, v28.16b, v29.16b, #(2*6) 2421 ext v26.16b, v28.16b, v29.16b, #(2*7) 2422 mla v16.8h, v24.8h, v0.h[5] 2423 mla v16.8h, v25.8h, v0.h[6] 2424 mla v16.8h, v26.8h, v0.h[7] 2425 srshr v16.8h, v16.8h, #2 2426 ret 2427 2428L(\type\()_8tap_filter_8): 2429 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2430 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2431 uxtl v28.8h, v28.8b 2432 uxtl v29.8h, v29.8b 2433 uxtl v30.8h, v30.8b 2434 uxtl v31.8h, v31.8b 2435 mul v24.8h, v28.8h, v0.h[0] 2436 mul v25.8h, v30.8h, v0.h[0] 2437.irpc i, 1234567 2438 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2439 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2440 mla v24.8h, v26.8h, v0.h[\i] 2441 mla v25.8h, v27.8h, v0.h[\i] 2442.endr 2443 srshr v24.8h, v24.8h, #2 2444 srshr v25.8h, v25.8h, #2 2445 ret 2446 2447L(\type\()_8tap_hv_tbl): 2448 .hword L(\type\()_8tap_hv_tbl) - 1280b 2449 .hword L(\type\()_8tap_hv_tbl) - 640b 2450 .hword L(\type\()_8tap_hv_tbl) - 320b 2451 .hword L(\type\()_8tap_hv_tbl) - 160b 2452 .hword L(\type\()_8tap_hv_tbl) - 80b 2453 .hword L(\type\()_8tap_hv_tbl) - 40b 2454 .hword L(\type\()_8tap_hv_tbl) - 20b 2455 .hword 0 2456endfunc 2457 2458 2459function \type\()_bilin_8bpc_neon, export=1 2460 dup v1.16b, \mx 2461 dup v3.16b, \my 2462 mov w9, #16 2463 sub w8, w9, \mx 2464 sub w9, w9, \my 2465 dup v0.16b, w8 2466 dup v2.16b, w9 2467.ifc \type, prep 2468 uxtw \d_strd, \w 2469 lsl \d_strd, \d_strd, #1 2470.endif 2471 2472 clz w8, \w 2473 sub w8, w8, #24 2474 cbnz \mx, L(\type\()_bilin_h) 2475 cbnz \my, L(\type\()_bilin_v) 2476 b \type\()_neon 2477 2478L(\type\()_bilin_h): 2479 cbnz \my, L(\type\()_bilin_hv) 2480 2481 adr x9, L(\type\()_bilin_h_tbl) 2482 ldrh w8, [x9, x8, lsl #1] 2483 sub x9, x9, w8, uxtw 2484 br x9 2485 248620: // 2xN h 2487 AARCH64_VALID_JUMP_TARGET 2488.ifc \type, put 2489 add \ds2, \dst, \d_strd 2490 add \sr2, \src, \s_strd 2491 lsl \d_strd, \d_strd, #1 2492 lsl \s_strd, \s_strd, #1 24932: 2494 ld1 {v4.s}[0], [\src], \s_strd 2495 ld1 {v6.s}[0], [\sr2], \s_strd 2496 ext v5.8b, v4.8b, v4.8b, #1 2497 ext v7.8b, v6.8b, v6.8b, #1 2498 trn1 v4.4h, v4.4h, v6.4h 2499 trn1 v5.4h, v5.4h, v7.4h 2500 subs \h, \h, #2 2501 umull v4.8h, v4.8b, v0.8b 2502 umlal v4.8h, v5.8b, v1.8b 2503 uqrshrn v4.8b, v4.8h, #4 2504 st1 {v4.h}[0], [\dst], \d_strd 2505 st1 {v4.h}[1], [\ds2], \d_strd 2506 b.gt 2b 2507 ret 2508.endif 2509 251040: // 4xN h 2511 AARCH64_VALID_JUMP_TARGET 2512 add \ds2, \dst, \d_strd 2513 add \sr2, \src, \s_strd 2514 lsl \d_strd, \d_strd, #1 2515 lsl \s_strd, \s_strd, #1 25164: 2517 ld1 {v4.8b}, [\src], \s_strd 2518 ld1 {v6.8b}, [\sr2], \s_strd 2519 ext v5.8b, v4.8b, v4.8b, #1 2520 ext v7.8b, v6.8b, v6.8b, #1 2521 trn1 v4.2s, v4.2s, v6.2s 2522 trn1 v5.2s, v5.2s, v7.2s 2523 subs \h, \h, #2 2524 umull v4.8h, v4.8b, v0.8b 2525 umlal v4.8h, v5.8b, v1.8b 2526.ifc \type, put 2527 uqrshrn v4.8b, v4.8h, #4 2528 st1 {v4.s}[0], [\dst], \d_strd 2529 st1 {v4.s}[1], [\ds2], \d_strd 2530.else 2531 st1 {v4.d}[0], [\dst], \d_strd 2532 st1 {v4.d}[1], [\ds2], \d_strd 2533.endif 2534 b.gt 4b 2535 ret 2536 253780: // 8xN h 2538 AARCH64_VALID_JUMP_TARGET 2539 add \ds2, \dst, \d_strd 2540 add \sr2, \src, \s_strd 2541 lsl \d_strd, \d_strd, #1 2542 lsl \s_strd, \s_strd, #1 25438: 2544 ld1 {v4.16b}, [\src], \s_strd 2545 ld1 {v6.16b}, [\sr2], \s_strd 2546 ext v5.16b, v4.16b, v4.16b, #1 2547 ext v7.16b, v6.16b, v6.16b, #1 2548 subs \h, \h, #2 2549 umull v4.8h, v4.8b, v0.8b 2550 umull v6.8h, v6.8b, v0.8b 2551 umlal v4.8h, v5.8b, v1.8b 2552 umlal v6.8h, v7.8b, v1.8b 2553.ifc \type, put 2554 uqrshrn v4.8b, v4.8h, #4 2555 uqrshrn v6.8b, v6.8h, #4 2556 st1 {v4.8b}, [\dst], \d_strd 2557 st1 {v6.8b}, [\ds2], \d_strd 2558.else 2559 st1 {v4.8h}, [\dst], \d_strd 2560 st1 {v6.8h}, [\ds2], \d_strd 2561.endif 2562 b.gt 8b 2563 ret 2564160: 2565320: 2566640: 25671280: // 16xN, 32xN, ... h 2568 AARCH64_VALID_JUMP_TARGET 2569 add \ds2, \dst, \d_strd 2570 add \sr2, \src, \s_strd 2571 lsl \s_strd, \s_strd, #1 2572 2573 sub \s_strd, \s_strd, \w, uxtw 2574 sub \s_strd, \s_strd, #8 2575.ifc \type, put 2576 lsl \d_strd, \d_strd, #1 2577 sub \d_strd, \d_strd, \w, uxtw 2578.endif 2579161: 2580 ld1 {v16.d}[1], [\src], #8 2581 ld1 {v20.d}[1], [\sr2], #8 2582 mov \mx, \w 2583 258416: 2585 ld1 {v18.16b}, [\src], #16 2586 ld1 {v22.16b}, [\sr2], #16 2587 ext v17.16b, v16.16b, v18.16b, #8 2588 ext v19.16b, v16.16b, v18.16b, #9 2589 ext v21.16b, v20.16b, v22.16b, #8 2590 ext v23.16b, v20.16b, v22.16b, #9 2591 umull v16.8h, v17.8b, v0.8b 2592 umull2 v17.8h, v17.16b, v0.16b 2593 umull v20.8h, v21.8b, v0.8b 2594 umull2 v21.8h, v21.16b, v0.16b 2595 umlal v16.8h, v19.8b, v1.8b 2596 umlal2 v17.8h, v19.16b, v1.16b 2597 umlal v20.8h, v23.8b, v1.8b 2598 umlal2 v21.8h, v23.16b, v1.16b 2599 subs \mx, \mx, #16 2600.ifc \type, put 2601 uqrshrn v16.8b, v16.8h, #4 2602 uqrshrn2 v16.16b, v17.8h, #4 2603 uqrshrn v20.8b, v20.8h, #4 2604 uqrshrn2 v20.16b, v21.8h, #4 2605 st1 {v16.16b}, [\dst], #16 2606 st1 {v20.16b}, [\ds2], #16 2607.else 2608 st1 {v16.8h, v17.8h}, [\dst], #32 2609 st1 {v20.8h, v21.8h}, [\ds2], #32 2610.endif 2611 b.le 9f 2612 2613 mov v16.16b, v18.16b 2614 mov v20.16b, v22.16b 2615 b 16b 2616 26179: 2618 add \dst, \dst, \d_strd 2619 add \ds2, \ds2, \d_strd 2620 add \src, \src, \s_strd 2621 add \sr2, \sr2, \s_strd 2622 2623 subs \h, \h, #2 2624 b.gt 161b 2625 ret 2626 2627L(\type\()_bilin_h_tbl): 2628 .hword L(\type\()_bilin_h_tbl) - 1280b 2629 .hword L(\type\()_bilin_h_tbl) - 640b 2630 .hword L(\type\()_bilin_h_tbl) - 320b 2631 .hword L(\type\()_bilin_h_tbl) - 160b 2632 .hword L(\type\()_bilin_h_tbl) - 80b 2633 .hword L(\type\()_bilin_h_tbl) - 40b 2634 .hword L(\type\()_bilin_h_tbl) - 20b 2635 .hword 0 2636 2637 2638L(\type\()_bilin_v): 2639 cmp \h, #4 2640 adr x9, L(\type\()_bilin_v_tbl) 2641 ldrh w8, [x9, x8, lsl #1] 2642 sub x9, x9, w8, uxtw 2643 br x9 2644 264520: // 2xN v 2646 AARCH64_VALID_JUMP_TARGET 2647.ifc \type, put 2648 cmp \h, #2 2649 add \ds2, \dst, \d_strd 2650 add \sr2, \src, \s_strd 2651 lsl \s_strd, \s_strd, #1 2652 lsl \d_strd, \d_strd, #1 2653 2654 // 2x2 v 2655 ld1 {v16.h}[0], [\src], \s_strd 2656 b.gt 24f 265722: 2658 ld1 {v17.h}[0], [\sr2], \s_strd 2659 ld1 {v18.h}[0], [\src], \s_strd 2660 trn1 v16.4h, v16.4h, v17.4h 2661 trn1 v17.4h, v17.4h, v18.4h 2662 umull v4.8h, v16.8b, v2.8b 2663 umlal v4.8h, v17.8b, v3.8b 2664 uqrshrn v4.8b, v4.8h, #4 2665 st1 {v4.h}[0], [\dst] 2666 st1 {v4.h}[1], [\ds2] 2667 ret 266824: // 2x4, 2x6, 2x8, ... v 2669 ld1 {v17.h}[0], [\sr2], \s_strd 2670 ld1 {v18.h}[0], [\src], \s_strd 2671 ld1 {v19.h}[0], [\sr2], \s_strd 2672 ld1 {v20.h}[0], [\src], \s_strd 2673 sub \h, \h, #4 2674 trn1 v16.4h, v16.4h, v17.4h 2675 trn1 v17.4h, v17.4h, v18.4h 2676 trn1 v18.4h, v18.4h, v19.4h 2677 trn1 v19.4h, v19.4h, v20.4h 2678 trn1 v16.2s, v16.2s, v18.2s 2679 trn1 v17.2s, v17.2s, v19.2s 2680 umull v4.8h, v16.8b, v2.8b 2681 umlal v4.8h, v17.8b, v3.8b 2682 cmp \h, #2 2683 uqrshrn v4.8b, v4.8h, #4 2684 st1 {v4.h}[0], [\dst], \d_strd 2685 st1 {v4.h}[1], [\ds2], \d_strd 2686 st1 {v4.h}[2], [\dst], \d_strd 2687 st1 {v4.h}[3], [\ds2], \d_strd 2688 b.lt 0f 2689 mov v16.8b, v20.8b 2690 b.eq 22b 2691 b 24b 26920: 2693 ret 2694.endif 2695 269640: // 4xN v 2697 AARCH64_VALID_JUMP_TARGET 2698 add \ds2, \dst, \d_strd 2699 add \sr2, \src, \s_strd 2700 lsl \s_strd, \s_strd, #1 2701 lsl \d_strd, \d_strd, #1 2702 ld1 {v16.s}[0], [\src], \s_strd 27034: 2704 ld1 {v17.s}[0], [\sr2], \s_strd 2705 ld1 {v18.s}[0], [\src], \s_strd 2706 trn1 v16.2s, v16.2s, v17.2s 2707 trn1 v17.2s, v17.2s, v18.2s 2708 umull v4.8h, v16.8b, v2.8b 2709 umlal v4.8h, v17.8b, v3.8b 2710 subs \h, \h, #2 2711.ifc \type, put 2712 uqrshrn v4.8b, v4.8h, #4 2713 st1 {v4.s}[0], [\dst], \d_strd 2714 st1 {v4.s}[1], [\ds2], \d_strd 2715.else 2716 st1 {v4.d}[0], [\dst], \d_strd 2717 st1 {v4.d}[1], [\ds2], \d_strd 2718.endif 2719 b.le 0f 2720 mov v16.8b, v18.8b 2721 b 4b 27220: 2723 ret 2724 272580: // 8xN v 2726 AARCH64_VALID_JUMP_TARGET 2727 add \ds2, \dst, \d_strd 2728 add \sr2, \src, \s_strd 2729 lsl \s_strd, \s_strd, #1 2730 lsl \d_strd, \d_strd, #1 2731 ld1 {v16.8b}, [\src], \s_strd 27328: 2733 ld1 {v17.8b}, [\sr2], \s_strd 2734 ld1 {v18.8b}, [\src], \s_strd 2735 umull v4.8h, v16.8b, v2.8b 2736 umull v5.8h, v17.8b, v2.8b 2737 umlal v4.8h, v17.8b, v3.8b 2738 umlal v5.8h, v18.8b, v3.8b 2739 subs \h, \h, #2 2740.ifc \type, put 2741 uqrshrn v4.8b, v4.8h, #4 2742 uqrshrn v5.8b, v5.8h, #4 2743 st1 {v4.8b}, [\dst], \d_strd 2744 st1 {v5.8b}, [\ds2], \d_strd 2745.else 2746 st1 {v4.8h}, [\dst], \d_strd 2747 st1 {v5.8h}, [\ds2], \d_strd 2748.endif 2749 b.le 0f 2750 mov v16.8b, v18.8b 2751 b 8b 27520: 2753 ret 2754 2755160: // 16xN, 32xN, ... 2756320: 2757640: 27581280: 2759 AARCH64_VALID_JUMP_TARGET 2760 mov \my, \h 27611: 2762 add \ds2, \dst, \d_strd 2763 add \sr2, \src, \s_strd 2764 lsl \s_strd, \s_strd, #1 2765 lsl \d_strd, \d_strd, #1 2766 2767 ld1 {v16.16b}, [\src], \s_strd 27682: 2769 ld1 {v17.16b}, [\sr2], \s_strd 2770 ld1 {v18.16b}, [\src], \s_strd 2771 umull v4.8h, v16.8b, v2.8b 2772 umull2 v5.8h, v16.16b, v2.16b 2773 umull v6.8h, v17.8b, v2.8b 2774 umull2 v7.8h, v17.16b, v2.16b 2775 umlal v4.8h, v17.8b, v3.8b 2776 umlal2 v5.8h, v17.16b, v3.16b 2777 umlal v6.8h, v18.8b, v3.8b 2778 umlal2 v7.8h, v18.16b, v3.16b 2779 subs \h, \h, #2 2780.ifc \type, put 2781 uqrshrn v4.8b, v4.8h, #4 2782 uqrshrn2 v4.16b, v5.8h, #4 2783 uqrshrn v6.8b, v6.8h, #4 2784 uqrshrn2 v6.16b, v7.8h, #4 2785 st1 {v4.16b}, [\dst], \d_strd 2786 st1 {v6.16b}, [\ds2], \d_strd 2787.else 2788 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2789 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2790.endif 2791 b.le 9f 2792 mov v16.16b, v18.16b 2793 b 2b 27949: 2795 subs \w, \w, #16 2796 b.le 0f 2797 asr \s_strd, \s_strd, #1 2798 asr \d_strd, \d_strd, #1 2799 msub \src, \s_strd, \xmy, \src 2800 msub \dst, \d_strd, \xmy, \dst 2801 sub \src, \src, \s_strd, lsl #1 2802 mov \h, \my 2803 add \src, \src, #16 2804.ifc \type, put 2805 add \dst, \dst, #16 2806.else 2807 add \dst, \dst, #32 2808.endif 2809 b 1b 28100: 2811 ret 2812 2813L(\type\()_bilin_v_tbl): 2814 .hword L(\type\()_bilin_v_tbl) - 1280b 2815 .hword L(\type\()_bilin_v_tbl) - 640b 2816 .hword L(\type\()_bilin_v_tbl) - 320b 2817 .hword L(\type\()_bilin_v_tbl) - 160b 2818 .hword L(\type\()_bilin_v_tbl) - 80b 2819 .hword L(\type\()_bilin_v_tbl) - 40b 2820 .hword L(\type\()_bilin_v_tbl) - 20b 2821 .hword 0 2822 2823L(\type\()_bilin_hv): 2824 uxtl v2.8h, v2.8b 2825 uxtl v3.8h, v3.8b 2826 adr x9, L(\type\()_bilin_hv_tbl) 2827 ldrh w8, [x9, x8, lsl #1] 2828 sub x9, x9, w8, uxtw 2829 br x9 2830 283120: // 2xN hv 2832 AARCH64_VALID_JUMP_TARGET 2833.ifc \type, put 2834 add \sr2, \src, \s_strd 2835 add \ds2, \dst, \d_strd 2836 lsl \s_strd, \s_strd, #1 2837 lsl \d_strd, \d_strd, #1 2838 2839 ld1 {v28.s}[0], [\src], \s_strd 2840 ext v29.8b, v28.8b, v28.8b, #1 2841 umull v16.8h, v28.8b, v0.8b 2842 umlal v16.8h, v29.8b, v1.8b 2843 28442: 2845 ld1 {v28.s}[0], [\sr2], \s_strd 2846 ld1 {v30.s}[0], [\src], \s_strd 2847 ext v29.8b, v28.8b, v28.8b, #1 2848 ext v31.8b, v30.8b, v30.8b, #1 2849 trn1 v28.4h, v28.4h, v30.4h 2850 trn1 v29.4h, v29.4h, v31.4h 2851 umull v17.8h, v28.8b, v0.8b 2852 umlal v17.8h, v29.8b, v1.8b 2853 2854 trn1 v16.2s, v16.2s, v17.2s 2855 2856 mul v4.4h, v16.4h, v2.4h 2857 mla v4.4h, v17.4h, v3.4h 2858 uqrshrn v4.8b, v4.8h, #8 2859 subs \h, \h, #2 2860 st1 {v4.h}[0], [\dst], \d_strd 2861 st1 {v4.h}[1], [\ds2], \d_strd 2862 b.le 0f 2863 trn2 v16.2s, v17.2s, v17.2s 2864 b 2b 28650: 2866 ret 2867.endif 2868 286940: // 4xN hv 2870 AARCH64_VALID_JUMP_TARGET 2871 add \sr2, \src, \s_strd 2872 add \ds2, \dst, \d_strd 2873 lsl \s_strd, \s_strd, #1 2874 lsl \d_strd, \d_strd, #1 2875 2876 ld1 {v28.8b}, [\src], \s_strd 2877 ext v29.8b, v28.8b, v28.8b, #1 2878 umull v16.8h, v28.8b, v0.8b 2879 umlal v16.8h, v29.8b, v1.8b 2880 28814: 2882 ld1 {v28.8b}, [\sr2], \s_strd 2883 ld1 {v30.8b}, [\src], \s_strd 2884 ext v29.8b, v28.8b, v28.8b, #1 2885 ext v31.8b, v30.8b, v30.8b, #1 2886 trn1 v28.2s, v28.2s, v30.2s 2887 trn1 v29.2s, v29.2s, v31.2s 2888 umull v17.8h, v28.8b, v0.8b 2889 umlal v17.8h, v29.8b, v1.8b 2890 2891 trn1 v16.2d, v16.2d, v17.2d 2892 2893 mul v4.8h, v16.8h, v2.8h 2894 mla v4.8h, v17.8h, v3.8h 2895 subs \h, \h, #2 2896.ifc \type, put 2897 uqrshrn v4.8b, v4.8h, #8 2898 st1 {v4.s}[0], [\dst], \d_strd 2899 st1 {v4.s}[1], [\ds2], \d_strd 2900.else 2901 urshr v4.8h, v4.8h, #4 2902 st1 {v4.d}[0], [\dst], \d_strd 2903 st1 {v4.d}[1], [\ds2], \d_strd 2904.endif 2905 b.le 0f 2906 trn2 v16.2d, v17.2d, v17.2d 2907 b 4b 29080: 2909 ret 2910 291180: // 8xN, 16xN, ... hv 2912160: 2913320: 2914640: 29151280: 2916 AARCH64_VALID_JUMP_TARGET 2917 mov \my, \h 2918 29191: 2920 add \sr2, \src, \s_strd 2921 add \ds2, \dst, \d_strd 2922 lsl \s_strd, \s_strd, #1 2923 lsl \d_strd, \d_strd, #1 2924 2925 ld1 {v28.16b}, [\src], \s_strd 2926 ext v29.16b, v28.16b, v28.16b, #1 2927 umull v16.8h, v28.8b, v0.8b 2928 umlal v16.8h, v29.8b, v1.8b 2929 29302: 2931 ld1 {v28.16b}, [\sr2], \s_strd 2932 ld1 {v30.16b}, [\src], \s_strd 2933 ext v29.16b, v28.16b, v28.16b, #1 2934 ext v31.16b, v30.16b, v30.16b, #1 2935 umull v17.8h, v28.8b, v0.8b 2936 umlal v17.8h, v29.8b, v1.8b 2937 umull v18.8h, v30.8b, v0.8b 2938 umlal v18.8h, v31.8b, v1.8b 2939 2940 mul v4.8h, v16.8h, v2.8h 2941 mla v4.8h, v17.8h, v3.8h 2942 mul v5.8h, v17.8h, v2.8h 2943 mla v5.8h, v18.8h, v3.8h 2944 subs \h, \h, #2 2945.ifc \type, put 2946 uqrshrn v4.8b, v4.8h, #8 2947 uqrshrn v5.8b, v5.8h, #8 2948 st1 {v4.8b}, [\dst], \d_strd 2949 st1 {v5.8b}, [\ds2], \d_strd 2950.else 2951 urshr v4.8h, v4.8h, #4 2952 urshr v5.8h, v5.8h, #4 2953 st1 {v4.8h}, [\dst], \d_strd 2954 st1 {v5.8h}, [\ds2], \d_strd 2955.endif 2956 b.le 9f 2957 mov v16.16b, v18.16b 2958 b 2b 29599: 2960 subs \w, \w, #8 2961 b.le 0f 2962 asr \s_strd, \s_strd, #1 2963 asr \d_strd, \d_strd, #1 2964 msub \src, \s_strd, \xmy, \src 2965 msub \dst, \d_strd, \xmy, \dst 2966 sub \src, \src, \s_strd, lsl #1 2967 mov \h, \my 2968 add \src, \src, #8 2969.ifc \type, put 2970 add \dst, \dst, #8 2971.else 2972 add \dst, \dst, #16 2973.endif 2974 b 1b 29750: 2976 ret 2977 2978L(\type\()_bilin_hv_tbl): 2979 .hword L(\type\()_bilin_hv_tbl) - 1280b 2980 .hword L(\type\()_bilin_hv_tbl) - 640b 2981 .hword L(\type\()_bilin_hv_tbl) - 320b 2982 .hword L(\type\()_bilin_hv_tbl) - 160b 2983 .hword L(\type\()_bilin_hv_tbl) - 80b 2984 .hword L(\type\()_bilin_hv_tbl) - 40b 2985 .hword L(\type\()_bilin_hv_tbl) - 20b 2986 .hword 0 2987endfunc 2988.endm 2989 2990filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 2991filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 2992 2993.macro load_filter_row dst, src, inc 2994 asr w13, \src, #10 2995 add \src, \src, \inc 2996 ldr \dst, [x11, w13, sxtw #3] 2997.endm 2998 2999function warp_filter_horz_neon 3000 add w12, w5, #512 3001 3002 ld1 {v16.8b, v17.8b}, [x2], x3 3003 3004 load_filter_row d0, w12, w7 3005 load_filter_row d1, w12, w7 3006 load_filter_row d2, w12, w7 3007 load_filter_row d3, w12, w7 3008 load_filter_row d4, w12, w7 3009 load_filter_row d5, w12, w7 3010 load_filter_row d6, w12, w7 3011 // subtract by 128 to allow using smull 3012 eor v16.8b, v16.8b, v22.8b 3013 eor v17.8b, v17.8b, v22.8b 3014 load_filter_row d7, w12, w7 3015 3016 ext v18.8b, v16.8b, v17.8b, #1 3017 ext v19.8b, v16.8b, v17.8b, #2 3018 smull v0.8h, v0.8b, v16.8b 3019 smull v1.8h, v1.8b, v18.8b 3020 ext v18.8b, v16.8b, v17.8b, #3 3021 ext v20.8b, v16.8b, v17.8b, #4 3022 smull v2.8h, v2.8b, v19.8b 3023 smull v3.8h, v3.8b, v18.8b 3024 ext v18.8b, v16.8b, v17.8b, #5 3025 ext v19.8b, v16.8b, v17.8b, #6 3026 smull v4.8h, v4.8b, v20.8b 3027 smull v5.8h, v5.8b, v18.8b 3028 ext v18.8b, v16.8b, v17.8b, #7 3029 smull v6.8h, v6.8b, v19.8b 3030 smull v7.8h, v7.8b, v18.8b 3031 3032 addp v0.8h, v0.8h, v1.8h 3033 addp v2.8h, v2.8h, v3.8h 3034 addp v4.8h, v4.8h, v5.8h 3035 addp v6.8h, v6.8h, v7.8h 3036 3037 addp v0.8h, v0.8h, v2.8h 3038 addp v4.8h, v4.8h, v6.8h 3039 3040 addp v0.8h, v0.8h, v4.8h 3041 3042 add w5, w5, w8 3043 3044 ret 3045endfunc 3046 3047// void dav1d_warp_affine_8x8_8bpc_neon( 3048// pixel *dst, const ptrdiff_t dst_stride, 3049// const pixel *src, const ptrdiff_t src_stride, 3050// const int16_t *const abcd, int mx, int my) 3051.macro warp t, shift 3052function warp_affine_8x8\t\()_8bpc_neon, export=1 3053 ldr x4, [x4] 3054 sbfx x7, x4, #0, #16 3055 sbfx x8, x4, #16, #16 3056 sbfx x9, x4, #32, #16 3057 sbfx x4, x4, #48, #16 3058 mov w10, #8 3059 sub x2, x2, x3, lsl #1 3060 sub x2, x2, x3 3061 sub x2, x2, #3 3062 movrel x11, X(mc_warp_filter), 64*8 3063 mov x15, x30 3064.ifnb \t 3065 lsl x1, x1, #1 3066.endif 3067 3068 movi v22.8b, #128 3069.ifb \t 3070 movi v23.8h, #128 3071.else 3072 movi v23.8h, #8, lsl #8 3073.endif 3074 3075 bl warp_filter_horz_neon 3076 srshr v24.8h, v0.8h, #3 3077 bl warp_filter_horz_neon 3078 srshr v25.8h, v0.8h, #3 3079 bl warp_filter_horz_neon 3080 srshr v26.8h, v0.8h, #3 3081 bl warp_filter_horz_neon 3082 srshr v27.8h, v0.8h, #3 3083 bl warp_filter_horz_neon 3084 srshr v28.8h, v0.8h, #3 3085 bl warp_filter_horz_neon 3086 srshr v29.8h, v0.8h, #3 3087 bl warp_filter_horz_neon 3088 srshr v30.8h, v0.8h, #3 3089 30901: 3091 add w14, w6, #512 3092 bl warp_filter_horz_neon 3093 srshr v31.8h, v0.8h, #3 3094 3095 load_filter_row d0, w14, w9 3096 load_filter_row d1, w14, w9 3097 load_filter_row d2, w14, w9 3098 load_filter_row d3, w14, w9 3099 load_filter_row d4, w14, w9 3100 load_filter_row d5, w14, w9 3101 load_filter_row d6, w14, w9 3102 load_filter_row d7, w14, w9 3103 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3104 3105 // This ordering of smull/smlal/smull2/smlal2 is highly 3106 // beneficial for Cortex A53 here. 3107 smull v16.4s, v24.4h, v0.4h 3108 smlal v16.4s, v25.4h, v1.4h 3109 smlal v16.4s, v26.4h, v2.4h 3110 smlal v16.4s, v27.4h, v3.4h 3111 smlal v16.4s, v28.4h, v4.4h 3112 smlal v16.4s, v29.4h, v5.4h 3113 smlal v16.4s, v30.4h, v6.4h 3114 smlal v16.4s, v31.4h, v7.4h 3115 smull2 v17.4s, v24.8h, v0.8h 3116 smlal2 v17.4s, v25.8h, v1.8h 3117 smlal2 v17.4s, v26.8h, v2.8h 3118 smlal2 v17.4s, v27.8h, v3.8h 3119 smlal2 v17.4s, v28.8h, v4.8h 3120 smlal2 v17.4s, v29.8h, v5.8h 3121 smlal2 v17.4s, v30.8h, v6.8h 3122 smlal2 v17.4s, v31.8h, v7.8h 3123 3124 mov v24.16b, v25.16b 3125 mov v25.16b, v26.16b 3126 sqrshrn v16.4h, v16.4s, #\shift 3127 mov v26.16b, v27.16b 3128 sqrshrn2 v16.8h, v17.4s, #\shift 3129 mov v27.16b, v28.16b 3130 mov v28.16b, v29.16b 3131 add v16.8h, v16.8h, v23.8h 3132.ifb \t 3133 sqxtun v16.8b, v16.8h 3134.endif 3135 mov v29.16b, v30.16b 3136 mov v30.16b, v31.16b 3137 subs w10, w10, #1 3138.ifnb \t 3139 st1 {v16.8h}, [x0], x1 3140.else 3141 st1 {v16.8b}, [x0], x1 3142.endif 3143 3144 add w6, w6, w4 3145 b.gt 1b 3146 3147 ret x15 3148endfunc 3149.endm 3150 3151warp , 11 3152warp t, 7 3153 3154// void dav1d_emu_edge_8bpc_neon( 3155// const intptr_t bw, const intptr_t bh, 3156// const intptr_t iw, const intptr_t ih, 3157// const intptr_t x, const intptr_t y, 3158// pixel *dst, const ptrdiff_t dst_stride, 3159// const pixel *ref, const ptrdiff_t ref_stride) 3160function emu_edge_8bpc_neon, export=1 3161 ldp x8, x9, [sp] 3162 3163 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3164 // ref += iclip(x, 0, iw - 1) 3165 sub x12, x3, #1 // ih - 1 3166 cmp x5, x3 3167 sub x13, x2, #1 // iw - 1 3168 csel x12, x12, x5, ge // min(y, ih - 1) 3169 cmp x4, x2 3170 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3171 csel x13, x13, x4, ge // min(x, iw - 1) 3172 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3173 madd x8, x12, x9, x8 // ref += iclip() * stride 3174 add x8, x8, x13 // ref += iclip() 3175 3176 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3177 // top_ext = iclip(-y, 0, bh - 1) 3178 add x10, x5, x1 // y + bh 3179 neg x5, x5 // -y 3180 sub x10, x10, x3 // y + bh - ih 3181 sub x12, x1, #1 // bh - 1 3182 cmp x10, x1 3183 bic x5, x5, x5, asr #63 // max(-y, 0) 3184 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3185 cmp x5, x1 3186 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3187 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3188 3189 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3190 // left_ext = iclip(-x, 0, bw - 1) 3191 add x11, x4, x0 // x + bw 3192 neg x4, x4 // -x 3193 sub x11, x11, x2 // x + bw - iw 3194 sub x13, x0, #1 // bw - 1 3195 cmp x11, x0 3196 bic x4, x4, x4, asr #63 // max(-x, 0) 3197 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3198 cmp x4, x0 3199 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3200 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3201 3202 // center_h = bh - top_ext - bottom_ext 3203 // dst += top_ext * PXSTRIDE(dst_stride) 3204 // center_w = bw - left_ext - right_ext 3205 sub x1, x1, x5 // bh - top_ext 3206 madd x6, x5, x7, x6 3207 sub x2, x0, x4 // bw - left_ext 3208 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3209 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3210 3211 mov x14, x6 // backup of dst 3212 3213.macro v_loop need_left, need_right 32140: 3215.if \need_left 3216 ld1r {v0.16b}, [x8] 3217 mov x12, x6 // out = dst 3218 mov x3, x4 32191: 3220 subs x3, x3, #16 3221 st1 {v0.16b}, [x12], #16 3222 b.gt 1b 3223.endif 3224 mov x13, x8 3225 add x12, x6, x4 // out = dst + left_ext 3226 mov x3, x2 32271: 3228 ld1 {v0.16b, v1.16b}, [x13], #32 3229 subs x3, x3, #32 3230 st1 {v0.16b, v1.16b}, [x12], #32 3231 b.gt 1b 3232.if \need_right 3233 add x3, x8, x2 // in + center_w 3234 sub x3, x3, #1 // in + center_w - 1 3235 add x12, x6, x4 // dst + left_ext 3236 ld1r {v0.16b}, [x3] 3237 add x12, x12, x2 // out = dst + left_ext + center_w 3238 mov x3, x11 32391: 3240 subs x3, x3, #16 3241 st1 {v0.16b}, [x12], #16 3242 b.gt 1b 3243.endif 3244 3245 subs x1, x1, #1 // center_h-- 3246 add x6, x6, x7 3247 add x8, x8, x9 3248 b.gt 0b 3249.endm 3250 3251 cbz x4, 2f 3252 // need_left 3253 cbz x11, 3f 3254 // need_left + need_right 3255 v_loop 1, 1 3256 b 5f 3257 32582: 3259 // !need_left 3260 cbz x11, 4f 3261 // !need_left + need_right 3262 v_loop 0, 1 3263 b 5f 3264 32653: 3266 // need_left + !need_right 3267 v_loop 1, 0 3268 b 5f 3269 32704: 3271 // !need_left + !need_right 3272 v_loop 0, 0 3273 32745: 3275 3276 cbz x10, 3f 3277 // need_bottom 3278 sub x8, x6, x7 // ref = dst - stride 3279 mov x4, x0 32801: 3281 ld1 {v0.16b, v1.16b}, [x8], #32 3282 mov x3, x10 32832: 3284 subs x3, x3, #1 3285 st1 {v0.16b, v1.16b}, [x6], x7 3286 b.gt 2b 3287 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3288 subs x4, x4, #32 // bw -= 32 3289 add x6, x6, #32 // dst += 32 3290 b.gt 1b 3291 32923: 3293 cbz x5, 3f 3294 // need_top 3295 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 32961: 3297 ld1 {v0.16b, v1.16b}, [x14], #32 3298 mov x3, x5 32992: 3300 subs x3, x3, #1 3301 st1 {v0.16b, v1.16b}, [x6], x7 3302 b.gt 2b 3303 msub x6, x7, x5, x6 // dst -= top_ext * stride 3304 subs x0, x0, #32 // bw -= 32 3305 add x6, x6, #32 // dst += 32 3306 b.gt 1b 3307 33083: 3309 ret 3310endfunc 3311