1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 adr x7, L(\type\()_tbl) 83 sub w4, w4, #24 84 ldrh w4, [x7, x4, lsl #1] 85 \type v4, v0, v1, v2, v3 86 sub x7, x7, w4, uxtw 87 br x7 8840: 89 add x7, x0, x1 90 lsl x1, x1, #1 914: 92 cmp w5, #4 93 st1 {v4.s}[0], [x0], x1 94 st1 {v4.s}[1], [x7], x1 95 st1 {v4.s}[2], [x0], x1 96 st1 {v4.s}[3], [x7], x1 97 b.eq 0f 98 \type v5, v0, v1, v2, v3 99 cmp w5, #8 100 st1 {v5.s}[0], [x0], x1 101 st1 {v5.s}[1], [x7], x1 102 st1 {v5.s}[2], [x0], x1 103 st1 {v5.s}[3], [x7], x1 104 b.eq 0f 105 \type v4, v0, v1, v2, v3 106 st1 {v4.s}[0], [x0], x1 107 st1 {v4.s}[1], [x7], x1 108 \type v5, v0, v1, v2, v3 109 st1 {v4.s}[2], [x0], x1 110 st1 {v4.s}[3], [x7], x1 111 st1 {v5.s}[0], [x0], x1 112 st1 {v5.s}[1], [x7], x1 113 st1 {v5.s}[2], [x0], x1 114 st1 {v5.s}[3], [x7], x1 115 ret 11680: 117 add x7, x0, x1 118 lsl x1, x1, #1 1198: 120 st1 {v4.d}[0], [x0], x1 121 \type v5, v0, v1, v2, v3 122 st1 {v4.d}[1], [x7], x1 123 st1 {v5.d}[0], [x0], x1 124 subs w5, w5, #4 125 st1 {v5.d}[1], [x7], x1 126 b.le 0f 127 \type v4, v0, v1, v2, v3 128 b 8b 12916: 130 \type v5, v0, v1, v2, v3 131 st1 {v4.16b}, [x0], x1 132 \type v6, v0, v1, v2, v3 133 st1 {v5.16b}, [x0], x1 134 \type v7, v0, v1, v2, v3 135 st1 {v6.16b}, [x0], x1 136 subs w5, w5, #4 137 st1 {v7.16b}, [x0], x1 138 b.le 0f 139 \type v4, v0, v1, v2, v3 140 b 16b 141320: 142 add x7, x0, x1 143 lsl x1, x1, #1 14432: 145 \type v5, v0, v1, v2, v3 146 \type v6, v0, v1, v2, v3 147 st1 {v4.16b,v5.16b}, [x0], x1 148 \type v7, v0, v1, v2, v3 149 subs w5, w5, #2 150 st1 {v6.16b,v7.16b}, [x7], x1 151 b.le 0f 152 \type v4, v0, v1, v2, v3 153 b 32b 154640: 155 add x7, x0, x1 156 lsl x1, x1, #1 15764: 158 \type v5, v0, v1, v2, v3 159 \type v6, v0, v1, v2, v3 160 \type v7, v0, v1, v2, v3 161 \type v16, v0, v1, v2, v3 162 \type v17, v0, v1, v2, v3 163 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 164 \type v18, v0, v1, v2, v3 165 \type v19, v0, v1, v2, v3 166 subs w5, w5, #2 167 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 168 b.le 0f 169 \type v4, v0, v1, v2, v3 170 b 64b 1711280: 172 add x7, x0, #64 173128: 174 \type v5, v0, v1, v2, v3 175 \type v6, v0, v1, v2, v3 176 \type v7, v0, v1, v2, v3 177 \type v16, v0, v1, v2, v3 178 \type v17, v0, v1, v2, v3 179 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 180 \type v18, v0, v1, v2, v3 181 \type v19, v0, v1, v2, v3 182 subs w5, w5, #1 183 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 184 b.le 0f 185 \type v4, v0, v1, v2, v3 186 b 128b 1870: 188 ret 189L(\type\()_tbl): 190 .hword L(\type\()_tbl) - 1280b 191 .hword L(\type\()_tbl) - 640b 192 .hword L(\type\()_tbl) - 320b 193 .hword L(\type\()_tbl) - 16b 194 .hword L(\type\()_tbl) - 80b 195 .hword L(\type\()_tbl) - 40b 196endfunc 197.endm 198 199bidir_fn avg 200bidir_fn w_avg 201bidir_fn mask 202 203 204.macro w_mask_fn type 205function w_mask_\type\()_8bpc_neon, export=1 206 clz w8, w4 207 adr x9, L(w_mask_\type\()_tbl) 208 sub w8, w8, #24 209 ldrh w8, [x9, x8, lsl #1] 210 sub x9, x9, w8, uxtw 211 mov w10, #6903 212 dup v0.8h, w10 213.if \type == 444 214 movi v1.16b, #64 215.elseif \type == 422 216 dup v2.8b, w7 217 movi v3.8b, #129 218 sub v3.8b, v3.8b, v2.8b 219.elseif \type == 420 220 dup v2.8h, w7 221 movi v3.8h, #1, lsl #8 222 sub v3.8h, v3.8h, v2.8h 223.endif 224 add x12, x0, x1 225 lsl x1, x1, #1 226 br x9 2274: 228 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 229 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 230 subs w5, w5, #4 231 sub v16.8h, v6.8h, v4.8h 232 sub v17.8h, v7.8h, v5.8h 233 sabd v18.8h, v4.8h, v6.8h 234 sabd v19.8h, v5.8h, v7.8h 235 uqsub v18.8h, v0.8h, v18.8h 236 uqsub v19.8h, v0.8h, v19.8h 237 ushr v18.8h, v18.8h, #8 238 ushr v19.8h, v19.8h, #8 239 shl v20.8h, v18.8h, #9 240 shl v21.8h, v19.8h, #9 241 sqdmulh v20.8h, v20.8h, v16.8h 242 sqdmulh v21.8h, v21.8h, v17.8h 243 add v20.8h, v20.8h, v4.8h 244 add v21.8h, v21.8h, v5.8h 245 sqrshrun v22.8b, v20.8h, #4 246 sqrshrun v23.8b, v21.8h, #4 247.if \type == 444 248 xtn v18.8b, v18.8h 249 xtn2 v18.16b, v19.8h 250 sub v18.16b, v1.16b, v18.16b 251 st1 {v18.16b}, [x6], #16 252.elseif \type == 422 253 addp v18.8h, v18.8h, v19.8h 254 xtn v18.8b, v18.8h 255 uhsub v18.8b, v3.8b, v18.8b 256 st1 {v18.8b}, [x6], #8 257.elseif \type == 420 258 trn1 v24.2d, v18.2d, v19.2d 259 trn2 v25.2d, v18.2d, v19.2d 260 add v24.8h, v24.8h, v25.8h 261 addp v18.8h, v24.8h, v24.8h 262 sub v18.4h, v3.4h, v18.4h 263 rshrn v18.8b, v18.8h, #2 264 st1 {v18.s}[0], [x6], #4 265.endif 266 st1 {v22.s}[0], [x0], x1 267 st1 {v22.s}[1], [x12], x1 268 st1 {v23.s}[0], [x0], x1 269 st1 {v23.s}[1], [x12], x1 270 b.gt 4b 271 ret 2728: 273 ld1 {v4.8h, v5.8h}, [x2], #32 274 ld1 {v6.8h, v7.8h}, [x3], #32 275 subs w5, w5, #2 276 sub v16.8h, v6.8h, v4.8h 277 sub v17.8h, v7.8h, v5.8h 278 sabd v18.8h, v4.8h, v6.8h 279 sabd v19.8h, v5.8h, v7.8h 280 uqsub v18.8h, v0.8h, v18.8h 281 uqsub v19.8h, v0.8h, v19.8h 282 ushr v18.8h, v18.8h, #8 283 ushr v19.8h, v19.8h, #8 284 shl v20.8h, v18.8h, #9 285 shl v21.8h, v19.8h, #9 286 sqdmulh v20.8h, v20.8h, v16.8h 287 sqdmulh v21.8h, v21.8h, v17.8h 288 add v20.8h, v20.8h, v4.8h 289 add v21.8h, v21.8h, v5.8h 290 sqrshrun v22.8b, v20.8h, #4 291 sqrshrun v23.8b, v21.8h, #4 292.if \type == 444 293 xtn v18.8b, v18.8h 294 xtn2 v18.16b, v19.8h 295 sub v18.16b, v1.16b, v18.16b 296 st1 {v18.16b}, [x6], #16 297.elseif \type == 422 298 addp v18.8h, v18.8h, v19.8h 299 xtn v18.8b, v18.8h 300 uhsub v18.8b, v3.8b, v18.8b 301 st1 {v18.8b}, [x6], #8 302.elseif \type == 420 303 add v18.8h, v18.8h, v19.8h 304 addp v18.8h, v18.8h, v18.8h 305 sub v18.4h, v3.4h, v18.4h 306 rshrn v18.8b, v18.8h, #2 307 st1 {v18.s}[0], [x6], #4 308.endif 309 st1 {v22.8b}, [x0], x1 310 st1 {v23.8b}, [x12], x1 311 b.gt 8b 312 ret 3131280: 314640: 315320: 316160: 317 mov w11, w4 318 sub x1, x1, w4, uxtw 319.if \type == 444 320 add x10, x6, w4, uxtw 321.elseif \type == 422 322 add x10, x6, x11, lsr #1 323.endif 324 add x9, x3, w4, uxtw #1 325 add x7, x2, w4, uxtw #1 326161: 327 mov w8, w4 32816: 329 ld1 {v4.8h, v5.8h}, [x2], #32 330 ld1 {v6.8h, v7.8h}, [x3], #32 331 ld1 {v16.8h, v17.8h}, [x7], #32 332 ld1 {v18.8h, v19.8h}, [x9], #32 333 subs w8, w8, #16 334 sub v6.8h, v6.8h, v4.8h 335 sub v7.8h, v7.8h, v5.8h 336 sub v18.8h, v18.8h, v16.8h 337 sub v19.8h, v19.8h, v17.8h 338 abs v20.8h, v6.8h 339 abs v21.8h, v7.8h 340 abs v22.8h, v18.8h 341 abs v23.8h, v19.8h 342 uqsub v20.8h, v0.8h, v20.8h 343 uqsub v21.8h, v0.8h, v21.8h 344 uqsub v22.8h, v0.8h, v22.8h 345 uqsub v23.8h, v0.8h, v23.8h 346 ushr v20.8h, v20.8h, #8 347 ushr v21.8h, v21.8h, #8 348 ushr v22.8h, v22.8h, #8 349 ushr v23.8h, v23.8h, #8 350 shl v24.8h, v20.8h, #9 351 shl v25.8h, v21.8h, #9 352 shl v26.8h, v22.8h, #9 353 shl v27.8h, v23.8h, #9 354 sqdmulh v24.8h, v24.8h, v6.8h 355 sqdmulh v25.8h, v25.8h, v7.8h 356 sqdmulh v26.8h, v26.8h, v18.8h 357 sqdmulh v27.8h, v27.8h, v19.8h 358 add v24.8h, v24.8h, v4.8h 359 add v25.8h, v25.8h, v5.8h 360 add v26.8h, v26.8h, v16.8h 361 add v27.8h, v27.8h, v17.8h 362 sqrshrun v24.8b, v24.8h, #4 363 sqrshrun v25.8b, v25.8h, #4 364 sqrshrun v26.8b, v26.8h, #4 365 sqrshrun v27.8b, v27.8h, #4 366.if \type == 444 367 xtn v20.8b, v20.8h 368 xtn2 v20.16b, v21.8h 369 xtn v21.8b, v22.8h 370 xtn2 v21.16b, v23.8h 371 sub v20.16b, v1.16b, v20.16b 372 sub v21.16b, v1.16b, v21.16b 373 st1 {v20.16b}, [x6], #16 374 st1 {v21.16b}, [x10], #16 375.elseif \type == 422 376 addp v20.8h, v20.8h, v21.8h 377 addp v21.8h, v22.8h, v23.8h 378 xtn v20.8b, v20.8h 379 xtn v21.8b, v21.8h 380 uhsub v20.8b, v3.8b, v20.8b 381 uhsub v21.8b, v3.8b, v21.8b 382 st1 {v20.8b}, [x6], #8 383 st1 {v21.8b}, [x10], #8 384.elseif \type == 420 385 add v20.8h, v20.8h, v22.8h 386 add v21.8h, v21.8h, v23.8h 387 addp v20.8h, v20.8h, v21.8h 388 sub v20.8h, v3.8h, v20.8h 389 rshrn v20.8b, v20.8h, #2 390 st1 {v20.8b}, [x6], #8 391.endif 392 st1 {v24.8b, v25.8b}, [x0], #16 393 st1 {v26.8b, v27.8b}, [x12], #16 394 b.gt 16b 395 subs w5, w5, #2 396 add x2, x2, w4, uxtw #1 397 add x3, x3, w4, uxtw #1 398 add x7, x7, w4, uxtw #1 399 add x9, x9, w4, uxtw #1 400.if \type == 444 401 add x6, x6, w4, uxtw 402 add x10, x10, w4, uxtw 403.elseif \type == 422 404 add x6, x6, x11, lsr #1 405 add x10, x10, x11, lsr #1 406.endif 407 add x0, x0, x1 408 add x12, x12, x1 409 b.gt 161b 410 ret 411L(w_mask_\type\()_tbl): 412 .hword L(w_mask_\type\()_tbl) - 1280b 413 .hword L(w_mask_\type\()_tbl) - 640b 414 .hword L(w_mask_\type\()_tbl) - 320b 415 .hword L(w_mask_\type\()_tbl) - 160b 416 .hword L(w_mask_\type\()_tbl) - 8b 417 .hword L(w_mask_\type\()_tbl) - 4b 418endfunc 419.endm 420 421w_mask_fn 444 422w_mask_fn 422 423w_mask_fn 420 424 425 426function blend_8bpc_neon, export=1 427 adr x6, L(blend_tbl) 428 clz w3, w3 429 sub w3, w3, #26 430 ldrh w3, [x6, x3, lsl #1] 431 sub x6, x6, w3, uxtw 432 movi v4.16b, #64 433 add x8, x0, x1 434 lsl x1, x1, #1 435 br x6 4364: 437 ld1 {v2.8b}, [x5], #8 438 ld1 {v1.d}[0], [x2], #8 439 ld1 {v0.s}[0], [x0] 440 subs w4, w4, #2 441 ld1 {v0.s}[1], [x8] 442 sub v3.8b, v4.8b, v2.8b 443 umull v5.8h, v1.8b, v2.8b 444 umlal v5.8h, v0.8b, v3.8b 445 rshrn v6.8b, v5.8h, #6 446 st1 {v6.s}[0], [x0], x1 447 st1 {v6.s}[1], [x8], x1 448 b.gt 4b 449 ret 4508: 451 ld1 {v2.16b}, [x5], #16 452 ld1 {v1.16b}, [x2], #16 453 ld1 {v0.d}[0], [x0] 454 ld1 {v0.d}[1], [x8] 455 sub v3.16b, v4.16b, v2.16b 456 subs w4, w4, #2 457 umull v5.8h, v1.8b, v2.8b 458 umlal v5.8h, v0.8b, v3.8b 459 umull2 v6.8h, v1.16b, v2.16b 460 umlal2 v6.8h, v0.16b, v3.16b 461 rshrn v7.8b, v5.8h, #6 462 rshrn2 v7.16b, v6.8h, #6 463 st1 {v7.d}[0], [x0], x1 464 st1 {v7.d}[1], [x8], x1 465 b.gt 8b 466 ret 46716: 468 ld1 {v1.16b, v2.16b}, [x5], #32 469 ld1 {v5.16b, v6.16b}, [x2], #32 470 ld1 {v0.16b}, [x0] 471 subs w4, w4, #2 472 sub v7.16b, v4.16b, v1.16b 473 sub v20.16b, v4.16b, v2.16b 474 ld1 {v3.16b}, [x8] 475 umull v16.8h, v5.8b, v1.8b 476 umlal v16.8h, v0.8b, v7.8b 477 umull2 v17.8h, v5.16b, v1.16b 478 umlal2 v17.8h, v0.16b, v7.16b 479 umull v21.8h, v6.8b, v2.8b 480 umlal v21.8h, v3.8b, v20.8b 481 umull2 v22.8h, v6.16b, v2.16b 482 umlal2 v22.8h, v3.16b, v20.16b 483 rshrn v18.8b, v16.8h, #6 484 rshrn2 v18.16b, v17.8h, #6 485 rshrn v19.8b, v21.8h, #6 486 rshrn2 v19.16b, v22.8h, #6 487 st1 {v18.16b}, [x0], x1 488 st1 {v19.16b}, [x8], x1 489 b.gt 16b 490 ret 49132: 492 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 493 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 494 ld1 {v20.16b, v21.16b}, [x0] 495 subs w4, w4, #2 496 ld1 {v22.16b, v23.16b}, [x8] 497 sub v5.16b, v4.16b, v0.16b 498 sub v6.16b, v4.16b, v1.16b 499 sub v30.16b, v4.16b, v2.16b 500 sub v31.16b, v4.16b, v3.16b 501 umull v24.8h, v16.8b, v0.8b 502 umlal v24.8h, v20.8b, v5.8b 503 umull2 v26.8h, v16.16b, v0.16b 504 umlal2 v26.8h, v20.16b, v5.16b 505 umull v28.8h, v17.8b, v1.8b 506 umlal v28.8h, v21.8b, v6.8b 507 umull2 v7.8h, v17.16b, v1.16b 508 umlal2 v7.8h, v21.16b, v6.16b 509 umull v27.8h, v18.8b, v2.8b 510 umlal v27.8h, v22.8b, v30.8b 511 umull2 v1.8h, v18.16b, v2.16b 512 umlal2 v1.8h, v22.16b, v30.16b 513 umull v29.8h, v19.8b, v3.8b 514 umlal v29.8h, v23.8b, v31.8b 515 umull2 v21.8h, v19.16b, v3.16b 516 umlal2 v21.8h, v23.16b, v31.16b 517 rshrn v24.8b, v24.8h, #6 518 rshrn2 v24.16b, v26.8h, #6 519 rshrn v25.8b, v28.8h, #6 520 rshrn2 v25.16b, v7.8h, #6 521 rshrn v27.8b, v27.8h, #6 522 rshrn2 v27.16b, v1.8h, #6 523 rshrn v28.8b, v29.8h, #6 524 rshrn2 v28.16b, v21.8h, #6 525 st1 {v24.16b, v25.16b}, [x0], x1 526 st1 {v27.16b, v28.16b}, [x8], x1 527 b.gt 32b 528 ret 529L(blend_tbl): 530 .hword L(blend_tbl) - 32b 531 .hword L(blend_tbl) - 16b 532 .hword L(blend_tbl) - 8b 533 .hword L(blend_tbl) - 4b 534endfunc 535 536function blend_h_8bpc_neon, export=1 537 adr x6, L(blend_h_tbl) 538 movrel x5, X(obmc_masks) 539 add x5, x5, w4, uxtw 540 sub w4, w4, w4, lsr #2 541 clz w7, w3 542 movi v4.16b, #64 543 add x8, x0, x1 544 lsl x1, x1, #1 545 sub w7, w7, #24 546 ldrh w7, [x6, x7, lsl #1] 547 sub x6, x6, w7, uxtw 548 br x6 5492: 550 ld1 {v0.h}[0], [x5], #2 551 ld1 {v1.s}[0], [x2], #4 552 subs w4, w4, #2 553 ld1 {v2.h}[0], [x0] 554 zip1 v0.8b, v0.8b, v0.8b 555 sub v3.8b, v4.8b, v0.8b 556 ld1 {v2.h}[1], [x8] 557 umull v5.8h, v1.8b, v0.8b 558 umlal v5.8h, v2.8b, v3.8b 559 rshrn v5.8b, v5.8h, #6 560 st1 {v5.h}[0], [x0], x1 561 st1 {v5.h}[1], [x8], x1 562 b.gt 2b 563 ret 5644: 565 ld2r {v0.8b, v1.8b}, [x5], #2 566 ld1 {v2.8b}, [x2], #8 567 subs w4, w4, #2 568 ext v0.8b, v0.8b, v1.8b, #4 569 ld1 {v3.s}[0], [x0] 570 sub v5.8b, v4.8b, v0.8b 571 ld1 {v3.s}[1], [x8] 572 umull v6.8h, v2.8b, v0.8b 573 umlal v6.8h, v3.8b, v5.8b 574 rshrn v6.8b, v6.8h, #6 575 st1 {v6.s}[0], [x0], x1 576 st1 {v6.s}[1], [x8], x1 577 b.gt 4b 578 ret 5798: 580 ld2r {v0.16b, v1.16b}, [x5], #2 581 ld1 {v2.16b}, [x2], #16 582 ld1 {v3.d}[0], [x0] 583 ext v0.16b, v0.16b, v1.16b, #8 584 sub v5.16b, v4.16b, v0.16b 585 ld1 {v3.d}[1], [x8] 586 subs w4, w4, #2 587 umull v6.8h, v0.8b, v2.8b 588 umlal v6.8h, v3.8b, v5.8b 589 umull2 v7.8h, v0.16b, v2.16b 590 umlal2 v7.8h, v3.16b, v5.16b 591 rshrn v16.8b, v6.8h, #6 592 rshrn2 v16.16b, v7.8h, #6 593 st1 {v16.d}[0], [x0], x1 594 st1 {v16.d}[1], [x8], x1 595 b.gt 8b 596 ret 59716: 598 ld2r {v0.16b, v1.16b}, [x5], #2 599 ld1 {v2.16b, v3.16b}, [x2], #32 600 ld1 {v5.16b}, [x0] 601 sub v7.16b, v4.16b, v0.16b 602 sub v16.16b, v4.16b, v1.16b 603 ld1 {v6.16b}, [x8] 604 subs w4, w4, #2 605 umull v17.8h, v0.8b, v2.8b 606 umlal v17.8h, v5.8b, v7.8b 607 umull2 v18.8h, v0.16b, v2.16b 608 umlal2 v18.8h, v5.16b, v7.16b 609 umull v19.8h, v1.8b, v3.8b 610 umlal v19.8h, v6.8b, v16.8b 611 umull2 v20.8h, v1.16b, v3.16b 612 umlal2 v20.8h, v6.16b, v16.16b 613 rshrn v21.8b, v17.8h, #6 614 rshrn2 v21.16b, v18.8h, #6 615 rshrn v22.8b, v19.8h, #6 616 rshrn2 v22.16b, v20.8h, #6 617 st1 {v21.16b}, [x0], x1 618 st1 {v22.16b}, [x8], x1 619 b.gt 16b 620 ret 6211280: 622640: 623320: 624 sub x1, x1, w3, uxtw 625 add x7, x2, w3, uxtw 626321: 627 ld2r {v0.16b, v1.16b}, [x5], #2 628 mov w6, w3 629 sub v20.16b, v4.16b, v0.16b 630 sub v21.16b, v4.16b, v1.16b 63132: 632 ld1 {v16.16b, v17.16b}, [x2], #32 633 ld1 {v2.16b, v3.16b}, [x0] 634 subs w6, w6, #32 635 umull v23.8h, v0.8b, v16.8b 636 umlal v23.8h, v2.8b, v20.8b 637 ld1 {v18.16b, v19.16b}, [x7], #32 638 umull2 v27.8h, v0.16b, v16.16b 639 umlal2 v27.8h, v2.16b, v20.16b 640 ld1 {v6.16b, v7.16b}, [x8] 641 umull v24.8h, v0.8b, v17.8b 642 umlal v24.8h, v3.8b, v20.8b 643 umull2 v28.8h, v0.16b, v17.16b 644 umlal2 v28.8h, v3.16b, v20.16b 645 umull v25.8h, v1.8b, v18.8b 646 umlal v25.8h, v6.8b, v21.8b 647 umull2 v5.8h, v1.16b, v18.16b 648 umlal2 v5.8h, v6.16b, v21.16b 649 rshrn v29.8b, v23.8h, #6 650 rshrn2 v29.16b, v27.8h, #6 651 umull v26.8h, v1.8b, v19.8b 652 umlal v26.8h, v7.8b, v21.8b 653 umull2 v31.8h, v1.16b, v19.16b 654 umlal2 v31.8h, v7.16b, v21.16b 655 rshrn v30.8b, v24.8h, #6 656 rshrn2 v30.16b, v28.8h, #6 657 rshrn v23.8b, v25.8h, #6 658 rshrn2 v23.16b, v5.8h, #6 659 rshrn v24.8b, v26.8h, #6 660 st1 {v29.16b, v30.16b}, [x0], #32 661 rshrn2 v24.16b, v31.8h, #6 662 st1 {v23.16b, v24.16b}, [x8], #32 663 b.gt 32b 664 subs w4, w4, #2 665 add x0, x0, x1 666 add x8, x8, x1 667 add x2, x2, w3, uxtw 668 add x7, x7, w3, uxtw 669 b.gt 321b 670 ret 671L(blend_h_tbl): 672 .hword L(blend_h_tbl) - 1280b 673 .hword L(blend_h_tbl) - 640b 674 .hword L(blend_h_tbl) - 320b 675 .hword L(blend_h_tbl) - 16b 676 .hword L(blend_h_tbl) - 8b 677 .hword L(blend_h_tbl) - 4b 678 .hword L(blend_h_tbl) - 2b 679endfunc 680 681function blend_v_8bpc_neon, export=1 682 adr x6, L(blend_v_tbl) 683 movrel x5, X(obmc_masks) 684 add x5, x5, w3, uxtw 685 clz w3, w3 686 movi v4.16b, #64 687 add x8, x0, x1 688 lsl x1, x1, #1 689 sub w3, w3, #26 690 ldrh w3, [x6, x3, lsl #1] 691 sub x6, x6, w3, uxtw 692 br x6 69320: 694 ld1r {v0.8b}, [x5] 695 sub v1.8b, v4.8b, v0.8b 6962: 697 ld1 {v2.h}[0], [x2], #2 698 ld1 {v3.b}[0], [x0] 699 subs w4, w4, #2 700 ld1 {v2.b}[1], [x2] 701 ld1 {v3.b}[1], [x8] 702 umull v5.8h, v2.8b, v0.8b 703 umlal v5.8h, v3.8b, v1.8b 704 rshrn v5.8b, v5.8h, #6 705 add x2, x2, #2 706 st1 {v5.b}[0], [x0], x1 707 st1 {v5.b}[1], [x8], x1 708 b.gt 2b 709 ret 71040: 711 ld1r {v0.2s}, [x5] 712 sub x1, x1, #2 713 sub v1.8b, v4.8b, v0.8b 7144: 715 ld1 {v2.8b}, [x2], #8 716 ld1 {v3.s}[0], [x0] 717 ld1 {v3.s}[1], [x8] 718 subs w4, w4, #2 719 umull v5.8h, v2.8b, v0.8b 720 umlal v5.8h, v3.8b, v1.8b 721 rshrn v5.8b, v5.8h, #6 722 st1 {v5.h}[0], [x0], #2 723 st1 {v5.h}[2], [x8], #2 724 st1 {v5.b}[2], [x0], x1 725 st1 {v5.b}[6], [x8], x1 726 b.gt 4b 727 ret 72880: 729 ld1r {v0.2d}, [x5] 730 sub x1, x1, #4 731 sub v1.16b, v4.16b, v0.16b 7328: 733 ld1 {v2.16b}, [x2], #16 734 ld1 {v3.d}[0], [x0] 735 ld1 {v3.d}[1], [x8] 736 subs w4, w4, #2 737 umull v5.8h, v0.8b, v2.8b 738 umlal v5.8h, v3.8b, v1.8b 739 umull2 v6.8h, v0.16b, v2.16b 740 umlal2 v6.8h, v3.16b, v1.16b 741 rshrn v7.8b, v5.8h, #6 742 rshrn2 v7.16b, v6.8h, #6 743 st1 {v7.s}[0], [x0], #4 744 st1 {v7.s}[2], [x8], #4 745 st1 {v7.h}[2], [x0], x1 746 st1 {v7.h}[6], [x8], x1 747 b.gt 8b 748 ret 749160: 750 ld1 {v0.16b}, [x5] 751 sub x1, x1, #8 752 sub v2.16b, v4.16b, v0.16b 75316: 754 ld1 {v5.16b, v6.16b}, [x2], #32 755 ld1 {v7.16b}, [x0] 756 subs w4, w4, #2 757 ld1 {v16.16b}, [x8] 758 umull v17.8h, v5.8b, v0.8b 759 umlal v17.8h, v7.8b, v2.8b 760 umull2 v18.8h, v5.16b, v0.16b 761 umlal2 v18.8h, v7.16b, v2.16b 762 umull v20.8h, v6.8b, v0.8b 763 umlal v20.8h, v16.8b, v2.8b 764 umull2 v21.8h, v6.16b, v0.16b 765 umlal2 v21.8h, v16.16b, v2.16b 766 rshrn v19.8b, v17.8h, #6 767 rshrn2 v19.16b, v18.8h, #6 768 rshrn v22.8b, v20.8h, #6 769 rshrn2 v22.16b, v21.8h, #6 770 st1 {v19.8b}, [x0], #8 771 st1 {v22.8b}, [x8], #8 772 st1 {v19.s}[2], [x0], x1 773 st1 {v22.s}[2], [x8], x1 774 b.gt 16b 775 ret 776320: 777 ld1 {v0.16b, v1.16b}, [x5] 778 sub x1, x1, #16 779 sub v2.16b, v4.16b, v0.16b 780 sub v3.8b, v4.8b, v1.8b 78132: 782 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 783 ld1 {v5.16b, v6.16b}, [x0] 784 subs w4, w4, #2 785 ld1 {v20.16b, v21.16b}, [x8] 786 umull v22.8h, v16.8b, v0.8b 787 umlal v22.8h, v5.8b, v2.8b 788 umull2 v23.8h, v16.16b, v0.16b 789 umlal2 v23.8h, v5.16b, v2.16b 790 umull v28.8h, v17.8b, v1.8b 791 umlal v28.8h, v6.8b, v3.8b 792 umull v30.8h, v18.8b, v0.8b 793 umlal v30.8h, v20.8b, v2.8b 794 umull2 v31.8h, v18.16b, v0.16b 795 umlal2 v31.8h, v20.16b, v2.16b 796 umull v25.8h, v19.8b, v1.8b 797 umlal v25.8h, v21.8b, v3.8b 798 rshrn v24.8b, v22.8h, #6 799 rshrn2 v24.16b, v23.8h, #6 800 rshrn v28.8b, v28.8h, #6 801 rshrn v30.8b, v30.8h, #6 802 rshrn2 v30.16b, v31.8h, #6 803 rshrn v27.8b, v25.8h, #6 804 st1 {v24.16b}, [x0], #16 805 st1 {v30.16b}, [x8], #16 806 st1 {v28.8b}, [x0], x1 807 st1 {v27.8b}, [x8], x1 808 b.gt 32b 809 ret 810L(blend_v_tbl): 811 .hword L(blend_v_tbl) - 320b 812 .hword L(blend_v_tbl) - 160b 813 .hword L(blend_v_tbl) - 80b 814 .hword L(blend_v_tbl) - 40b 815 .hword L(blend_v_tbl) - 20b 816endfunc 817 818 819// This has got the same signature as the put_8tap functions, 820// and assumes that x8 is set to (clz(w)-24). 821function put_neon 822 adr x9, L(put_tbl) 823 ldrh w8, [x9, x8, lsl #1] 824 sub x9, x9, w8, uxtw 825 br x9 826 8272: 828 ld1 {v0.h}[0], [x2], x3 829 ld1 {v1.h}[0], [x2], x3 830 subs w5, w5, #2 831 st1 {v0.h}[0], [x0], x1 832 st1 {v1.h}[0], [x0], x1 833 b.gt 2b 834 ret 8354: 836 ld1 {v0.s}[0], [x2], x3 837 ld1 {v1.s}[0], [x2], x3 838 subs w5, w5, #2 839 st1 {v0.s}[0], [x0], x1 840 st1 {v1.s}[0], [x0], x1 841 b.gt 4b 842 ret 8438: 844 ld1 {v0.8b}, [x2], x3 845 ld1 {v1.8b}, [x2], x3 846 subs w5, w5, #2 847 st1 {v0.8b}, [x0], x1 848 st1 {v1.8b}, [x0], x1 849 b.gt 8b 850 ret 851160: 852 add x8, x0, x1 853 lsl x1, x1, #1 854 add x9, x2, x3 855 lsl x3, x3, #1 85616: 857 ld1 {v0.16b}, [x2], x3 858 ld1 {v1.16b}, [x9], x3 859 subs w5, w5, #2 860 st1 {v0.16b}, [x0], x1 861 st1 {v1.16b}, [x8], x1 862 b.gt 16b 863 ret 86432: 865 ldp x6, x7, [x2] 866 ldp x8, x9, [x2, #16] 867 stp x6, x7, [x0] 868 subs w5, w5, #1 869 stp x8, x9, [x0, #16] 870 add x2, x2, x3 871 add x0, x0, x1 872 b.gt 32b 873 ret 87464: 875 ldp x6, x7, [x2] 876 ldp x8, x9, [x2, #16] 877 stp x6, x7, [x0] 878 ldp x10, x11, [x2, #32] 879 stp x8, x9, [x0, #16] 880 subs w5, w5, #1 881 ldp x12, x13, [x2, #48] 882 stp x10, x11, [x0, #32] 883 stp x12, x13, [x0, #48] 884 add x2, x2, x3 885 add x0, x0, x1 886 b.gt 64b 887 ret 888128: 889 ldp q0, q1, [x2] 890 ldp q2, q3, [x2, #32] 891 stp q0, q1, [x0] 892 ldp q4, q5, [x2, #64] 893 stp q2, q3, [x0, #32] 894 ldp q6, q7, [x2, #96] 895 subs w5, w5, #1 896 stp q4, q5, [x0, #64] 897 stp q6, q7, [x0, #96] 898 add x2, x2, x3 899 add x0, x0, x1 900 b.gt 128b 901 ret 902 903L(put_tbl): 904 .hword L(put_tbl) - 128b 905 .hword L(put_tbl) - 64b 906 .hword L(put_tbl) - 32b 907 .hword L(put_tbl) - 160b 908 .hword L(put_tbl) - 8b 909 .hword L(put_tbl) - 4b 910 .hword L(put_tbl) - 2b 911endfunc 912 913 914// This has got the same signature as the prep_8tap functions, 915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 916function prep_neon 917 adr x9, L(prep_tbl) 918 ldrh w8, [x9, x8, lsl #1] 919 sub x9, x9, w8, uxtw 920 br x9 921 9224: 923 ld1 {v0.s}[0], [x1], x2 924 ld1 {v1.s}[0], [x1], x2 925 subs w4, w4, #2 926 ushll v0.8h, v0.8b, #4 927 ushll v1.8h, v1.8b, #4 928 st1 {v0.4h, v1.4h}, [x0], #16 929 b.gt 4b 930 ret 9318: 932 ld1 {v0.8b}, [x1], x2 933 ld1 {v1.8b}, [x1], x2 934 subs w4, w4, #2 935 ushll v0.8h, v0.8b, #4 936 ushll v1.8h, v1.8b, #4 937 st1 {v0.8h, v1.8h}, [x0], #32 938 b.gt 8b 939 ret 940160: 941 add x9, x1, x2 942 lsl x2, x2, #1 94316: 944 ld1 {v0.16b}, [x1], x2 945 ld1 {v1.16b}, [x9], x2 946 subs w4, w4, #2 947 ushll v4.8h, v0.8b, #4 948 ushll2 v5.8h, v0.16b, #4 949 ushll v6.8h, v1.8b, #4 950 ushll2 v7.8h, v1.16b, #4 951 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 952 b.gt 16b 953 ret 954320: 955 add x8, x0, w3, uxtw 95632: 957 ld1 {v0.16b, v1.16b}, [x1], x2 958 subs w4, w4, #2 959 ushll v4.8h, v0.8b, #4 960 ushll2 v5.8h, v0.16b, #4 961 ld1 {v2.16b, v3.16b}, [x1], x2 962 ushll v6.8h, v1.8b, #4 963 ushll2 v7.8h, v1.16b, #4 964 ushll v16.8h, v2.8b, #4 965 st1 {v4.8h, v5.8h}, [x0], x7 966 ushll2 v17.8h, v2.16b, #4 967 st1 {v6.8h, v7.8h}, [x8], x7 968 ushll v18.8h, v3.8b, #4 969 st1 {v16.8h, v17.8h}, [x0], x7 970 ushll2 v19.8h, v3.16b, #4 971 st1 {v18.8h, v19.8h}, [x8], x7 972 b.gt 32b 973 ret 974640: 975 add x8, x0, #32 976 mov x6, #64 97764: 978 ldp q0, q1, [x1] 979 subs w4, w4, #1 980 ushll v4.8h, v0.8b, #4 981 ushll2 v5.8h, v0.16b, #4 982 ldp q2, q3, [x1, #32] 983 ushll v6.8h, v1.8b, #4 984 ushll2 v7.8h, v1.16b, #4 985 add x1, x1, x2 986 ushll v16.8h, v2.8b, #4 987 st1 {v4.8h, v5.8h}, [x0], x6 988 ushll2 v17.8h, v2.16b, #4 989 ushll v18.8h, v3.8b, #4 990 st1 {v6.8h, v7.8h}, [x8], x6 991 ushll2 v19.8h, v3.16b, #4 992 st1 {v16.8h, v17.8h}, [x0], x6 993 st1 {v18.8h, v19.8h}, [x8], x6 994 b.gt 64b 995 ret 9961280: 997 add x8, x0, #64 998 mov x6, #128 999128: 1000 ldp q0, q1, [x1] 1001 ldp q2, q3, [x1, #32] 1002 ushll v16.8h, v0.8b, #4 1003 ushll2 v17.8h, v0.16b, #4 1004 ushll v18.8h, v1.8b, #4 1005 ushll2 v19.8h, v1.16b, #4 1006 ushll v20.8h, v2.8b, #4 1007 ushll2 v21.8h, v2.16b, #4 1008 ldp q4, q5, [x1, #64] 1009 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 1010 ushll v22.8h, v3.8b, #4 1011 ushll2 v23.8h, v3.16b, #4 1012 ushll v24.8h, v4.8b, #4 1013 ushll2 v25.8h, v4.16b, #4 1014 ushll v26.8h, v5.8b, #4 1015 ushll2 v27.8h, v5.16b, #4 1016 ldp q6, q7, [x1, #96] 1017 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 1018 ushll v28.8h, v6.8b, #4 1019 ushll2 v29.8h, v6.16b, #4 1020 ushll v30.8h, v7.8b, #4 1021 ushll2 v31.8h, v7.16b, #4 1022 subs w4, w4, #1 1023 add x1, x1, x2 1024 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 1025 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 1026 b.gt 128b 1027 ret 1028 1029L(prep_tbl): 1030 .hword L(prep_tbl) - 1280b 1031 .hword L(prep_tbl) - 640b 1032 .hword L(prep_tbl) - 320b 1033 .hword L(prep_tbl) - 160b 1034 .hword L(prep_tbl) - 8b 1035 .hword L(prep_tbl) - 4b 1036endfunc 1037 1038 1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1040 ld1 {\d0\wd}[0], [\s0], \strd 1041 ld1 {\d1\wd}[0], [\s1], \strd 1042.ifnb \d2 1043 ld1 {\d2\wd}[0], [\s0], \strd 1044 ld1 {\d3\wd}[0], [\s1], \strd 1045.endif 1046.ifnb \d4 1047 ld1 {\d4\wd}[0], [\s0], \strd 1048.endif 1049.ifnb \d5 1050 ld1 {\d5\wd}[0], [\s1], \strd 1051.endif 1052.ifnb \d6 1053 ld1 {\d6\wd}[0], [\s0], \strd 1054.endif 1055.endm 1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1057 ld1 {\d0\wd}, [\s0], \strd 1058 ld1 {\d1\wd}, [\s1], \strd 1059.ifnb \d2 1060 ld1 {\d2\wd}, [\s0], \strd 1061 ld1 {\d3\wd}, [\s1], \strd 1062.endif 1063.ifnb \d4 1064 ld1 {\d4\wd}, [\s0], \strd 1065.endif 1066.ifnb \d5 1067 ld1 {\d5\wd}, [\s1], \strd 1068.endif 1069.ifnb \d6 1070 ld1 {\d6\wd}, [\s0], \strd 1071.endif 1072.endm 1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1074 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1075.endm 1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1077 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1078.endm 1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1080 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1081.endm 1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1083 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1084.endm 1085.macro interleave_1 wd, r0, r1, r2, r3, r4 1086 trn1 \r0\wd, \r0\wd, \r1\wd 1087 trn1 \r1\wd, \r1\wd, \r2\wd 1088.ifnb \r3 1089 trn1 \r2\wd, \r2\wd, \r3\wd 1090 trn1 \r3\wd, \r3\wd, \r4\wd 1091.endif 1092.endm 1093.macro interleave_1_h r0, r1, r2, r3, r4 1094 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1095.endm 1096.macro interleave_1_s r0, r1, r2, r3, r4 1097 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1098.endm 1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1100 trn1 \r0\wd, \r0\wd, \r2\wd 1101 trn1 \r1\wd, \r1\wd, \r3\wd 1102 trn1 \r2\wd, \r2\wd, \r4\wd 1103 trn1 \r3\wd, \r3\wd, \r5\wd 1104.endm 1105.macro interleave_2_s r0, r1, r2, r3, r4, r5 1106 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1107.endm 1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1109 uxtl \r0\().8h, \r0\().8b 1110 uxtl \r1\().8h, \r1\().8b 1111.ifnb \r2 1112 uxtl \r2\().8h, \r2\().8b 1113 uxtl \r3\().8h, \r3\().8b 1114.endif 1115.ifnb \r4 1116 uxtl \r4\().8h, \r4\().8b 1117.endif 1118.ifnb \r5 1119 uxtl \r5\().8h, \r5\().8b 1120.endif 1121.ifnb \r6 1122 uxtl \r6\().8h, \r6\().8b 1123.endif 1124.endm 1125.macro mul_mla_4 d, s0, s1, s2, s3, wd 1126 mul \d\wd, \s0\wd, v0.h[0] 1127 mla \d\wd, \s1\wd, v0.h[1] 1128 mla \d\wd, \s2\wd, v0.h[2] 1129 mla \d\wd, \s3\wd, v0.h[3] 1130.endm 1131// Interleaving the mul/mla chains actually hurts performance 1132// significantly on Cortex A53, thus keeping mul/mla tightly 1133// chained like this. 1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1135 mul \d0\().8h, \s0\().8h, v0.h[0] 1136 mla \d0\().8h, \s1\().8h, v0.h[1] 1137 mla \d0\().8h, \s2\().8h, v0.h[2] 1138 mla \d0\().8h, \s3\().8h, v0.h[3] 1139 mla \d0\().8h, \s4\().8h, v0.h[4] 1140 mla \d0\().8h, \s5\().8h, v0.h[5] 1141 mla \d0\().8h, \s6\().8h, v0.h[6] 1142 mla \d0\().8h, \s7\().8h, v0.h[7] 1143 mul \d1\().8h, \s1\().8h, v0.h[0] 1144 mla \d1\().8h, \s2\().8h, v0.h[1] 1145 mla \d1\().8h, \s3\().8h, v0.h[2] 1146 mla \d1\().8h, \s4\().8h, v0.h[3] 1147 mla \d1\().8h, \s5\().8h, v0.h[4] 1148 mla \d1\().8h, \s6\().8h, v0.h[5] 1149 mla \d1\().8h, \s7\().8h, v0.h[6] 1150 mla \d1\().8h, \s8\().8h, v0.h[7] 1151.endm 1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1153 mul \d0\().8h, \s0\().8h, v0.h[0] 1154 mla \d0\().8h, \s1\().8h, v0.h[1] 1155 mla \d0\().8h, \s2\().8h, v0.h[2] 1156 mla \d0\().8h, \s3\().8h, v0.h[3] 1157 mla \d0\().8h, \s4\().8h, v0.h[4] 1158 mla \d0\().8h, \s5\().8h, v0.h[5] 1159 mla \d0\().8h, \s6\().8h, v0.h[6] 1160 mla \d0\().8h, \s7\().8h, v0.h[7] 1161 mul \d1\().8h, \s2\().8h, v0.h[0] 1162 mla \d1\().8h, \s3\().8h, v0.h[1] 1163 mla \d1\().8h, \s4\().8h, v0.h[2] 1164 mla \d1\().8h, \s5\().8h, v0.h[3] 1165 mla \d1\().8h, \s6\().8h, v0.h[4] 1166 mla \d1\().8h, \s7\().8h, v0.h[5] 1167 mla \d1\().8h, \s8\().8h, v0.h[6] 1168 mla \d1\().8h, \s9\().8h, v0.h[7] 1169.endm 1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 1171 mul \d0\().8h, \s0\().8h, v0.h[0] 1172 mla \d0\().8h, \s1\().8h, v0.h[1] 1173 mla \d0\().8h, \s2\().8h, v0.h[2] 1174 mla \d0\().8h, \s3\().8h, v0.h[3] 1175 mla \d0\().8h, \s4\().8h, v0.h[4] 1176 mla \d0\().8h, \s5\().8h, v0.h[5] 1177 mla \d0\().8h, \s6\().8h, v0.h[6] 1178 mla \d0\().8h, \s7\().8h, v0.h[7] 1179 mul \d1\().8h, \s4\().8h, v0.h[0] 1180 mla \d1\().8h, \s5\().8h, v0.h[1] 1181 mla \d1\().8h, \s6\().8h, v0.h[2] 1182 mla \d1\().8h, \s7\().8h, v0.h[3] 1183 mla \d1\().8h, \s8\().8h, v0.h[4] 1184 mla \d1\().8h, \s9\().8h, v0.h[5] 1185 mla \d1\().8h, \s10\().8h, v0.h[6] 1186 mla \d1\().8h, \s11\().8h, v0.h[7] 1187.endm 1188.macro sqrshrun_b shift, r0, r1, r2, r3 1189 sqrshrun \r0\().8b, \r0\().8h, #\shift 1190.ifnb \r1 1191 sqrshrun \r1\().8b, \r1\().8h, #\shift 1192.endif 1193.ifnb \r2 1194 sqrshrun \r2\().8b, \r2\().8h, #\shift 1195 sqrshrun \r3\().8b, \r3\().8h, #\shift 1196.endif 1197.endm 1198.macro srshr_h shift, r0, r1, r2, r3 1199 srshr \r0\().8h, \r0\().8h, #\shift 1200.ifnb \r1 1201 srshr \r1\().8h, \r1\().8h, #\shift 1202.endif 1203.ifnb \r2 1204 srshr \r2\().8h, \r2\().8h, #\shift 1205 srshr \r3\().8h, \r3\().8h, #\shift 1206.endif 1207.endm 1208.macro st_h strd, reg, lanes 1209 st1 {\reg\().h}[0], [x0], \strd 1210 st1 {\reg\().h}[1], [x8], \strd 1211.if \lanes > 2 1212 st1 {\reg\().h}[2], [x0], \strd 1213 st1 {\reg\().h}[3], [x8], \strd 1214.endif 1215.endm 1216.macro st_s strd, r0, r1 1217 st1 {\r0\().s}[0], [x0], \strd 1218 st1 {\r0\().s}[1], [x8], \strd 1219.ifnb \r1 1220 st1 {\r1\().s}[0], [x0], \strd 1221 st1 {\r1\().s}[1], [x8], \strd 1222.endif 1223.endm 1224.macro st_d strd, r0, r1 1225 st1 {\r0\().d}[0], [x0], \strd 1226 st1 {\r0\().d}[1], [x8], \strd 1227.ifnb \r1 1228 st1 {\r1\().d}[0], [x0], \strd 1229 st1 {\r1\().d}[1], [x8], \strd 1230.endif 1231.endm 1232.macro shift_store_4 type, strd, r0, r1 1233.ifc \type, put 1234 sqrshrun_b 6, \r0, \r1 1235 st_s \strd, \r0, \r1 1236.else 1237 srshr_h 2, \r0, \r1 1238 st_d \strd, \r0, \r1 1239.endif 1240.endm 1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1242 st1 {\r0\wd}, [x0], \strd 1243 st1 {\r1\wd}, [x8], \strd 1244.ifnb \r2 1245 st1 {\r2\wd}, [x0], \strd 1246 st1 {\r3\wd}, [x8], \strd 1247.endif 1248.ifnb \r4 1249 st1 {\r4\wd}, [x0], \strd 1250 st1 {\r5\wd}, [x8], \strd 1251 st1 {\r6\wd}, [x0], \strd 1252 st1 {\r7\wd}, [x8], \strd 1253.endif 1254.endm 1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1256 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1257.endm 1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1259 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1260.endm 1261.macro shift_store_8 type, strd, r0, r1, r2, r3 1262.ifc \type, put 1263 sqrshrun_b 6, \r0, \r1, \r2, \r3 1264 st_8b \strd, \r0, \r1, \r2, \r3 1265.else 1266 srshr_h 2, \r0, \r1, \r2, \r3 1267 st_16b \strd, \r0, \r1, \r2, \r3 1268.endif 1269.endm 1270.macro shift_store_16 type, strd, r0, r1, r2, r3 1271.ifc \type, put 1272 sqrshrun \r0\().8b, \r0\().8h, #6 1273 sqrshrun2 \r0\().16b, \r1\().8h, #6 1274 sqrshrun \r2\().8b, \r2\().8h, #6 1275 sqrshrun2 \r2\().16b, \r3\().8h, #6 1276 st_16b \strd, \r0, \r2 1277.else 1278 srshr_h 2, \r0, \r1, \r2, \r3 1279 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1280 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1281.endif 1282.endm 1283 1284.macro make_8tap_fn op, type, type_h, type_v 1285function \op\()_8tap_\type\()_8bpc_neon, export=1 1286 mov x8, \type_h 1287 mov x9, \type_v 1288 b \op\()_8tap_neon 1289endfunc 1290.endm 1291 1292// No spaces in these expressions, due to gas-preprocessor. 1293#define REGULAR ((0*15<<7)|3*15) 1294#define SMOOTH ((1*15<<7)|4*15) 1295#define SHARP ((2*15<<7)|3*15) 1296 1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 1298make_8tap_fn \type, regular, REGULAR, REGULAR 1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1300make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1301make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1302make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1303make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1304make_8tap_fn \type, sharp, SHARP, SHARP 1305make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1306make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1307 1308function \type\()_8tap_neon 1309 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1310 mul \mx, \mx, w10 1311 mul \my, \my, w10 1312 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1313 add \my, \my, w9 // my, 8tap_v, 4tap_v 1314.ifc \type, prep 1315 uxtw \d_strd, \w 1316 lsl \d_strd, \d_strd, #1 1317.endif 1318 1319 clz w8, \w 1320 tst \mx, #(0x7f << 14) 1321 sub w8, w8, #24 1322 movrel x10, X(mc_subpel_filters), -8 1323 b.ne L(\type\()_8tap_h) 1324 tst \my, #(0x7f << 14) 1325 b.ne L(\type\()_8tap_v) 1326 b \type\()_neon 1327 1328L(\type\()_8tap_h): 1329 cmp \w, #4 1330 ubfx w9, \mx, #7, #7 1331 and \mx, \mx, #0x7f 1332 b.le 4f 1333 mov \mx, w9 13344: 1335 tst \my, #(0x7f << 14) 1336 add \xmx, x10, \mx, uxtw #3 1337 b.ne L(\type\()_8tap_hv) 1338 1339 adr x9, L(\type\()_8tap_h_tbl) 1340 ldrh w8, [x9, x8, lsl #1] 1341 sub x9, x9, w8, uxtw 1342 br x9 1343 134420: // 2xN h 1345.ifc \type, put 1346 add \xmx, \xmx, #2 1347 ld1 {v0.s}[0], [\xmx] 1348 sub \src, \src, #1 1349 add \ds2, \dst, \d_strd 1350 add \sr2, \src, \s_strd 1351 lsl \d_strd, \d_strd, #1 1352 lsl \s_strd, \s_strd, #1 1353 sxtl v0.8h, v0.8b 13542: 1355 ld1 {v4.8b}, [\src], \s_strd 1356 ld1 {v6.8b}, [\sr2], \s_strd 1357 uxtl v4.8h, v4.8b 1358 uxtl v6.8h, v6.8b 1359 ext v5.16b, v4.16b, v4.16b, #2 1360 ext v7.16b, v6.16b, v6.16b, #2 1361 subs \h, \h, #2 1362 trn1 v3.2s, v4.2s, v6.2s 1363 trn2 v6.2s, v4.2s, v6.2s 1364 trn1 v4.2s, v5.2s, v7.2s 1365 trn2 v7.2s, v5.2s, v7.2s 1366 mul v3.4h, v3.4h, v0.h[0] 1367 mla v3.4h, v4.4h, v0.h[1] 1368 mla v3.4h, v6.4h, v0.h[2] 1369 mla v3.4h, v7.4h, v0.h[3] 1370 srshr v3.4h, v3.4h, #2 1371 sqrshrun v3.8b, v3.8h, #4 1372 st1 {v3.h}[0], [\dst], \d_strd 1373 st1 {v3.h}[1], [\ds2], \d_strd 1374 b.gt 2b 1375 ret 1376.endif 1377 137840: // 4xN h 1379 add \xmx, \xmx, #2 1380 ld1 {v0.s}[0], [\xmx] 1381 sub \src, \src, #1 1382 add \ds2, \dst, \d_strd 1383 add \sr2, \src, \s_strd 1384 lsl \d_strd, \d_strd, #1 1385 lsl \s_strd, \s_strd, #1 1386 sxtl v0.8h, v0.8b 13874: 1388 ld1 {v16.8b}, [\src], \s_strd 1389 ld1 {v20.8b}, [\sr2], \s_strd 1390 uxtl v16.8h, v16.8b 1391 uxtl v20.8h, v20.8b 1392 ext v17.16b, v16.16b, v16.16b, #2 1393 ext v18.16b, v16.16b, v16.16b, #4 1394 ext v19.16b, v16.16b, v16.16b, #6 1395 ext v21.16b, v20.16b, v20.16b, #2 1396 ext v22.16b, v20.16b, v20.16b, #4 1397 ext v23.16b, v20.16b, v20.16b, #6 1398 subs \h, \h, #2 1399 mul v16.4h, v16.4h, v0.h[0] 1400 mla v16.4h, v17.4h, v0.h[1] 1401 mla v16.4h, v18.4h, v0.h[2] 1402 mla v16.4h, v19.4h, v0.h[3] 1403 mul v20.4h, v20.4h, v0.h[0] 1404 mla v20.4h, v21.4h, v0.h[1] 1405 mla v20.4h, v22.4h, v0.h[2] 1406 mla v20.4h, v23.4h, v0.h[3] 1407 srshr v16.4h, v16.4h, #2 1408 srshr v20.4h, v20.4h, #2 1409.ifc \type, put 1410 sqrshrun v16.8b, v16.8h, #4 1411 sqrshrun v20.8b, v20.8h, #4 1412 st1 {v16.s}[0], [\dst], \d_strd 1413 st1 {v20.s}[0], [\ds2], \d_strd 1414.else 1415 st1 {v16.4h}, [\dst], \d_strd 1416 st1 {v20.4h}, [\ds2], \d_strd 1417.endif 1418 b.gt 4b 1419 ret 1420 142180: // 8xN h 1422 ld1 {v0.8b}, [\xmx] 1423 sub \src, \src, #3 1424 add \ds2, \dst, \d_strd 1425 add \sr2, \src, \s_strd 1426 lsl \d_strd, \d_strd, #1 1427 lsl \s_strd, \s_strd, #1 1428 sxtl v0.8h, v0.8b 14298: 1430 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1431 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1432 uxtl v16.8h, v16.8b 1433 uxtl v17.8h, v17.8b 1434 uxtl v20.8h, v20.8b 1435 uxtl v21.8h, v21.8b 1436 1437 mul v18.8h, v16.8h, v0.h[0] 1438 mul v22.8h, v20.8h, v0.h[0] 1439.irpc i, 1234567 1440 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1441 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1442 mla v18.8h, v19.8h, v0.h[\i] 1443 mla v22.8h, v23.8h, v0.h[\i] 1444.endr 1445 subs \h, \h, #2 1446 srshr v18.8h, v18.8h, #2 1447 srshr v22.8h, v22.8h, #2 1448.ifc \type, put 1449 sqrshrun v18.8b, v18.8h, #4 1450 sqrshrun v22.8b, v22.8h, #4 1451 st1 {v18.8b}, [\dst], \d_strd 1452 st1 {v22.8b}, [\ds2], \d_strd 1453.else 1454 st1 {v18.8h}, [\dst], \d_strd 1455 st1 {v22.8h}, [\ds2], \d_strd 1456.endif 1457 b.gt 8b 1458 ret 1459160: 1460320: 1461640: 14621280: // 16xN, 32xN, ... h 1463 ld1 {v0.8b}, [\xmx] 1464 sub \src, \src, #3 1465 add \ds2, \dst, \d_strd 1466 add \sr2, \src, \s_strd 1467 lsl \s_strd, \s_strd, #1 1468 sxtl v0.8h, v0.8b 1469 1470 sub \s_strd, \s_strd, \w, uxtw 1471 sub \s_strd, \s_strd, #8 1472.ifc \type, put 1473 lsl \d_strd, \d_strd, #1 1474 sub \d_strd, \d_strd, \w, uxtw 1475.endif 1476161: 1477 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1478 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1479 mov \mx, \w 1480 uxtl v16.8h, v16.8b 1481 uxtl v17.8h, v17.8b 1482 uxtl v18.8h, v18.8b 1483 uxtl v20.8h, v20.8b 1484 uxtl v21.8h, v21.8b 1485 uxtl v22.8h, v22.8b 1486 148716: 1488 mul v24.8h, v16.8h, v0.h[0] 1489 mul v25.8h, v17.8h, v0.h[0] 1490 mul v26.8h, v20.8h, v0.h[0] 1491 mul v27.8h, v21.8h, v0.h[0] 1492.irpc i, 1234567 1493 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1494 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1495 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1496 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1497 mla v24.8h, v28.8h, v0.h[\i] 1498 mla v25.8h, v29.8h, v0.h[\i] 1499 mla v26.8h, v30.8h, v0.h[\i] 1500 mla v27.8h, v31.8h, v0.h[\i] 1501.endr 1502 srshr v24.8h, v24.8h, #2 1503 srshr v25.8h, v25.8h, #2 1504 srshr v26.8h, v26.8h, #2 1505 srshr v27.8h, v27.8h, #2 1506 subs \mx, \mx, #16 1507.ifc \type, put 1508 sqrshrun v24.8b, v24.8h, #4 1509 sqrshrun2 v24.16b, v25.8h, #4 1510 sqrshrun v26.8b, v26.8h, #4 1511 sqrshrun2 v26.16b, v27.8h, #4 1512 st1 {v24.16b}, [\dst], #16 1513 st1 {v26.16b}, [\ds2], #16 1514.else 1515 st1 {v24.8h, v25.8h}, [\dst], #32 1516 st1 {v26.8h, v27.8h}, [\ds2], #32 1517.endif 1518 b.le 9f 1519 1520 mov v16.16b, v18.16b 1521 mov v20.16b, v22.16b 1522 ld1 {v17.8b, v18.8b}, [\src], #16 1523 ld1 {v21.8b, v22.8b}, [\sr2], #16 1524 uxtl v17.8h, v17.8b 1525 uxtl v18.8h, v18.8b 1526 uxtl v21.8h, v21.8b 1527 uxtl v22.8h, v22.8b 1528 b 16b 1529 15309: 1531 add \dst, \dst, \d_strd 1532 add \ds2, \ds2, \d_strd 1533 add \src, \src, \s_strd 1534 add \sr2, \sr2, \s_strd 1535 1536 subs \h, \h, #2 1537 b.gt 161b 1538 ret 1539 1540L(\type\()_8tap_h_tbl): 1541 .hword L(\type\()_8tap_h_tbl) - 1280b 1542 .hword L(\type\()_8tap_h_tbl) - 640b 1543 .hword L(\type\()_8tap_h_tbl) - 320b 1544 .hword L(\type\()_8tap_h_tbl) - 160b 1545 .hword L(\type\()_8tap_h_tbl) - 80b 1546 .hword L(\type\()_8tap_h_tbl) - 40b 1547 .hword L(\type\()_8tap_h_tbl) - 20b 1548 .hword 0 1549 1550 1551L(\type\()_8tap_v): 1552 cmp \h, #4 1553 ubfx w9, \my, #7, #7 1554 and \my, \my, #0x7f 1555 b.le 4f 1556 mov \my, w9 15574: 1558 add \xmy, x10, \my, uxtw #3 1559 1560 adr x9, L(\type\()_8tap_v_tbl) 1561 ldrh w8, [x9, x8, lsl #1] 1562 sub x9, x9, w8, uxtw 1563 br x9 1564 156520: // 2xN v 1566.ifc \type, put 1567 b.gt 28f 1568 1569 cmp \h, #2 1570 add \xmy, \xmy, #2 1571 ld1 {v0.s}[0], [\xmy] 1572 sub \src, \src, \s_strd 1573 add \ds2, \dst, \d_strd 1574 add \sr2, \src, \s_strd 1575 lsl \s_strd, \s_strd, #1 1576 lsl \d_strd, \d_strd, #1 1577 sxtl v0.8h, v0.8b 1578 1579 // 2x2 v 1580 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1581 interleave_1_h v1, v2, v3, v4, v5 1582 b.gt 24f 1583 uxtl_b v1, v2, v3, v4 1584 mul_mla_4 v6, v1, v2, v3, v4, .4h 1585 sqrshrun_b 6, v6 1586 st_h \d_strd, v6, 2 1587 ret 1588 158924: // 2x4 v 1590 load_h \sr2, \src, \s_strd, v6, v7 1591 interleave_1_h v5, v6, v7 1592 interleave_2_s v1, v2, v3, v4, v5, v6 1593 uxtl_b v1, v2, v3, v4 1594 mul_mla_4 v6, v1, v2, v3, v4, .8h 1595 sqrshrun_b 6, v6 1596 st_h \d_strd, v6, 4 1597 ret 1598 159928: // 2x8, 2x16 v 1600 ld1 {v0.8b}, [\xmy] 1601 sub \sr2, \src, \s_strd, lsl #1 1602 add \ds2, \dst, \d_strd 1603 sub \src, \sr2, \s_strd 1604 lsl \d_strd, \d_strd, #1 1605 lsl \s_strd, \s_strd, #1 1606 sxtl v0.8h, v0.8b 1607 1608 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1609 interleave_1_h v1, v2, v3, v4, v5 1610 interleave_1_h v5, v6, v7 1611 interleave_2_s v1, v2, v3, v4, v5, v6 1612 uxtl_b v1, v2, v3, v4 1613216: 1614 subs \h, \h, #8 1615 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1616 load_h \sr2, \src, \s_strd, v20, v21, v22, v23 1617 interleave_1_h v7, v16, v17, v18, v19 1618 interleave_1_h v19, v20, v21, v22, v23 1619 interleave_2_s v5, v6, v7, v16, v17, v18 1620 interleave_2_s v17, v18, v19, v20, v21, v22 1621 uxtl_b v5, v6, v7, v16 1622 uxtl_b v17, v18, v19, v20 1623 mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 1624 sqrshrun_b 6, v30, v31 1625 st_h \d_strd, v30, 4 1626 st_h \d_strd, v31, 4 1627 b.le 0f 1628 mov v1.16b, v17.16b 1629 mov v2.16b, v18.16b 1630 mov v3.16b, v19.16b 1631 mov v4.16b, v20.16b 1632 mov v5.16b, v21.16b 1633 mov v6.16b, v22.16b 1634 mov v7.16b, v23.16b 1635 b 216b 16360: 1637 ret 1638.endif 1639 164040: 1641 b.gt 480f 1642 1643 // 4x2, 4x4 v 1644 cmp \h, #2 1645 add \xmy, \xmy, #2 1646 ld1 {v0.s}[0], [\xmy] 1647 sub \src, \src, \s_strd 1648 add \ds2, \dst, \d_strd 1649 add \sr2, \src, \s_strd 1650 lsl \s_strd, \s_strd, #1 1651 lsl \d_strd, \d_strd, #1 1652 sxtl v0.8h, v0.8b 1653 1654 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1655 interleave_1_s v1, v2, v3, v4, v5 1656 uxtl_b v1, v2, v3, v4 1657 mul_mla_4 v6, v1, v2, v3, v4, .8h 1658 shift_store_4 \type, \d_strd, v6 1659 b.le 0f 1660 load_s \sr2, \src, \s_strd, v6, v7 1661 interleave_1_s v5, v6, v7 1662 uxtl_b v5, v6 1663 mul_mla_4 v7, v3, v4, v5, v6, .8h 1664 shift_store_4 \type, \d_strd, v7 16650: 1666 ret 1667 1668480: // 4x8, 4x16 v 1669 ld1 {v0.8b}, [\xmy] 1670 sub \sr2, \src, \s_strd, lsl #1 1671 add \ds2, \dst, \d_strd 1672 sub \src, \sr2, \s_strd 1673 lsl \s_strd, \s_strd, #1 1674 lsl \d_strd, \d_strd, #1 1675 sxtl v0.8h, v0.8b 1676 1677 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1678 interleave_1_s v16, v17, v18 1679 interleave_1_s v18, v19, v20, v21, v22 1680 uxtl_b v16, v17 1681 uxtl_b v18, v19, v20, v21 1682 168348: 1684 subs \h, \h, #4 1685 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1686 interleave_1_s v22, v23, v24, v25, v26 1687 uxtl_b v22, v23, v24, v25 1688 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1689 shift_store_4 \type, \d_strd, v1, v2 1690 b.le 0f 1691 subs \h, \h, #4 1692 load_s \sr2, \src, \s_strd, v27, v16, v17, v18 1693 interleave_1_s v26, v27, v16, v17, v18 1694 uxtl_b v26, v27, v16, v17 1695 mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 1696 shift_store_4 \type, \d_strd, v1, v2 1697 b.le 0f 1698 subs \h, \h, #4 1699 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1700 interleave_1_s v18, v19, v20, v21, v22 1701 uxtl_b v18, v19, v20, v21 1702 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1703 shift_store_4 \type, \d_strd, v1, v2 1704 b.gt 48b 17050: 1706 ret 1707 170880: 1709 b.gt 880f 1710 1711 // 8x2, 8x4 v 1712 cmp \h, #2 1713 add \xmy, \xmy, #2 1714 ld1 {v0.s}[0], [\xmy] 1715 sub \src, \src, \s_strd 1716 add \ds2, \dst, \d_strd 1717 add \sr2, \src, \s_strd 1718 lsl \s_strd, \s_strd, #1 1719 lsl \d_strd, \d_strd, #1 1720 sxtl v0.8h, v0.8b 1721 1722 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1723 uxtl_b v1, v2, v3, v4, v5 1724 mul_mla_4 v6, v1, v2, v3, v4, .8h 1725 mul_mla_4 v7, v2, v3, v4, v5, .8h 1726 shift_store_8 \type, \d_strd, v6, v7 1727 b.le 0f 1728 load_8b \sr2, \src, \s_strd, v6, v7 1729 uxtl_b v6, v7 1730 mul_mla_4 v1, v3, v4, v5, v6, .8h 1731 mul_mla_4 v2, v4, v5, v6, v7, .8h 1732 shift_store_8 \type, \d_strd, v1, v2 17330: 1734 ret 1735 1736880: // 8x6, 8x8, 8x16, 8x32 v 17371680: // 16x8, 16x16, ... 1738320: // 32x8, 32x16, ... 1739640: 17401280: 1741 ld1 {v0.8b}, [\xmy] 1742 sub \src, \src, \s_strd 1743 sub \src, \src, \s_strd, lsl #1 1744 sxtl v0.8h, v0.8b 1745 mov \my, \h 1746168: 1747 add \ds2, \dst, \d_strd 1748 add \sr2, \src, \s_strd 1749 lsl \s_strd, \s_strd, #1 1750 lsl \d_strd, \d_strd, #1 1751 1752 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1753 uxtl_b v16, v17, v18, v19, v20, v21, v22 1754 175588: 1756 subs \h, \h, #2 1757 load_8b \sr2, \src, \s_strd, v23, v24 1758 uxtl_b v23, v24 1759 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1760 shift_store_8 \type, \d_strd, v1, v2 1761 b.le 9f 1762 subs \h, \h, #2 1763 load_8b \sr2, \src, \s_strd, v25, v26 1764 uxtl_b v25, v26 1765 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1766 shift_store_8 \type, \d_strd, v3, v4 1767 b.le 9f 1768 subs \h, \h, #2 1769 load_8b \sr2, \src, \s_strd, v27, v16 1770 uxtl_b v27, v16 1771 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1772 shift_store_8 \type, \d_strd, v1, v2 1773 b.le 9f 1774 subs \h, \h, #2 1775 load_8b \sr2, \src, \s_strd, v17, v18 1776 uxtl_b v17, v18 1777 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1778 shift_store_8 \type, \d_strd, v3, v4 1779 b.le 9f 1780 subs \h, \h, #4 1781 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1782 uxtl_b v19, v20, v21, v22 1783 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1784 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1785 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1786 b.gt 88b 17879: 1788 subs \w, \w, #8 1789 b.le 0f 1790 asr \s_strd, \s_strd, #1 1791 asr \d_strd, \d_strd, #1 1792 msub \src, \s_strd, \xmy, \src 1793 msub \dst, \d_strd, \xmy, \dst 1794 sub \src, \src, \s_strd, lsl #3 1795 mov \h, \my 1796 add \src, \src, #8 1797.ifc \type, put 1798 add \dst, \dst, #8 1799.else 1800 add \dst, \dst, #16 1801.endif 1802 b 168b 18030: 1804 ret 1805 1806160: 1807 b.gt 1680b 1808 1809 // 16x2, 16x4 v 1810 add \xmy, \xmy, #2 1811 ld1 {v0.s}[0], [\xmy] 1812 sub \src, \src, \s_strd 1813 add \ds2, \dst, \d_strd 1814 add \sr2, \src, \s_strd 1815 lsl \s_strd, \s_strd, #1 1816 lsl \d_strd, \d_strd, #1 1817 sxtl v0.8h, v0.8b 1818 1819 cmp \h, #2 1820 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1821 uxtl v16.8h, v1.8b 1822 uxtl v17.8h, v2.8b 1823 uxtl v18.8h, v3.8b 1824 uxtl v19.8h, v4.8b 1825 uxtl v20.8h, v5.8b 1826 uxtl2 v23.8h, v1.16b 1827 uxtl2 v24.8h, v2.16b 1828 uxtl2 v25.8h, v3.16b 1829 uxtl2 v26.8h, v4.16b 1830 uxtl2 v27.8h, v5.16b 1831 mul_mla_4 v1, v16, v17, v18, v19, .8h 1832 mul_mla_4 v16, v17, v18, v19, v20, .8h 1833 mul_mla_4 v2, v23, v24, v25, v26, .8h 1834 mul_mla_4 v17, v24, v25, v26, v27, .8h 1835 shift_store_16 \type, \d_strd, v1, v2, v16, v17 1836 b.le 0f 1837 load_16b \sr2, \src, \s_strd, v6, v7 1838 uxtl v21.8h, v6.8b 1839 uxtl v22.8h, v7.8b 1840 uxtl2 v28.8h, v6.16b 1841 uxtl2 v29.8h, v7.16b 1842 mul_mla_4 v1, v18, v19, v20, v21, .8h 1843 mul_mla_4 v3, v19, v20, v21, v22, .8h 1844 mul_mla_4 v2, v25, v26, v27, v28, .8h 1845 mul_mla_4 v4, v26, v27, v28, v29, .8h 1846 shift_store_16 \type, \d_strd, v1, v2, v3, v4 18470: 1848 ret 1849 1850L(\type\()_8tap_v_tbl): 1851 .hword L(\type\()_8tap_v_tbl) - 1280b 1852 .hword L(\type\()_8tap_v_tbl) - 640b 1853 .hword L(\type\()_8tap_v_tbl) - 320b 1854 .hword L(\type\()_8tap_v_tbl) - 160b 1855 .hword L(\type\()_8tap_v_tbl) - 80b 1856 .hword L(\type\()_8tap_v_tbl) - 40b 1857 .hword L(\type\()_8tap_v_tbl) - 20b 1858 .hword 0 1859 1860L(\type\()_8tap_hv): 1861 cmp \h, #4 1862 ubfx w9, \my, #7, #7 1863 and \my, \my, #0x7f 1864 b.le 4f 1865 mov \my, w9 18664: 1867 add \xmy, x10, \my, uxtw #3 1868 1869 adr x9, L(\type\()_8tap_hv_tbl) 1870 ldrh w8, [x9, x8, lsl #1] 1871 sub x9, x9, w8, uxtw 1872 br x9 1873 187420: 1875.ifc \type, put 1876 add \xmx, \xmx, #2 1877 ld1 {v0.s}[0], [\xmx] 1878 b.gt 280f 1879 add \xmy, \xmy, #2 1880 ld1 {v1.s}[0], [\xmy] 1881 1882 // 2x2, 2x4 hv 1883 sub \sr2, \src, #1 1884 sub \src, \sr2, \s_strd 1885 add \ds2, \dst, \d_strd 1886 lsl \s_strd, \s_strd, #1 1887 lsl \d_strd, \d_strd, #1 1888 sxtl v0.8h, v0.8b 1889 sxtl v1.8h, v1.8b 1890 mov x15, x30 1891 1892 ld1 {v28.8b}, [\src], \s_strd 1893 uxtl v28.8h, v28.8b 1894 ext v29.16b, v28.16b, v28.16b, #2 1895 mul v28.4h, v28.4h, v0.4h 1896 mul v29.4h, v29.4h, v0.4h 1897 addp v28.4h, v28.4h, v29.4h 1898 addp v16.4h, v28.4h, v28.4h 1899 srshr v16.4h, v16.4h, #2 1900 bl L(\type\()_8tap_filter_2) 1901 1902 trn1 v16.2s, v16.2s, v28.2s 1903 mov v17.8b, v28.8b 1904 19052: 1906 bl L(\type\()_8tap_filter_2) 1907 1908 ext v18.8b, v17.8b, v28.8b, #4 1909 smull v2.4s, v16.4h, v1.h[0] 1910 smlal v2.4s, v17.4h, v1.h[1] 1911 smlal v2.4s, v18.4h, v1.h[2] 1912 smlal v2.4s, v28.4h, v1.h[3] 1913 1914 sqrshrn v2.4h, v2.4s, #\shift_hv 1915 sqxtun v2.8b, v2.8h 1916 subs \h, \h, #2 1917 st1 {v2.h}[0], [\dst], \d_strd 1918 st1 {v2.h}[1], [\ds2], \d_strd 1919 b.le 0f 1920 mov v16.8b, v18.8b 1921 mov v17.8b, v28.8b 1922 b 2b 1923 1924280: // 2x8, 2x16, 2x32 hv 1925 ld1 {v1.8b}, [\xmy] 1926 sub \src, \src, #1 1927 sub \sr2, \src, \s_strd, lsl #1 1928 sub \src, \sr2, \s_strd 1929 add \ds2, \dst, \d_strd 1930 lsl \s_strd, \s_strd, #1 1931 lsl \d_strd, \d_strd, #1 1932 sxtl v0.8h, v0.8b 1933 sxtl v1.8h, v1.8b 1934 mov x15, x30 1935 1936 ld1 {v28.8b}, [\src], \s_strd 1937 uxtl v28.8h, v28.8b 1938 ext v29.16b, v28.16b, v28.16b, #2 1939 mul v28.4h, v28.4h, v0.4h 1940 mul v29.4h, v29.4h, v0.4h 1941 addp v28.4h, v28.4h, v29.4h 1942 addp v16.4h, v28.4h, v28.4h 1943 srshr v16.4h, v16.4h, #2 1944 1945 bl L(\type\()_8tap_filter_2) 1946 trn1 v16.2s, v16.2s, v28.2s 1947 mov v17.8b, v28.8b 1948 bl L(\type\()_8tap_filter_2) 1949 ext v18.8b, v17.8b, v28.8b, #4 1950 mov v19.8b, v28.8b 1951 bl L(\type\()_8tap_filter_2) 1952 ext v20.8b, v19.8b, v28.8b, #4 1953 mov v21.8b, v28.8b 1954 195528: 1956 bl L(\type\()_8tap_filter_2) 1957 ext v22.8b, v21.8b, v28.8b, #4 1958 smull v2.4s, v16.4h, v1.h[0] 1959 smlal v2.4s, v17.4h, v1.h[1] 1960 smlal v2.4s, v18.4h, v1.h[2] 1961 smlal v2.4s, v19.4h, v1.h[3] 1962 smlal v2.4s, v20.4h, v1.h[4] 1963 smlal v2.4s, v21.4h, v1.h[5] 1964 smlal v2.4s, v22.4h, v1.h[6] 1965 smlal v2.4s, v28.4h, v1.h[7] 1966 1967 sqrshrn v2.4h, v2.4s, #\shift_hv 1968 sqxtun v2.8b, v2.8h 1969 subs \h, \h, #2 1970 st1 {v2.h}[0], [\dst], \d_strd 1971 st1 {v2.h}[1], [\ds2], \d_strd 1972 b.le 0f 1973 mov v16.8b, v18.8b 1974 mov v17.8b, v19.8b 1975 mov v18.8b, v20.8b 1976 mov v19.8b, v21.8b 1977 mov v20.8b, v22.8b 1978 mov v21.8b, v28.8b 1979 b 28b 1980 19810: 1982 br x15 1983 1984L(\type\()_8tap_filter_2): 1985 ld1 {v28.8b}, [\sr2], \s_strd 1986 ld1 {v30.8b}, [\src], \s_strd 1987 uxtl v28.8h, v28.8b 1988 uxtl v30.8h, v30.8b 1989 ext v29.16b, v28.16b, v28.16b, #2 1990 ext v31.16b, v30.16b, v30.16b, #2 1991 trn1 v27.2s, v28.2s, v30.2s 1992 trn2 v30.2s, v28.2s, v30.2s 1993 trn1 v28.2s, v29.2s, v31.2s 1994 trn2 v31.2s, v29.2s, v31.2s 1995 mul v27.4h, v27.4h, v0.h[0] 1996 mla v27.4h, v28.4h, v0.h[1] 1997 mla v27.4h, v30.4h, v0.h[2] 1998 mla v27.4h, v31.4h, v0.h[3] 1999 srshr v28.4h, v27.4h, #2 2000 ret 2001.endif 2002 200340: 2004 add \xmx, \xmx, #2 2005 ld1 {v0.s}[0], [\xmx] 2006 b.gt 480f 2007 add \xmy, \xmy, #2 2008 ld1 {v1.s}[0], [\xmy] 2009 sub \sr2, \src, #1 2010 sub \src, \sr2, \s_strd 2011 add \ds2, \dst, \d_strd 2012 lsl \s_strd, \s_strd, #1 2013 lsl \d_strd, \d_strd, #1 2014 sxtl v0.8h, v0.8b 2015 sxtl v1.8h, v1.8b 2016 mov x15, x30 2017 2018 // 4x2, 4x4 hv 2019 ld1 {v26.8b}, [\src], \s_strd 2020 uxtl v26.8h, v26.8b 2021 ext v28.16b, v26.16b, v26.16b, #2 2022 ext v29.16b, v26.16b, v26.16b, #4 2023 ext v30.16b, v26.16b, v26.16b, #6 2024 mul v31.4h, v26.4h, v0.h[0] 2025 mla v31.4h, v28.4h, v0.h[1] 2026 mla v31.4h, v29.4h, v0.h[2] 2027 mla v31.4h, v30.4h, v0.h[3] 2028 srshr v16.4h, v31.4h, #2 2029 2030 bl L(\type\()_8tap_filter_4) 2031 mov v17.8b, v28.8b 2032 mov v18.8b, v29.8b 2033 20344: 2035 bl L(\type\()_8tap_filter_4) 2036 // Interleaving the mul/mla chains actually hurts performance 2037 // significantly on Cortex A53, thus keeping mul/mla tightly 2038 // chained like this. 2039 smull v2.4s, v16.4h, v1.h[0] 2040 smlal v2.4s, v17.4h, v1.h[1] 2041 smlal v2.4s, v18.4h, v1.h[2] 2042 smlal v2.4s, v28.4h, v1.h[3] 2043 smull v3.4s, v17.4h, v1.h[0] 2044 smlal v3.4s, v18.4h, v1.h[1] 2045 smlal v3.4s, v28.4h, v1.h[2] 2046 smlal v3.4s, v29.4h, v1.h[3] 2047 sqrshrn v2.4h, v2.4s, #\shift_hv 2048 sqrshrn v3.4h, v3.4s, #\shift_hv 2049 subs \h, \h, #2 2050.ifc \type, put 2051 sqxtun v2.8b, v2.8h 2052 sqxtun v3.8b, v3.8h 2053 st1 {v2.s}[0], [\dst], \d_strd 2054 st1 {v3.s}[0], [\ds2], \d_strd 2055.else 2056 st1 {v2.4h}, [\dst], \d_strd 2057 st1 {v3.4h}, [\ds2], \d_strd 2058.endif 2059 b.le 0f 2060 mov v16.8b, v18.8b 2061 mov v17.8b, v28.8b 2062 mov v18.8b, v29.8b 2063 b 4b 2064 2065480: // 4x8, 4x16, 4x32 hv 2066 ld1 {v1.8b}, [\xmy] 2067 sub \src, \src, #1 2068 sub \sr2, \src, \s_strd, lsl #1 2069 sub \src, \sr2, \s_strd 2070 add \ds2, \dst, \d_strd 2071 lsl \s_strd, \s_strd, #1 2072 lsl \d_strd, \d_strd, #1 2073 sxtl v0.8h, v0.8b 2074 sxtl v1.8h, v1.8b 2075 mov x15, x30 2076 2077 ld1 {v26.8b}, [\src], \s_strd 2078 uxtl v26.8h, v26.8b 2079 ext v28.16b, v26.16b, v26.16b, #2 2080 ext v29.16b, v26.16b, v26.16b, #4 2081 ext v30.16b, v26.16b, v26.16b, #6 2082 mul v31.4h, v26.4h, v0.h[0] 2083 mla v31.4h, v28.4h, v0.h[1] 2084 mla v31.4h, v29.4h, v0.h[2] 2085 mla v31.4h, v30.4h, v0.h[3] 2086 srshr v16.4h, v31.4h, #2 2087 2088 bl L(\type\()_8tap_filter_4) 2089 mov v17.8b, v28.8b 2090 mov v18.8b, v29.8b 2091 bl L(\type\()_8tap_filter_4) 2092 mov v19.8b, v28.8b 2093 mov v20.8b, v29.8b 2094 bl L(\type\()_8tap_filter_4) 2095 mov v21.8b, v28.8b 2096 mov v22.8b, v29.8b 2097 209848: 2099 bl L(\type\()_8tap_filter_4) 2100 smull v2.4s, v16.4h, v1.h[0] 2101 smlal v2.4s, v17.4h, v1.h[1] 2102 smlal v2.4s, v18.4h, v1.h[2] 2103 smlal v2.4s, v19.4h, v1.h[3] 2104 smlal v2.4s, v20.4h, v1.h[4] 2105 smlal v2.4s, v21.4h, v1.h[5] 2106 smlal v2.4s, v22.4h, v1.h[6] 2107 smlal v2.4s, v28.4h, v1.h[7] 2108 smull v3.4s, v17.4h, v1.h[0] 2109 smlal v3.4s, v18.4h, v1.h[1] 2110 smlal v3.4s, v19.4h, v1.h[2] 2111 smlal v3.4s, v20.4h, v1.h[3] 2112 smlal v3.4s, v21.4h, v1.h[4] 2113 smlal v3.4s, v22.4h, v1.h[5] 2114 smlal v3.4s, v28.4h, v1.h[6] 2115 smlal v3.4s, v29.4h, v1.h[7] 2116 sqrshrn v2.4h, v2.4s, #\shift_hv 2117 sqrshrn v3.4h, v3.4s, #\shift_hv 2118 subs \h, \h, #2 2119.ifc \type, put 2120 sqxtun v2.8b, v2.8h 2121 sqxtun v3.8b, v3.8h 2122 st1 {v2.s}[0], [\dst], \d_strd 2123 st1 {v3.s}[0], [\ds2], \d_strd 2124.else 2125 st1 {v2.4h}, [\dst], \d_strd 2126 st1 {v3.4h}, [\ds2], \d_strd 2127.endif 2128 b.le 0f 2129 mov v16.8b, v18.8b 2130 mov v17.8b, v19.8b 2131 mov v18.8b, v20.8b 2132 mov v19.8b, v21.8b 2133 mov v20.8b, v22.8b 2134 mov v21.8b, v28.8b 2135 mov v22.8b, v29.8b 2136 b 48b 21370: 2138 br x15 2139 2140L(\type\()_8tap_filter_4): 2141 ld1 {v26.8b}, [\sr2], \s_strd 2142 ld1 {v27.8b}, [\src], \s_strd 2143 uxtl v26.8h, v26.8b 2144 uxtl v27.8h, v27.8b 2145 ext v28.16b, v26.16b, v26.16b, #2 2146 ext v29.16b, v26.16b, v26.16b, #4 2147 ext v30.16b, v26.16b, v26.16b, #6 2148 mul v31.4h, v26.4h, v0.h[0] 2149 mla v31.4h, v28.4h, v0.h[1] 2150 mla v31.4h, v29.4h, v0.h[2] 2151 mla v31.4h, v30.4h, v0.h[3] 2152 ext v28.16b, v27.16b, v27.16b, #2 2153 ext v29.16b, v27.16b, v27.16b, #4 2154 ext v30.16b, v27.16b, v27.16b, #6 2155 mul v27.4h, v27.4h, v0.h[0] 2156 mla v27.4h, v28.4h, v0.h[1] 2157 mla v27.4h, v29.4h, v0.h[2] 2158 mla v27.4h, v30.4h, v0.h[3] 2159 srshr v28.4h, v31.4h, #2 2160 srshr v29.4h, v27.4h, #2 2161 ret 2162 216380: 2164160: 2165320: 2166 b.gt 880f 2167 add \xmy, \xmy, #2 2168 ld1 {v0.8b}, [\xmx] 2169 ld1 {v1.s}[0], [\xmy] 2170 sub \src, \src, #3 2171 sub \src, \src, \s_strd 2172 sxtl v0.8h, v0.8b 2173 sxtl v1.8h, v1.8b 2174 mov x15, x30 2175 mov \my, \h 2176 2177164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2178 add \ds2, \dst, \d_strd 2179 add \sr2, \src, \s_strd 2180 lsl \d_strd, \d_strd, #1 2181 lsl \s_strd, \s_strd, #1 2182 2183 bl L(\type\()_8tap_filter_8_first) 2184 bl L(\type\()_8tap_filter_8) 2185 mov v17.16b, v24.16b 2186 mov v18.16b, v25.16b 2187 21888: 2189 smull v2.4s, v16.4h, v1.h[0] 2190 smull2 v3.4s, v16.8h, v1.h[0] 2191 bl L(\type\()_8tap_filter_8) 2192 smull v4.4s, v17.4h, v1.h[0] 2193 smull2 v5.4s, v17.8h, v1.h[0] 2194 smlal v2.4s, v17.4h, v1.h[1] 2195 smlal2 v3.4s, v17.8h, v1.h[1] 2196 smlal v4.4s, v18.4h, v1.h[1] 2197 smlal2 v5.4s, v18.8h, v1.h[1] 2198 smlal v2.4s, v18.4h, v1.h[2] 2199 smlal2 v3.4s, v18.8h, v1.h[2] 2200 smlal v4.4s, v24.4h, v1.h[2] 2201 smlal2 v5.4s, v24.8h, v1.h[2] 2202 smlal v2.4s, v24.4h, v1.h[3] 2203 smlal2 v3.4s, v24.8h, v1.h[3] 2204 smlal v4.4s, v25.4h, v1.h[3] 2205 smlal2 v5.4s, v25.8h, v1.h[3] 2206 sqrshrn v2.4h, v2.4s, #\shift_hv 2207 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2208 sqrshrn v4.4h, v4.4s, #\shift_hv 2209 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2210 subs \h, \h, #2 2211.ifc \type, put 2212 sqxtun v2.8b, v2.8h 2213 sqxtun v4.8b, v4.8h 2214 st1 {v2.8b}, [\dst], \d_strd 2215 st1 {v4.8b}, [\ds2], \d_strd 2216.else 2217 st1 {v2.8h}, [\dst], \d_strd 2218 st1 {v4.8h}, [\ds2], \d_strd 2219.endif 2220 b.le 9f 2221 mov v16.16b, v18.16b 2222 mov v17.16b, v24.16b 2223 mov v18.16b, v25.16b 2224 b 8b 22259: 2226 subs \w, \w, #8 2227 b.le 0f 2228 asr \s_strd, \s_strd, #1 2229 asr \d_strd, \d_strd, #1 2230 msub \src, \s_strd, \xmy, \src 2231 msub \dst, \d_strd, \xmy, \dst 2232 sub \src, \src, \s_strd, lsl #2 2233 mov \h, \my 2234 add \src, \src, #8 2235.ifc \type, put 2236 add \dst, \dst, #8 2237.else 2238 add \dst, \dst, #16 2239.endif 2240 b 164b 2241 2242880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2243640: 22441280: 2245 ld1 {v0.8b}, [\xmx] 2246 ld1 {v1.8b}, [\xmy] 2247 sub \src, \src, #3 2248 sub \src, \src, \s_strd 2249 sub \src, \src, \s_strd, lsl #1 2250 sxtl v0.8h, v0.8b 2251 sxtl v1.8h, v1.8b 2252 mov x15, x30 2253 mov \my, \h 2254 2255168: 2256 add \ds2, \dst, \d_strd 2257 add \sr2, \src, \s_strd 2258 lsl \d_strd, \d_strd, #1 2259 lsl \s_strd, \s_strd, #1 2260 2261 bl L(\type\()_8tap_filter_8_first) 2262 bl L(\type\()_8tap_filter_8) 2263 mov v17.16b, v24.16b 2264 mov v18.16b, v25.16b 2265 bl L(\type\()_8tap_filter_8) 2266 mov v19.16b, v24.16b 2267 mov v20.16b, v25.16b 2268 bl L(\type\()_8tap_filter_8) 2269 mov v21.16b, v24.16b 2270 mov v22.16b, v25.16b 2271 227288: 2273 smull v2.4s, v16.4h, v1.h[0] 2274 smull2 v3.4s, v16.8h, v1.h[0] 2275 bl L(\type\()_8tap_filter_8) 2276 smull v4.4s, v17.4h, v1.h[0] 2277 smull2 v5.4s, v17.8h, v1.h[0] 2278 smlal v2.4s, v17.4h, v1.h[1] 2279 smlal2 v3.4s, v17.8h, v1.h[1] 2280 smlal v4.4s, v18.4h, v1.h[1] 2281 smlal2 v5.4s, v18.8h, v1.h[1] 2282 smlal v2.4s, v18.4h, v1.h[2] 2283 smlal2 v3.4s, v18.8h, v1.h[2] 2284 smlal v4.4s, v19.4h, v1.h[2] 2285 smlal2 v5.4s, v19.8h, v1.h[2] 2286 smlal v2.4s, v19.4h, v1.h[3] 2287 smlal2 v3.4s, v19.8h, v1.h[3] 2288 smlal v4.4s, v20.4h, v1.h[3] 2289 smlal2 v5.4s, v20.8h, v1.h[3] 2290 smlal v2.4s, v20.4h, v1.h[4] 2291 smlal2 v3.4s, v20.8h, v1.h[4] 2292 smlal v4.4s, v21.4h, v1.h[4] 2293 smlal2 v5.4s, v21.8h, v1.h[4] 2294 smlal v2.4s, v21.4h, v1.h[5] 2295 smlal2 v3.4s, v21.8h, v1.h[5] 2296 smlal v4.4s, v22.4h, v1.h[5] 2297 smlal2 v5.4s, v22.8h, v1.h[5] 2298 smlal v2.4s, v22.4h, v1.h[6] 2299 smlal2 v3.4s, v22.8h, v1.h[6] 2300 smlal v4.4s, v24.4h, v1.h[6] 2301 smlal2 v5.4s, v24.8h, v1.h[6] 2302 smlal v2.4s, v24.4h, v1.h[7] 2303 smlal2 v3.4s, v24.8h, v1.h[7] 2304 smlal v4.4s, v25.4h, v1.h[7] 2305 smlal2 v5.4s, v25.8h, v1.h[7] 2306 sqrshrn v2.4h, v2.4s, #\shift_hv 2307 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2308 sqrshrn v4.4h, v4.4s, #\shift_hv 2309 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2310 subs \h, \h, #2 2311.ifc \type, put 2312 sqxtun v2.8b, v2.8h 2313 sqxtun v4.8b, v4.8h 2314 st1 {v2.8b}, [\dst], \d_strd 2315 st1 {v4.8b}, [\ds2], \d_strd 2316.else 2317 st1 {v2.8h}, [\dst], \d_strd 2318 st1 {v4.8h}, [\ds2], \d_strd 2319.endif 2320 b.le 9f 2321 mov v16.16b, v18.16b 2322 mov v17.16b, v19.16b 2323 mov v18.16b, v20.16b 2324 mov v19.16b, v21.16b 2325 mov v20.16b, v22.16b 2326 mov v21.16b, v24.16b 2327 mov v22.16b, v25.16b 2328 b 88b 23299: 2330 subs \w, \w, #8 2331 b.le 0f 2332 asr \s_strd, \s_strd, #1 2333 asr \d_strd, \d_strd, #1 2334 msub \src, \s_strd, \xmy, \src 2335 msub \dst, \d_strd, \xmy, \dst 2336 sub \src, \src, \s_strd, lsl #3 2337 mov \h, \my 2338 add \src, \src, #8 2339.ifc \type, put 2340 add \dst, \dst, #8 2341.else 2342 add \dst, \dst, #16 2343.endif 2344 b 168b 23450: 2346 br x15 2347 2348L(\type\()_8tap_filter_8_first): 2349 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2350 uxtl v28.8h, v28.8b 2351 uxtl v29.8h, v29.8b 2352 mul v16.8h, v28.8h, v0.h[0] 2353 ext v24.16b, v28.16b, v29.16b, #(2*1) 2354 ext v25.16b, v28.16b, v29.16b, #(2*2) 2355 ext v26.16b, v28.16b, v29.16b, #(2*3) 2356 ext v27.16b, v28.16b, v29.16b, #(2*4) 2357 mla v16.8h, v24.8h, v0.h[1] 2358 mla v16.8h, v25.8h, v0.h[2] 2359 mla v16.8h, v26.8h, v0.h[3] 2360 mla v16.8h, v27.8h, v0.h[4] 2361 ext v24.16b, v28.16b, v29.16b, #(2*5) 2362 ext v25.16b, v28.16b, v29.16b, #(2*6) 2363 ext v26.16b, v28.16b, v29.16b, #(2*7) 2364 mla v16.8h, v24.8h, v0.h[5] 2365 mla v16.8h, v25.8h, v0.h[6] 2366 mla v16.8h, v26.8h, v0.h[7] 2367 srshr v16.8h, v16.8h, #2 2368 ret 2369 2370L(\type\()_8tap_filter_8): 2371 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2372 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2373 uxtl v28.8h, v28.8b 2374 uxtl v29.8h, v29.8b 2375 uxtl v30.8h, v30.8b 2376 uxtl v31.8h, v31.8b 2377 mul v24.8h, v28.8h, v0.h[0] 2378 mul v25.8h, v30.8h, v0.h[0] 2379.irpc i, 1234567 2380 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2381 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2382 mla v24.8h, v26.8h, v0.h[\i] 2383 mla v25.8h, v27.8h, v0.h[\i] 2384.endr 2385 srshr v24.8h, v24.8h, #2 2386 srshr v25.8h, v25.8h, #2 2387 ret 2388 2389L(\type\()_8tap_hv_tbl): 2390 .hword L(\type\()_8tap_hv_tbl) - 1280b 2391 .hword L(\type\()_8tap_hv_tbl) - 640b 2392 .hword L(\type\()_8tap_hv_tbl) - 320b 2393 .hword L(\type\()_8tap_hv_tbl) - 160b 2394 .hword L(\type\()_8tap_hv_tbl) - 80b 2395 .hword L(\type\()_8tap_hv_tbl) - 40b 2396 .hword L(\type\()_8tap_hv_tbl) - 20b 2397 .hword 0 2398endfunc 2399 2400 2401function \type\()_bilin_8bpc_neon, export=1 2402 dup v1.16b, \mx 2403 dup v3.16b, \my 2404 mov w9, #16 2405 sub w8, w9, \mx 2406 sub w9, w9, \my 2407 dup v0.16b, w8 2408 dup v2.16b, w9 2409.ifc \type, prep 2410 uxtw \d_strd, \w 2411 lsl \d_strd, \d_strd, #1 2412.endif 2413 2414 clz w8, \w 2415 sub w8, w8, #24 2416 cbnz \mx, L(\type\()_bilin_h) 2417 cbnz \my, L(\type\()_bilin_v) 2418 b \type\()_neon 2419 2420L(\type\()_bilin_h): 2421 cbnz \my, L(\type\()_bilin_hv) 2422 2423 adr x9, L(\type\()_bilin_h_tbl) 2424 ldrh w8, [x9, x8, lsl #1] 2425 sub x9, x9, w8, uxtw 2426 br x9 2427 242820: // 2xN h 2429.ifc \type, put 2430 add \ds2, \dst, \d_strd 2431 add \sr2, \src, \s_strd 2432 lsl \d_strd, \d_strd, #1 2433 lsl \s_strd, \s_strd, #1 24342: 2435 ld1 {v4.s}[0], [\src], \s_strd 2436 ld1 {v6.s}[0], [\sr2], \s_strd 2437 ext v5.8b, v4.8b, v4.8b, #1 2438 ext v7.8b, v6.8b, v6.8b, #1 2439 trn1 v4.4h, v4.4h, v6.4h 2440 trn1 v5.4h, v5.4h, v7.4h 2441 subs \h, \h, #2 2442 umull v4.8h, v4.8b, v0.8b 2443 umlal v4.8h, v5.8b, v1.8b 2444 uqrshrn v4.8b, v4.8h, #4 2445 st1 {v4.h}[0], [\dst], \d_strd 2446 st1 {v4.h}[1], [\ds2], \d_strd 2447 b.gt 2b 2448 ret 2449.endif 2450 245140: // 4xN h 2452 add \ds2, \dst, \d_strd 2453 add \sr2, \src, \s_strd 2454 lsl \d_strd, \d_strd, #1 2455 lsl \s_strd, \s_strd, #1 24564: 2457 ld1 {v4.8b}, [\src], \s_strd 2458 ld1 {v6.8b}, [\sr2], \s_strd 2459 ext v5.8b, v4.8b, v4.8b, #1 2460 ext v7.8b, v6.8b, v6.8b, #1 2461 trn1 v4.2s, v4.2s, v6.2s 2462 trn1 v5.2s, v5.2s, v7.2s 2463 subs \h, \h, #2 2464 umull v4.8h, v4.8b, v0.8b 2465 umlal v4.8h, v5.8b, v1.8b 2466.ifc \type, put 2467 uqrshrn v4.8b, v4.8h, #4 2468 st1 {v4.s}[0], [\dst], \d_strd 2469 st1 {v4.s}[1], [\ds2], \d_strd 2470.else 2471 st1 {v4.d}[0], [\dst], \d_strd 2472 st1 {v4.d}[1], [\ds2], \d_strd 2473.endif 2474 b.gt 4b 2475 ret 2476 247780: // 8xN h 2478 add \ds2, \dst, \d_strd 2479 add \sr2, \src, \s_strd 2480 lsl \d_strd, \d_strd, #1 2481 lsl \s_strd, \s_strd, #1 24828: 2483 ld1 {v4.16b}, [\src], \s_strd 2484 ld1 {v6.16b}, [\sr2], \s_strd 2485 ext v5.16b, v4.16b, v4.16b, #1 2486 ext v7.16b, v6.16b, v6.16b, #1 2487 subs \h, \h, #2 2488 umull v4.8h, v4.8b, v0.8b 2489 umull v6.8h, v6.8b, v0.8b 2490 umlal v4.8h, v5.8b, v1.8b 2491 umlal v6.8h, v7.8b, v1.8b 2492.ifc \type, put 2493 uqrshrn v4.8b, v4.8h, #4 2494 uqrshrn v6.8b, v6.8h, #4 2495 st1 {v4.8b}, [\dst], \d_strd 2496 st1 {v6.8b}, [\ds2], \d_strd 2497.else 2498 st1 {v4.8h}, [\dst], \d_strd 2499 st1 {v6.8h}, [\ds2], \d_strd 2500.endif 2501 b.gt 8b 2502 ret 2503160: 2504320: 2505640: 25061280: // 16xN, 32xN, ... h 2507 add \ds2, \dst, \d_strd 2508 add \sr2, \src, \s_strd 2509 lsl \s_strd, \s_strd, #1 2510 2511 sub \s_strd, \s_strd, \w, uxtw 2512 sub \s_strd, \s_strd, #8 2513.ifc \type, put 2514 lsl \d_strd, \d_strd, #1 2515 sub \d_strd, \d_strd, \w, uxtw 2516.endif 2517161: 2518 ld1 {v16.d}[1], [\src], #8 2519 ld1 {v20.d}[1], [\sr2], #8 2520 mov \mx, \w 2521 252216: 2523 ld1 {v18.16b}, [\src], #16 2524 ld1 {v22.16b}, [\sr2], #16 2525 ext v17.16b, v16.16b, v18.16b, #8 2526 ext v19.16b, v16.16b, v18.16b, #9 2527 ext v21.16b, v20.16b, v22.16b, #8 2528 ext v23.16b, v20.16b, v22.16b, #9 2529 umull v16.8h, v17.8b, v0.8b 2530 umull2 v17.8h, v17.16b, v0.16b 2531 umull v20.8h, v21.8b, v0.8b 2532 umull2 v21.8h, v21.16b, v0.16b 2533 umlal v16.8h, v19.8b, v1.8b 2534 umlal2 v17.8h, v19.16b, v1.16b 2535 umlal v20.8h, v23.8b, v1.8b 2536 umlal2 v21.8h, v23.16b, v1.16b 2537 subs \mx, \mx, #16 2538.ifc \type, put 2539 uqrshrn v16.8b, v16.8h, #4 2540 uqrshrn2 v16.16b, v17.8h, #4 2541 uqrshrn v20.8b, v20.8h, #4 2542 uqrshrn2 v20.16b, v21.8h, #4 2543 st1 {v16.16b}, [\dst], #16 2544 st1 {v20.16b}, [\ds2], #16 2545.else 2546 st1 {v16.8h, v17.8h}, [\dst], #32 2547 st1 {v20.8h, v21.8h}, [\ds2], #32 2548.endif 2549 b.le 9f 2550 2551 mov v16.16b, v18.16b 2552 mov v20.16b, v22.16b 2553 b 16b 2554 25559: 2556 add \dst, \dst, \d_strd 2557 add \ds2, \ds2, \d_strd 2558 add \src, \src, \s_strd 2559 add \sr2, \sr2, \s_strd 2560 2561 subs \h, \h, #2 2562 b.gt 161b 2563 ret 2564 2565L(\type\()_bilin_h_tbl): 2566 .hword L(\type\()_bilin_h_tbl) - 1280b 2567 .hword L(\type\()_bilin_h_tbl) - 640b 2568 .hword L(\type\()_bilin_h_tbl) - 320b 2569 .hword L(\type\()_bilin_h_tbl) - 160b 2570 .hword L(\type\()_bilin_h_tbl) - 80b 2571 .hword L(\type\()_bilin_h_tbl) - 40b 2572 .hword L(\type\()_bilin_h_tbl) - 20b 2573 .hword 0 2574 2575 2576L(\type\()_bilin_v): 2577 cmp \h, #4 2578 adr x9, L(\type\()_bilin_v_tbl) 2579 ldrh w8, [x9, x8, lsl #1] 2580 sub x9, x9, w8, uxtw 2581 br x9 2582 258320: // 2xN v 2584.ifc \type, put 2585 cmp \h, #2 2586 add \ds2, \dst, \d_strd 2587 add \sr2, \src, \s_strd 2588 lsl \s_strd, \s_strd, #1 2589 lsl \d_strd, \d_strd, #1 2590 2591 // 2x2 v 2592 ld1 {v16.h}[0], [\src], \s_strd 2593 b.gt 24f 2594 ld1 {v17.h}[0], [\sr2], \s_strd 2595 ld1 {v18.h}[0], [\src], \s_strd 2596 trn1 v16.4h, v16.4h, v17.4h 2597 trn1 v17.4h, v17.4h, v18.4h 2598 umull v4.8h, v16.8b, v2.8b 2599 umlal v4.8h, v17.8b, v3.8b 2600 uqrshrn v4.8b, v4.8h, #4 2601 st1 {v4.h}[0], [\dst] 2602 st1 {v4.h}[1], [\ds2] 2603 ret 260424: // 2x4, 2x8, ... v 2605 ld1 {v17.h}[0], [\sr2], \s_strd 2606 ld1 {v18.h}[0], [\src], \s_strd 2607 ld1 {v19.h}[0], [\sr2], \s_strd 2608 ld1 {v20.h}[0], [\src], \s_strd 2609 trn1 v16.4h, v16.4h, v17.4h 2610 trn1 v17.4h, v17.4h, v18.4h 2611 trn1 v18.4h, v18.4h, v19.4h 2612 trn1 v19.4h, v19.4h, v20.4h 2613 trn1 v16.2s, v16.2s, v18.2s 2614 trn1 v17.2s, v17.2s, v19.2s 2615 umull v4.8h, v16.8b, v2.8b 2616 umlal v4.8h, v17.8b, v3.8b 2617 subs \h, \h, #4 2618 uqrshrn v4.8b, v4.8h, #4 2619 st1 {v4.h}[0], [\dst], \d_strd 2620 st1 {v4.h}[1], [\ds2], \d_strd 2621 st1 {v4.h}[2], [\dst], \d_strd 2622 st1 {v4.h}[3], [\ds2], \d_strd 2623 b.le 0f 2624 mov v16.8b, v20.8b 2625 b 24b 26260: 2627 ret 2628.endif 2629 263040: // 4xN v 2631 add \ds2, \dst, \d_strd 2632 add \sr2, \src, \s_strd 2633 lsl \s_strd, \s_strd, #1 2634 lsl \d_strd, \d_strd, #1 2635 ld1 {v16.s}[0], [\src], \s_strd 26364: 2637 ld1 {v17.s}[0], [\sr2], \s_strd 2638 ld1 {v18.s}[0], [\src], \s_strd 2639 trn1 v16.2s, v16.2s, v17.2s 2640 trn1 v17.2s, v17.2s, v18.2s 2641 umull v4.8h, v16.8b, v2.8b 2642 umlal v4.8h, v17.8b, v3.8b 2643 subs \h, \h, #2 2644.ifc \type, put 2645 uqrshrn v4.8b, v4.8h, #4 2646 st1 {v4.s}[0], [\dst], \d_strd 2647 st1 {v4.s}[1], [\ds2], \d_strd 2648.else 2649 st1 {v4.d}[0], [\dst], \d_strd 2650 st1 {v4.d}[1], [\ds2], \d_strd 2651.endif 2652 b.le 0f 2653 mov v16.8b, v18.8b 2654 b 4b 26550: 2656 ret 2657 265880: // 8xN v 2659 add \ds2, \dst, \d_strd 2660 add \sr2, \src, \s_strd 2661 lsl \s_strd, \s_strd, #1 2662 lsl \d_strd, \d_strd, #1 2663 ld1 {v16.8b}, [\src], \s_strd 26648: 2665 ld1 {v17.8b}, [\sr2], \s_strd 2666 ld1 {v18.8b}, [\src], \s_strd 2667 umull v4.8h, v16.8b, v2.8b 2668 umull v5.8h, v17.8b, v2.8b 2669 umlal v4.8h, v17.8b, v3.8b 2670 umlal v5.8h, v18.8b, v3.8b 2671 subs \h, \h, #2 2672.ifc \type, put 2673 uqrshrn v4.8b, v4.8h, #4 2674 uqrshrn v5.8b, v5.8h, #4 2675 st1 {v4.8b}, [\dst], \d_strd 2676 st1 {v5.8b}, [\ds2], \d_strd 2677.else 2678 st1 {v4.8h}, [\dst], \d_strd 2679 st1 {v5.8h}, [\ds2], \d_strd 2680.endif 2681 b.le 0f 2682 mov v16.8b, v18.8b 2683 b 8b 26840: 2685 ret 2686 2687160: // 16xN, 32xN, ... 2688320: 2689640: 26901280: 2691 mov \my, \h 26921: 2693 add \ds2, \dst, \d_strd 2694 add \sr2, \src, \s_strd 2695 lsl \s_strd, \s_strd, #1 2696 lsl \d_strd, \d_strd, #1 2697 2698 ld1 {v16.16b}, [\src], \s_strd 26992: 2700 ld1 {v17.16b}, [\sr2], \s_strd 2701 ld1 {v18.16b}, [\src], \s_strd 2702 umull v4.8h, v16.8b, v2.8b 2703 umull2 v5.8h, v16.16b, v2.16b 2704 umull v6.8h, v17.8b, v2.8b 2705 umull2 v7.8h, v17.16b, v2.16b 2706 umlal v4.8h, v17.8b, v3.8b 2707 umlal2 v5.8h, v17.16b, v3.16b 2708 umlal v6.8h, v18.8b, v3.8b 2709 umlal2 v7.8h, v18.16b, v3.16b 2710 subs \h, \h, #2 2711.ifc \type, put 2712 uqrshrn v4.8b, v4.8h, #4 2713 uqrshrn2 v4.16b, v5.8h, #4 2714 uqrshrn v6.8b, v6.8h, #4 2715 uqrshrn2 v6.16b, v7.8h, #4 2716 st1 {v4.16b}, [\dst], \d_strd 2717 st1 {v6.16b}, [\ds2], \d_strd 2718.else 2719 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2720 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2721.endif 2722 b.le 9f 2723 mov v16.16b, v18.16b 2724 b 2b 27259: 2726 subs \w, \w, #16 2727 b.le 0f 2728 asr \s_strd, \s_strd, #1 2729 asr \d_strd, \d_strd, #1 2730 msub \src, \s_strd, \xmy, \src 2731 msub \dst, \d_strd, \xmy, \dst 2732 sub \src, \src, \s_strd, lsl #1 2733 mov \h, \my 2734 add \src, \src, #16 2735.ifc \type, put 2736 add \dst, \dst, #16 2737.else 2738 add \dst, \dst, #32 2739.endif 2740 b 1b 27410: 2742 ret 2743 2744L(\type\()_bilin_v_tbl): 2745 .hword L(\type\()_bilin_v_tbl) - 1280b 2746 .hword L(\type\()_bilin_v_tbl) - 640b 2747 .hword L(\type\()_bilin_v_tbl) - 320b 2748 .hword L(\type\()_bilin_v_tbl) - 160b 2749 .hword L(\type\()_bilin_v_tbl) - 80b 2750 .hword L(\type\()_bilin_v_tbl) - 40b 2751 .hword L(\type\()_bilin_v_tbl) - 20b 2752 .hword 0 2753 2754L(\type\()_bilin_hv): 2755 uxtl v2.8h, v2.8b 2756 uxtl v3.8h, v3.8b 2757 adr x9, L(\type\()_bilin_hv_tbl) 2758 ldrh w8, [x9, x8, lsl #1] 2759 sub x9, x9, w8, uxtw 2760 br x9 2761 276220: // 2xN hv 2763.ifc \type, put 2764 add \sr2, \src, \s_strd 2765 add \ds2, \dst, \d_strd 2766 lsl \s_strd, \s_strd, #1 2767 lsl \d_strd, \d_strd, #1 2768 2769 ld1 {v28.s}[0], [\src], \s_strd 2770 ext v29.8b, v28.8b, v28.8b, #1 2771 umull v16.8h, v28.8b, v0.8b 2772 umlal v16.8h, v29.8b, v1.8b 2773 27742: 2775 ld1 {v28.s}[0], [\sr2], \s_strd 2776 ld1 {v30.s}[0], [\src], \s_strd 2777 ext v29.8b, v28.8b, v28.8b, #1 2778 ext v31.8b, v30.8b, v30.8b, #1 2779 trn1 v28.4h, v28.4h, v30.4h 2780 trn1 v29.4h, v29.4h, v31.4h 2781 umull v17.8h, v28.8b, v0.8b 2782 umlal v17.8h, v29.8b, v1.8b 2783 2784 trn1 v16.2s, v16.2s, v17.2s 2785 2786 mul v4.4h, v16.4h, v2.4h 2787 mla v4.4h, v17.4h, v3.4h 2788 uqrshrn v4.8b, v4.8h, #8 2789 subs \h, \h, #2 2790 st1 {v4.h}[0], [\dst], \d_strd 2791 st1 {v4.h}[1], [\ds2], \d_strd 2792 b.le 0f 2793 trn2 v16.2s, v17.2s, v17.2s 2794 b 2b 27950: 2796 ret 2797.endif 2798 279940: // 4xN hv 2800 add \sr2, \src, \s_strd 2801 add \ds2, \dst, \d_strd 2802 lsl \s_strd, \s_strd, #1 2803 lsl \d_strd, \d_strd, #1 2804 2805 ld1 {v28.8b}, [\src], \s_strd 2806 ext v29.8b, v28.8b, v28.8b, #1 2807 umull v16.8h, v28.8b, v0.8b 2808 umlal v16.8h, v29.8b, v1.8b 2809 28104: 2811 ld1 {v28.8b}, [\sr2], \s_strd 2812 ld1 {v30.8b}, [\src], \s_strd 2813 ext v29.8b, v28.8b, v28.8b, #1 2814 ext v31.8b, v30.8b, v30.8b, #1 2815 trn1 v28.2s, v28.2s, v30.2s 2816 trn1 v29.2s, v29.2s, v31.2s 2817 umull v17.8h, v28.8b, v0.8b 2818 umlal v17.8h, v29.8b, v1.8b 2819 2820 trn1 v16.2d, v16.2d, v17.2d 2821 2822 mul v4.8h, v16.8h, v2.8h 2823 mla v4.8h, v17.8h, v3.8h 2824 subs \h, \h, #2 2825.ifc \type, put 2826 uqrshrn v4.8b, v4.8h, #8 2827 st1 {v4.s}[0], [\dst], \d_strd 2828 st1 {v4.s}[1], [\ds2], \d_strd 2829.else 2830 urshr v4.8h, v4.8h, #4 2831 st1 {v4.d}[0], [\dst], \d_strd 2832 st1 {v4.d}[1], [\ds2], \d_strd 2833.endif 2834 b.le 0f 2835 trn2 v16.2d, v17.2d, v17.2d 2836 b 4b 28370: 2838 ret 2839 284080: // 8xN, 16xN, ... hv 2841160: 2842320: 2843640: 28441280: 2845 mov \my, \h 2846 28471: 2848 add \sr2, \src, \s_strd 2849 add \ds2, \dst, \d_strd 2850 lsl \s_strd, \s_strd, #1 2851 lsl \d_strd, \d_strd, #1 2852 2853 ld1 {v28.16b}, [\src], \s_strd 2854 ext v29.16b, v28.16b, v28.16b, #1 2855 umull v16.8h, v28.8b, v0.8b 2856 umlal v16.8h, v29.8b, v1.8b 2857 28582: 2859 ld1 {v28.16b}, [\sr2], \s_strd 2860 ld1 {v30.16b}, [\src], \s_strd 2861 ext v29.16b, v28.16b, v28.16b, #1 2862 ext v31.16b, v30.16b, v30.16b, #1 2863 umull v17.8h, v28.8b, v0.8b 2864 umlal v17.8h, v29.8b, v1.8b 2865 umull v18.8h, v30.8b, v0.8b 2866 umlal v18.8h, v31.8b, v1.8b 2867 2868 mul v4.8h, v16.8h, v2.8h 2869 mla v4.8h, v17.8h, v3.8h 2870 mul v5.8h, v17.8h, v2.8h 2871 mla v5.8h, v18.8h, v3.8h 2872 subs \h, \h, #2 2873.ifc \type, put 2874 uqrshrn v4.8b, v4.8h, #8 2875 uqrshrn v5.8b, v5.8h, #8 2876 st1 {v4.8b}, [\dst], \d_strd 2877 st1 {v5.8b}, [\ds2], \d_strd 2878.else 2879 urshr v4.8h, v4.8h, #4 2880 urshr v5.8h, v5.8h, #4 2881 st1 {v4.8h}, [\dst], \d_strd 2882 st1 {v5.8h}, [\ds2], \d_strd 2883.endif 2884 b.le 9f 2885 mov v16.16b, v18.16b 2886 b 2b 28879: 2888 subs \w, \w, #8 2889 b.le 0f 2890 asr \s_strd, \s_strd, #1 2891 asr \d_strd, \d_strd, #1 2892 msub \src, \s_strd, \xmy, \src 2893 msub \dst, \d_strd, \xmy, \dst 2894 sub \src, \src, \s_strd, lsl #1 2895 mov \h, \my 2896 add \src, \src, #8 2897.ifc \type, put 2898 add \dst, \dst, #8 2899.else 2900 add \dst, \dst, #16 2901.endif 2902 b 1b 29030: 2904 ret 2905 2906L(\type\()_bilin_hv_tbl): 2907 .hword L(\type\()_bilin_hv_tbl) - 1280b 2908 .hword L(\type\()_bilin_hv_tbl) - 640b 2909 .hword L(\type\()_bilin_hv_tbl) - 320b 2910 .hword L(\type\()_bilin_hv_tbl) - 160b 2911 .hword L(\type\()_bilin_hv_tbl) - 80b 2912 .hword L(\type\()_bilin_hv_tbl) - 40b 2913 .hword L(\type\()_bilin_hv_tbl) - 20b 2914 .hword 0 2915endfunc 2916.endm 2917 2918filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 2919filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 2920 2921.macro load_filter_row dst, src, inc 2922 asr w13, \src, #10 2923 add \src, \src, \inc 2924 ldr \dst, [x11, w13, sxtw #3] 2925.endm 2926 2927function warp_filter_horz_neon 2928 add w12, w5, #512 2929 2930 ld1 {v16.8b, v17.8b}, [x2], x3 2931 2932 load_filter_row d0, w12, w7 2933 load_filter_row d1, w12, w7 2934 load_filter_row d2, w12, w7 2935 load_filter_row d3, w12, w7 2936 load_filter_row d4, w12, w7 2937 load_filter_row d5, w12, w7 2938 load_filter_row d6, w12, w7 2939 // subtract by 128 to allow using smull 2940 eor v16.8b, v16.8b, v22.8b 2941 eor v17.8b, v17.8b, v22.8b 2942 load_filter_row d7, w12, w7 2943 2944 ext v18.8b, v16.8b, v17.8b, #1 2945 ext v19.8b, v16.8b, v17.8b, #2 2946 smull v0.8h, v0.8b, v16.8b 2947 smull v1.8h, v1.8b, v18.8b 2948 ext v18.8b, v16.8b, v17.8b, #3 2949 ext v20.8b, v16.8b, v17.8b, #4 2950 smull v2.8h, v2.8b, v19.8b 2951 smull v3.8h, v3.8b, v18.8b 2952 ext v18.8b, v16.8b, v17.8b, #5 2953 ext v19.8b, v16.8b, v17.8b, #6 2954 smull v4.8h, v4.8b, v20.8b 2955 smull v5.8h, v5.8b, v18.8b 2956 ext v18.8b, v16.8b, v17.8b, #7 2957 smull v6.8h, v6.8b, v19.8b 2958 smull v7.8h, v7.8b, v18.8b 2959 2960 addp v0.8h, v0.8h, v1.8h 2961 addp v2.8h, v2.8h, v3.8h 2962 addp v4.8h, v4.8h, v5.8h 2963 addp v6.8h, v6.8h, v7.8h 2964 2965 addp v0.8h, v0.8h, v2.8h 2966 addp v4.8h, v4.8h, v6.8h 2967 2968 addp v0.8h, v0.8h, v4.8h 2969 2970 add w5, w5, w8 2971 2972 ret 2973endfunc 2974 2975// void dav1d_warp_affine_8x8_8bpc_neon( 2976// pixel *dst, const ptrdiff_t dst_stride, 2977// const pixel *src, const ptrdiff_t src_stride, 2978// const int16_t *const abcd, int mx, int my) 2979.macro warp t, shift 2980function warp_affine_8x8\t\()_8bpc_neon, export=1 2981 ldr x4, [x4] 2982 sbfx x7, x4, #0, #16 2983 sbfx x8, x4, #16, #16 2984 sbfx x9, x4, #32, #16 2985 sbfx x4, x4, #48, #16 2986 mov w10, #8 2987 sub x2, x2, x3, lsl #1 2988 sub x2, x2, x3 2989 sub x2, x2, #3 2990 movrel x11, X(mc_warp_filter), 64*8 2991 mov x15, x30 2992.ifnb \t 2993 lsl x1, x1, #1 2994.endif 2995 2996 movi v22.8b, #128 2997.ifb \t 2998 movi v23.8h, #128 2999.else 3000 movi v23.8h, #8, lsl #8 3001.endif 3002 3003 bl warp_filter_horz_neon 3004 srshr v24.8h, v0.8h, #3 3005 bl warp_filter_horz_neon 3006 srshr v25.8h, v0.8h, #3 3007 bl warp_filter_horz_neon 3008 srshr v26.8h, v0.8h, #3 3009 bl warp_filter_horz_neon 3010 srshr v27.8h, v0.8h, #3 3011 bl warp_filter_horz_neon 3012 srshr v28.8h, v0.8h, #3 3013 bl warp_filter_horz_neon 3014 srshr v29.8h, v0.8h, #3 3015 bl warp_filter_horz_neon 3016 srshr v30.8h, v0.8h, #3 3017 30181: 3019 add w14, w6, #512 3020 bl warp_filter_horz_neon 3021 srshr v31.8h, v0.8h, #3 3022 3023 load_filter_row d0, w14, w9 3024 load_filter_row d1, w14, w9 3025 load_filter_row d2, w14, w9 3026 load_filter_row d3, w14, w9 3027 load_filter_row d4, w14, w9 3028 load_filter_row d5, w14, w9 3029 load_filter_row d6, w14, w9 3030 load_filter_row d7, w14, w9 3031 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3032 3033 // This ordering of smull/smlal/smull2/smlal2 is highly 3034 // beneficial for Cortex A53 here. 3035 smull v16.4s, v24.4h, v0.4h 3036 smlal v16.4s, v25.4h, v1.4h 3037 smlal v16.4s, v26.4h, v2.4h 3038 smlal v16.4s, v27.4h, v3.4h 3039 smlal v16.4s, v28.4h, v4.4h 3040 smlal v16.4s, v29.4h, v5.4h 3041 smlal v16.4s, v30.4h, v6.4h 3042 smlal v16.4s, v31.4h, v7.4h 3043 smull2 v17.4s, v24.8h, v0.8h 3044 smlal2 v17.4s, v25.8h, v1.8h 3045 smlal2 v17.4s, v26.8h, v2.8h 3046 smlal2 v17.4s, v27.8h, v3.8h 3047 smlal2 v17.4s, v28.8h, v4.8h 3048 smlal2 v17.4s, v29.8h, v5.8h 3049 smlal2 v17.4s, v30.8h, v6.8h 3050 smlal2 v17.4s, v31.8h, v7.8h 3051 3052 mov v24.16b, v25.16b 3053 mov v25.16b, v26.16b 3054 sqrshrn v16.4h, v16.4s, #\shift 3055 mov v26.16b, v27.16b 3056 sqrshrn2 v16.8h, v17.4s, #\shift 3057 mov v27.16b, v28.16b 3058 mov v28.16b, v29.16b 3059 add v16.8h, v16.8h, v23.8h 3060.ifb \t 3061 sqxtun v16.8b, v16.8h 3062.endif 3063 mov v29.16b, v30.16b 3064 mov v30.16b, v31.16b 3065 subs w10, w10, #1 3066.ifnb \t 3067 st1 {v16.8h}, [x0], x1 3068.else 3069 st1 {v16.8b}, [x0], x1 3070.endif 3071 3072 add w6, w6, w4 3073 b.gt 1b 3074 3075 br x15 3076endfunc 3077.endm 3078 3079warp , 11 3080warp t, 7 3081 3082// void dav1d_emu_edge_8bpc_neon( 3083// const intptr_t bw, const intptr_t bh, 3084// const intptr_t iw, const intptr_t ih, 3085// const intptr_t x, const intptr_t y, 3086// pixel *dst, const ptrdiff_t dst_stride, 3087// const pixel *ref, const ptrdiff_t ref_stride) 3088function emu_edge_8bpc_neon, export=1 3089 ldp x8, x9, [sp] 3090 3091 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3092 // ref += iclip(x, 0, iw - 1) 3093 sub x12, x3, #1 // ih - 1 3094 cmp x5, x3 3095 sub x13, x2, #1 // iw - 1 3096 csel x12, x12, x5, ge // min(y, ih - 1) 3097 cmp x4, x2 3098 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3099 csel x13, x13, x4, ge // min(x, iw - 1) 3100 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3101 madd x8, x12, x9, x8 // ref += iclip() * stride 3102 add x8, x8, x13 // ref += iclip() 3103 3104 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3105 // top_ext = iclip(-y, 0, bh - 1) 3106 add x10, x5, x1 // y + bh 3107 neg x5, x5 // -y 3108 sub x10, x10, x3 // y + bh - ih 3109 sub x12, x1, #1 // bh - 1 3110 cmp x10, x1 3111 bic x5, x5, x5, asr #63 // max(-y, 0) 3112 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3113 cmp x5, x1 3114 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3115 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3116 3117 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3118 // left_ext = iclip(-x, 0, bw - 1) 3119 add x11, x4, x0 // x + bw 3120 neg x4, x4 // -x 3121 sub x11, x11, x2 // x + bw - iw 3122 sub x13, x0, #1 // bw - 1 3123 cmp x11, x0 3124 bic x4, x4, x4, asr #63 // max(-x, 0) 3125 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3126 cmp x4, x0 3127 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3128 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3129 3130 // center_h = bh - top_ext - bottom_ext 3131 // dst += top_ext * PXSTRIDE(dst_stride) 3132 // center_w = bw - left_ext - right_ext 3133 sub x1, x1, x5 // bh - top_ext 3134 madd x6, x5, x7, x6 3135 sub x2, x0, x4 // bw - left_ext 3136 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3137 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3138 3139 mov x14, x6 // backup of dst 3140 3141.macro v_loop need_left, need_right 31420: 3143.if \need_left 3144 ld1r {v0.16b}, [x8] 3145 mov x12, x6 // out = dst 3146 mov x3, x4 31471: 3148 subs x3, x3, #16 3149 st1 {v0.16b}, [x12], #16 3150 b.gt 1b 3151.endif 3152 mov x13, x8 3153 add x12, x6, x4 // out = dst + left_ext 3154 mov x3, x2 31551: 3156 ld1 {v0.16b, v1.16b}, [x13], #32 3157 subs x3, x3, #32 3158 st1 {v0.16b, v1.16b}, [x12], #32 3159 b.gt 1b 3160.if \need_right 3161 add x3, x8, x2 // in + center_w 3162 sub x3, x3, #1 // in + center_w - 1 3163 add x12, x6, x4 // dst + left_ext 3164 ld1r {v0.16b}, [x3] 3165 add x12, x12, x2 // out = dst + left_ext + center_w 3166 mov x3, x11 31671: 3168 subs x3, x3, #16 3169 st1 {v0.16b}, [x12], #16 3170 b.gt 1b 3171.endif 3172 3173 subs x1, x1, #1 // center_h-- 3174 add x6, x6, x7 3175 add x8, x8, x9 3176 b.gt 0b 3177.endm 3178 3179 cbz x4, 2f 3180 // need_left 3181 cbz x11, 3f 3182 // need_left + need_right 3183 v_loop 1, 1 3184 b 5f 3185 31862: 3187 // !need_left 3188 cbz x11, 4f 3189 // !need_left + need_right 3190 v_loop 0, 1 3191 b 5f 3192 31933: 3194 // need_left + !need_right 3195 v_loop 1, 0 3196 b 5f 3197 31984: 3199 // !need_left + !need_right 3200 v_loop 0, 0 3201 32025: 3203 3204 cbz x10, 3f 3205 // need_bottom 3206 sub x8, x6, x7 // ref = dst - stride 3207 mov x4, x0 32081: 3209 ld1 {v0.16b, v1.16b}, [x8], #32 3210 mov x3, x10 32112: 3212 subs x3, x3, #1 3213 st1 {v0.16b, v1.16b}, [x6], x7 3214 b.gt 2b 3215 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3216 subs x4, x4, #32 // bw -= 32 3217 add x6, x6, #32 // dst += 32 3218 b.gt 1b 3219 32203: 3221 cbz x5, 3f 3222 // need_top 3223 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 32241: 3225 ld1 {v0.16b, v1.16b}, [x14], #32 3226 mov x3, x5 32272: 3228 subs x3, x3, #1 3229 st1 {v0.16b, v1.16b}, [x6], x7 3230 b.gt 2b 3231 msub x6, x7, x5, x6 // dst -= top_ext * stride 3232 subs x0, x0, #32 // bw -= 32 3233 add x6, x6, #32 // dst += 32 3234 b.gt 1b 3235 32363: 3237 ret 3238endfunc 3239