1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 adr x7, L(\type\()_tbl) 83 sub w4, w4, #24 84 ldrh w4, [x7, x4, lsl #1] 85 \type v4, v0, v1, v2, v3 86 sub x7, x7, w4, uxtw 87 br x7 8840: 89 add x7, x0, x1 90 lsl x1, x1, #1 914: 92 cmp w5, #4 93 st1 {v4.s}[0], [x0], x1 94 st1 {v4.s}[1], [x7], x1 95 st1 {v4.s}[2], [x0], x1 96 st1 {v4.s}[3], [x7], x1 97 b.eq 0f 98 \type v5, v0, v1, v2, v3 99 cmp w5, #8 100 st1 {v5.s}[0], [x0], x1 101 st1 {v5.s}[1], [x7], x1 102 st1 {v5.s}[2], [x0], x1 103 st1 {v5.s}[3], [x7], x1 104 b.eq 0f 105 \type v4, v0, v1, v2, v3 106 st1 {v4.s}[0], [x0], x1 107 st1 {v4.s}[1], [x7], x1 108 \type v5, v0, v1, v2, v3 109 st1 {v4.s}[2], [x0], x1 110 st1 {v4.s}[3], [x7], x1 111 st1 {v5.s}[0], [x0], x1 112 st1 {v5.s}[1], [x7], x1 113 st1 {v5.s}[2], [x0], x1 114 st1 {v5.s}[3], [x7], x1 115 ret 11680: 117 add x7, x0, x1 118 lsl x1, x1, #1 1198: 120 st1 {v4.d}[0], [x0], x1 121 \type v5, v0, v1, v2, v3 122 st1 {v4.d}[1], [x7], x1 123 st1 {v5.d}[0], [x0], x1 124 subs w5, w5, #4 125 st1 {v5.d}[1], [x7], x1 126 b.le 0f 127 \type v4, v0, v1, v2, v3 128 b 8b 12916: 130 \type v5, v0, v1, v2, v3 131 st1 {v4.16b}, [x0], x1 132 \type v6, v0, v1, v2, v3 133 st1 {v5.16b}, [x0], x1 134 \type v7, v0, v1, v2, v3 135 st1 {v6.16b}, [x0], x1 136 subs w5, w5, #4 137 st1 {v7.16b}, [x0], x1 138 b.le 0f 139 \type v4, v0, v1, v2, v3 140 b 16b 141320: 142 add x7, x0, x1 143 lsl x1, x1, #1 14432: 145 \type v5, v0, v1, v2, v3 146 \type v6, v0, v1, v2, v3 147 st1 {v4.16b,v5.16b}, [x0], x1 148 \type v7, v0, v1, v2, v3 149 subs w5, w5, #2 150 st1 {v6.16b,v7.16b}, [x7], x1 151 b.le 0f 152 \type v4, v0, v1, v2, v3 153 b 32b 154640: 155 add x7, x0, x1 156 lsl x1, x1, #1 15764: 158 \type v5, v0, v1, v2, v3 159 \type v6, v0, v1, v2, v3 160 \type v7, v0, v1, v2, v3 161 \type v16, v0, v1, v2, v3 162 \type v17, v0, v1, v2, v3 163 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 164 \type v18, v0, v1, v2, v3 165 \type v19, v0, v1, v2, v3 166 subs w5, w5, #2 167 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 168 b.le 0f 169 \type v4, v0, v1, v2, v3 170 b 64b 1711280: 172 add x7, x0, #64 173128: 174 \type v5, v0, v1, v2, v3 175 \type v6, v0, v1, v2, v3 176 \type v7, v0, v1, v2, v3 177 \type v16, v0, v1, v2, v3 178 \type v17, v0, v1, v2, v3 179 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 180 \type v18, v0, v1, v2, v3 181 \type v19, v0, v1, v2, v3 182 subs w5, w5, #1 183 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 184 b.le 0f 185 \type v4, v0, v1, v2, v3 186 b 128b 1870: 188 ret 189L(\type\()_tbl): 190 .hword L(\type\()_tbl) - 1280b 191 .hword L(\type\()_tbl) - 640b 192 .hword L(\type\()_tbl) - 320b 193 .hword L(\type\()_tbl) - 16b 194 .hword L(\type\()_tbl) - 80b 195 .hword L(\type\()_tbl) - 40b 196endfunc 197.endm 198 199bidir_fn avg 200bidir_fn w_avg 201bidir_fn mask 202 203 204.macro w_mask_fn type 205function w_mask_\type\()_8bpc_neon, export=1 206 clz w8, w4 207 adr x9, L(w_mask_\type\()_tbl) 208 sub w8, w8, #24 209 ldrh w8, [x9, x8, lsl #1] 210 sub x9, x9, w8, uxtw 211 mov w10, #6903 212 dup v0.8h, w10 213.if \type == 444 214 movi v1.16b, #64 215.elseif \type == 422 216 dup v2.8b, w7 217 movi v3.8b, #129 218 sub v3.8b, v3.8b, v2.8b 219.elseif \type == 420 220 dup v2.8h, w7 221 movi v3.8h, #1, lsl #8 222 sub v3.8h, v3.8h, v2.8h 223.endif 224 add x12, x0, x1 225 lsl x1, x1, #1 226 br x9 2274: 228 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 229 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 230 subs w5, w5, #4 231 sub v16.8h, v6.8h, v4.8h 232 sub v17.8h, v7.8h, v5.8h 233 sabd v18.8h, v4.8h, v6.8h 234 sabd v19.8h, v5.8h, v7.8h 235 uqsub v18.8h, v0.8h, v18.8h 236 uqsub v19.8h, v0.8h, v19.8h 237 ushr v18.8h, v18.8h, #8 238 ushr v19.8h, v19.8h, #8 239 shl v20.8h, v18.8h, #9 240 shl v21.8h, v19.8h, #9 241 sqdmulh v20.8h, v20.8h, v16.8h 242 sqdmulh v21.8h, v21.8h, v17.8h 243 add v20.8h, v20.8h, v4.8h 244 add v21.8h, v21.8h, v5.8h 245 sqrshrun v22.8b, v20.8h, #4 246 sqrshrun v23.8b, v21.8h, #4 247.if \type == 444 248 xtn v18.8b, v18.8h 249 xtn2 v18.16b, v19.8h 250 sub v18.16b, v1.16b, v18.16b 251 st1 {v18.16b}, [x6], #16 252.elseif \type == 422 253 addp v18.8h, v18.8h, v19.8h 254 xtn v18.8b, v18.8h 255 uhsub v18.8b, v3.8b, v18.8b 256 st1 {v18.8b}, [x6], #8 257.elseif \type == 420 258 trn1 v24.2d, v18.2d, v19.2d 259 trn2 v25.2d, v18.2d, v19.2d 260 add v24.8h, v24.8h, v25.8h 261 addp v18.8h, v24.8h, v24.8h 262 sub v18.4h, v3.4h, v18.4h 263 rshrn v18.8b, v18.8h, #2 264 st1 {v18.s}[0], [x6], #4 265.endif 266 st1 {v22.s}[0], [x0], x1 267 st1 {v22.s}[1], [x12], x1 268 st1 {v23.s}[0], [x0], x1 269 st1 {v23.s}[1], [x12], x1 270 b.gt 4b 271 ret 2728: 273 ld1 {v4.8h, v5.8h}, [x2], #32 274 ld1 {v6.8h, v7.8h}, [x3], #32 275 subs w5, w5, #2 276 sub v16.8h, v6.8h, v4.8h 277 sub v17.8h, v7.8h, v5.8h 278 sabd v18.8h, v4.8h, v6.8h 279 sabd v19.8h, v5.8h, v7.8h 280 uqsub v18.8h, v0.8h, v18.8h 281 uqsub v19.8h, v0.8h, v19.8h 282 ushr v18.8h, v18.8h, #8 283 ushr v19.8h, v19.8h, #8 284 shl v20.8h, v18.8h, #9 285 shl v21.8h, v19.8h, #9 286 sqdmulh v20.8h, v20.8h, v16.8h 287 sqdmulh v21.8h, v21.8h, v17.8h 288 add v20.8h, v20.8h, v4.8h 289 add v21.8h, v21.8h, v5.8h 290 sqrshrun v22.8b, v20.8h, #4 291 sqrshrun v23.8b, v21.8h, #4 292.if \type == 444 293 xtn v18.8b, v18.8h 294 xtn2 v18.16b, v19.8h 295 sub v18.16b, v1.16b, v18.16b 296 st1 {v18.16b}, [x6], #16 297.elseif \type == 422 298 addp v18.8h, v18.8h, v19.8h 299 xtn v18.8b, v18.8h 300 uhsub v18.8b, v3.8b, v18.8b 301 st1 {v18.8b}, [x6], #8 302.elseif \type == 420 303 add v18.8h, v18.8h, v19.8h 304 addp v18.8h, v18.8h, v18.8h 305 sub v18.4h, v3.4h, v18.4h 306 rshrn v18.8b, v18.8h, #2 307 st1 {v18.s}[0], [x6], #4 308.endif 309 st1 {v22.8b}, [x0], x1 310 st1 {v23.8b}, [x12], x1 311 b.gt 8b 312 ret 3131280: 314640: 315320: 316160: 317 mov w11, w4 318 sub x1, x1, w4, uxtw 319.if \type == 444 320 add x10, x6, w4, uxtw 321.elseif \type == 422 322 add x10, x6, x11, lsr #1 323.endif 324 add x9, x3, w4, uxtw #1 325 add x7, x2, w4, uxtw #1 326161: 327 mov w8, w4 32816: 329 ld1 {v4.8h, v5.8h}, [x2], #32 330 ld1 {v6.8h, v7.8h}, [x3], #32 331 ld1 {v16.8h, v17.8h}, [x7], #32 332 ld1 {v18.8h, v19.8h}, [x9], #32 333 subs w8, w8, #16 334 sub v6.8h, v6.8h, v4.8h 335 sub v7.8h, v7.8h, v5.8h 336 sub v18.8h, v18.8h, v16.8h 337 sub v19.8h, v19.8h, v17.8h 338 abs v20.8h, v6.8h 339 abs v21.8h, v7.8h 340 abs v22.8h, v18.8h 341 abs v23.8h, v19.8h 342 uqsub v20.8h, v0.8h, v20.8h 343 uqsub v21.8h, v0.8h, v21.8h 344 uqsub v22.8h, v0.8h, v22.8h 345 uqsub v23.8h, v0.8h, v23.8h 346 ushr v20.8h, v20.8h, #8 347 ushr v21.8h, v21.8h, #8 348 ushr v22.8h, v22.8h, #8 349 ushr v23.8h, v23.8h, #8 350 shl v24.8h, v20.8h, #9 351 shl v25.8h, v21.8h, #9 352 shl v26.8h, v22.8h, #9 353 shl v27.8h, v23.8h, #9 354 sqdmulh v24.8h, v24.8h, v6.8h 355 sqdmulh v25.8h, v25.8h, v7.8h 356 sqdmulh v26.8h, v26.8h, v18.8h 357 sqdmulh v27.8h, v27.8h, v19.8h 358 add v24.8h, v24.8h, v4.8h 359 add v25.8h, v25.8h, v5.8h 360 add v26.8h, v26.8h, v16.8h 361 add v27.8h, v27.8h, v17.8h 362 sqrshrun v24.8b, v24.8h, #4 363 sqrshrun v25.8b, v25.8h, #4 364 sqrshrun v26.8b, v26.8h, #4 365 sqrshrun v27.8b, v27.8h, #4 366.if \type == 444 367 xtn v20.8b, v20.8h 368 xtn2 v20.16b, v21.8h 369 xtn v21.8b, v22.8h 370 xtn2 v21.16b, v23.8h 371 sub v20.16b, v1.16b, v20.16b 372 sub v21.16b, v1.16b, v21.16b 373 st1 {v20.16b}, [x6], #16 374 st1 {v21.16b}, [x10], #16 375.elseif \type == 422 376 addp v20.8h, v20.8h, v21.8h 377 addp v21.8h, v22.8h, v23.8h 378 xtn v20.8b, v20.8h 379 xtn v21.8b, v21.8h 380 uhsub v20.8b, v3.8b, v20.8b 381 uhsub v21.8b, v3.8b, v21.8b 382 st1 {v20.8b}, [x6], #8 383 st1 {v21.8b}, [x10], #8 384.elseif \type == 420 385 add v20.8h, v20.8h, v22.8h 386 add v21.8h, v21.8h, v23.8h 387 addp v20.8h, v20.8h, v21.8h 388 sub v20.8h, v3.8h, v20.8h 389 rshrn v20.8b, v20.8h, #2 390 st1 {v20.8b}, [x6], #8 391.endif 392 st1 {v24.8b, v25.8b}, [x0], #16 393 st1 {v26.8b, v27.8b}, [x12], #16 394 b.gt 16b 395 subs w5, w5, #2 396 add x2, x2, w4, uxtw #1 397 add x3, x3, w4, uxtw #1 398 add x7, x7, w4, uxtw #1 399 add x9, x9, w4, uxtw #1 400.if \type == 444 401 add x6, x6, w4, uxtw 402 add x10, x10, w4, uxtw 403.elseif \type == 422 404 add x6, x6, x11, lsr #1 405 add x10, x10, x11, lsr #1 406.endif 407 add x0, x0, x1 408 add x12, x12, x1 409 b.gt 161b 410 ret 411L(w_mask_\type\()_tbl): 412 .hword L(w_mask_\type\()_tbl) - 1280b 413 .hword L(w_mask_\type\()_tbl) - 640b 414 .hword L(w_mask_\type\()_tbl) - 320b 415 .hword L(w_mask_\type\()_tbl) - 160b 416 .hword L(w_mask_\type\()_tbl) - 8b 417 .hword L(w_mask_\type\()_tbl) - 4b 418endfunc 419.endm 420 421w_mask_fn 444 422w_mask_fn 422 423w_mask_fn 420 424 425 426function blend_8bpc_neon, export=1 427 adr x6, L(blend_tbl) 428 clz w3, w3 429 sub w3, w3, #26 430 ldrh w3, [x6, x3, lsl #1] 431 sub x6, x6, w3, uxtw 432 movi v4.16b, #64 433 add x8, x0, x1 434 lsl x1, x1, #1 435 br x6 4364: 437 ld1 {v2.8b}, [x5], #8 438 ld1 {v1.d}[0], [x2], #8 439 ld1 {v0.s}[0], [x0] 440 subs w4, w4, #2 441 ld1 {v0.s}[1], [x8] 442 sub v3.8b, v4.8b, v2.8b 443 umull v5.8h, v1.8b, v2.8b 444 umlal v5.8h, v0.8b, v3.8b 445 rshrn v6.8b, v5.8h, #6 446 st1 {v6.s}[0], [x0], x1 447 st1 {v6.s}[1], [x8], x1 448 b.gt 4b 449 ret 4508: 451 ld1 {v2.16b}, [x5], #16 452 ld1 {v1.16b}, [x2], #16 453 ld1 {v0.d}[0], [x0] 454 ld1 {v0.d}[1], [x8] 455 sub v3.16b, v4.16b, v2.16b 456 subs w4, w4, #2 457 umull v5.8h, v1.8b, v2.8b 458 umlal v5.8h, v0.8b, v3.8b 459 umull2 v6.8h, v1.16b, v2.16b 460 umlal2 v6.8h, v0.16b, v3.16b 461 rshrn v7.8b, v5.8h, #6 462 rshrn2 v7.16b, v6.8h, #6 463 st1 {v7.d}[0], [x0], x1 464 st1 {v7.d}[1], [x8], x1 465 b.gt 8b 466 ret 46716: 468 ld1 {v1.16b, v2.16b}, [x5], #32 469 ld1 {v5.16b, v6.16b}, [x2], #32 470 ld1 {v0.16b}, [x0] 471 subs w4, w4, #2 472 sub v7.16b, v4.16b, v1.16b 473 sub v20.16b, v4.16b, v2.16b 474 ld1 {v3.16b}, [x8] 475 umull v16.8h, v5.8b, v1.8b 476 umlal v16.8h, v0.8b, v7.8b 477 umull2 v17.8h, v5.16b, v1.16b 478 umlal2 v17.8h, v0.16b, v7.16b 479 umull v21.8h, v6.8b, v2.8b 480 umlal v21.8h, v3.8b, v20.8b 481 umull2 v22.8h, v6.16b, v2.16b 482 umlal2 v22.8h, v3.16b, v20.16b 483 rshrn v18.8b, v16.8h, #6 484 rshrn2 v18.16b, v17.8h, #6 485 rshrn v19.8b, v21.8h, #6 486 rshrn2 v19.16b, v22.8h, #6 487 st1 {v18.16b}, [x0], x1 488 st1 {v19.16b}, [x8], x1 489 b.gt 16b 490 ret 49132: 492 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 493 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 494 ld1 {v20.16b, v21.16b}, [x0] 495 subs w4, w4, #2 496 ld1 {v22.16b, v23.16b}, [x8] 497 sub v5.16b, v4.16b, v0.16b 498 sub v6.16b, v4.16b, v1.16b 499 sub v30.16b, v4.16b, v2.16b 500 sub v31.16b, v4.16b, v3.16b 501 umull v24.8h, v16.8b, v0.8b 502 umlal v24.8h, v20.8b, v5.8b 503 umull2 v26.8h, v16.16b, v0.16b 504 umlal2 v26.8h, v20.16b, v5.16b 505 umull v28.8h, v17.8b, v1.8b 506 umlal v28.8h, v21.8b, v6.8b 507 umull2 v7.8h, v17.16b, v1.16b 508 umlal2 v7.8h, v21.16b, v6.16b 509 umull v27.8h, v18.8b, v2.8b 510 umlal v27.8h, v22.8b, v30.8b 511 umull2 v1.8h, v18.16b, v2.16b 512 umlal2 v1.8h, v22.16b, v30.16b 513 umull v29.8h, v19.8b, v3.8b 514 umlal v29.8h, v23.8b, v31.8b 515 umull2 v21.8h, v19.16b, v3.16b 516 umlal2 v21.8h, v23.16b, v31.16b 517 rshrn v24.8b, v24.8h, #6 518 rshrn2 v24.16b, v26.8h, #6 519 rshrn v25.8b, v28.8h, #6 520 rshrn2 v25.16b, v7.8h, #6 521 rshrn v27.8b, v27.8h, #6 522 rshrn2 v27.16b, v1.8h, #6 523 rshrn v28.8b, v29.8h, #6 524 rshrn2 v28.16b, v21.8h, #6 525 st1 {v24.16b, v25.16b}, [x0], x1 526 st1 {v27.16b, v28.16b}, [x8], x1 527 b.gt 32b 528 ret 529L(blend_tbl): 530 .hword L(blend_tbl) - 32b 531 .hword L(blend_tbl) - 16b 532 .hword L(blend_tbl) - 8b 533 .hword L(blend_tbl) - 4b 534endfunc 535 536function blend_h_8bpc_neon, export=1 537 adr x6, L(blend_h_tbl) 538 movrel x5, X(obmc_masks) 539 add x5, x5, w4, uxtw 540 sub w4, w4, w4, lsr #2 541 clz w7, w3 542 movi v4.16b, #64 543 add x8, x0, x1 544 lsl x1, x1, #1 545 sub w7, w7, #24 546 ldrh w7, [x6, x7, lsl #1] 547 sub x6, x6, w7, uxtw 548 br x6 5492: 550 ld1 {v0.h}[0], [x5], #2 551 ld1 {v1.s}[0], [x2], #4 552 subs w4, w4, #2 553 ld1 {v2.h}[0], [x0] 554 zip1 v0.8b, v0.8b, v0.8b 555 sub v3.8b, v4.8b, v0.8b 556 ld1 {v2.h}[1], [x8] 557 umull v5.8h, v1.8b, v0.8b 558 umlal v5.8h, v2.8b, v3.8b 559 rshrn v5.8b, v5.8h, #6 560 st1 {v5.h}[0], [x0], x1 561 st1 {v5.h}[1], [x8], x1 562 b.gt 2b 563 ret 5644: 565 ld2r {v0.8b, v1.8b}, [x5], #2 566 ld1 {v2.8b}, [x2], #8 567 subs w4, w4, #2 568 ext v0.8b, v0.8b, v1.8b, #4 569 ld1 {v3.s}[0], [x0] 570 sub v5.8b, v4.8b, v0.8b 571 ld1 {v3.s}[1], [x8] 572 umull v6.8h, v2.8b, v0.8b 573 umlal v6.8h, v3.8b, v5.8b 574 rshrn v6.8b, v6.8h, #6 575 st1 {v6.s}[0], [x0], x1 576 st1 {v6.s}[1], [x8], x1 577 b.gt 4b 578 ret 5798: 580 ld2r {v0.16b, v1.16b}, [x5], #2 581 ld1 {v2.16b}, [x2], #16 582 ld1 {v3.d}[0], [x0] 583 ext v0.16b, v0.16b, v1.16b, #8 584 sub v5.16b, v4.16b, v0.16b 585 ld1 {v3.d}[1], [x8] 586 subs w4, w4, #2 587 umull v6.8h, v0.8b, v2.8b 588 umlal v6.8h, v3.8b, v5.8b 589 umull2 v7.8h, v0.16b, v2.16b 590 umlal2 v7.8h, v3.16b, v5.16b 591 rshrn v16.8b, v6.8h, #6 592 rshrn2 v16.16b, v7.8h, #6 593 st1 {v16.d}[0], [x0], x1 594 st1 {v16.d}[1], [x8], x1 595 b.gt 8b 596 ret 59716: 598 ld2r {v0.16b, v1.16b}, [x5], #2 599 ld1 {v2.16b, v3.16b}, [x2], #32 600 ld1 {v5.16b}, [x0] 601 sub v7.16b, v4.16b, v0.16b 602 sub v16.16b, v4.16b, v1.16b 603 ld1 {v6.16b}, [x8] 604 subs w4, w4, #2 605 umull v17.8h, v0.8b, v2.8b 606 umlal v17.8h, v5.8b, v7.8b 607 umull2 v18.8h, v0.16b, v2.16b 608 umlal2 v18.8h, v5.16b, v7.16b 609 umull v19.8h, v1.8b, v3.8b 610 umlal v19.8h, v6.8b, v16.8b 611 umull2 v20.8h, v1.16b, v3.16b 612 umlal2 v20.8h, v6.16b, v16.16b 613 rshrn v21.8b, v17.8h, #6 614 rshrn2 v21.16b, v18.8h, #6 615 rshrn v22.8b, v19.8h, #6 616 rshrn2 v22.16b, v20.8h, #6 617 st1 {v21.16b}, [x0], x1 618 st1 {v22.16b}, [x8], x1 619 b.gt 16b 620 ret 6211280: 622640: 623320: 624 sub x1, x1, w3, uxtw 625 add x7, x2, w3, uxtw 626321: 627 ld2r {v0.16b, v1.16b}, [x5], #2 628 mov w6, w3 629 sub v20.16b, v4.16b, v0.16b 630 sub v21.16b, v4.16b, v1.16b 63132: 632 ld1 {v16.16b, v17.16b}, [x2], #32 633 ld1 {v2.16b, v3.16b}, [x0] 634 subs w6, w6, #32 635 umull v23.8h, v0.8b, v16.8b 636 umlal v23.8h, v2.8b, v20.8b 637 ld1 {v18.16b, v19.16b}, [x7], #32 638 umull2 v27.8h, v0.16b, v16.16b 639 umlal2 v27.8h, v2.16b, v20.16b 640 ld1 {v6.16b, v7.16b}, [x8] 641 umull v24.8h, v0.8b, v17.8b 642 umlal v24.8h, v3.8b, v20.8b 643 umull2 v28.8h, v0.16b, v17.16b 644 umlal2 v28.8h, v3.16b, v20.16b 645 umull v25.8h, v1.8b, v18.8b 646 umlal v25.8h, v6.8b, v21.8b 647 umull2 v5.8h, v1.16b, v18.16b 648 umlal2 v5.8h, v6.16b, v21.16b 649 rshrn v29.8b, v23.8h, #6 650 rshrn2 v29.16b, v27.8h, #6 651 umull v26.8h, v1.8b, v19.8b 652 umlal v26.8h, v7.8b, v21.8b 653 umull2 v31.8h, v1.16b, v19.16b 654 umlal2 v31.8h, v7.16b, v21.16b 655 rshrn v30.8b, v24.8h, #6 656 rshrn2 v30.16b, v28.8h, #6 657 rshrn v23.8b, v25.8h, #6 658 rshrn2 v23.16b, v5.8h, #6 659 rshrn v24.8b, v26.8h, #6 660 st1 {v29.16b, v30.16b}, [x0], #32 661 rshrn2 v24.16b, v31.8h, #6 662 st1 {v23.16b, v24.16b}, [x8], #32 663 b.gt 32b 664 subs w4, w4, #2 665 add x0, x0, x1 666 add x8, x8, x1 667 add x2, x2, w3, uxtw 668 add x7, x7, w3, uxtw 669 b.gt 321b 670 ret 671L(blend_h_tbl): 672 .hword L(blend_h_tbl) - 1280b 673 .hword L(blend_h_tbl) - 640b 674 .hword L(blend_h_tbl) - 320b 675 .hword L(blend_h_tbl) - 16b 676 .hword L(blend_h_tbl) - 8b 677 .hword L(blend_h_tbl) - 4b 678 .hword L(blend_h_tbl) - 2b 679endfunc 680 681function blend_v_8bpc_neon, export=1 682 adr x6, L(blend_v_tbl) 683 movrel x5, X(obmc_masks) 684 add x5, x5, w3, uxtw 685 clz w3, w3 686 movi v4.16b, #64 687 add x8, x0, x1 688 lsl x1, x1, #1 689 sub w3, w3, #26 690 ldrh w3, [x6, x3, lsl #1] 691 sub x6, x6, w3, uxtw 692 br x6 69320: 694 ld1r {v0.8b}, [x5] 695 sub v1.8b, v4.8b, v0.8b 6962: 697 ld1 {v2.h}[0], [x2], #2 698 ld1 {v3.b}[0], [x0] 699 subs w4, w4, #2 700 ld1 {v2.b}[1], [x2] 701 ld1 {v3.b}[1], [x8] 702 umull v5.8h, v2.8b, v0.8b 703 umlal v5.8h, v3.8b, v1.8b 704 rshrn v5.8b, v5.8h, #6 705 add x2, x2, #2 706 st1 {v5.b}[0], [x0], x1 707 st1 {v5.b}[1], [x8], x1 708 b.gt 2b 709 ret 71040: 711 ld1r {v0.2s}, [x5] 712 sub x1, x1, #2 713 sub v1.8b, v4.8b, v0.8b 7144: 715 ld1 {v2.8b}, [x2], #8 716 ld1 {v3.s}[0], [x0] 717 ld1 {v3.s}[1], [x8] 718 subs w4, w4, #2 719 umull v5.8h, v2.8b, v0.8b 720 umlal v5.8h, v3.8b, v1.8b 721 rshrn v5.8b, v5.8h, #6 722 st1 {v5.h}[0], [x0], #2 723 st1 {v5.h}[2], [x8], #2 724 st1 {v5.b}[2], [x0], x1 725 st1 {v5.b}[6], [x8], x1 726 b.gt 4b 727 ret 72880: 729 ld1r {v0.2d}, [x5] 730 sub x1, x1, #4 731 sub v1.16b, v4.16b, v0.16b 7328: 733 ld1 {v2.16b}, [x2], #16 734 ld1 {v3.d}[0], [x0] 735 ld1 {v3.d}[1], [x8] 736 subs w4, w4, #2 737 umull v5.8h, v0.8b, v2.8b 738 umlal v5.8h, v3.8b, v1.8b 739 umull2 v6.8h, v0.16b, v2.16b 740 umlal2 v6.8h, v3.16b, v1.16b 741 rshrn v7.8b, v5.8h, #6 742 rshrn2 v7.16b, v6.8h, #6 743 st1 {v7.s}[0], [x0], #4 744 st1 {v7.s}[2], [x8], #4 745 st1 {v7.h}[2], [x0], x1 746 st1 {v7.h}[6], [x8], x1 747 b.gt 8b 748 ret 749160: 750 ld1 {v0.16b}, [x5] 751 sub x1, x1, #8 752 sub v2.16b, v4.16b, v0.16b 75316: 754 ld1 {v5.16b, v6.16b}, [x2], #32 755 ld1 {v7.16b}, [x0] 756 subs w4, w4, #2 757 ld1 {v16.16b}, [x8] 758 umull v17.8h, v5.8b, v0.8b 759 umlal v17.8h, v7.8b, v2.8b 760 umull2 v18.8h, v5.16b, v0.16b 761 umlal2 v18.8h, v7.16b, v2.16b 762 umull v20.8h, v6.8b, v0.8b 763 umlal v20.8h, v16.8b, v2.8b 764 umull2 v21.8h, v6.16b, v0.16b 765 umlal2 v21.8h, v16.16b, v2.16b 766 rshrn v19.8b, v17.8h, #6 767 rshrn2 v19.16b, v18.8h, #6 768 rshrn v22.8b, v20.8h, #6 769 rshrn2 v22.16b, v21.8h, #6 770 st1 {v19.8b}, [x0], #8 771 st1 {v22.8b}, [x8], #8 772 st1 {v19.s}[2], [x0], x1 773 st1 {v22.s}[2], [x8], x1 774 b.gt 16b 775 ret 776320: 777 ld1 {v0.16b, v1.16b}, [x5] 778 sub x1, x1, #16 779 sub v2.16b, v4.16b, v0.16b 780 sub v3.8b, v4.8b, v1.8b 78132: 782 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 783 ld1 {v5.16b, v6.16b}, [x0] 784 subs w4, w4, #2 785 ld1 {v20.16b, v21.16b}, [x8] 786 umull v22.8h, v16.8b, v0.8b 787 umlal v22.8h, v5.8b, v2.8b 788 umull2 v23.8h, v16.16b, v0.16b 789 umlal2 v23.8h, v5.16b, v2.16b 790 umull v28.8h, v17.8b, v1.8b 791 umlal v28.8h, v6.8b, v3.8b 792 umull v30.8h, v18.8b, v0.8b 793 umlal v30.8h, v20.8b, v2.8b 794 umull2 v31.8h, v18.16b, v0.16b 795 umlal2 v31.8h, v20.16b, v2.16b 796 umull v25.8h, v19.8b, v1.8b 797 umlal v25.8h, v21.8b, v3.8b 798 rshrn v24.8b, v22.8h, #6 799 rshrn2 v24.16b, v23.8h, #6 800 rshrn v28.8b, v28.8h, #6 801 rshrn v30.8b, v30.8h, #6 802 rshrn2 v30.16b, v31.8h, #6 803 rshrn v27.8b, v25.8h, #6 804 st1 {v24.16b}, [x0], #16 805 st1 {v30.16b}, [x8], #16 806 st1 {v28.8b}, [x0], x1 807 st1 {v27.8b}, [x8], x1 808 b.gt 32b 809 ret 810L(blend_v_tbl): 811 .hword L(blend_v_tbl) - 320b 812 .hword L(blend_v_tbl) - 160b 813 .hword L(blend_v_tbl) - 80b 814 .hword L(blend_v_tbl) - 40b 815 .hword L(blend_v_tbl) - 20b 816endfunc 817 818 819// This has got the same signature as the put_8tap functions, 820// and assumes that x8 is set to (clz(w)-24). 821function put_neon 822 adr x9, L(put_tbl) 823 ldrh w8, [x9, x8, lsl #1] 824 sub x9, x9, w8, uxtw 825 br x9 826 8272: 828 ld1 {v0.h}[0], [x2], x3 829 ld1 {v1.h}[0], [x2], x3 830 subs w5, w5, #2 831 st1 {v0.h}[0], [x0], x1 832 st1 {v1.h}[0], [x0], x1 833 b.gt 2b 834 ret 8354: 836 ld1 {v0.s}[0], [x2], x3 837 ld1 {v1.s}[0], [x2], x3 838 subs w5, w5, #2 839 st1 {v0.s}[0], [x0], x1 840 st1 {v1.s}[0], [x0], x1 841 b.gt 4b 842 ret 8438: 844 ld1 {v0.8b}, [x2], x3 845 ld1 {v1.8b}, [x2], x3 846 subs w5, w5, #2 847 st1 {v0.8b}, [x0], x1 848 st1 {v1.8b}, [x0], x1 849 b.gt 8b 850 ret 851160: 852 add x8, x0, x1 853 lsl x1, x1, #1 854 add x9, x2, x3 855 lsl x3, x3, #1 85616: 857 ld1 {v0.16b}, [x2], x3 858 ld1 {v1.16b}, [x9], x3 859 subs w5, w5, #2 860 st1 {v0.16b}, [x0], x1 861 st1 {v1.16b}, [x8], x1 862 b.gt 16b 863 ret 86432: 865 ldp x6, x7, [x2] 866 ldp x8, x9, [x2, #16] 867 stp x6, x7, [x0] 868 subs w5, w5, #1 869 stp x8, x9, [x0, #16] 870 add x2, x2, x3 871 add x0, x0, x1 872 b.gt 32b 873 ret 87464: 875 ldp x6, x7, [x2] 876 ldp x8, x9, [x2, #16] 877 stp x6, x7, [x0] 878 ldp x10, x11, [x2, #32] 879 stp x8, x9, [x0, #16] 880 subs w5, w5, #1 881 ldp x12, x13, [x2, #48] 882 stp x10, x11, [x0, #32] 883 stp x12, x13, [x0, #48] 884 add x2, x2, x3 885 add x0, x0, x1 886 b.gt 64b 887 ret 888128: 889 ldp q0, q1, [x2] 890 ldp q2, q3, [x2, #32] 891 stp q0, q1, [x0] 892 ldp q4, q5, [x2, #64] 893 stp q2, q3, [x0, #32] 894 ldp q6, q7, [x2, #96] 895 subs w5, w5, #1 896 stp q4, q5, [x0, #64] 897 stp q6, q7, [x0, #96] 898 add x2, x2, x3 899 add x0, x0, x1 900 b.gt 128b 901 ret 902 903L(put_tbl): 904 .hword L(put_tbl) - 128b 905 .hword L(put_tbl) - 64b 906 .hword L(put_tbl) - 32b 907 .hword L(put_tbl) - 160b 908 .hword L(put_tbl) - 8b 909 .hword L(put_tbl) - 4b 910 .hword L(put_tbl) - 2b 911endfunc 912 913 914// This has got the same signature as the prep_8tap functions, 915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 916function prep_neon 917 adr x9, L(prep_tbl) 918 ldrh w8, [x9, x8, lsl #1] 919 sub x9, x9, w8, uxtw 920 br x9 921 9224: 923 ld1 {v0.s}[0], [x1], x2 924 ld1 {v1.s}[0], [x1], x2 925 subs w4, w4, #2 926 ushll v0.8h, v0.8b, #4 927 ushll v1.8h, v1.8b, #4 928 st1 {v0.4h, v1.4h}, [x0], #16 929 b.gt 4b 930 ret 9318: 932 ld1 {v0.8b}, [x1], x2 933 ld1 {v1.8b}, [x1], x2 934 subs w4, w4, #2 935 ushll v0.8h, v0.8b, #4 936 ushll v1.8h, v1.8b, #4 937 st1 {v0.8h, v1.8h}, [x0], #32 938 b.gt 8b 939 ret 940160: 941 add x9, x1, x2 942 lsl x2, x2, #1 94316: 944 ld1 {v0.16b}, [x1], x2 945 ld1 {v1.16b}, [x9], x2 946 subs w4, w4, #2 947 ushll v4.8h, v0.8b, #4 948 ushll2 v5.8h, v0.16b, #4 949 ushll v6.8h, v1.8b, #4 950 ushll2 v7.8h, v1.16b, #4 951 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 952 b.gt 16b 953 ret 954320: 955 add x8, x0, w3, uxtw 95632: 957 ld1 {v0.16b, v1.16b}, [x1], x2 958 subs w4, w4, #2 959 ushll v4.8h, v0.8b, #4 960 ushll2 v5.8h, v0.16b, #4 961 ld1 {v2.16b, v3.16b}, [x1], x2 962 ushll v6.8h, v1.8b, #4 963 ushll2 v7.8h, v1.16b, #4 964 ushll v16.8h, v2.8b, #4 965 st1 {v4.8h, v5.8h}, [x0], x7 966 ushll2 v17.8h, v2.16b, #4 967 st1 {v6.8h, v7.8h}, [x8], x7 968 ushll v18.8h, v3.8b, #4 969 st1 {v16.8h, v17.8h}, [x0], x7 970 ushll2 v19.8h, v3.16b, #4 971 st1 {v18.8h, v19.8h}, [x8], x7 972 b.gt 32b 973 ret 974640: 975 add x8, x0, #32 976 mov x6, #64 97764: 978 ldp q0, q1, [x1] 979 subs w4, w4, #1 980 ushll v4.8h, v0.8b, #4 981 ushll2 v5.8h, v0.16b, #4 982 ldp q2, q3, [x1, #32] 983 ushll v6.8h, v1.8b, #4 984 ushll2 v7.8h, v1.16b, #4 985 add x1, x1, x2 986 ushll v16.8h, v2.8b, #4 987 st1 {v4.8h, v5.8h}, [x0], x6 988 ushll2 v17.8h, v2.16b, #4 989 ushll v18.8h, v3.8b, #4 990 st1 {v6.8h, v7.8h}, [x8], x6 991 ushll2 v19.8h, v3.16b, #4 992 st1 {v16.8h, v17.8h}, [x0], x6 993 st1 {v18.8h, v19.8h}, [x8], x6 994 b.gt 64b 995 ret 9961280: 997 add x8, x0, #64 998 mov x6, #128 999128: 1000 ldp q0, q1, [x1] 1001 ldp q2, q3, [x1, #32] 1002 ushll v16.8h, v0.8b, #4 1003 ushll2 v17.8h, v0.16b, #4 1004 ushll v18.8h, v1.8b, #4 1005 ushll2 v19.8h, v1.16b, #4 1006 ushll v20.8h, v2.8b, #4 1007 ushll2 v21.8h, v2.16b, #4 1008 ldp q4, q5, [x1, #64] 1009 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 1010 ushll v22.8h, v3.8b, #4 1011 ushll2 v23.8h, v3.16b, #4 1012 ushll v24.8h, v4.8b, #4 1013 ushll2 v25.8h, v4.16b, #4 1014 ushll v26.8h, v5.8b, #4 1015 ushll2 v27.8h, v5.16b, #4 1016 ldp q6, q7, [x1, #96] 1017 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 1018 ushll v28.8h, v6.8b, #4 1019 ushll2 v29.8h, v6.16b, #4 1020 ushll v30.8h, v7.8b, #4 1021 ushll2 v31.8h, v7.16b, #4 1022 subs w4, w4, #1 1023 add x1, x1, x2 1024 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 1025 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 1026 b.gt 128b 1027 ret 1028 1029L(prep_tbl): 1030 .hword L(prep_tbl) - 1280b 1031 .hword L(prep_tbl) - 640b 1032 .hword L(prep_tbl) - 320b 1033 .hword L(prep_tbl) - 160b 1034 .hword L(prep_tbl) - 8b 1035 .hword L(prep_tbl) - 4b 1036endfunc 1037 1038 1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1040 ld1 {\d0\wd}[0], [\s0], \strd 1041 ld1 {\d1\wd}[0], [\s1], \strd 1042.ifnb \d2 1043 ld1 {\d2\wd}[0], [\s0], \strd 1044 ld1 {\d3\wd}[0], [\s1], \strd 1045.endif 1046.ifnb \d4 1047 ld1 {\d4\wd}[0], [\s0], \strd 1048.endif 1049.ifnb \d5 1050 ld1 {\d5\wd}[0], [\s1], \strd 1051.endif 1052.ifnb \d6 1053 ld1 {\d6\wd}[0], [\s0], \strd 1054.endif 1055.endm 1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1057 ld1 {\d0\wd}, [\s0], \strd 1058 ld1 {\d1\wd}, [\s1], \strd 1059.ifnb \d2 1060 ld1 {\d2\wd}, [\s0], \strd 1061 ld1 {\d3\wd}, [\s1], \strd 1062.endif 1063.ifnb \d4 1064 ld1 {\d4\wd}, [\s0], \strd 1065.endif 1066.ifnb \d5 1067 ld1 {\d5\wd}, [\s1], \strd 1068.endif 1069.ifnb \d6 1070 ld1 {\d6\wd}, [\s0], \strd 1071.endif 1072.endm 1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1074 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1075.endm 1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1077 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1078.endm 1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1080 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1081.endm 1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1083 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1084.endm 1085.macro interleave_1 wd, r0, r1, r2, r3, r4 1086 trn1 \r0\wd, \r0\wd, \r1\wd 1087 trn1 \r1\wd, \r1\wd, \r2\wd 1088.ifnb \r3 1089 trn1 \r2\wd, \r2\wd, \r3\wd 1090 trn1 \r3\wd, \r3\wd, \r4\wd 1091.endif 1092.endm 1093.macro interleave_1_h r0, r1, r2, r3, r4 1094 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1095.endm 1096.macro interleave_1_s r0, r1, r2, r3, r4 1097 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1098.endm 1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1100 trn1 \r0\wd, \r0\wd, \r2\wd 1101 trn1 \r1\wd, \r1\wd, \r3\wd 1102 trn1 \r2\wd, \r2\wd, \r4\wd 1103 trn1 \r3\wd, \r3\wd, \r5\wd 1104.endm 1105.macro interleave_2_s r0, r1, r2, r3, r4, r5 1106 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1107.endm 1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1109 uxtl \r0\().8h, \r0\().8b 1110 uxtl \r1\().8h, \r1\().8b 1111.ifnb \r2 1112 uxtl \r2\().8h, \r2\().8b 1113 uxtl \r3\().8h, \r3\().8b 1114.endif 1115.ifnb \r4 1116 uxtl \r4\().8h, \r4\().8b 1117.endif 1118.ifnb \r5 1119 uxtl \r5\().8h, \r5\().8b 1120.endif 1121.ifnb \r6 1122 uxtl \r6\().8h, \r6\().8b 1123.endif 1124.endm 1125.macro mul_mla_4 d, s0, s1, s2, s3, wd 1126 mul \d\wd, \s0\wd, v0.h[0] 1127 mla \d\wd, \s1\wd, v0.h[1] 1128 mla \d\wd, \s2\wd, v0.h[2] 1129 mla \d\wd, \s3\wd, v0.h[3] 1130.endm 1131// Interleaving the mul/mla chains actually hurts performance 1132// significantly on Cortex A53, thus keeping mul/mla tightly 1133// chained like this. 1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1135 mul \d0\().8h, \s0\().8h, v0.h[0] 1136 mla \d0\().8h, \s1\().8h, v0.h[1] 1137 mla \d0\().8h, \s2\().8h, v0.h[2] 1138 mla \d0\().8h, \s3\().8h, v0.h[3] 1139 mla \d0\().8h, \s4\().8h, v0.h[4] 1140 mla \d0\().8h, \s5\().8h, v0.h[5] 1141 mla \d0\().8h, \s6\().8h, v0.h[6] 1142 mla \d0\().8h, \s7\().8h, v0.h[7] 1143 mul \d1\().8h, \s1\().8h, v0.h[0] 1144 mla \d1\().8h, \s2\().8h, v0.h[1] 1145 mla \d1\().8h, \s3\().8h, v0.h[2] 1146 mla \d1\().8h, \s4\().8h, v0.h[3] 1147 mla \d1\().8h, \s5\().8h, v0.h[4] 1148 mla \d1\().8h, \s6\().8h, v0.h[5] 1149 mla \d1\().8h, \s7\().8h, v0.h[6] 1150 mla \d1\().8h, \s8\().8h, v0.h[7] 1151.endm 1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1153 mul \d0\().8h, \s0\().8h, v0.h[0] 1154 mla \d0\().8h, \s1\().8h, v0.h[1] 1155 mla \d0\().8h, \s2\().8h, v0.h[2] 1156 mla \d0\().8h, \s3\().8h, v0.h[3] 1157 mla \d0\().8h, \s4\().8h, v0.h[4] 1158 mla \d0\().8h, \s5\().8h, v0.h[5] 1159 mla \d0\().8h, \s6\().8h, v0.h[6] 1160 mla \d0\().8h, \s7\().8h, v0.h[7] 1161 mul \d1\().8h, \s2\().8h, v0.h[0] 1162 mla \d1\().8h, \s3\().8h, v0.h[1] 1163 mla \d1\().8h, \s4\().8h, v0.h[2] 1164 mla \d1\().8h, \s5\().8h, v0.h[3] 1165 mla \d1\().8h, \s6\().8h, v0.h[4] 1166 mla \d1\().8h, \s7\().8h, v0.h[5] 1167 mla \d1\().8h, \s8\().8h, v0.h[6] 1168 mla \d1\().8h, \s9\().8h, v0.h[7] 1169.endm 1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 1171 mul \d0\().8h, \s0\().8h, v0.h[0] 1172 mla \d0\().8h, \s1\().8h, v0.h[1] 1173 mla \d0\().8h, \s2\().8h, v0.h[2] 1174 mla \d0\().8h, \s3\().8h, v0.h[3] 1175 mla \d0\().8h, \s4\().8h, v0.h[4] 1176 mla \d0\().8h, \s5\().8h, v0.h[5] 1177 mla \d0\().8h, \s6\().8h, v0.h[6] 1178 mla \d0\().8h, \s7\().8h, v0.h[7] 1179 mul \d1\().8h, \s4\().8h, v0.h[0] 1180 mla \d1\().8h, \s5\().8h, v0.h[1] 1181 mla \d1\().8h, \s6\().8h, v0.h[2] 1182 mla \d1\().8h, \s7\().8h, v0.h[3] 1183 mla \d1\().8h, \s8\().8h, v0.h[4] 1184 mla \d1\().8h, \s9\().8h, v0.h[5] 1185 mla \d1\().8h, \s10\().8h, v0.h[6] 1186 mla \d1\().8h, \s11\().8h, v0.h[7] 1187.endm 1188.macro sqrshrun_b shift, r0, r1, r2, r3 1189 sqrshrun \r0\().8b, \r0\().8h, #\shift 1190.ifnb \r1 1191 sqrshrun \r1\().8b, \r1\().8h, #\shift 1192.endif 1193.ifnb \r2 1194 sqrshrun \r2\().8b, \r2\().8h, #\shift 1195 sqrshrun \r3\().8b, \r3\().8h, #\shift 1196.endif 1197.endm 1198.macro srshr_h shift, r0, r1, r2, r3 1199 srshr \r0\().8h, \r0\().8h, #\shift 1200.ifnb \r1 1201 srshr \r1\().8h, \r1\().8h, #\shift 1202.endif 1203.ifnb \r2 1204 srshr \r2\().8h, \r2\().8h, #\shift 1205 srshr \r3\().8h, \r3\().8h, #\shift 1206.endif 1207.endm 1208.macro st_h strd, reg, lanes 1209 st1 {\reg\().h}[0], [x0], \strd 1210 st1 {\reg\().h}[1], [x8], \strd 1211.if \lanes > 2 1212 st1 {\reg\().h}[2], [x0], \strd 1213 st1 {\reg\().h}[3], [x8], \strd 1214.endif 1215.endm 1216.macro st_s strd, r0, r1 1217 st1 {\r0\().s}[0], [x0], \strd 1218 st1 {\r0\().s}[1], [x8], \strd 1219.ifnb \r1 1220 st1 {\r1\().s}[0], [x0], \strd 1221 st1 {\r1\().s}[1], [x8], \strd 1222.endif 1223.endm 1224.macro st_d strd, r0, r1 1225 st1 {\r0\().d}[0], [x0], \strd 1226 st1 {\r0\().d}[1], [x8], \strd 1227.ifnb \r1 1228 st1 {\r1\().d}[0], [x0], \strd 1229 st1 {\r1\().d}[1], [x8], \strd 1230.endif 1231.endm 1232.macro shift_store_4 type, strd, r0, r1 1233.ifc \type, put 1234 sqrshrun_b 6, \r0, \r1 1235 st_s \strd, \r0, \r1 1236.else 1237 srshr_h 2, \r0, \r1 1238 st_d \strd, \r0, \r1 1239.endif 1240.endm 1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1242 st1 {\r0\wd}, [x0], \strd 1243 st1 {\r1\wd}, [x8], \strd 1244.ifnb \r2 1245 st1 {\r2\wd}, [x0], \strd 1246 st1 {\r3\wd}, [x8], \strd 1247.endif 1248.ifnb \r4 1249 st1 {\r4\wd}, [x0], \strd 1250 st1 {\r5\wd}, [x8], \strd 1251 st1 {\r6\wd}, [x0], \strd 1252 st1 {\r7\wd}, [x8], \strd 1253.endif 1254.endm 1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1256 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1257.endm 1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1259 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1260.endm 1261.macro shift_store_8 type, strd, r0, r1, r2, r3 1262.ifc \type, put 1263 sqrshrun_b 6, \r0, \r1, \r2, \r3 1264 st_8b \strd, \r0, \r1, \r2, \r3 1265.else 1266 srshr_h 2, \r0, \r1, \r2, \r3 1267 st_16b \strd, \r0, \r1, \r2, \r3 1268.endif 1269.endm 1270.macro shift_store_16 type, strd, r0, r1, r2, r3 1271.ifc \type, put 1272 sqrshrun \r0\().8b, \r0\().8h, #6 1273 sqrshrun2 \r0\().16b, \r1\().8h, #6 1274 sqrshrun \r2\().8b, \r2\().8h, #6 1275 sqrshrun2 \r2\().16b, \r3\().8h, #6 1276 st_16b \strd, \r0, \r2 1277.else 1278 srshr_h 2, \r0, \r1, \r2, \r3 1279 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1280 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1281.endif 1282.endm 1283 1284.macro make_8tap_fn op, type, type_h, type_v 1285function \op\()_8tap_\type\()_8bpc_neon, export=1 1286 mov x8, \type_h 1287 mov x9, \type_v 1288 b \op\()_8tap_neon 1289endfunc 1290.endm 1291 1292// No spaces in these expressions, due to gas-preprocessor. 1293#define REGULAR ((0*15<<7)|3*15) 1294#define SMOOTH ((1*15<<7)|4*15) 1295#define SHARP ((2*15<<7)|3*15) 1296 1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 1298make_8tap_fn \type, regular, REGULAR, REGULAR 1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1300make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1301make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1302make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1303make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1304make_8tap_fn \type, sharp, SHARP, SHARP 1305make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1306make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1307 1308function \type\()_8tap_neon 1309 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1310 mul \mx, \mx, w10 1311 mul \my, \my, w10 1312 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1313 add \my, \my, w9 // my, 8tap_v, 4tap_v 1314.ifc \type, prep 1315 uxtw \d_strd, \w 1316 lsl \d_strd, \d_strd, #1 1317.endif 1318 1319 clz w8, \w 1320 tst \mx, #(0x7f << 14) 1321 sub w8, w8, #24 1322 movrel x10, X(mc_subpel_filters), -8 1323 b.ne L(\type\()_8tap_h) 1324 tst \my, #(0x7f << 14) 1325 b.ne L(\type\()_8tap_v) 1326 b \type\()_neon 1327 1328L(\type\()_8tap_h): 1329 cmp \w, #4 1330 ubfx w9, \mx, #7, #7 1331 and \mx, \mx, #0x7f 1332 b.le 4f 1333 mov \mx, w9 13344: 1335 tst \my, #(0x7f << 14) 1336 add \xmx, x10, \mx, uxtw #3 1337 b.ne L(\type\()_8tap_hv) 1338 1339 adr x9, L(\type\()_8tap_h_tbl) 1340 ldrh w8, [x9, x8, lsl #1] 1341 sub x9, x9, w8, uxtw 1342 br x9 1343 134420: // 2xN h 1345.ifc \type, put 1346 add \xmx, \xmx, #2 1347 ld1 {v0.s}[0], [\xmx] 1348 sub \src, \src, #1 1349 add \ds2, \dst, \d_strd 1350 add \sr2, \src, \s_strd 1351 lsl \d_strd, \d_strd, #1 1352 lsl \s_strd, \s_strd, #1 1353 sxtl v0.8h, v0.8b 13542: 1355 ld1 {v4.8b}, [\src], \s_strd 1356 ld1 {v6.8b}, [\sr2], \s_strd 1357 uxtl v4.8h, v4.8b 1358 uxtl v6.8h, v6.8b 1359 ext v5.16b, v4.16b, v4.16b, #2 1360 ext v7.16b, v6.16b, v6.16b, #2 1361 subs \h, \h, #2 1362 trn1 v3.2s, v4.2s, v6.2s 1363 trn2 v6.2s, v4.2s, v6.2s 1364 trn1 v4.2s, v5.2s, v7.2s 1365 trn2 v7.2s, v5.2s, v7.2s 1366 mul v3.4h, v3.4h, v0.h[0] 1367 mla v3.4h, v4.4h, v0.h[1] 1368 mla v3.4h, v6.4h, v0.h[2] 1369 mla v3.4h, v7.4h, v0.h[3] 1370 srshr v3.4h, v3.4h, #2 1371 sqrshrun v3.8b, v3.8h, #4 1372 st1 {v3.h}[0], [\dst], \d_strd 1373 st1 {v3.h}[1], [\ds2], \d_strd 1374 b.gt 2b 1375 ret 1376.endif 1377 137840: // 4xN h 1379 add \xmx, \xmx, #2 1380 ld1 {v0.s}[0], [\xmx] 1381 sub \src, \src, #1 1382 add \ds2, \dst, \d_strd 1383 add \sr2, \src, \s_strd 1384 lsl \d_strd, \d_strd, #1 1385 lsl \s_strd, \s_strd, #1 1386 sxtl v0.8h, v0.8b 13874: 1388 ld1 {v16.8b}, [\src], \s_strd 1389 ld1 {v20.8b}, [\sr2], \s_strd 1390 uxtl v16.8h, v16.8b 1391 uxtl v20.8h, v20.8b 1392 ext v17.16b, v16.16b, v16.16b, #2 1393 ext v18.16b, v16.16b, v16.16b, #4 1394 ext v19.16b, v16.16b, v16.16b, #6 1395 ext v21.16b, v20.16b, v20.16b, #2 1396 ext v22.16b, v20.16b, v20.16b, #4 1397 ext v23.16b, v20.16b, v20.16b, #6 1398 subs \h, \h, #2 1399 mul v16.4h, v16.4h, v0.h[0] 1400 mla v16.4h, v17.4h, v0.h[1] 1401 mla v16.4h, v18.4h, v0.h[2] 1402 mla v16.4h, v19.4h, v0.h[3] 1403 mul v20.4h, v20.4h, v0.h[0] 1404 mla v20.4h, v21.4h, v0.h[1] 1405 mla v20.4h, v22.4h, v0.h[2] 1406 mla v20.4h, v23.4h, v0.h[3] 1407 srshr v16.4h, v16.4h, #2 1408 srshr v20.4h, v20.4h, #2 1409.ifc \type, put 1410 sqrshrun v16.8b, v16.8h, #4 1411 sqrshrun v20.8b, v20.8h, #4 1412 st1 {v16.s}[0], [\dst], \d_strd 1413 st1 {v20.s}[0], [\ds2], \d_strd 1414.else 1415 st1 {v16.4h}, [\dst], \d_strd 1416 st1 {v20.4h}, [\ds2], \d_strd 1417.endif 1418 b.gt 4b 1419 ret 1420 142180: // 8xN h 1422 ld1 {v0.8b}, [\xmx] 1423 sub \src, \src, #3 1424 add \ds2, \dst, \d_strd 1425 add \sr2, \src, \s_strd 1426 lsl \d_strd, \d_strd, #1 1427 lsl \s_strd, \s_strd, #1 1428 sxtl v0.8h, v0.8b 14298: 1430 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1431 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1432 uxtl v16.8h, v16.8b 1433 uxtl v17.8h, v17.8b 1434 uxtl v20.8h, v20.8b 1435 uxtl v21.8h, v21.8b 1436 1437 mul v18.8h, v16.8h, v0.h[0] 1438 mul v22.8h, v20.8h, v0.h[0] 1439.irpc i, 1234567 1440 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1441 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1442 mla v18.8h, v19.8h, v0.h[\i] 1443 mla v22.8h, v23.8h, v0.h[\i] 1444.endr 1445 subs \h, \h, #2 1446 srshr v18.8h, v18.8h, #2 1447 srshr v22.8h, v22.8h, #2 1448.ifc \type, put 1449 sqrshrun v18.8b, v18.8h, #4 1450 sqrshrun v22.8b, v22.8h, #4 1451 st1 {v18.8b}, [\dst], \d_strd 1452 st1 {v22.8b}, [\ds2], \d_strd 1453.else 1454 st1 {v18.8h}, [\dst], \d_strd 1455 st1 {v22.8h}, [\ds2], \d_strd 1456.endif 1457 b.gt 8b 1458 ret 1459160: 1460320: 1461640: 14621280: // 16xN, 32xN, ... h 1463 ld1 {v0.8b}, [\xmx] 1464 sub \src, \src, #3 1465 add \ds2, \dst, \d_strd 1466 add \sr2, \src, \s_strd 1467 lsl \s_strd, \s_strd, #1 1468 sxtl v0.8h, v0.8b 1469 1470 sub \s_strd, \s_strd, \w, uxtw 1471 sub \s_strd, \s_strd, #8 1472.ifc \type, put 1473 lsl \d_strd, \d_strd, #1 1474 sub \d_strd, \d_strd, \w, uxtw 1475.endif 1476161: 1477 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1478 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1479 mov \mx, \w 1480 uxtl v16.8h, v16.8b 1481 uxtl v17.8h, v17.8b 1482 uxtl v18.8h, v18.8b 1483 uxtl v20.8h, v20.8b 1484 uxtl v21.8h, v21.8b 1485 uxtl v22.8h, v22.8b 1486 148716: 1488 mul v24.8h, v16.8h, v0.h[0] 1489 mul v25.8h, v17.8h, v0.h[0] 1490 mul v26.8h, v20.8h, v0.h[0] 1491 mul v27.8h, v21.8h, v0.h[0] 1492.irpc i, 1234567 1493 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1494 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1495 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1496 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1497 mla v24.8h, v28.8h, v0.h[\i] 1498 mla v25.8h, v29.8h, v0.h[\i] 1499 mla v26.8h, v30.8h, v0.h[\i] 1500 mla v27.8h, v31.8h, v0.h[\i] 1501.endr 1502 srshr v24.8h, v24.8h, #2 1503 srshr v25.8h, v25.8h, #2 1504 srshr v26.8h, v26.8h, #2 1505 srshr v27.8h, v27.8h, #2 1506 subs \mx, \mx, #16 1507.ifc \type, put 1508 sqrshrun v24.8b, v24.8h, #4 1509 sqrshrun2 v24.16b, v25.8h, #4 1510 sqrshrun v26.8b, v26.8h, #4 1511 sqrshrun2 v26.16b, v27.8h, #4 1512 st1 {v24.16b}, [\dst], #16 1513 st1 {v26.16b}, [\ds2], #16 1514.else 1515 st1 {v24.8h, v25.8h}, [\dst], #32 1516 st1 {v26.8h, v27.8h}, [\ds2], #32 1517.endif 1518 b.le 9f 1519 1520 mov v16.16b, v18.16b 1521 mov v20.16b, v22.16b 1522 ld1 {v17.8b, v18.8b}, [\src], #16 1523 ld1 {v21.8b, v22.8b}, [\sr2], #16 1524 uxtl v17.8h, v17.8b 1525 uxtl v18.8h, v18.8b 1526 uxtl v21.8h, v21.8b 1527 uxtl v22.8h, v22.8b 1528 b 16b 1529 15309: 1531 add \dst, \dst, \d_strd 1532 add \ds2, \ds2, \d_strd 1533 add \src, \src, \s_strd 1534 add \sr2, \sr2, \s_strd 1535 1536 subs \h, \h, #2 1537 b.gt 161b 1538 ret 1539 1540L(\type\()_8tap_h_tbl): 1541 .hword L(\type\()_8tap_h_tbl) - 1280b 1542 .hword L(\type\()_8tap_h_tbl) - 640b 1543 .hword L(\type\()_8tap_h_tbl) - 320b 1544 .hword L(\type\()_8tap_h_tbl) - 160b 1545 .hword L(\type\()_8tap_h_tbl) - 80b 1546 .hword L(\type\()_8tap_h_tbl) - 40b 1547 .hword L(\type\()_8tap_h_tbl) - 20b 1548 .hword 0 1549 1550 1551L(\type\()_8tap_v): 1552 cmp \h, #4 1553 ubfx w9, \my, #7, #7 1554 and \my, \my, #0x7f 1555 b.le 4f 1556 mov \my, w9 15574: 1558 add \xmy, x10, \my, uxtw #3 1559 1560 adr x9, L(\type\()_8tap_v_tbl) 1561 ldrh w8, [x9, x8, lsl #1] 1562 sub x9, x9, w8, uxtw 1563 br x9 1564 156520: // 2xN v 1566.ifc \type, put 1567 b.gt 28f 1568 1569 cmp \h, #2 1570 add \xmy, \xmy, #2 1571 ld1 {v0.s}[0], [\xmy] 1572 sub \src, \src, \s_strd 1573 add \ds2, \dst, \d_strd 1574 add \sr2, \src, \s_strd 1575 lsl \s_strd, \s_strd, #1 1576 lsl \d_strd, \d_strd, #1 1577 sxtl v0.8h, v0.8b 1578 1579 // 2x2 v 1580 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1581 interleave_1_h v1, v2, v3, v4, v5 1582 b.gt 24f 1583 uxtl_b v1, v2, v3, v4 1584 mul_mla_4 v6, v1, v2, v3, v4, .4h 1585 sqrshrun_b 6, v6 1586 st_h \d_strd, v6, 2 1587 ret 1588 158924: // 2x4 v 1590 load_h \sr2, \src, \s_strd, v6, v7 1591 interleave_1_h v5, v6, v7 1592 interleave_2_s v1, v2, v3, v4, v5, v6 1593 uxtl_b v1, v2, v3, v4 1594 mul_mla_4 v6, v1, v2, v3, v4, .8h 1595 sqrshrun_b 6, v6 1596 st_h \d_strd, v6, 4 1597 ret 1598 159928: // 2x8, 2x16 v 1600 ld1 {v0.8b}, [\xmy] 1601 sub \sr2, \src, \s_strd, lsl #1 1602 add \ds2, \dst, \d_strd 1603 sub \src, \sr2, \s_strd 1604 lsl \d_strd, \d_strd, #1 1605 lsl \s_strd, \s_strd, #1 1606 sxtl v0.8h, v0.8b 1607 1608 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1609 interleave_1_h v1, v2, v3, v4, v5 1610 interleave_1_h v5, v6, v7 1611 interleave_2_s v1, v2, v3, v4, v5, v6 1612 uxtl_b v1, v2, v3, v4 1613216: 1614 subs \h, \h, #8 1615 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1616 load_h \sr2, \src, \s_strd, v20, v21, v22, v23 1617 interleave_1_h v7, v16, v17, v18, v19 1618 interleave_1_h v19, v20, v21, v22, v23 1619 interleave_2_s v5, v6, v7, v16, v17, v18 1620 interleave_2_s v17, v18, v19, v20, v21, v22 1621 uxtl_b v5, v6, v7, v16 1622 uxtl_b v17, v18, v19, v20 1623 mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 1624 sqrshrun_b 6, v30, v31 1625 st_h \d_strd, v30, 4 1626 st_h \d_strd, v31, 4 1627 b.le 0f 1628 mov v1.16b, v17.16b 1629 mov v2.16b, v18.16b 1630 mov v3.16b, v19.16b 1631 mov v4.16b, v20.16b 1632 mov v5.16b, v21.16b 1633 mov v6.16b, v22.16b 1634 mov v7.16b, v23.16b 1635 b 216b 16360: 1637 ret 1638.endif 1639 164040: 1641 b.gt 480f 1642 1643 // 4x2, 4x4 v 1644 cmp \h, #2 1645 add \xmy, \xmy, #2 1646 ld1 {v0.s}[0], [\xmy] 1647 sub \src, \src, \s_strd 1648 add \ds2, \dst, \d_strd 1649 add \sr2, \src, \s_strd 1650 lsl \s_strd, \s_strd, #1 1651 lsl \d_strd, \d_strd, #1 1652 sxtl v0.8h, v0.8b 1653 1654 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1655 interleave_1_s v1, v2, v3, v4, v5 1656 uxtl_b v1, v2, v3, v4 1657 mul_mla_4 v6, v1, v2, v3, v4, .8h 1658 shift_store_4 \type, \d_strd, v6 1659 b.le 0f 1660 load_s \sr2, \src, \s_strd, v6, v7 1661 interleave_1_s v5, v6, v7 1662 uxtl_b v5, v6 1663 mul_mla_4 v7, v3, v4, v5, v6, .8h 1664 shift_store_4 \type, \d_strd, v7 16650: 1666 ret 1667 1668480: // 4x8, 4x16 v 1669 ld1 {v0.8b}, [\xmy] 1670 sub \sr2, \src, \s_strd, lsl #1 1671 add \ds2, \dst, \d_strd 1672 sub \src, \sr2, \s_strd 1673 lsl \s_strd, \s_strd, #1 1674 lsl \d_strd, \d_strd, #1 1675 sxtl v0.8h, v0.8b 1676 1677 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1678 interleave_1_s v16, v17, v18 1679 interleave_1_s v18, v19, v20, v21, v22 1680 uxtl_b v16, v17 1681 uxtl_b v18, v19, v20, v21 1682 168348: 1684 subs \h, \h, #4 1685 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1686 interleave_1_s v22, v23, v24, v25, v26 1687 uxtl_b v22, v23, v24, v25 1688 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1689 shift_store_4 \type, \d_strd, v1, v2 1690 b.le 0f 1691 subs \h, \h, #4 1692 load_s \sr2, \src, \s_strd, v27, v16, v17, v18 1693 interleave_1_s v26, v27, v16, v17, v18 1694 uxtl_b v26, v27, v16, v17 1695 mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 1696 shift_store_4 \type, \d_strd, v1, v2 1697 b.le 0f 1698 subs \h, \h, #4 1699 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1700 interleave_1_s v18, v19, v20, v21, v22 1701 uxtl_b v18, v19, v20, v21 1702 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1703 shift_store_4 \type, \d_strd, v1, v2 1704 b.gt 48b 17050: 1706 ret 1707 170880: 1709 b.gt 880f 1710 1711 // 8x2, 8x4 v 1712 cmp \h, #2 1713 add \xmy, \xmy, #2 1714 ld1 {v0.s}[0], [\xmy] 1715 sub \src, \src, \s_strd 1716 add \ds2, \dst, \d_strd 1717 add \sr2, \src, \s_strd 1718 lsl \s_strd, \s_strd, #1 1719 lsl \d_strd, \d_strd, #1 1720 sxtl v0.8h, v0.8b 1721 1722 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1723 uxtl_b v1, v2, v3, v4, v5 1724 mul_mla_4 v6, v1, v2, v3, v4, .8h 1725 mul_mla_4 v7, v2, v3, v4, v5, .8h 1726 shift_store_8 \type, \d_strd, v6, v7 1727 b.le 0f 1728 load_8b \sr2, \src, \s_strd, v6, v7 1729 uxtl_b v6, v7 1730 mul_mla_4 v1, v3, v4, v5, v6, .8h 1731 mul_mla_4 v2, v4, v5, v6, v7, .8h 1732 shift_store_8 \type, \d_strd, v1, v2 17330: 1734 ret 1735 1736880: // 8x6, 8x8, 8x16, 8x32 v 17371680: // 16x8, 16x16, ... 1738320: // 32x8, 32x16, ... 1739640: 17401280: 1741 ld1 {v0.8b}, [\xmy] 1742 sub \src, \src, \s_strd 1743 sub \src, \src, \s_strd, lsl #1 1744 sxtl v0.8h, v0.8b 1745 mov \my, \h 1746168: 1747 add \ds2, \dst, \d_strd 1748 add \sr2, \src, \s_strd 1749 lsl \s_strd, \s_strd, #1 1750 lsl \d_strd, \d_strd, #1 1751 1752 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1753 uxtl_b v16, v17, v18, v19, v20, v21, v22 1754 175588: 1756 subs \h, \h, #2 1757 load_8b \sr2, \src, \s_strd, v23, v24 1758 uxtl_b v23, v24 1759 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1760 shift_store_8 \type, \d_strd, v1, v2 1761 b.le 9f 1762 subs \h, \h, #2 1763 load_8b \sr2, \src, \s_strd, v25, v26 1764 uxtl_b v25, v26 1765 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1766 shift_store_8 \type, \d_strd, v3, v4 1767 b.le 9f 1768 subs \h, \h, #2 1769 load_8b \sr2, \src, \s_strd, v27, v16 1770 uxtl_b v27, v16 1771 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1772 shift_store_8 \type, \d_strd, v1, v2 1773 b.le 9f 1774 subs \h, \h, #2 1775 load_8b \sr2, \src, \s_strd, v17, v18 1776 uxtl_b v17, v18 1777 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1778 shift_store_8 \type, \d_strd, v3, v4 1779 b.le 9f 1780 subs \h, \h, #4 1781 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1782 uxtl_b v19, v20, v21, v22 1783 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1784 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1785 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1786 b.gt 88b 17879: 1788 subs \w, \w, #8 1789 b.le 0f 1790 asr \s_strd, \s_strd, #1 1791 asr \d_strd, \d_strd, #1 1792 msub \src, \s_strd, \xmy, \src 1793 msub \dst, \d_strd, \xmy, \dst 1794 sub \src, \src, \s_strd, lsl #3 1795 mov \h, \my 1796 add \src, \src, #8 1797.ifc \type, put 1798 add \dst, \dst, #8 1799.else 1800 add \dst, \dst, #16 1801.endif 1802 b 168b 18030: 1804 ret 1805 1806160: 1807 b.gt 1680b 1808 1809 // 16x2, 16x4 v 1810 add \xmy, \xmy, #2 1811 ld1 {v0.s}[0], [\xmy] 1812 sub \src, \src, \s_strd 1813 add \ds2, \dst, \d_strd 1814 add \sr2, \src, \s_strd 1815 lsl \s_strd, \s_strd, #1 1816 lsl \d_strd, \d_strd, #1 1817 sxtl v0.8h, v0.8b 1818 1819 cmp \h, #2 1820 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1821 uxtl v16.8h, v1.8b 1822 uxtl v17.8h, v2.8b 1823 uxtl v18.8h, v3.8b 1824 uxtl v19.8h, v4.8b 1825 uxtl v20.8h, v5.8b 1826 uxtl2 v23.8h, v1.16b 1827 uxtl2 v24.8h, v2.16b 1828 uxtl2 v25.8h, v3.16b 1829 uxtl2 v26.8h, v4.16b 1830 uxtl2 v27.8h, v5.16b 1831 mul_mla_4 v1, v16, v17, v18, v19, .8h 1832 mul_mla_4 v16, v17, v18, v19, v20, .8h 1833 mul_mla_4 v2, v23, v24, v25, v26, .8h 1834 mul_mla_4 v17, v24, v25, v26, v27, .8h 1835 shift_store_16 \type, \d_strd, v1, v2, v16, v17 1836 b.le 0f 1837 load_16b \sr2, \src, \s_strd, v6, v7 1838 uxtl v21.8h, v6.8b 1839 uxtl v22.8h, v7.8b 1840 uxtl2 v28.8h, v6.16b 1841 uxtl2 v29.8h, v7.16b 1842 mul_mla_4 v1, v18, v19, v20, v21, .8h 1843 mul_mla_4 v3, v19, v20, v21, v22, .8h 1844 mul_mla_4 v2, v25, v26, v27, v28, .8h 1845 mul_mla_4 v4, v26, v27, v28, v29, .8h 1846 shift_store_16 \type, \d_strd, v1, v2, v3, v4 18470: 1848 ret 1849 1850L(\type\()_8tap_v_tbl): 1851 .hword L(\type\()_8tap_v_tbl) - 1280b 1852 .hword L(\type\()_8tap_v_tbl) - 640b 1853 .hword L(\type\()_8tap_v_tbl) - 320b 1854 .hword L(\type\()_8tap_v_tbl) - 160b 1855 .hword L(\type\()_8tap_v_tbl) - 80b 1856 .hword L(\type\()_8tap_v_tbl) - 40b 1857 .hword L(\type\()_8tap_v_tbl) - 20b 1858 .hword 0 1859 1860L(\type\()_8tap_hv): 1861 cmp \h, #4 1862 ubfx w9, \my, #7, #7 1863 and \my, \my, #0x7f 1864 b.le 4f 1865 mov \my, w9 18664: 1867 add \xmy, x10, \my, uxtw #3 1868 1869 adr x9, L(\type\()_8tap_hv_tbl) 1870 ldrh w8, [x9, x8, lsl #1] 1871 sub x9, x9, w8, uxtw 1872 br x9 1873 187420: 1875.ifc \type, put 1876 add \xmx, \xmx, #2 1877 ld1 {v0.s}[0], [\xmx] 1878 b.gt 280f 1879 add \xmy, \xmy, #2 1880 ld1 {v1.s}[0], [\xmy] 1881 1882 // 2x2, 2x4 hv 1883 sub \sr2, \src, #1 1884 sub \src, \sr2, \s_strd 1885 add \ds2, \dst, \d_strd 1886 lsl \s_strd, \s_strd, #1 1887 lsl \d_strd, \d_strd, #1 1888 sxtl v0.8h, v0.8b 1889 sxtl v1.8h, v1.8b 1890 mov x15, x30 1891 1892 ld1 {v28.8b}, [\src], \s_strd 1893 uxtl v28.8h, v28.8b 1894 ext v29.16b, v28.16b, v28.16b, #2 1895 mul v28.4h, v28.4h, v0.4h 1896 mul v29.4h, v29.4h, v0.4h 1897 addp v28.4h, v28.4h, v29.4h 1898 addp v16.4h, v28.4h, v28.4h 1899 srshr v16.4h, v16.4h, #2 1900 bl L(\type\()_8tap_filter_2) 1901 1902 trn1 v16.2s, v16.2s, v28.2s 1903 mov v17.8b, v28.8b 1904 19052: 1906 bl L(\type\()_8tap_filter_2) 1907 1908 ext v18.8b, v17.8b, v28.8b, #4 1909 smull v2.4s, v16.4h, v1.h[0] 1910 smlal v2.4s, v17.4h, v1.h[1] 1911 smlal v2.4s, v18.4h, v1.h[2] 1912 smlal v2.4s, v28.4h, v1.h[3] 1913 1914 sqrshrn v2.4h, v2.4s, #\shift_hv 1915 sqxtun v2.8b, v2.8h 1916 subs \h, \h, #2 1917 st1 {v2.h}[0], [\dst], \d_strd 1918 st1 {v2.h}[1], [\ds2], \d_strd 1919 b.le 0f 1920 mov v16.8b, v18.8b 1921 mov v17.8b, v28.8b 1922 b 2b 1923 1924280: // 2x8, 2x16, 2x32 hv 1925 ld1 {v1.8b}, [\xmy] 1926 sub \src, \src, #1 1927 sub \sr2, \src, \s_strd, lsl #1 1928 sub \src, \sr2, \s_strd 1929 add \ds2, \dst, \d_strd 1930 lsl \s_strd, \s_strd, #1 1931 lsl \d_strd, \d_strd, #1 1932 sxtl v0.8h, v0.8b 1933 sxtl v1.8h, v1.8b 1934 mov x15, x30 1935 1936 ld1 {v28.8b}, [\src], \s_strd 1937 uxtl v28.8h, v28.8b 1938 ext v29.16b, v28.16b, v28.16b, #2 1939 mul v28.4h, v28.4h, v0.4h 1940 mul v29.4h, v29.4h, v0.4h 1941 addp v28.4h, v28.4h, v29.4h 1942 addp v16.4h, v28.4h, v28.4h 1943 srshr v16.4h, v16.4h, #2 1944 1945 bl L(\type\()_8tap_filter_2) 1946 trn1 v16.2s, v16.2s, v28.2s 1947 mov v17.8b, v28.8b 1948 bl L(\type\()_8tap_filter_2) 1949 ext v18.8b, v17.8b, v28.8b, #4 1950 mov v19.8b, v28.8b 1951 bl L(\type\()_8tap_filter_2) 1952 ext v20.8b, v19.8b, v28.8b, #4 1953 mov v21.8b, v28.8b 1954 195528: 1956 bl L(\type\()_8tap_filter_2) 1957 ext v22.8b, v21.8b, v28.8b, #4 1958 smull v2.4s, v16.4h, v1.h[0] 1959 smlal v2.4s, v17.4h, v1.h[1] 1960 smlal v2.4s, v18.4h, v1.h[2] 1961 smlal v2.4s, v19.4h, v1.h[3] 1962 smlal v2.4s, v20.4h, v1.h[4] 1963 smlal v2.4s, v21.4h, v1.h[5] 1964 smlal v2.4s, v22.4h, v1.h[6] 1965 smlal v2.4s, v28.4h, v1.h[7] 1966 1967 sqrshrn v2.4h, v2.4s, #\shift_hv 1968 sqxtun v2.8b, v2.8h 1969 subs \h, \h, #2 1970 st1 {v2.h}[0], [\dst], \d_strd 1971 st1 {v2.h}[1], [\ds2], \d_strd 1972 b.le 0f 1973 mov v16.8b, v18.8b 1974 mov v17.8b, v19.8b 1975 mov v18.8b, v20.8b 1976 mov v19.8b, v21.8b 1977 mov v20.8b, v22.8b 1978 mov v21.8b, v28.8b 1979 b 28b 1980 19810: 1982 br x15 1983 1984L(\type\()_8tap_filter_2): 1985 ld1 {v28.8b}, [\sr2], \s_strd 1986 ld1 {v30.8b}, [\src], \s_strd 1987 uxtl v28.8h, v28.8b 1988 uxtl v30.8h, v30.8b 1989 ext v29.16b, v28.16b, v28.16b, #2 1990 ext v31.16b, v30.16b, v30.16b, #2 1991 trn1 v27.2s, v28.2s, v30.2s 1992 trn2 v30.2s, v28.2s, v30.2s 1993 trn1 v28.2s, v29.2s, v31.2s 1994 trn2 v31.2s, v29.2s, v31.2s 1995 mul v27.4h, v27.4h, v0.h[0] 1996 mla v27.4h, v28.4h, v0.h[1] 1997 mla v27.4h, v30.4h, v0.h[2] 1998 mla v27.4h, v31.4h, v0.h[3] 1999 srshr v28.4h, v27.4h, #2 2000 ret 2001.endif 2002 200340: 2004 add \xmx, \xmx, #2 2005 ld1 {v0.s}[0], [\xmx] 2006 b.gt 480f 2007 add \xmy, \xmy, #2 2008 ld1 {v1.s}[0], [\xmy] 2009 sub \sr2, \src, #1 2010 sub \src, \sr2, \s_strd 2011 add \ds2, \dst, \d_strd 2012 lsl \s_strd, \s_strd, #1 2013 lsl \d_strd, \d_strd, #1 2014 sxtl v0.8h, v0.8b 2015 sxtl v1.8h, v1.8b 2016 mov x15, x30 2017 2018 // 4x2, 4x4 hv 2019 ld1 {v26.8b}, [\src], \s_strd 2020 uxtl v26.8h, v26.8b 2021 ext v28.16b, v26.16b, v26.16b, #2 2022 ext v29.16b, v26.16b, v26.16b, #4 2023 ext v30.16b, v26.16b, v26.16b, #6 2024 mul v31.4h, v26.4h, v0.h[0] 2025 mla v31.4h, v28.4h, v0.h[1] 2026 mla v31.4h, v29.4h, v0.h[2] 2027 mla v31.4h, v30.4h, v0.h[3] 2028 srshr v16.4h, v31.4h, #2 2029 2030 bl L(\type\()_8tap_filter_4) 2031 mov v17.8b, v28.8b 2032 mov v18.8b, v29.8b 2033 20344: 2035 bl L(\type\()_8tap_filter_4) 2036 // Interleaving the mul/mla chains actually hurts performance 2037 // significantly on Cortex A53, thus keeping mul/mla tightly 2038 // chained like this. 2039 smull v2.4s, v16.4h, v1.h[0] 2040 smlal v2.4s, v17.4h, v1.h[1] 2041 smlal v2.4s, v18.4h, v1.h[2] 2042 smlal v2.4s, v28.4h, v1.h[3] 2043 smull v3.4s, v17.4h, v1.h[0] 2044 smlal v3.4s, v18.4h, v1.h[1] 2045 smlal v3.4s, v28.4h, v1.h[2] 2046 smlal v3.4s, v29.4h, v1.h[3] 2047 sqrshrn v2.4h, v2.4s, #\shift_hv 2048 sqrshrn v3.4h, v3.4s, #\shift_hv 2049 subs \h, \h, #2 2050.ifc \type, put 2051 sqxtun v2.8b, v2.8h 2052 sqxtun v3.8b, v3.8h 2053 st1 {v2.s}[0], [\dst], \d_strd 2054 st1 {v3.s}[0], [\ds2], \d_strd 2055.else 2056 st1 {v2.4h}, [\dst], \d_strd 2057 st1 {v3.4h}, [\ds2], \d_strd 2058.endif 2059 b.le 0f 2060 mov v16.8b, v18.8b 2061 mov v17.8b, v28.8b 2062 mov v18.8b, v29.8b 2063 b 4b 2064 2065480: // 4x8, 4x16, 4x32 hv 2066 ld1 {v1.8b}, [\xmy] 2067 sub \src, \src, #1 2068 sub \sr2, \src, \s_strd, lsl #1 2069 sub \src, \sr2, \s_strd 2070 add \ds2, \dst, \d_strd 2071 lsl \s_strd, \s_strd, #1 2072 lsl \d_strd, \d_strd, #1 2073 sxtl v0.8h, v0.8b 2074 sxtl v1.8h, v1.8b 2075 mov x15, x30 2076 2077 ld1 {v26.8b}, [\src], \s_strd 2078 uxtl v26.8h, v26.8b 2079 ext v28.16b, v26.16b, v26.16b, #2 2080 ext v29.16b, v26.16b, v26.16b, #4 2081 ext v30.16b, v26.16b, v26.16b, #6 2082 mul v31.4h, v26.4h, v0.h[0] 2083 mla v31.4h, v28.4h, v0.h[1] 2084 mla v31.4h, v29.4h, v0.h[2] 2085 mla v31.4h, v30.4h, v0.h[3] 2086 srshr v16.4h, v31.4h, #2 2087 2088 bl L(\type\()_8tap_filter_4) 2089 mov v17.8b, v28.8b 2090 mov v18.8b, v29.8b 2091 bl L(\type\()_8tap_filter_4) 2092 mov v19.8b, v28.8b 2093 mov v20.8b, v29.8b 2094 bl L(\type\()_8tap_filter_4) 2095 mov v21.8b, v28.8b 2096 mov v22.8b, v29.8b 2097 209848: 2099 bl L(\type\()_8tap_filter_4) 2100 smull v2.4s, v16.4h, v1.h[0] 2101 smlal v2.4s, v17.4h, v1.h[1] 2102 smlal v2.4s, v18.4h, v1.h[2] 2103 smlal v2.4s, v19.4h, v1.h[3] 2104 smlal v2.4s, v20.4h, v1.h[4] 2105 smlal v2.4s, v21.4h, v1.h[5] 2106 smlal v2.4s, v22.4h, v1.h[6] 2107 smlal v2.4s, v28.4h, v1.h[7] 2108 smull v3.4s, v17.4h, v1.h[0] 2109 smlal v3.4s, v18.4h, v1.h[1] 2110 smlal v3.4s, v19.4h, v1.h[2] 2111 smlal v3.4s, v20.4h, v1.h[3] 2112 smlal v3.4s, v21.4h, v1.h[4] 2113 smlal v3.4s, v22.4h, v1.h[5] 2114 smlal v3.4s, v28.4h, v1.h[6] 2115 smlal v3.4s, v29.4h, v1.h[7] 2116 sqrshrn v2.4h, v2.4s, #\shift_hv 2117 sqrshrn v3.4h, v3.4s, #\shift_hv 2118 subs \h, \h, #2 2119.ifc \type, put 2120 sqxtun v2.8b, v2.8h 2121 sqxtun v3.8b, v3.8h 2122 st1 {v2.s}[0], [\dst], \d_strd 2123 st1 {v3.s}[0], [\ds2], \d_strd 2124.else 2125 st1 {v2.4h}, [\dst], \d_strd 2126 st1 {v3.4h}, [\ds2], \d_strd 2127.endif 2128 b.le 0f 2129 mov v16.8b, v18.8b 2130 mov v17.8b, v19.8b 2131 mov v18.8b, v20.8b 2132 mov v19.8b, v21.8b 2133 mov v20.8b, v22.8b 2134 mov v21.8b, v28.8b 2135 mov v22.8b, v29.8b 2136 b 48b 21370: 2138 br x15 2139 2140L(\type\()_8tap_filter_4): 2141 ld1 {v26.8b}, [\sr2], \s_strd 2142 ld1 {v27.8b}, [\src], \s_strd 2143 uxtl v26.8h, v26.8b 2144 uxtl v27.8h, v27.8b 2145 ext v28.16b, v26.16b, v26.16b, #2 2146 ext v29.16b, v26.16b, v26.16b, #4 2147 ext v30.16b, v26.16b, v26.16b, #6 2148 mul v31.4h, v26.4h, v0.h[0] 2149 mla v31.4h, v28.4h, v0.h[1] 2150 mla v31.4h, v29.4h, v0.h[2] 2151 mla v31.4h, v30.4h, v0.h[3] 2152 ext v28.16b, v27.16b, v27.16b, #2 2153 ext v29.16b, v27.16b, v27.16b, #4 2154 ext v30.16b, v27.16b, v27.16b, #6 2155 mul v27.4h, v27.4h, v0.h[0] 2156 mla v27.4h, v28.4h, v0.h[1] 2157 mla v27.4h, v29.4h, v0.h[2] 2158 mla v27.4h, v30.4h, v0.h[3] 2159 srshr v28.4h, v31.4h, #2 2160 srshr v29.4h, v27.4h, #2 2161 ret 2162 216380: 2164160: 2165320: 2166 b.gt 880f 2167 add \xmy, \xmy, #2 2168 ld1 {v0.8b}, [\xmx] 2169 ld1 {v1.s}[0], [\xmy] 2170 sub \src, \src, #3 2171 sub \src, \src, \s_strd 2172 sxtl v0.8h, v0.8b 2173 sxtl v1.8h, v1.8b 2174 mov x15, x30 2175 mov \my, \h 2176 2177164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2178 add \ds2, \dst, \d_strd 2179 add \sr2, \src, \s_strd 2180 lsl \d_strd, \d_strd, #1 2181 lsl \s_strd, \s_strd, #1 2182 2183 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2184 uxtl v28.8h, v28.8b 2185 uxtl v29.8h, v29.8b 2186 mul v24.8h, v28.8h, v0.h[0] 2187.irpc i, 1234567 2188 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2189 mla v24.8h, v26.8h, v0.h[\i] 2190.endr 2191 srshr v16.8h, v24.8h, #2 2192 2193 bl L(\type\()_8tap_filter_8) 2194 mov v17.16b, v24.16b 2195 mov v18.16b, v25.16b 2196 21978: 2198 smull v2.4s, v16.4h, v1.h[0] 2199 smull2 v3.4s, v16.8h, v1.h[0] 2200 bl L(\type\()_8tap_filter_8) 2201 smull v4.4s, v17.4h, v1.h[0] 2202 smull2 v5.4s, v17.8h, v1.h[0] 2203 smlal v2.4s, v17.4h, v1.h[1] 2204 smlal2 v3.4s, v17.8h, v1.h[1] 2205 smlal v4.4s, v18.4h, v1.h[1] 2206 smlal2 v5.4s, v18.8h, v1.h[1] 2207 smlal v2.4s, v18.4h, v1.h[2] 2208 smlal2 v3.4s, v18.8h, v1.h[2] 2209 smlal v4.4s, v24.4h, v1.h[2] 2210 smlal2 v5.4s, v24.8h, v1.h[2] 2211 smlal v2.4s, v24.4h, v1.h[3] 2212 smlal2 v3.4s, v24.8h, v1.h[3] 2213 smlal v4.4s, v25.4h, v1.h[3] 2214 smlal2 v5.4s, v25.8h, v1.h[3] 2215 sqrshrn v2.4h, v2.4s, #\shift_hv 2216 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2217 sqrshrn v4.4h, v4.4s, #\shift_hv 2218 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2219 subs \h, \h, #2 2220.ifc \type, put 2221 sqxtun v2.8b, v2.8h 2222 sqxtun v4.8b, v4.8h 2223 st1 {v2.8b}, [\dst], \d_strd 2224 st1 {v4.8b}, [\ds2], \d_strd 2225.else 2226 st1 {v2.8h}, [\dst], \d_strd 2227 st1 {v4.8h}, [\ds2], \d_strd 2228.endif 2229 b.le 9f 2230 mov v16.16b, v18.16b 2231 mov v17.16b, v24.16b 2232 mov v18.16b, v25.16b 2233 b 8b 22349: 2235 subs \w, \w, #8 2236 b.le 0f 2237 asr \s_strd, \s_strd, #1 2238 asr \d_strd, \d_strd, #1 2239 msub \src, \s_strd, \xmy, \src 2240 msub \dst, \d_strd, \xmy, \dst 2241 sub \src, \src, \s_strd, lsl #2 2242 mov \h, \my 2243 add \src, \src, #8 2244.ifc \type, put 2245 add \dst, \dst, #8 2246.else 2247 add \dst, \dst, #16 2248.endif 2249 b 164b 2250 2251880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2252640: 22531280: 2254 ld1 {v0.8b}, [\xmx] 2255 ld1 {v1.8b}, [\xmy] 2256 sub \src, \src, #3 2257 sub \src, \src, \s_strd 2258 sub \src, \src, \s_strd, lsl #1 2259 sxtl v0.8h, v0.8b 2260 sxtl v1.8h, v1.8b 2261 mov x15, x30 2262 mov \my, \h 2263 2264168: 2265 add \ds2, \dst, \d_strd 2266 add \sr2, \src, \s_strd 2267 lsl \d_strd, \d_strd, #1 2268 lsl \s_strd, \s_strd, #1 2269 2270 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2271 uxtl v28.8h, v28.8b 2272 uxtl v29.8h, v29.8b 2273 mul v24.8h, v28.8h, v0.h[0] 2274.irpc i, 1234567 2275 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2276 mla v24.8h, v26.8h, v0.h[\i] 2277.endr 2278 srshr v16.8h, v24.8h, #2 2279 2280 bl L(\type\()_8tap_filter_8) 2281 mov v17.16b, v24.16b 2282 mov v18.16b, v25.16b 2283 bl L(\type\()_8tap_filter_8) 2284 mov v19.16b, v24.16b 2285 mov v20.16b, v25.16b 2286 bl L(\type\()_8tap_filter_8) 2287 mov v21.16b, v24.16b 2288 mov v22.16b, v25.16b 2289 229088: 2291 smull v2.4s, v16.4h, v1.h[0] 2292 smull2 v3.4s, v16.8h, v1.h[0] 2293 bl L(\type\()_8tap_filter_8) 2294 smull v4.4s, v17.4h, v1.h[0] 2295 smull2 v5.4s, v17.8h, v1.h[0] 2296 smlal v2.4s, v17.4h, v1.h[1] 2297 smlal2 v3.4s, v17.8h, v1.h[1] 2298 smlal v4.4s, v18.4h, v1.h[1] 2299 smlal2 v5.4s, v18.8h, v1.h[1] 2300 smlal v2.4s, v18.4h, v1.h[2] 2301 smlal2 v3.4s, v18.8h, v1.h[2] 2302 smlal v4.4s, v19.4h, v1.h[2] 2303 smlal2 v5.4s, v19.8h, v1.h[2] 2304 smlal v2.4s, v19.4h, v1.h[3] 2305 smlal2 v3.4s, v19.8h, v1.h[3] 2306 smlal v4.4s, v20.4h, v1.h[3] 2307 smlal2 v5.4s, v20.8h, v1.h[3] 2308 smlal v2.4s, v20.4h, v1.h[4] 2309 smlal2 v3.4s, v20.8h, v1.h[4] 2310 smlal v4.4s, v21.4h, v1.h[4] 2311 smlal2 v5.4s, v21.8h, v1.h[4] 2312 smlal v2.4s, v21.4h, v1.h[5] 2313 smlal2 v3.4s, v21.8h, v1.h[5] 2314 smlal v4.4s, v22.4h, v1.h[5] 2315 smlal2 v5.4s, v22.8h, v1.h[5] 2316 smlal v2.4s, v22.4h, v1.h[6] 2317 smlal2 v3.4s, v22.8h, v1.h[6] 2318 smlal v4.4s, v24.4h, v1.h[6] 2319 smlal2 v5.4s, v24.8h, v1.h[6] 2320 smlal v2.4s, v24.4h, v1.h[7] 2321 smlal2 v3.4s, v24.8h, v1.h[7] 2322 smlal v4.4s, v25.4h, v1.h[7] 2323 smlal2 v5.4s, v25.8h, v1.h[7] 2324 sqrshrn v2.4h, v2.4s, #\shift_hv 2325 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2326 sqrshrn v4.4h, v4.4s, #\shift_hv 2327 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2328 subs \h, \h, #2 2329.ifc \type, put 2330 sqxtun v2.8b, v2.8h 2331 sqxtun v4.8b, v4.8h 2332 st1 {v2.8b}, [\dst], \d_strd 2333 st1 {v4.8b}, [\ds2], \d_strd 2334.else 2335 st1 {v2.8h}, [\dst], \d_strd 2336 st1 {v4.8h}, [\ds2], \d_strd 2337.endif 2338 b.le 9f 2339 mov v16.16b, v18.16b 2340 mov v17.16b, v19.16b 2341 mov v18.16b, v20.16b 2342 mov v19.16b, v21.16b 2343 mov v20.16b, v22.16b 2344 mov v21.16b, v24.16b 2345 mov v22.16b, v25.16b 2346 b 88b 23479: 2348 subs \w, \w, #8 2349 b.le 0f 2350 asr \s_strd, \s_strd, #1 2351 asr \d_strd, \d_strd, #1 2352 msub \src, \s_strd, \xmy, \src 2353 msub \dst, \d_strd, \xmy, \dst 2354 sub \src, \src, \s_strd, lsl #3 2355 mov \h, \my 2356 add \src, \src, #8 2357.ifc \type, put 2358 add \dst, \dst, #8 2359.else 2360 add \dst, \dst, #16 2361.endif 2362 b 168b 23630: 2364 br x15 2365 2366L(\type\()_8tap_filter_8): 2367 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2368 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2369 uxtl v28.8h, v28.8b 2370 uxtl v29.8h, v29.8b 2371 uxtl v30.8h, v30.8b 2372 uxtl v31.8h, v31.8b 2373 mul v24.8h, v28.8h, v0.h[0] 2374 mul v25.8h, v30.8h, v0.h[0] 2375.irpc i, 1234567 2376 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2377 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2378 mla v24.8h, v26.8h, v0.h[\i] 2379 mla v25.8h, v27.8h, v0.h[\i] 2380.endr 2381 srshr v24.8h, v24.8h, #2 2382 srshr v25.8h, v25.8h, #2 2383 ret 2384 2385L(\type\()_8tap_hv_tbl): 2386 .hword L(\type\()_8tap_hv_tbl) - 1280b 2387 .hword L(\type\()_8tap_hv_tbl) - 640b 2388 .hword L(\type\()_8tap_hv_tbl) - 320b 2389 .hword L(\type\()_8tap_hv_tbl) - 160b 2390 .hword L(\type\()_8tap_hv_tbl) - 80b 2391 .hword L(\type\()_8tap_hv_tbl) - 40b 2392 .hword L(\type\()_8tap_hv_tbl) - 20b 2393 .hword 0 2394endfunc 2395 2396 2397function \type\()_bilin_8bpc_neon, export=1 2398 dup v1.16b, \mx 2399 dup v3.16b, \my 2400 mov w9, #16 2401 sub w8, w9, \mx 2402 sub w9, w9, \my 2403 dup v0.16b, w8 2404 dup v2.16b, w9 2405.ifc \type, prep 2406 uxtw \d_strd, \w 2407 lsl \d_strd, \d_strd, #1 2408.endif 2409 2410 clz w8, \w 2411 sub w8, w8, #24 2412 cbnz \mx, L(\type\()_bilin_h) 2413 cbnz \my, L(\type\()_bilin_v) 2414 b \type\()_neon 2415 2416L(\type\()_bilin_h): 2417 cbnz \my, L(\type\()_bilin_hv) 2418 2419 adr x9, L(\type\()_bilin_h_tbl) 2420 ldrh w8, [x9, x8, lsl #1] 2421 sub x9, x9, w8, uxtw 2422 br x9 2423 242420: // 2xN h 2425.ifc \type, put 2426 add \ds2, \dst, \d_strd 2427 add \sr2, \src, \s_strd 2428 lsl \d_strd, \d_strd, #1 2429 lsl \s_strd, \s_strd, #1 24302: 2431 ld1 {v4.s}[0], [\src], \s_strd 2432 ld1 {v6.s}[0], [\sr2], \s_strd 2433 ext v5.8b, v4.8b, v4.8b, #1 2434 ext v7.8b, v6.8b, v6.8b, #1 2435 trn1 v4.4h, v4.4h, v6.4h 2436 trn1 v5.4h, v5.4h, v7.4h 2437 subs \h, \h, #2 2438 umull v4.8h, v4.8b, v0.8b 2439 umlal v4.8h, v5.8b, v1.8b 2440 uqrshrn v4.8b, v4.8h, #4 2441 st1 {v4.h}[0], [\dst], \d_strd 2442 st1 {v4.h}[1], [\ds2], \d_strd 2443 b.gt 2b 2444 ret 2445.endif 2446 244740: // 4xN h 2448 add \ds2, \dst, \d_strd 2449 add \sr2, \src, \s_strd 2450 lsl \d_strd, \d_strd, #1 2451 lsl \s_strd, \s_strd, #1 24524: 2453 ld1 {v4.8b}, [\src], \s_strd 2454 ld1 {v6.8b}, [\sr2], \s_strd 2455 ext v5.8b, v4.8b, v4.8b, #1 2456 ext v7.8b, v6.8b, v6.8b, #1 2457 trn1 v4.2s, v4.2s, v6.2s 2458 trn1 v5.2s, v5.2s, v7.2s 2459 subs \h, \h, #2 2460 umull v4.8h, v4.8b, v0.8b 2461 umlal v4.8h, v5.8b, v1.8b 2462.ifc \type, put 2463 uqrshrn v4.8b, v4.8h, #4 2464 st1 {v4.s}[0], [\dst], \d_strd 2465 st1 {v4.s}[1], [\ds2], \d_strd 2466.else 2467 st1 {v4.d}[0], [\dst], \d_strd 2468 st1 {v4.d}[1], [\ds2], \d_strd 2469.endif 2470 b.gt 4b 2471 ret 2472 247380: // 8xN h 2474 add \ds2, \dst, \d_strd 2475 add \sr2, \src, \s_strd 2476 lsl \d_strd, \d_strd, #1 2477 lsl \s_strd, \s_strd, #1 24788: 2479 ld1 {v4.16b}, [\src], \s_strd 2480 ld1 {v6.16b}, [\sr2], \s_strd 2481 ext v5.16b, v4.16b, v4.16b, #1 2482 ext v7.16b, v6.16b, v6.16b, #1 2483 subs \h, \h, #2 2484 umull v4.8h, v4.8b, v0.8b 2485 umull v6.8h, v6.8b, v0.8b 2486 umlal v4.8h, v5.8b, v1.8b 2487 umlal v6.8h, v7.8b, v1.8b 2488.ifc \type, put 2489 uqrshrn v4.8b, v4.8h, #4 2490 uqrshrn v6.8b, v6.8h, #4 2491 st1 {v4.8b}, [\dst], \d_strd 2492 st1 {v6.8b}, [\ds2], \d_strd 2493.else 2494 st1 {v4.8h}, [\dst], \d_strd 2495 st1 {v6.8h}, [\ds2], \d_strd 2496.endif 2497 b.gt 8b 2498 ret 2499160: 2500320: 2501640: 25021280: // 16xN, 32xN, ... h 2503 add \ds2, \dst, \d_strd 2504 add \sr2, \src, \s_strd 2505 lsl \s_strd, \s_strd, #1 2506 2507 sub \s_strd, \s_strd, \w, uxtw 2508 sub \s_strd, \s_strd, #8 2509.ifc \type, put 2510 lsl \d_strd, \d_strd, #1 2511 sub \d_strd, \d_strd, \w, uxtw 2512.endif 2513161: 2514 ld1 {v16.d}[1], [\src], #8 2515 ld1 {v20.d}[1], [\sr2], #8 2516 mov \mx, \w 2517 251816: 2519 ld1 {v18.16b}, [\src], #16 2520 ld1 {v22.16b}, [\sr2], #16 2521 ext v17.16b, v16.16b, v18.16b, #8 2522 ext v19.16b, v16.16b, v18.16b, #9 2523 ext v21.16b, v20.16b, v22.16b, #8 2524 ext v23.16b, v20.16b, v22.16b, #9 2525 umull v16.8h, v17.8b, v0.8b 2526 umull2 v17.8h, v17.16b, v0.16b 2527 umull v20.8h, v21.8b, v0.8b 2528 umull2 v21.8h, v21.16b, v0.16b 2529 umlal v16.8h, v19.8b, v1.8b 2530 umlal2 v17.8h, v19.16b, v1.16b 2531 umlal v20.8h, v23.8b, v1.8b 2532 umlal2 v21.8h, v23.16b, v1.16b 2533 subs \mx, \mx, #16 2534.ifc \type, put 2535 uqrshrn v16.8b, v16.8h, #4 2536 uqrshrn2 v16.16b, v17.8h, #4 2537 uqrshrn v20.8b, v20.8h, #4 2538 uqrshrn2 v20.16b, v21.8h, #4 2539 st1 {v16.16b}, [\dst], #16 2540 st1 {v20.16b}, [\ds2], #16 2541.else 2542 st1 {v16.8h, v17.8h}, [\dst], #32 2543 st1 {v20.8h, v21.8h}, [\ds2], #32 2544.endif 2545 b.le 9f 2546 2547 mov v16.16b, v18.16b 2548 mov v20.16b, v22.16b 2549 b 16b 2550 25519: 2552 add \dst, \dst, \d_strd 2553 add \ds2, \ds2, \d_strd 2554 add \src, \src, \s_strd 2555 add \sr2, \sr2, \s_strd 2556 2557 subs \h, \h, #2 2558 b.gt 161b 2559 ret 2560 2561L(\type\()_bilin_h_tbl): 2562 .hword L(\type\()_bilin_h_tbl) - 1280b 2563 .hword L(\type\()_bilin_h_tbl) - 640b 2564 .hword L(\type\()_bilin_h_tbl) - 320b 2565 .hword L(\type\()_bilin_h_tbl) - 160b 2566 .hword L(\type\()_bilin_h_tbl) - 80b 2567 .hword L(\type\()_bilin_h_tbl) - 40b 2568 .hword L(\type\()_bilin_h_tbl) - 20b 2569 .hword 0 2570 2571 2572L(\type\()_bilin_v): 2573 cmp \h, #4 2574 adr x9, L(\type\()_bilin_v_tbl) 2575 ldrh w8, [x9, x8, lsl #1] 2576 sub x9, x9, w8, uxtw 2577 br x9 2578 257920: // 2xN v 2580.ifc \type, put 2581 cmp \h, #2 2582 add \ds2, \dst, \d_strd 2583 add \sr2, \src, \s_strd 2584 lsl \s_strd, \s_strd, #1 2585 lsl \d_strd, \d_strd, #1 2586 2587 // 2x2 v 2588 ld1 {v16.h}[0], [\src], \s_strd 2589 b.gt 24f 2590 ld1 {v17.h}[0], [\sr2], \s_strd 2591 ld1 {v18.h}[0], [\src], \s_strd 2592 trn1 v16.4h, v16.4h, v17.4h 2593 trn1 v17.4h, v17.4h, v18.4h 2594 umull v4.8h, v16.8b, v2.8b 2595 umlal v4.8h, v17.8b, v3.8b 2596 uqrshrn v4.8b, v4.8h, #4 2597 st1 {v4.h}[0], [\dst] 2598 st1 {v4.h}[1], [\ds2] 2599 ret 260024: // 2x4, 2x8, ... v 2601 ld1 {v17.h}[0], [\sr2], \s_strd 2602 ld1 {v18.h}[0], [\src], \s_strd 2603 ld1 {v19.h}[0], [\sr2], \s_strd 2604 ld1 {v20.h}[0], [\src], \s_strd 2605 trn1 v16.4h, v16.4h, v17.4h 2606 trn1 v17.4h, v17.4h, v18.4h 2607 trn1 v18.4h, v18.4h, v19.4h 2608 trn1 v19.4h, v19.4h, v20.4h 2609 trn1 v16.2s, v16.2s, v18.2s 2610 trn1 v17.2s, v17.2s, v19.2s 2611 umull v4.8h, v16.8b, v2.8b 2612 umlal v4.8h, v17.8b, v3.8b 2613 subs \h, \h, #4 2614 uqrshrn v4.8b, v4.8h, #4 2615 st1 {v4.h}[0], [\dst], \d_strd 2616 st1 {v4.h}[1], [\ds2], \d_strd 2617 st1 {v4.h}[2], [\dst], \d_strd 2618 st1 {v4.h}[3], [\ds2], \d_strd 2619 b.le 0f 2620 mov v16.8b, v20.8b 2621 b 24b 26220: 2623 ret 2624.endif 2625 262640: // 4xN v 2627 add \ds2, \dst, \d_strd 2628 add \sr2, \src, \s_strd 2629 lsl \s_strd, \s_strd, #1 2630 lsl \d_strd, \d_strd, #1 2631 ld1 {v16.s}[0], [\src], \s_strd 26324: 2633 ld1 {v17.s}[0], [\sr2], \s_strd 2634 ld1 {v18.s}[0], [\src], \s_strd 2635 trn1 v16.2s, v16.2s, v17.2s 2636 trn1 v17.2s, v17.2s, v18.2s 2637 umull v4.8h, v16.8b, v2.8b 2638 umlal v4.8h, v17.8b, v3.8b 2639 subs \h, \h, #2 2640.ifc \type, put 2641 uqrshrn v4.8b, v4.8h, #4 2642 st1 {v4.s}[0], [\dst], \d_strd 2643 st1 {v4.s}[1], [\ds2], \d_strd 2644.else 2645 st1 {v4.d}[0], [\dst], \d_strd 2646 st1 {v4.d}[1], [\ds2], \d_strd 2647.endif 2648 b.le 0f 2649 mov v16.8b, v18.8b 2650 b 4b 26510: 2652 ret 2653 265480: // 8xN v 2655 add \ds2, \dst, \d_strd 2656 add \sr2, \src, \s_strd 2657 lsl \s_strd, \s_strd, #1 2658 lsl \d_strd, \d_strd, #1 2659 ld1 {v16.8b}, [\src], \s_strd 26608: 2661 ld1 {v17.8b}, [\sr2], \s_strd 2662 ld1 {v18.8b}, [\src], \s_strd 2663 umull v4.8h, v16.8b, v2.8b 2664 umull v5.8h, v17.8b, v2.8b 2665 umlal v4.8h, v17.8b, v3.8b 2666 umlal v5.8h, v18.8b, v3.8b 2667 subs \h, \h, #2 2668.ifc \type, put 2669 uqrshrn v4.8b, v4.8h, #4 2670 uqrshrn v5.8b, v5.8h, #4 2671 st1 {v4.8b}, [\dst], \d_strd 2672 st1 {v5.8b}, [\ds2], \d_strd 2673.else 2674 st1 {v4.8h}, [\dst], \d_strd 2675 st1 {v5.8h}, [\ds2], \d_strd 2676.endif 2677 b.le 0f 2678 mov v16.8b, v18.8b 2679 b 8b 26800: 2681 ret 2682 2683160: // 16xN, 32xN, ... 2684320: 2685640: 26861280: 2687 mov \my, \h 26881: 2689 add \ds2, \dst, \d_strd 2690 add \sr2, \src, \s_strd 2691 lsl \s_strd, \s_strd, #1 2692 lsl \d_strd, \d_strd, #1 2693 2694 ld1 {v16.16b}, [\src], \s_strd 26952: 2696 ld1 {v17.16b}, [\sr2], \s_strd 2697 ld1 {v18.16b}, [\src], \s_strd 2698 umull v4.8h, v16.8b, v2.8b 2699 umull2 v5.8h, v16.16b, v2.16b 2700 umull v6.8h, v17.8b, v2.8b 2701 umull2 v7.8h, v17.16b, v2.16b 2702 umlal v4.8h, v17.8b, v3.8b 2703 umlal2 v5.8h, v17.16b, v3.16b 2704 umlal v6.8h, v18.8b, v3.8b 2705 umlal2 v7.8h, v18.16b, v3.16b 2706 subs \h, \h, #2 2707.ifc \type, put 2708 uqrshrn v4.8b, v4.8h, #4 2709 uqrshrn2 v4.16b, v5.8h, #4 2710 uqrshrn v6.8b, v6.8h, #4 2711 uqrshrn2 v6.16b, v7.8h, #4 2712 st1 {v4.16b}, [\dst], \d_strd 2713 st1 {v6.16b}, [\ds2], \d_strd 2714.else 2715 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2716 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2717.endif 2718 b.le 9f 2719 mov v16.16b, v18.16b 2720 b 2b 27219: 2722 subs \w, \w, #16 2723 b.le 0f 2724 asr \s_strd, \s_strd, #1 2725 asr \d_strd, \d_strd, #1 2726 msub \src, \s_strd, \xmy, \src 2727 msub \dst, \d_strd, \xmy, \dst 2728 sub \src, \src, \s_strd, lsl #1 2729 mov \h, \my 2730 add \src, \src, #16 2731.ifc \type, put 2732 add \dst, \dst, #16 2733.else 2734 add \dst, \dst, #32 2735.endif 2736 b 1b 27370: 2738 ret 2739 2740L(\type\()_bilin_v_tbl): 2741 .hword L(\type\()_bilin_v_tbl) - 1280b 2742 .hword L(\type\()_bilin_v_tbl) - 640b 2743 .hword L(\type\()_bilin_v_tbl) - 320b 2744 .hword L(\type\()_bilin_v_tbl) - 160b 2745 .hword L(\type\()_bilin_v_tbl) - 80b 2746 .hword L(\type\()_bilin_v_tbl) - 40b 2747 .hword L(\type\()_bilin_v_tbl) - 20b 2748 .hword 0 2749 2750L(\type\()_bilin_hv): 2751 uxtl v2.8h, v2.8b 2752 uxtl v3.8h, v3.8b 2753 adr x9, L(\type\()_bilin_hv_tbl) 2754 ldrh w8, [x9, x8, lsl #1] 2755 sub x9, x9, w8, uxtw 2756 br x9 2757 275820: // 2xN hv 2759.ifc \type, put 2760 add \sr2, \src, \s_strd 2761 add \ds2, \dst, \d_strd 2762 lsl \s_strd, \s_strd, #1 2763 lsl \d_strd, \d_strd, #1 2764 2765 ld1 {v28.s}[0], [\src], \s_strd 2766 ext v29.8b, v28.8b, v28.8b, #1 2767 umull v16.8h, v28.8b, v0.8b 2768 umlal v16.8h, v29.8b, v1.8b 2769 27702: 2771 ld1 {v28.s}[0], [\sr2], \s_strd 2772 ld1 {v30.s}[0], [\src], \s_strd 2773 ext v29.8b, v28.8b, v28.8b, #1 2774 ext v31.8b, v30.8b, v30.8b, #1 2775 trn1 v28.4h, v28.4h, v30.4h 2776 trn1 v29.4h, v29.4h, v31.4h 2777 umull v17.8h, v28.8b, v0.8b 2778 umlal v17.8h, v29.8b, v1.8b 2779 2780 trn1 v16.2s, v16.2s, v17.2s 2781 2782 mul v4.4h, v16.4h, v2.4h 2783 mla v4.4h, v17.4h, v3.4h 2784 uqrshrn v4.8b, v4.8h, #8 2785 subs \h, \h, #2 2786 st1 {v4.h}[0], [\dst], \d_strd 2787 st1 {v4.h}[1], [\ds2], \d_strd 2788 b.le 0f 2789 trn2 v16.2s, v17.2s, v17.2s 2790 b 2b 27910: 2792 ret 2793.endif 2794 279540: // 4xN hv 2796 add \sr2, \src, \s_strd 2797 add \ds2, \dst, \d_strd 2798 lsl \s_strd, \s_strd, #1 2799 lsl \d_strd, \d_strd, #1 2800 2801 ld1 {v28.8b}, [\src], \s_strd 2802 ext v29.8b, v28.8b, v28.8b, #1 2803 umull v16.8h, v28.8b, v0.8b 2804 umlal v16.8h, v29.8b, v1.8b 2805 28064: 2807 ld1 {v28.8b}, [\sr2], \s_strd 2808 ld1 {v30.8b}, [\src], \s_strd 2809 ext v29.8b, v28.8b, v28.8b, #1 2810 ext v31.8b, v30.8b, v30.8b, #1 2811 trn1 v28.2s, v28.2s, v30.2s 2812 trn1 v29.2s, v29.2s, v31.2s 2813 umull v17.8h, v28.8b, v0.8b 2814 umlal v17.8h, v29.8b, v1.8b 2815 2816 trn1 v16.2d, v16.2d, v17.2d 2817 2818 mul v4.8h, v16.8h, v2.8h 2819 mla v4.8h, v17.8h, v3.8h 2820 subs \h, \h, #2 2821.ifc \type, put 2822 uqrshrn v4.8b, v4.8h, #8 2823 st1 {v4.s}[0], [\dst], \d_strd 2824 st1 {v4.s}[1], [\ds2], \d_strd 2825.else 2826 urshr v4.8h, v4.8h, #4 2827 st1 {v4.d}[0], [\dst], \d_strd 2828 st1 {v4.d}[1], [\ds2], \d_strd 2829.endif 2830 b.le 0f 2831 trn2 v16.2d, v17.2d, v17.2d 2832 b 4b 28330: 2834 ret 2835 283680: // 8xN, 16xN, ... hv 2837160: 2838320: 2839640: 28401280: 2841 mov \my, \h 2842 28431: 2844 add \sr2, \src, \s_strd 2845 add \ds2, \dst, \d_strd 2846 lsl \s_strd, \s_strd, #1 2847 lsl \d_strd, \d_strd, #1 2848 2849 ld1 {v28.16b}, [\src], \s_strd 2850 ext v29.16b, v28.16b, v28.16b, #1 2851 umull v16.8h, v28.8b, v0.8b 2852 umlal v16.8h, v29.8b, v1.8b 2853 28542: 2855 ld1 {v28.16b}, [\sr2], \s_strd 2856 ld1 {v30.16b}, [\src], \s_strd 2857 ext v29.16b, v28.16b, v28.16b, #1 2858 ext v31.16b, v30.16b, v30.16b, #1 2859 umull v17.8h, v28.8b, v0.8b 2860 umlal v17.8h, v29.8b, v1.8b 2861 umull v18.8h, v30.8b, v0.8b 2862 umlal v18.8h, v31.8b, v1.8b 2863 2864 mul v4.8h, v16.8h, v2.8h 2865 mla v4.8h, v17.8h, v3.8h 2866 mul v5.8h, v17.8h, v2.8h 2867 mla v5.8h, v18.8h, v3.8h 2868 subs \h, \h, #2 2869.ifc \type, put 2870 uqrshrn v4.8b, v4.8h, #8 2871 uqrshrn v5.8b, v5.8h, #8 2872 st1 {v4.8b}, [\dst], \d_strd 2873 st1 {v5.8b}, [\ds2], \d_strd 2874.else 2875 urshr v4.8h, v4.8h, #4 2876 urshr v5.8h, v5.8h, #4 2877 st1 {v4.8h}, [\dst], \d_strd 2878 st1 {v5.8h}, [\ds2], \d_strd 2879.endif 2880 b.le 9f 2881 mov v16.16b, v18.16b 2882 b 2b 28839: 2884 subs \w, \w, #8 2885 b.le 0f 2886 asr \s_strd, \s_strd, #1 2887 asr \d_strd, \d_strd, #1 2888 msub \src, \s_strd, \xmy, \src 2889 msub \dst, \d_strd, \xmy, \dst 2890 sub \src, \src, \s_strd, lsl #1 2891 mov \h, \my 2892 add \src, \src, #8 2893.ifc \type, put 2894 add \dst, \dst, #8 2895.else 2896 add \dst, \dst, #16 2897.endif 2898 b 1b 28990: 2900 ret 2901 2902L(\type\()_bilin_hv_tbl): 2903 .hword L(\type\()_bilin_hv_tbl) - 1280b 2904 .hword L(\type\()_bilin_hv_tbl) - 640b 2905 .hword L(\type\()_bilin_hv_tbl) - 320b 2906 .hword L(\type\()_bilin_hv_tbl) - 160b 2907 .hword L(\type\()_bilin_hv_tbl) - 80b 2908 .hword L(\type\()_bilin_hv_tbl) - 40b 2909 .hword L(\type\()_bilin_hv_tbl) - 20b 2910 .hword 0 2911endfunc 2912.endm 2913 2914filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 2915filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 2916 2917.macro load_filter_row dst, src, inc 2918 asr w13, \src, #10 2919 ldr \dst, [x11, w13, sxtw #3] 2920 add \src, \src, \inc 2921.endm 2922 2923function warp_filter_horz_neon 2924 add w12, w5, #512 2925 2926 ld1 {v16.8b, v17.8b}, [x2], x3 2927 2928 load_filter_row d0, w12, w7 2929 uxtl v16.8h, v16.8b 2930 load_filter_row d1, w12, w7 2931 uxtl v17.8h, v17.8b 2932 load_filter_row d2, w12, w7 2933 sxtl v0.8h, v0.8b 2934 load_filter_row d3, w12, w7 2935 sxtl v1.8h, v1.8b 2936 load_filter_row d4, w12, w7 2937 sxtl v2.8h, v2.8b 2938 load_filter_row d5, w12, w7 2939 sxtl v3.8h, v3.8b 2940 load_filter_row d6, w12, w7 2941 sxtl v4.8h, v4.8b 2942 load_filter_row d7, w12, w7 2943 sxtl v5.8h, v5.8b 2944 ext v18.16b, v16.16b, v17.16b, #2*1 2945 mul v23.8h, v16.8h, v0.8h 2946 sxtl v6.8h, v6.8b 2947 ext v19.16b, v16.16b, v17.16b, #2*2 2948 mul v18.8h, v18.8h, v1.8h 2949 sxtl v7.8h, v7.8b 2950 ext v20.16b, v16.16b, v17.16b, #2*3 2951 mul v19.8h, v19.8h, v2.8h 2952 ext v21.16b, v16.16b, v17.16b, #2*4 2953 saddlp v23.4s, v23.8h 2954 mul v20.8h, v20.8h, v3.8h 2955 ext v22.16b, v16.16b, v17.16b, #2*5 2956 saddlp v18.4s, v18.8h 2957 mul v21.8h, v21.8h, v4.8h 2958 saddlp v19.4s, v19.8h 2959 mul v22.8h, v22.8h, v5.8h 2960 saddlp v20.4s, v20.8h 2961 saddlp v21.4s, v21.8h 2962 saddlp v22.4s, v22.8h 2963 addp v18.4s, v23.4s, v18.4s 2964 ext v23.16b, v16.16b, v17.16b, #2*6 2965 addp v19.4s, v19.4s, v20.4s 2966 mul v23.8h, v23.8h, v6.8h 2967 ext v20.16b, v16.16b, v17.16b, #2*7 2968 mul v20.8h, v20.8h, v7.8h 2969 saddlp v23.4s, v23.8h 2970 addp v21.4s, v21.4s, v22.4s 2971 saddlp v20.4s, v20.8h 2972 addp v20.4s, v23.4s, v20.4s 2973 addp v18.4s, v18.4s, v19.4s 2974 addp v20.4s, v21.4s, v20.4s 2975 2976 add w5, w5, w8 2977 2978 rshrn v16.4h, v18.4s, #3 2979 rshrn2 v16.8h, v20.4s, #3 2980 2981 ret 2982endfunc 2983 2984// void dav1d_warp_affine_8x8_8bpc_neon( 2985// pixel *dst, const ptrdiff_t dst_stride, 2986// const pixel *src, const ptrdiff_t src_stride, 2987// const int16_t *const abcd, int mx, int my) 2988.macro warp t, shift 2989function warp_affine_8x8\t\()_8bpc_neon, export=1 2990 ldr x4, [x4] 2991 sbfx x7, x4, #0, #16 2992 sbfx x8, x4, #16, #16 2993 sbfx x9, x4, #32, #16 2994 sbfx x4, x4, #48, #16 2995 mov w10, #8 2996 sub x2, x2, x3, lsl #1 2997 sub x2, x2, x3 2998 sub x2, x2, #3 2999 movrel x11, X(mc_warp_filter), 64*8 3000 mov x15, x30 3001.ifnb \t 3002 lsl x1, x1, #1 3003.endif 3004 3005 bl warp_filter_horz_neon 3006 mov v24.16b, v16.16b 3007 bl warp_filter_horz_neon 3008 mov v25.16b, v16.16b 3009 bl warp_filter_horz_neon 3010 mov v26.16b, v16.16b 3011 bl warp_filter_horz_neon 3012 mov v27.16b, v16.16b 3013 bl warp_filter_horz_neon 3014 mov v28.16b, v16.16b 3015 bl warp_filter_horz_neon 3016 mov v29.16b, v16.16b 3017 bl warp_filter_horz_neon 3018 mov v30.16b, v16.16b 3019 30201: 3021 add w14, w6, #512 3022 bl warp_filter_horz_neon 3023 mov v31.16b, v16.16b 3024 3025 load_filter_row d0, w14, w9 3026 load_filter_row d1, w14, w9 3027 load_filter_row d2, w14, w9 3028 load_filter_row d3, w14, w9 3029 load_filter_row d4, w14, w9 3030 load_filter_row d5, w14, w9 3031 load_filter_row d6, w14, w9 3032 load_filter_row d7, w14, w9 3033 transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 3034 sxtl v0.8h, v0.8b 3035 sxtl v1.8h, v1.8b 3036 sxtl v2.8h, v2.8b 3037 sxtl v3.8h, v3.8b 3038 sxtl v4.8h, v4.8b 3039 sxtl v5.8h, v5.8b 3040 sxtl v6.8h, v6.8b 3041 sxtl v7.8h, v7.8b 3042 3043 // This ordering of smull/smlal/smull2/smlal2 is highly 3044 // beneficial for Cortex A53 here. 3045 smull v16.4s, v24.4h, v0.4h 3046 smlal v16.4s, v25.4h, v1.4h 3047 smlal v16.4s, v26.4h, v2.4h 3048 smlal v16.4s, v27.4h, v3.4h 3049 smlal v16.4s, v28.4h, v4.4h 3050 smlal v16.4s, v29.4h, v5.4h 3051 smlal v16.4s, v30.4h, v6.4h 3052 smlal v16.4s, v31.4h, v7.4h 3053 smull2 v17.4s, v24.8h, v0.8h 3054 smlal2 v17.4s, v25.8h, v1.8h 3055 smlal2 v17.4s, v26.8h, v2.8h 3056 smlal2 v17.4s, v27.8h, v3.8h 3057 smlal2 v17.4s, v28.8h, v4.8h 3058 smlal2 v17.4s, v29.8h, v5.8h 3059 smlal2 v17.4s, v30.8h, v6.8h 3060 smlal2 v17.4s, v31.8h, v7.8h 3061 3062 mov v24.16b, v25.16b 3063 mov v25.16b, v26.16b 3064 sqrshrn v16.4h, v16.4s, #\shift 3065 mov v26.16b, v27.16b 3066 sqrshrn2 v16.8h, v17.4s, #\shift 3067 mov v27.16b, v28.16b 3068 mov v28.16b, v29.16b 3069.ifb \t 3070 sqxtun v16.8b, v16.8h 3071.endif 3072 mov v29.16b, v30.16b 3073 mov v30.16b, v31.16b 3074 subs w10, w10, #1 3075.ifnb \t 3076 st1 {v16.8h}, [x0], x1 3077.else 3078 st1 {v16.8b}, [x0], x1 3079.endif 3080 3081 add w6, w6, w4 3082 b.gt 1b 3083 3084 br x15 3085endfunc 3086.endm 3087 3088warp , 11 3089warp t, 7 3090 3091// void dav1d_emu_edge_8bpc_neon( 3092// const intptr_t bw, const intptr_t bh, 3093// const intptr_t iw, const intptr_t ih, 3094// const intptr_t x, const intptr_t y, 3095// pixel *dst, const ptrdiff_t dst_stride, 3096// const pixel *ref, const ptrdiff_t ref_stride) 3097function emu_edge_8bpc_neon, export=1 3098 ldp x8, x9, [sp] 3099 3100 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3101 // ref += iclip(x, 0, iw - 1) 3102 sub x12, x3, #1 // ih - 1 3103 cmp x5, x3 3104 sub x13, x2, #1 // iw - 1 3105 csel x12, x12, x5, ge // min(y, ih - 1) 3106 cmp x4, x2 3107 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3108 csel x13, x13, x4, ge // min(x, iw - 1) 3109 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3110 madd x8, x12, x9, x8 // ref += iclip() * stride 3111 add x8, x8, x13 // ref += iclip() 3112 3113 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3114 // top_ext = iclip(-y, 0, bh - 1) 3115 add x10, x5, x1 // y + bh 3116 neg x5, x5 // -y 3117 sub x10, x10, x3 // y + bh - ih 3118 sub x12, x1, #1 // bh - 1 3119 cmp x10, x1 3120 bic x5, x5, x5, asr #63 // max(-y, 0) 3121 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3122 cmp x5, x1 3123 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3124 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3125 3126 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3127 // left_ext = iclip(-x, 0, bw - 1) 3128 add x11, x4, x0 // x + bw 3129 neg x4, x4 // -x 3130 sub x11, x11, x2 // x + bw - iw 3131 sub x13, x0, #1 // bw - 1 3132 cmp x11, x0 3133 bic x4, x4, x4, asr #63 // max(-x, 0) 3134 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3135 cmp x4, x0 3136 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3137 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3138 3139 // center_h = bh - top_ext - bottom_ext 3140 // dst += top_ext * PXSTRIDE(dst_stride) 3141 // center_w = bw - left_ext - right_ext 3142 sub x1, x1, x5 // bh - top_ext 3143 madd x6, x5, x7, x6 3144 sub x2, x0, x4 // bw - left_ext 3145 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3146 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3147 3148 mov x14, x6 // backup of dst 3149 3150.macro v_loop need_left, need_right 31510: 3152.if \need_left 3153 ld1r {v0.16b}, [x8] 3154 mov x12, x6 // out = dst 3155 mov x3, x4 31561: 3157 subs x3, x3, #16 3158 st1 {v0.16b}, [x12], #16 3159 b.gt 1b 3160.endif 3161 mov x13, x8 3162 add x12, x6, x4 // out = dst + left_ext 3163 mov x3, x2 31641: 3165 ld1 {v0.16b, v1.16b}, [x13], #32 3166 subs x3, x3, #32 3167 st1 {v0.16b, v1.16b}, [x12], #32 3168 b.gt 1b 3169.if \need_right 3170 add x3, x8, x2 // in + center_w 3171 sub x3, x3, #1 // in + center_w - 1 3172 add x12, x6, x4 // dst + left_ext 3173 ld1r {v0.16b}, [x3] 3174 add x12, x12, x2 // out = dst + left_ext + center_w 3175 mov x3, x11 31761: 3177 subs x3, x3, #16 3178 st1 {v0.16b}, [x12], #16 3179 b.gt 1b 3180.endif 3181 3182 subs x1, x1, #1 // center_h-- 3183 add x6, x6, x7 3184 add x8, x8, x9 3185 b.gt 0b 3186.endm 3187 3188 cbz x4, 2f 3189 // need_left 3190 cbz x11, 3f 3191 // need_left + need_right 3192 v_loop 1, 1 3193 b 5f 3194 31952: 3196 // !need_left 3197 cbz x11, 4f 3198 // !need_left + need_right 3199 v_loop 0, 1 3200 b 5f 3201 32023: 3203 // need_left + !need_right 3204 v_loop 1, 0 3205 b 5f 3206 32074: 3208 // !need_left + !need_right 3209 v_loop 0, 0 3210 32115: 3212 3213 cbz x10, 3f 3214 // need_bottom 3215 sub x8, x6, x7 // ref = dst - stride 3216 mov x4, x0 32171: 3218 ld1 {v0.16b, v1.16b}, [x8], #32 3219 mov x3, x10 32202: 3221 subs x3, x3, #1 3222 st1 {v0.16b, v1.16b}, [x6], x7 3223 b.gt 2b 3224 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3225 subs x4, x4, #32 // bw -= 32 3226 add x6, x6, #32 // dst += 32 3227 b.gt 1b 3228 32293: 3230 cbz x5, 3f 3231 // need_top 3232 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 32331: 3234 ld1 {v0.16b, v1.16b}, [x14], #32 3235 mov x3, x5 32362: 3237 subs x3, x3, #1 3238 st1 {v0.16b, v1.16b}, [x6], x7 3239 b.gt 2b 3240 msub x6, x7, x5, x6 // dst -= top_ext * stride 3241 subs x0, x0, #32 // bw -= 32 3242 add x6, x6, #32 // dst += 32 3243 b.gt 1b 3244 32453: 3246 ret 3247endfunc 3248