1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 adr x7, L(\type\()_tbl) 83 sub w4, w4, #24 84 ldrh w4, [x7, x4, lsl #1] 85 \type v4, v0, v1, v2, v3 86 sub x7, x7, w4, uxtw 87 br x7 8840: 89 add x7, x0, x1 90 lsl x1, x1, #1 914: 92 cmp w5, #4 93 st1 {v4.s}[0], [x0], x1 94 st1 {v4.s}[1], [x7], x1 95 st1 {v4.s}[2], [x0], x1 96 st1 {v4.s}[3], [x7], x1 97 b.eq 0f 98 \type v5, v0, v1, v2, v3 99 cmp w5, #8 100 st1 {v5.s}[0], [x0], x1 101 st1 {v5.s}[1], [x7], x1 102 st1 {v5.s}[2], [x0], x1 103 st1 {v5.s}[3], [x7], x1 104 b.eq 0f 105 \type v4, v0, v1, v2, v3 106 st1 {v4.s}[0], [x0], x1 107 st1 {v4.s}[1], [x7], x1 108 \type v5, v0, v1, v2, v3 109 st1 {v4.s}[2], [x0], x1 110 st1 {v4.s}[3], [x7], x1 111 st1 {v5.s}[0], [x0], x1 112 st1 {v5.s}[1], [x7], x1 113 st1 {v5.s}[2], [x0], x1 114 st1 {v5.s}[3], [x7], x1 115 ret 11680: 117 add x7, x0, x1 118 lsl x1, x1, #1 1198: 120 st1 {v4.d}[0], [x0], x1 121 \type v5, v0, v1, v2, v3 122 st1 {v4.d}[1], [x7], x1 123 st1 {v5.d}[0], [x0], x1 124 subs w5, w5, #4 125 st1 {v5.d}[1], [x7], x1 126 b.le 0f 127 \type v4, v0, v1, v2, v3 128 b 8b 12916: 130 \type v5, v0, v1, v2, v3 131 st1 {v4.16b}, [x0], x1 132 \type v6, v0, v1, v2, v3 133 st1 {v5.16b}, [x0], x1 134 \type v7, v0, v1, v2, v3 135 st1 {v6.16b}, [x0], x1 136 subs w5, w5, #4 137 st1 {v7.16b}, [x0], x1 138 b.le 0f 139 \type v4, v0, v1, v2, v3 140 b 16b 141320: 142 add x7, x0, x1 143 lsl x1, x1, #1 14432: 145 \type v5, v0, v1, v2, v3 146 \type v6, v0, v1, v2, v3 147 st1 {v4.16b,v5.16b}, [x0], x1 148 \type v7, v0, v1, v2, v3 149 subs w5, w5, #2 150 st1 {v6.16b,v7.16b}, [x7], x1 151 b.le 0f 152 \type v4, v0, v1, v2, v3 153 b 32b 154640: 155 add x7, x0, x1 156 lsl x1, x1, #1 15764: 158 \type v5, v0, v1, v2, v3 159 \type v6, v0, v1, v2, v3 160 \type v7, v0, v1, v2, v3 161 \type v16, v0, v1, v2, v3 162 \type v17, v0, v1, v2, v3 163 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 164 \type v18, v0, v1, v2, v3 165 \type v19, v0, v1, v2, v3 166 subs w5, w5, #2 167 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 168 b.le 0f 169 \type v4, v0, v1, v2, v3 170 b 64b 1711280: 172 add x7, x0, #64 173128: 174 \type v5, v0, v1, v2, v3 175 \type v6, v0, v1, v2, v3 176 \type v7, v0, v1, v2, v3 177 \type v16, v0, v1, v2, v3 178 \type v17, v0, v1, v2, v3 179 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 180 \type v18, v0, v1, v2, v3 181 \type v19, v0, v1, v2, v3 182 subs w5, w5, #1 183 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 184 b.le 0f 185 \type v4, v0, v1, v2, v3 186 b 128b 1870: 188 ret 189L(\type\()_tbl): 190 .hword L(\type\()_tbl) - 1280b 191 .hword L(\type\()_tbl) - 640b 192 .hword L(\type\()_tbl) - 320b 193 .hword L(\type\()_tbl) - 16b 194 .hword L(\type\()_tbl) - 80b 195 .hword L(\type\()_tbl) - 40b 196endfunc 197.endm 198 199bidir_fn avg 200bidir_fn w_avg 201bidir_fn mask 202 203 204.macro w_mask_fn type 205function w_mask_\type\()_8bpc_neon, export=1 206 clz w8, w4 207 adr x9, L(w_mask_\type\()_tbl) 208 sub w8, w8, #24 209 ldrh w8, [x9, x8, lsl #1] 210 sub x9, x9, w8, uxtw 211 mov w10, #6903 212 dup v0.8h, w10 213.if \type == 444 214 movi v1.16b, #64 215.elseif \type == 422 216 dup v2.8b, w7 217 movi v3.8b, #129 218 sub v3.8b, v3.8b, v2.8b 219.elseif \type == 420 220 dup v2.8h, w7 221 movi v3.8h, #1, lsl #8 222 sub v3.8h, v3.8h, v2.8h 223.endif 224 add x12, x0, x1 225 lsl x1, x1, #1 226 br x9 2274: 228 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 229 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 230 subs w5, w5, #4 231 sub v16.8h, v6.8h, v4.8h 232 sub v17.8h, v7.8h, v5.8h 233 sabd v18.8h, v4.8h, v6.8h 234 sabd v19.8h, v5.8h, v7.8h 235 uqsub v18.8h, v0.8h, v18.8h 236 uqsub v19.8h, v0.8h, v19.8h 237 ushr v18.8h, v18.8h, #8 238 ushr v19.8h, v19.8h, #8 239 shl v20.8h, v18.8h, #9 240 shl v21.8h, v19.8h, #9 241 sqdmulh v20.8h, v20.8h, v16.8h 242 sqdmulh v21.8h, v21.8h, v17.8h 243 add v20.8h, v20.8h, v4.8h 244 add v21.8h, v21.8h, v5.8h 245 sqrshrun v22.8b, v20.8h, #4 246 sqrshrun v23.8b, v21.8h, #4 247.if \type == 444 248 xtn v18.8b, v18.8h 249 xtn2 v18.16b, v19.8h 250 sub v18.16b, v1.16b, v18.16b 251 st1 {v18.16b}, [x6], #16 252.elseif \type == 422 253 addp v18.8h, v18.8h, v19.8h 254 xtn v18.8b, v18.8h 255 uhsub v18.8b, v3.8b, v18.8b 256 st1 {v18.8b}, [x6], #8 257.elseif \type == 420 258 trn1 v24.2d, v18.2d, v19.2d 259 trn2 v25.2d, v18.2d, v19.2d 260 add v24.8h, v24.8h, v25.8h 261 addp v18.8h, v24.8h, v24.8h 262 sub v18.4h, v3.4h, v18.4h 263 rshrn v18.8b, v18.8h, #2 264 st1 {v18.s}[0], [x6], #4 265.endif 266 st1 {v22.s}[0], [x0], x1 267 st1 {v22.s}[1], [x12], x1 268 st1 {v23.s}[0], [x0], x1 269 st1 {v23.s}[1], [x12], x1 270 b.gt 4b 271 ret 2728: 273 ld1 {v4.8h, v5.8h}, [x2], #32 274 ld1 {v6.8h, v7.8h}, [x3], #32 275 subs w5, w5, #2 276 sub v16.8h, v6.8h, v4.8h 277 sub v17.8h, v7.8h, v5.8h 278 sabd v18.8h, v4.8h, v6.8h 279 sabd v19.8h, v5.8h, v7.8h 280 uqsub v18.8h, v0.8h, v18.8h 281 uqsub v19.8h, v0.8h, v19.8h 282 ushr v18.8h, v18.8h, #8 283 ushr v19.8h, v19.8h, #8 284 shl v20.8h, v18.8h, #9 285 shl v21.8h, v19.8h, #9 286 sqdmulh v20.8h, v20.8h, v16.8h 287 sqdmulh v21.8h, v21.8h, v17.8h 288 add v20.8h, v20.8h, v4.8h 289 add v21.8h, v21.8h, v5.8h 290 sqrshrun v22.8b, v20.8h, #4 291 sqrshrun v23.8b, v21.8h, #4 292.if \type == 444 293 xtn v18.8b, v18.8h 294 xtn2 v18.16b, v19.8h 295 sub v18.16b, v1.16b, v18.16b 296 st1 {v18.16b}, [x6], #16 297.elseif \type == 422 298 addp v18.8h, v18.8h, v19.8h 299 xtn v18.8b, v18.8h 300 uhsub v18.8b, v3.8b, v18.8b 301 st1 {v18.8b}, [x6], #8 302.elseif \type == 420 303 add v18.8h, v18.8h, v19.8h 304 addp v18.8h, v18.8h, v18.8h 305 sub v18.4h, v3.4h, v18.4h 306 rshrn v18.8b, v18.8h, #2 307 st1 {v18.s}[0], [x6], #4 308.endif 309 st1 {v22.8b}, [x0], x1 310 st1 {v23.8b}, [x12], x1 311 b.gt 8b 312 ret 3131280: 314640: 315320: 316160: 317 mov w11, w4 318 sub x1, x1, w4, uxtw 319.if \type == 444 320 add x10, x6, w4, uxtw 321.elseif \type == 422 322 add x10, x6, x11, lsr #1 323.endif 324 add x9, x3, w4, uxtw #1 325 add x7, x2, w4, uxtw #1 326161: 327 mov w8, w4 32816: 329 ld1 {v4.8h, v5.8h}, [x2], #32 330 ld1 {v6.8h, v7.8h}, [x3], #32 331 ld1 {v16.8h, v17.8h}, [x7], #32 332 ld1 {v18.8h, v19.8h}, [x9], #32 333 subs w8, w8, #16 334 sub v6.8h, v6.8h, v4.8h 335 sub v7.8h, v7.8h, v5.8h 336 sub v18.8h, v18.8h, v16.8h 337 sub v19.8h, v19.8h, v17.8h 338 abs v20.8h, v6.8h 339 abs v21.8h, v7.8h 340 abs v22.8h, v18.8h 341 abs v23.8h, v19.8h 342 uqsub v20.8h, v0.8h, v20.8h 343 uqsub v21.8h, v0.8h, v21.8h 344 uqsub v22.8h, v0.8h, v22.8h 345 uqsub v23.8h, v0.8h, v23.8h 346 ushr v20.8h, v20.8h, #8 347 ushr v21.8h, v21.8h, #8 348 ushr v22.8h, v22.8h, #8 349 ushr v23.8h, v23.8h, #8 350 shl v24.8h, v20.8h, #9 351 shl v25.8h, v21.8h, #9 352 shl v26.8h, v22.8h, #9 353 shl v27.8h, v23.8h, #9 354 sqdmulh v24.8h, v24.8h, v6.8h 355 sqdmulh v25.8h, v25.8h, v7.8h 356 sqdmulh v26.8h, v26.8h, v18.8h 357 sqdmulh v27.8h, v27.8h, v19.8h 358 add v24.8h, v24.8h, v4.8h 359 add v25.8h, v25.8h, v5.8h 360 add v26.8h, v26.8h, v16.8h 361 add v27.8h, v27.8h, v17.8h 362 sqrshrun v24.8b, v24.8h, #4 363 sqrshrun v25.8b, v25.8h, #4 364 sqrshrun v26.8b, v26.8h, #4 365 sqrshrun v27.8b, v27.8h, #4 366.if \type == 444 367 xtn v20.8b, v20.8h 368 xtn2 v20.16b, v21.8h 369 xtn v21.8b, v22.8h 370 xtn2 v21.16b, v23.8h 371 sub v20.16b, v1.16b, v20.16b 372 sub v21.16b, v1.16b, v21.16b 373 st1 {v20.16b}, [x6], #16 374 st1 {v21.16b}, [x10], #16 375.elseif \type == 422 376 addp v20.8h, v20.8h, v21.8h 377 addp v21.8h, v22.8h, v23.8h 378 xtn v20.8b, v20.8h 379 xtn v21.8b, v21.8h 380 uhsub v20.8b, v3.8b, v20.8b 381 uhsub v21.8b, v3.8b, v21.8b 382 st1 {v20.8b}, [x6], #8 383 st1 {v21.8b}, [x10], #8 384.elseif \type == 420 385 add v20.8h, v20.8h, v22.8h 386 add v21.8h, v21.8h, v23.8h 387 addp v20.8h, v20.8h, v21.8h 388 sub v20.8h, v3.8h, v20.8h 389 rshrn v20.8b, v20.8h, #2 390 st1 {v20.8b}, [x6], #8 391.endif 392 st1 {v24.8b, v25.8b}, [x0], #16 393 st1 {v26.8b, v27.8b}, [x12], #16 394 b.gt 16b 395 subs w5, w5, #2 396 add x2, x2, w4, uxtw #1 397 add x3, x3, w4, uxtw #1 398 add x7, x7, w4, uxtw #1 399 add x9, x9, w4, uxtw #1 400.if \type == 444 401 add x6, x6, w4, uxtw 402 add x10, x10, w4, uxtw 403.elseif \type == 422 404 add x6, x6, x11, lsr #1 405 add x10, x10, x11, lsr #1 406.endif 407 add x0, x0, x1 408 add x12, x12, x1 409 b.gt 161b 410 ret 411L(w_mask_\type\()_tbl): 412 .hword L(w_mask_\type\()_tbl) - 1280b 413 .hword L(w_mask_\type\()_tbl) - 640b 414 .hword L(w_mask_\type\()_tbl) - 320b 415 .hword L(w_mask_\type\()_tbl) - 160b 416 .hword L(w_mask_\type\()_tbl) - 8b 417 .hword L(w_mask_\type\()_tbl) - 4b 418endfunc 419.endm 420 421w_mask_fn 444 422w_mask_fn 422 423w_mask_fn 420 424 425 426function blend_8bpc_neon, export=1 427 adr x6, L(blend_tbl) 428 clz w3, w3 429 sub w3, w3, #26 430 ldrh w3, [x6, x3, lsl #1] 431 sub x6, x6, w3, uxtw 432 movi v4.16b, #64 433 add x8, x0, x1 434 lsl x1, x1, #1 435 br x6 4364: 437 ld1 {v2.8b}, [x5], #8 438 ld1 {v1.d}[0], [x2], #8 439 ld1 {v0.s}[0], [x0] 440 subs w4, w4, #2 441 ld1 {v0.s}[1], [x8] 442 sub v3.8b, v4.8b, v2.8b 443 umull v5.8h, v1.8b, v2.8b 444 umlal v5.8h, v0.8b, v3.8b 445 rshrn v6.8b, v5.8h, #6 446 st1 {v6.s}[0], [x0], x1 447 st1 {v6.s}[1], [x8], x1 448 b.gt 4b 449 ret 4508: 451 ld1 {v2.16b}, [x5], #16 452 ld1 {v1.16b}, [x2], #16 453 ld1 {v0.d}[0], [x0] 454 ld1 {v0.d}[1], [x8] 455 sub v3.16b, v4.16b, v2.16b 456 subs w4, w4, #2 457 umull v5.8h, v1.8b, v2.8b 458 umlal v5.8h, v0.8b, v3.8b 459 umull2 v6.8h, v1.16b, v2.16b 460 umlal2 v6.8h, v0.16b, v3.16b 461 rshrn v7.8b, v5.8h, #6 462 rshrn2 v7.16b, v6.8h, #6 463 st1 {v7.d}[0], [x0], x1 464 st1 {v7.d}[1], [x8], x1 465 b.gt 8b 466 ret 46716: 468 ld1 {v1.16b, v2.16b}, [x5], #32 469 ld1 {v5.16b, v6.16b}, [x2], #32 470 ld1 {v0.16b}, [x0] 471 subs w4, w4, #2 472 sub v7.16b, v4.16b, v1.16b 473 sub v20.16b, v4.16b, v2.16b 474 ld1 {v3.16b}, [x8] 475 umull v16.8h, v5.8b, v1.8b 476 umlal v16.8h, v0.8b, v7.8b 477 umull2 v17.8h, v5.16b, v1.16b 478 umlal2 v17.8h, v0.16b, v7.16b 479 umull v21.8h, v6.8b, v2.8b 480 umlal v21.8h, v3.8b, v20.8b 481 umull2 v22.8h, v6.16b, v2.16b 482 umlal2 v22.8h, v3.16b, v20.16b 483 rshrn v18.8b, v16.8h, #6 484 rshrn2 v18.16b, v17.8h, #6 485 rshrn v19.8b, v21.8h, #6 486 rshrn2 v19.16b, v22.8h, #6 487 st1 {v18.16b}, [x0], x1 488 st1 {v19.16b}, [x8], x1 489 b.gt 16b 490 ret 49132: 492 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 493 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 494 ld1 {v20.16b, v21.16b}, [x0] 495 subs w4, w4, #2 496 ld1 {v22.16b, v23.16b}, [x8] 497 sub v5.16b, v4.16b, v0.16b 498 sub v6.16b, v4.16b, v1.16b 499 sub v30.16b, v4.16b, v2.16b 500 sub v31.16b, v4.16b, v3.16b 501 umull v24.8h, v16.8b, v0.8b 502 umlal v24.8h, v20.8b, v5.8b 503 umull2 v26.8h, v16.16b, v0.16b 504 umlal2 v26.8h, v20.16b, v5.16b 505 umull v28.8h, v17.8b, v1.8b 506 umlal v28.8h, v21.8b, v6.8b 507 umull2 v7.8h, v17.16b, v1.16b 508 umlal2 v7.8h, v21.16b, v6.16b 509 umull v27.8h, v18.8b, v2.8b 510 umlal v27.8h, v22.8b, v30.8b 511 umull2 v1.8h, v18.16b, v2.16b 512 umlal2 v1.8h, v22.16b, v30.16b 513 umull v29.8h, v19.8b, v3.8b 514 umlal v29.8h, v23.8b, v31.8b 515 umull2 v21.8h, v19.16b, v3.16b 516 umlal2 v21.8h, v23.16b, v31.16b 517 rshrn v24.8b, v24.8h, #6 518 rshrn2 v24.16b, v26.8h, #6 519 rshrn v25.8b, v28.8h, #6 520 rshrn2 v25.16b, v7.8h, #6 521 rshrn v27.8b, v27.8h, #6 522 rshrn2 v27.16b, v1.8h, #6 523 rshrn v28.8b, v29.8h, #6 524 rshrn2 v28.16b, v21.8h, #6 525 st1 {v24.16b, v25.16b}, [x0], x1 526 st1 {v27.16b, v28.16b}, [x8], x1 527 b.gt 32b 528 ret 529L(blend_tbl): 530 .hword L(blend_tbl) - 32b 531 .hword L(blend_tbl) - 16b 532 .hword L(blend_tbl) - 8b 533 .hword L(blend_tbl) - 4b 534endfunc 535 536function blend_h_8bpc_neon, export=1 537 adr x6, L(blend_h_tbl) 538 movrel x5, X(obmc_masks) 539 add x5, x5, w4, uxtw 540 sub w4, w4, w4, lsr #2 541 clz w7, w3 542 movi v4.16b, #64 543 add x8, x0, x1 544 lsl x1, x1, #1 545 sub w7, w7, #24 546 ldrh w7, [x6, x7, lsl #1] 547 sub x6, x6, w7, uxtw 548 br x6 5492: 550 ld1 {v0.h}[0], [x5], #2 551 ld1 {v1.s}[0], [x2], #4 552 subs w4, w4, #2 553 ld1 {v2.h}[0], [x0] 554 zip1 v0.8b, v0.8b, v0.8b 555 sub v3.8b, v4.8b, v0.8b 556 ld1 {v2.h}[1], [x8] 557 umull v5.8h, v1.8b, v0.8b 558 umlal v5.8h, v2.8b, v3.8b 559 rshrn v5.8b, v5.8h, #6 560 st1 {v5.h}[0], [x0], x1 561 st1 {v5.h}[1], [x8], x1 562 b.gt 2b 563 ret 5644: 565 ld2r {v0.8b, v1.8b}, [x5], #2 566 ld1 {v2.8b}, [x2], #8 567 subs w4, w4, #2 568 ext v0.8b, v0.8b, v1.8b, #4 569 ld1 {v3.s}[0], [x0] 570 sub v5.8b, v4.8b, v0.8b 571 ld1 {v3.s}[1], [x8] 572 umull v6.8h, v2.8b, v0.8b 573 umlal v6.8h, v3.8b, v5.8b 574 rshrn v6.8b, v6.8h, #6 575 st1 {v6.s}[0], [x0], x1 576 st1 {v6.s}[1], [x8], x1 577 b.gt 4b 578 ret 5798: 580 ld2r {v0.16b, v1.16b}, [x5], #2 581 ld1 {v2.16b}, [x2], #16 582 ld1 {v3.d}[0], [x0] 583 ext v0.16b, v0.16b, v1.16b, #8 584 sub v5.16b, v4.16b, v0.16b 585 ld1 {v3.d}[1], [x8] 586 subs w4, w4, #2 587 umull v6.8h, v0.8b, v2.8b 588 umlal v6.8h, v3.8b, v5.8b 589 umull2 v7.8h, v0.16b, v2.16b 590 umlal2 v7.8h, v3.16b, v5.16b 591 rshrn v16.8b, v6.8h, #6 592 rshrn2 v16.16b, v7.8h, #6 593 st1 {v16.d}[0], [x0], x1 594 st1 {v16.d}[1], [x8], x1 595 b.gt 8b 596 ret 59716: 598 ld2r {v0.16b, v1.16b}, [x5], #2 599 ld1 {v2.16b, v3.16b}, [x2], #32 600 ld1 {v5.16b}, [x0] 601 sub v7.16b, v4.16b, v0.16b 602 sub v16.16b, v4.16b, v1.16b 603 ld1 {v6.16b}, [x8] 604 subs w4, w4, #2 605 umull v17.8h, v0.8b, v2.8b 606 umlal v17.8h, v5.8b, v7.8b 607 umull2 v18.8h, v0.16b, v2.16b 608 umlal2 v18.8h, v5.16b, v7.16b 609 umull v19.8h, v1.8b, v3.8b 610 umlal v19.8h, v6.8b, v16.8b 611 umull2 v20.8h, v1.16b, v3.16b 612 umlal2 v20.8h, v6.16b, v16.16b 613 rshrn v21.8b, v17.8h, #6 614 rshrn2 v21.16b, v18.8h, #6 615 rshrn v22.8b, v19.8h, #6 616 rshrn2 v22.16b, v20.8h, #6 617 st1 {v21.16b}, [x0], x1 618 st1 {v22.16b}, [x8], x1 619 b.gt 16b 620 ret 6211280: 622640: 623320: 624 sub x1, x1, w3, uxtw 625 add x7, x2, w3, uxtw 626321: 627 ld2r {v0.16b, v1.16b}, [x5], #2 628 mov w6, w3 629 sub v20.16b, v4.16b, v0.16b 630 sub v21.16b, v4.16b, v1.16b 63132: 632 ld1 {v16.16b, v17.16b}, [x2], #32 633 ld1 {v2.16b, v3.16b}, [x0] 634 subs w6, w6, #32 635 umull v23.8h, v0.8b, v16.8b 636 umlal v23.8h, v2.8b, v20.8b 637 ld1 {v18.16b, v19.16b}, [x7], #32 638 umull2 v27.8h, v0.16b, v16.16b 639 umlal2 v27.8h, v2.16b, v20.16b 640 ld1 {v6.16b, v7.16b}, [x8] 641 umull v24.8h, v0.8b, v17.8b 642 umlal v24.8h, v3.8b, v20.8b 643 umull2 v28.8h, v0.16b, v17.16b 644 umlal2 v28.8h, v3.16b, v20.16b 645 umull v25.8h, v1.8b, v18.8b 646 umlal v25.8h, v6.8b, v21.8b 647 umull2 v5.8h, v1.16b, v18.16b 648 umlal2 v5.8h, v6.16b, v21.16b 649 rshrn v29.8b, v23.8h, #6 650 rshrn2 v29.16b, v27.8h, #6 651 umull v26.8h, v1.8b, v19.8b 652 umlal v26.8h, v7.8b, v21.8b 653 umull2 v31.8h, v1.16b, v19.16b 654 umlal2 v31.8h, v7.16b, v21.16b 655 rshrn v30.8b, v24.8h, #6 656 rshrn2 v30.16b, v28.8h, #6 657 rshrn v23.8b, v25.8h, #6 658 rshrn2 v23.16b, v5.8h, #6 659 rshrn v24.8b, v26.8h, #6 660 st1 {v29.16b, v30.16b}, [x0], #32 661 rshrn2 v24.16b, v31.8h, #6 662 st1 {v23.16b, v24.16b}, [x8], #32 663 b.gt 32b 664 subs w4, w4, #2 665 add x0, x0, x1 666 add x8, x8, x1 667 add x2, x2, w3, uxtw 668 add x7, x7, w3, uxtw 669 b.gt 321b 670 ret 671L(blend_h_tbl): 672 .hword L(blend_h_tbl) - 1280b 673 .hword L(blend_h_tbl) - 640b 674 .hword L(blend_h_tbl) - 320b 675 .hword L(blend_h_tbl) - 16b 676 .hword L(blend_h_tbl) - 8b 677 .hword L(blend_h_tbl) - 4b 678 .hword L(blend_h_tbl) - 2b 679endfunc 680 681function blend_v_8bpc_neon, export=1 682 adr x6, L(blend_v_tbl) 683 movrel x5, X(obmc_masks) 684 add x5, x5, w3, uxtw 685 clz w3, w3 686 movi v4.16b, #64 687 add x8, x0, x1 688 lsl x1, x1, #1 689 sub w3, w3, #26 690 ldrh w3, [x6, x3, lsl #1] 691 sub x6, x6, w3, uxtw 692 br x6 69320: 694 ld1r {v0.8b}, [x5] 695 sub v1.8b, v4.8b, v0.8b 6962: 697 ld1 {v2.h}[0], [x2], #2 698 ld1 {v3.b}[0], [x0] 699 subs w4, w4, #2 700 ld1 {v2.b}[1], [x2] 701 ld1 {v3.b}[1], [x8] 702 umull v5.8h, v2.8b, v0.8b 703 umlal v5.8h, v3.8b, v1.8b 704 rshrn v5.8b, v5.8h, #6 705 add x2, x2, #2 706 st1 {v5.b}[0], [x0], x1 707 st1 {v5.b}[1], [x8], x1 708 b.gt 2b 709 ret 71040: 711 ld1r {v0.2s}, [x5] 712 sub x1, x1, #2 713 sub v1.8b, v4.8b, v0.8b 7144: 715 ld1 {v2.8b}, [x2], #8 716 ld1 {v3.s}[0], [x0] 717 ld1 {v3.s}[1], [x8] 718 subs w4, w4, #2 719 umull v5.8h, v2.8b, v0.8b 720 umlal v5.8h, v3.8b, v1.8b 721 rshrn v5.8b, v5.8h, #6 722 st1 {v5.h}[0], [x0], #2 723 st1 {v5.h}[2], [x8], #2 724 st1 {v5.b}[2], [x0], x1 725 st1 {v5.b}[6], [x8], x1 726 b.gt 4b 727 ret 72880: 729 ld1r {v0.2d}, [x5] 730 sub x1, x1, #4 731 sub v1.16b, v4.16b, v0.16b 7328: 733 ld1 {v2.16b}, [x2], #16 734 ld1 {v3.d}[0], [x0] 735 ld1 {v3.d}[1], [x8] 736 subs w4, w4, #2 737 umull v5.8h, v0.8b, v2.8b 738 umlal v5.8h, v3.8b, v1.8b 739 umull2 v6.8h, v0.16b, v2.16b 740 umlal2 v6.8h, v3.16b, v1.16b 741 rshrn v7.8b, v5.8h, #6 742 rshrn2 v7.16b, v6.8h, #6 743 st1 {v7.s}[0], [x0], #4 744 st1 {v7.s}[2], [x8], #4 745 st1 {v7.h}[2], [x0], x1 746 st1 {v7.h}[6], [x8], x1 747 b.gt 8b 748 ret 749160: 750 ld1 {v0.16b}, [x5] 751 sub x1, x1, #8 752 sub v2.16b, v4.16b, v0.16b 75316: 754 ld1 {v5.16b, v6.16b}, [x2], #32 755 ld1 {v7.16b}, [x0] 756 subs w4, w4, #2 757 ld1 {v16.16b}, [x8] 758 umull v17.8h, v5.8b, v0.8b 759 umlal v17.8h, v7.8b, v2.8b 760 umull2 v18.8h, v5.16b, v0.16b 761 umlal2 v18.8h, v7.16b, v2.16b 762 umull v20.8h, v6.8b, v0.8b 763 umlal v20.8h, v16.8b, v2.8b 764 umull2 v21.8h, v6.16b, v0.16b 765 umlal2 v21.8h, v16.16b, v2.16b 766 rshrn v19.8b, v17.8h, #6 767 rshrn2 v19.16b, v18.8h, #6 768 rshrn v22.8b, v20.8h, #6 769 rshrn2 v22.16b, v21.8h, #6 770 st1 {v19.8b}, [x0], #8 771 st1 {v22.8b}, [x8], #8 772 st1 {v19.s}[2], [x0], x1 773 st1 {v22.s}[2], [x8], x1 774 b.gt 16b 775 ret 776320: 777 ld1 {v0.16b, v1.16b}, [x5] 778 sub x1, x1, #16 779 sub v2.16b, v4.16b, v0.16b 780 sub v3.8b, v4.8b, v1.8b 78132: 782 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 783 ld1 {v5.16b, v6.16b}, [x0] 784 subs w4, w4, #2 785 ld1 {v20.16b, v21.16b}, [x8] 786 umull v22.8h, v16.8b, v0.8b 787 umlal v22.8h, v5.8b, v2.8b 788 umull2 v23.8h, v16.16b, v0.16b 789 umlal2 v23.8h, v5.16b, v2.16b 790 umull v28.8h, v17.8b, v1.8b 791 umlal v28.8h, v6.8b, v3.8b 792 umull v30.8h, v18.8b, v0.8b 793 umlal v30.8h, v20.8b, v2.8b 794 umull2 v31.8h, v18.16b, v0.16b 795 umlal2 v31.8h, v20.16b, v2.16b 796 umull v25.8h, v19.8b, v1.8b 797 umlal v25.8h, v21.8b, v3.8b 798 rshrn v24.8b, v22.8h, #6 799 rshrn2 v24.16b, v23.8h, #6 800 rshrn v28.8b, v28.8h, #6 801 rshrn v30.8b, v30.8h, #6 802 rshrn2 v30.16b, v31.8h, #6 803 rshrn v27.8b, v25.8h, #6 804 st1 {v24.16b}, [x0], #16 805 st1 {v30.16b}, [x8], #16 806 st1 {v28.8b}, [x0], x1 807 st1 {v27.8b}, [x8], x1 808 b.gt 32b 809 ret 810L(blend_v_tbl): 811 .hword L(blend_v_tbl) - 320b 812 .hword L(blend_v_tbl) - 160b 813 .hword L(blend_v_tbl) - 80b 814 .hword L(blend_v_tbl) - 40b 815 .hword L(blend_v_tbl) - 20b 816endfunc 817 818 819// This has got the same signature as the put_8tap functions, 820// and assumes that x8 is set to (clz(w)-24). 821function put_neon 822 adr x9, L(put_tbl) 823 ldrh w8, [x9, x8, lsl #1] 824 sub x9, x9, w8, uxtw 825 br x9 826 8272: 828 ld1 {v0.h}[0], [x2], x3 829 ld1 {v1.h}[0], [x2], x3 830 subs w5, w5, #2 831 st1 {v0.h}[0], [x0], x1 832 st1 {v1.h}[0], [x0], x1 833 b.gt 2b 834 ret 8354: 836 ld1 {v0.s}[0], [x2], x3 837 ld1 {v1.s}[0], [x2], x3 838 subs w5, w5, #2 839 st1 {v0.s}[0], [x0], x1 840 st1 {v1.s}[0], [x0], x1 841 b.gt 4b 842 ret 8438: 844 ld1 {v0.8b}, [x2], x3 845 ld1 {v1.8b}, [x2], x3 846 subs w5, w5, #2 847 st1 {v0.8b}, [x0], x1 848 st1 {v1.8b}, [x0], x1 849 b.gt 8b 850 ret 851160: 852 add x8, x0, x1 853 lsl x1, x1, #1 854 add x9, x2, x3 855 lsl x3, x3, #1 85616: 857 ld1 {v0.16b}, [x2], x3 858 ld1 {v1.16b}, [x9], x3 859 subs w5, w5, #2 860 st1 {v0.16b}, [x0], x1 861 st1 {v1.16b}, [x8], x1 862 b.gt 16b 863 ret 86432: 865 ldp x6, x7, [x2] 866 ldp x8, x9, [x2, #16] 867 stp x6, x7, [x0] 868 subs w5, w5, #1 869 stp x8, x9, [x0, #16] 870 add x2, x2, x3 871 add x0, x0, x1 872 b.gt 32b 873 ret 87464: 875 ldp x6, x7, [x2] 876 ldp x8, x9, [x2, #16] 877 stp x6, x7, [x0] 878 ldp x10, x11, [x2, #32] 879 stp x8, x9, [x0, #16] 880 subs w5, w5, #1 881 ldp x12, x13, [x2, #48] 882 stp x10, x11, [x0, #32] 883 stp x12, x13, [x0, #48] 884 add x2, x2, x3 885 add x0, x0, x1 886 b.gt 64b 887 ret 888128: 889 ldp q0, q1, [x2] 890 ldp q2, q3, [x2, #32] 891 stp q0, q1, [x0] 892 ldp q4, q5, [x2, #64] 893 stp q2, q3, [x0, #32] 894 ldp q6, q7, [x2, #96] 895 subs w5, w5, #1 896 stp q4, q5, [x0, #64] 897 stp q6, q7, [x0, #96] 898 add x2, x2, x3 899 add x0, x0, x1 900 b.gt 128b 901 ret 902 903L(put_tbl): 904 .hword L(put_tbl) - 128b 905 .hword L(put_tbl) - 64b 906 .hword L(put_tbl) - 32b 907 .hword L(put_tbl) - 160b 908 .hword L(put_tbl) - 8b 909 .hword L(put_tbl) - 4b 910 .hword L(put_tbl) - 2b 911endfunc 912 913 914// This has got the same signature as the prep_8tap functions, 915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 916function prep_neon 917 adr x9, L(prep_tbl) 918 ldrh w8, [x9, x8, lsl #1] 919 sub x9, x9, w8, uxtw 920 br x9 921 9224: 923 ld1 {v0.s}[0], [x1], x2 924 ld1 {v1.s}[0], [x1], x2 925 subs w4, w4, #2 926 ushll v0.8h, v0.8b, #4 927 ushll v1.8h, v1.8b, #4 928 st1 {v0.4h, v1.4h}, [x0], #16 929 b.gt 4b 930 ret 9318: 932 ld1 {v0.8b}, [x1], x2 933 ld1 {v1.8b}, [x1], x2 934 subs w4, w4, #2 935 ushll v0.8h, v0.8b, #4 936 ushll v1.8h, v1.8b, #4 937 st1 {v0.8h, v1.8h}, [x0], #32 938 b.gt 8b 939 ret 940160: 941 add x9, x1, x2 942 lsl x2, x2, #1 94316: 944 ld1 {v0.16b}, [x1], x2 945 ld1 {v1.16b}, [x9], x2 946 subs w4, w4, #2 947 ushll v4.8h, v0.8b, #4 948 ushll2 v5.8h, v0.16b, #4 949 ushll v6.8h, v1.8b, #4 950 ushll2 v7.8h, v1.16b, #4 951 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 952 b.gt 16b 953 ret 954320: 955 add x8, x0, w3, uxtw 95632: 957 ld1 {v0.16b, v1.16b}, [x1], x2 958 subs w4, w4, #2 959 ushll v4.8h, v0.8b, #4 960 ushll2 v5.8h, v0.16b, #4 961 ld1 {v2.16b, v3.16b}, [x1], x2 962 ushll v6.8h, v1.8b, #4 963 ushll2 v7.8h, v1.16b, #4 964 ushll v16.8h, v2.8b, #4 965 st1 {v4.8h, v5.8h}, [x0], x7 966 ushll2 v17.8h, v2.16b, #4 967 st1 {v6.8h, v7.8h}, [x8], x7 968 ushll v18.8h, v3.8b, #4 969 st1 {v16.8h, v17.8h}, [x0], x7 970 ushll2 v19.8h, v3.16b, #4 971 st1 {v18.8h, v19.8h}, [x8], x7 972 b.gt 32b 973 ret 974640: 975 add x8, x0, #32 976 mov x6, #64 97764: 978 ldp q0, q1, [x1] 979 subs w4, w4, #1 980 ushll v4.8h, v0.8b, #4 981 ushll2 v5.8h, v0.16b, #4 982 ldp q2, q3, [x1, #32] 983 ushll v6.8h, v1.8b, #4 984 ushll2 v7.8h, v1.16b, #4 985 add x1, x1, x2 986 ushll v16.8h, v2.8b, #4 987 st1 {v4.8h, v5.8h}, [x0], x6 988 ushll2 v17.8h, v2.16b, #4 989 ushll v18.8h, v3.8b, #4 990 st1 {v6.8h, v7.8h}, [x8], x6 991 ushll2 v19.8h, v3.16b, #4 992 st1 {v16.8h, v17.8h}, [x0], x6 993 st1 {v18.8h, v19.8h}, [x8], x6 994 b.gt 64b 995 ret 9961280: 997 add x8, x0, #64 998 mov x6, #128 999128: 1000 ldp q0, q1, [x1] 1001 ldp q2, q3, [x1, #32] 1002 ushll v16.8h, v0.8b, #4 1003 ushll2 v17.8h, v0.16b, #4 1004 ushll v18.8h, v1.8b, #4 1005 ushll2 v19.8h, v1.16b, #4 1006 ushll v20.8h, v2.8b, #4 1007 ushll2 v21.8h, v2.16b, #4 1008 ldp q4, q5, [x1, #64] 1009 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 1010 ushll v22.8h, v3.8b, #4 1011 ushll2 v23.8h, v3.16b, #4 1012 ushll v24.8h, v4.8b, #4 1013 ushll2 v25.8h, v4.16b, #4 1014 ushll v26.8h, v5.8b, #4 1015 ushll2 v27.8h, v5.16b, #4 1016 ldp q6, q7, [x1, #96] 1017 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 1018 ushll v28.8h, v6.8b, #4 1019 ushll2 v29.8h, v6.16b, #4 1020 ushll v30.8h, v7.8b, #4 1021 ushll2 v31.8h, v7.16b, #4 1022 subs w4, w4, #1 1023 add x1, x1, x2 1024 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 1025 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 1026 b.gt 128b 1027 ret 1028 1029L(prep_tbl): 1030 .hword L(prep_tbl) - 1280b 1031 .hword L(prep_tbl) - 640b 1032 .hword L(prep_tbl) - 320b 1033 .hword L(prep_tbl) - 160b 1034 .hword L(prep_tbl) - 8b 1035 .hword L(prep_tbl) - 4b 1036endfunc 1037 1038 1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1040 ld1 {\d0\wd}[0], [\s0], \strd 1041 ld1 {\d1\wd}[0], [\s1], \strd 1042.ifnb \d2 1043 ld1 {\d2\wd}[0], [\s0], \strd 1044 ld1 {\d3\wd}[0], [\s1], \strd 1045.endif 1046.ifnb \d4 1047 ld1 {\d4\wd}[0], [\s0], \strd 1048.endif 1049.ifnb \d5 1050 ld1 {\d5\wd}[0], [\s1], \strd 1051.endif 1052.ifnb \d6 1053 ld1 {\d6\wd}[0], [\s0], \strd 1054.endif 1055.endm 1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1057 ld1 {\d0\wd}, [\s0], \strd 1058 ld1 {\d1\wd}, [\s1], \strd 1059.ifnb \d2 1060 ld1 {\d2\wd}, [\s0], \strd 1061 ld1 {\d3\wd}, [\s1], \strd 1062.endif 1063.ifnb \d4 1064 ld1 {\d4\wd}, [\s0], \strd 1065.endif 1066.ifnb \d5 1067 ld1 {\d5\wd}, [\s1], \strd 1068.endif 1069.ifnb \d6 1070 ld1 {\d6\wd}, [\s0], \strd 1071.endif 1072.endm 1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1074 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1075.endm 1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1077 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1078.endm 1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1080 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1081.endm 1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1083 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1084.endm 1085.macro interleave_1 wd, r0, r1, r2, r3, r4 1086 trn1 \r0\wd, \r0\wd, \r1\wd 1087 trn1 \r1\wd, \r1\wd, \r2\wd 1088.ifnb \r3 1089 trn1 \r2\wd, \r2\wd, \r3\wd 1090 trn1 \r3\wd, \r3\wd, \r4\wd 1091.endif 1092.endm 1093.macro interleave_1_h r0, r1, r2, r3, r4 1094 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1095.endm 1096.macro interleave_1_s r0, r1, r2, r3, r4 1097 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1098.endm 1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1100 trn1 \r0\wd, \r0\wd, \r2\wd 1101 trn1 \r1\wd, \r1\wd, \r3\wd 1102 trn1 \r2\wd, \r2\wd, \r4\wd 1103 trn1 \r3\wd, \r3\wd, \r5\wd 1104.endm 1105.macro interleave_2_s r0, r1, r2, r3, r4, r5 1106 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1107.endm 1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1109 uxtl \r0\().8h, \r0\().8b 1110 uxtl \r1\().8h, \r1\().8b 1111.ifnb \r2 1112 uxtl \r2\().8h, \r2\().8b 1113 uxtl \r3\().8h, \r3\().8b 1114.endif 1115.ifnb \r4 1116 uxtl \r4\().8h, \r4\().8b 1117.endif 1118.ifnb \r5 1119 uxtl \r5\().8h, \r5\().8b 1120.endif 1121.ifnb \r6 1122 uxtl \r6\().8h, \r6\().8b 1123.endif 1124.endm 1125.macro mul_mla_4 d, s0, s1, s2, s3, wd 1126 mul \d\wd, \s0\wd, v0.h[0] 1127 mla \d\wd, \s1\wd, v0.h[1] 1128 mla \d\wd, \s2\wd, v0.h[2] 1129 mla \d\wd, \s3\wd, v0.h[3] 1130.endm 1131// Interleaving the mul/mla chains actually hurts performance 1132// significantly on Cortex A53, thus keeping mul/mla tightly 1133// chained like this. 1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1135 mul \d0\().8h, \s0\().8h, v0.h[0] 1136 mla \d0\().8h, \s1\().8h, v0.h[1] 1137 mla \d0\().8h, \s2\().8h, v0.h[2] 1138 mla \d0\().8h, \s3\().8h, v0.h[3] 1139 mla \d0\().8h, \s4\().8h, v0.h[4] 1140 mla \d0\().8h, \s5\().8h, v0.h[5] 1141 mla \d0\().8h, \s6\().8h, v0.h[6] 1142 mla \d0\().8h, \s7\().8h, v0.h[7] 1143 mul \d1\().8h, \s1\().8h, v0.h[0] 1144 mla \d1\().8h, \s2\().8h, v0.h[1] 1145 mla \d1\().8h, \s3\().8h, v0.h[2] 1146 mla \d1\().8h, \s4\().8h, v0.h[3] 1147 mla \d1\().8h, \s5\().8h, v0.h[4] 1148 mla \d1\().8h, \s6\().8h, v0.h[5] 1149 mla \d1\().8h, \s7\().8h, v0.h[6] 1150 mla \d1\().8h, \s8\().8h, v0.h[7] 1151.endm 1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1153 mul \d0\().8h, \s0\().8h, v0.h[0] 1154 mla \d0\().8h, \s1\().8h, v0.h[1] 1155 mla \d0\().8h, \s2\().8h, v0.h[2] 1156 mla \d0\().8h, \s3\().8h, v0.h[3] 1157 mla \d0\().8h, \s4\().8h, v0.h[4] 1158 mla \d0\().8h, \s5\().8h, v0.h[5] 1159 mla \d0\().8h, \s6\().8h, v0.h[6] 1160 mla \d0\().8h, \s7\().8h, v0.h[7] 1161 mul \d1\().8h, \s2\().8h, v0.h[0] 1162 mla \d1\().8h, \s3\().8h, v0.h[1] 1163 mla \d1\().8h, \s4\().8h, v0.h[2] 1164 mla \d1\().8h, \s5\().8h, v0.h[3] 1165 mla \d1\().8h, \s6\().8h, v0.h[4] 1166 mla \d1\().8h, \s7\().8h, v0.h[5] 1167 mla \d1\().8h, \s8\().8h, v0.h[6] 1168 mla \d1\().8h, \s9\().8h, v0.h[7] 1169.endm 1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 1171 mul \d0\().8h, \s0\().8h, v0.h[0] 1172 mla \d0\().8h, \s1\().8h, v0.h[1] 1173 mla \d0\().8h, \s2\().8h, v0.h[2] 1174 mla \d0\().8h, \s3\().8h, v0.h[3] 1175 mla \d0\().8h, \s4\().8h, v0.h[4] 1176 mla \d0\().8h, \s5\().8h, v0.h[5] 1177 mla \d0\().8h, \s6\().8h, v0.h[6] 1178 mla \d0\().8h, \s7\().8h, v0.h[7] 1179 mul \d1\().8h, \s4\().8h, v0.h[0] 1180 mla \d1\().8h, \s5\().8h, v0.h[1] 1181 mla \d1\().8h, \s6\().8h, v0.h[2] 1182 mla \d1\().8h, \s7\().8h, v0.h[3] 1183 mla \d1\().8h, \s8\().8h, v0.h[4] 1184 mla \d1\().8h, \s9\().8h, v0.h[5] 1185 mla \d1\().8h, \s10\().8h, v0.h[6] 1186 mla \d1\().8h, \s11\().8h, v0.h[7] 1187.endm 1188.macro sqrshrun_b shift, r0, r1, r2, r3 1189 sqrshrun \r0\().8b, \r0\().8h, #\shift 1190.ifnb \r1 1191 sqrshrun \r1\().8b, \r1\().8h, #\shift 1192.endif 1193.ifnb \r2 1194 sqrshrun \r2\().8b, \r2\().8h, #\shift 1195 sqrshrun \r3\().8b, \r3\().8h, #\shift 1196.endif 1197.endm 1198.macro srshr_h shift, r0, r1, r2, r3 1199 srshr \r0\().8h, \r0\().8h, #\shift 1200.ifnb \r1 1201 srshr \r1\().8h, \r1\().8h, #\shift 1202.endif 1203.ifnb \r2 1204 srshr \r2\().8h, \r2\().8h, #\shift 1205 srshr \r3\().8h, \r3\().8h, #\shift 1206.endif 1207.endm 1208.macro st_h strd, reg, lanes 1209 st1 {\reg\().h}[0], [x0], \strd 1210 st1 {\reg\().h}[1], [x8], \strd 1211.if \lanes > 2 1212 st1 {\reg\().h}[2], [x0], \strd 1213 st1 {\reg\().h}[3], [x8], \strd 1214.endif 1215.endm 1216.macro st_s strd, r0, r1 1217 st1 {\r0\().s}[0], [x0], \strd 1218 st1 {\r0\().s}[1], [x8], \strd 1219.ifnb \r1 1220 st1 {\r1\().s}[0], [x0], \strd 1221 st1 {\r1\().s}[1], [x8], \strd 1222.endif 1223.endm 1224.macro st_d strd, r0, r1 1225 st1 {\r0\().d}[0], [x0], \strd 1226 st1 {\r0\().d}[1], [x8], \strd 1227.ifnb \r1 1228 st1 {\r1\().d}[0], [x0], \strd 1229 st1 {\r1\().d}[1], [x8], \strd 1230.endif 1231.endm 1232.macro shift_store_4 type, strd, r0, r1 1233.ifc \type, put 1234 sqrshrun_b 6, \r0, \r1 1235 st_s \strd, \r0, \r1 1236.else 1237 srshr_h 2, \r0, \r1 1238 st_d \strd, \r0, \r1 1239.endif 1240.endm 1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1242 st1 {\r0\wd}, [x0], \strd 1243 st1 {\r1\wd}, [x8], \strd 1244.ifnb \r2 1245 st1 {\r2\wd}, [x0], \strd 1246 st1 {\r3\wd}, [x8], \strd 1247.endif 1248.ifnb \r4 1249 st1 {\r4\wd}, [x0], \strd 1250 st1 {\r5\wd}, [x8], \strd 1251 st1 {\r6\wd}, [x0], \strd 1252 st1 {\r7\wd}, [x8], \strd 1253.endif 1254.endm 1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1256 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1257.endm 1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1259 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1260.endm 1261.macro shift_store_8 type, strd, r0, r1, r2, r3 1262.ifc \type, put 1263 sqrshrun_b 6, \r0, \r1, \r2, \r3 1264 st_8b \strd, \r0, \r1, \r2, \r3 1265.else 1266 srshr_h 2, \r0, \r1, \r2, \r3 1267 st_16b \strd, \r0, \r1, \r2, \r3 1268.endif 1269.endm 1270.macro shift_store_16 type, strd, r0, r1, r2, r3 1271.ifc \type, put 1272 sqrshrun \r0\().8b, \r0\().8h, #6 1273 sqrshrun2 \r0\().16b, \r1\().8h, #6 1274 sqrshrun \r2\().8b, \r2\().8h, #6 1275 sqrshrun2 \r2\().16b, \r3\().8h, #6 1276 st_16b \strd, \r0, \r2 1277.else 1278 srshr_h 2, \r0, \r1, \r2, \r3 1279 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1280 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1281.endif 1282.endm 1283 1284.macro make_8tap_fn op, type, type_h, type_v 1285function \op\()_8tap_\type\()_8bpc_neon, export=1 1286 mov x8, \type_h 1287 mov x9, \type_v 1288 b \op\()_8tap_neon 1289endfunc 1290.endm 1291 1292// No spaces in these expressions, due to gas-preprocessor. 1293#define REGULAR ((0*15<<7)|3*15) 1294#define SMOOTH ((1*15<<7)|4*15) 1295#define SHARP ((2*15<<7)|3*15) 1296 1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 1298make_8tap_fn \type, regular, REGULAR, REGULAR 1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1300make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1301make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1302make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1303make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1304make_8tap_fn \type, sharp, SHARP, SHARP 1305make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1306make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1307 1308function \type\()_8tap_neon 1309 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1310 mul \mx, \mx, w10 1311 mul \my, \my, w10 1312 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1313 add \my, \my, w9 // my, 8tap_v, 4tap_v 1314.ifc \type, prep 1315 uxtw \d_strd, \w 1316 lsl \d_strd, \d_strd, #1 1317.endif 1318 1319 clz w8, \w 1320 tst \mx, #(0x7f << 14) 1321 sub w8, w8, #24 1322 movrel x10, X(mc_subpel_filters), -8 1323 b.ne L(\type\()_8tap_h) 1324 tst \my, #(0x7f << 14) 1325 b.ne L(\type\()_8tap_v) 1326 b \type\()_neon 1327 1328L(\type\()_8tap_h): 1329 cmp \w, #4 1330 ubfx w9, \mx, #7, #7 1331 and \mx, \mx, #0x7f 1332 b.le 4f 1333 mov \mx, w9 13344: 1335 tst \my, #(0x7f << 14) 1336 add \xmx, x10, \mx, uxtw #3 1337 b.ne L(\type\()_8tap_hv) 1338 1339 adr x9, L(\type\()_8tap_h_tbl) 1340 ldrh w8, [x9, x8, lsl #1] 1341 sub x9, x9, w8, uxtw 1342 br x9 1343 134420: // 2xN h 1345.ifc \type, put 1346 add \xmx, \xmx, #2 1347 ld1 {v0.s}[0], [\xmx] 1348 sub \src, \src, #1 1349 add \ds2, \dst, \d_strd 1350 add \sr2, \src, \s_strd 1351 lsl \d_strd, \d_strd, #1 1352 lsl \s_strd, \s_strd, #1 1353 sxtl v0.8h, v0.8b 13542: 1355 ld1 {v4.8b}, [\src], \s_strd 1356 ld1 {v6.8b}, [\sr2], \s_strd 1357 uxtl v4.8h, v4.8b 1358 uxtl v6.8h, v6.8b 1359 ext v5.16b, v4.16b, v4.16b, #2 1360 ext v7.16b, v6.16b, v6.16b, #2 1361 subs \h, \h, #2 1362 trn1 v3.2s, v4.2s, v6.2s 1363 trn2 v6.2s, v4.2s, v6.2s 1364 trn1 v4.2s, v5.2s, v7.2s 1365 trn2 v7.2s, v5.2s, v7.2s 1366 mul v3.4h, v3.4h, v0.h[0] 1367 mla v3.4h, v4.4h, v0.h[1] 1368 mla v3.4h, v6.4h, v0.h[2] 1369 mla v3.4h, v7.4h, v0.h[3] 1370 srshr v3.4h, v3.4h, #2 1371 sqrshrun v3.8b, v3.8h, #4 1372 st1 {v3.h}[0], [\dst], \d_strd 1373 st1 {v3.h}[1], [\ds2], \d_strd 1374 b.gt 2b 1375 ret 1376.endif 1377 137840: // 4xN h 1379 add \xmx, \xmx, #2 1380 ld1 {v0.s}[0], [\xmx] 1381 sub \src, \src, #1 1382 add \ds2, \dst, \d_strd 1383 add \sr2, \src, \s_strd 1384 lsl \d_strd, \d_strd, #1 1385 lsl \s_strd, \s_strd, #1 1386 sxtl v0.8h, v0.8b 13874: 1388 ld1 {v16.8b}, [\src], \s_strd 1389 ld1 {v20.8b}, [\sr2], \s_strd 1390 uxtl v16.8h, v16.8b 1391 uxtl v20.8h, v20.8b 1392 ext v17.16b, v16.16b, v16.16b, #2 1393 ext v18.16b, v16.16b, v16.16b, #4 1394 ext v19.16b, v16.16b, v16.16b, #6 1395 ext v21.16b, v20.16b, v20.16b, #2 1396 ext v22.16b, v20.16b, v20.16b, #4 1397 ext v23.16b, v20.16b, v20.16b, #6 1398 subs \h, \h, #2 1399 mul v16.4h, v16.4h, v0.h[0] 1400 mla v16.4h, v17.4h, v0.h[1] 1401 mla v16.4h, v18.4h, v0.h[2] 1402 mla v16.4h, v19.4h, v0.h[3] 1403 mul v20.4h, v20.4h, v0.h[0] 1404 mla v20.4h, v21.4h, v0.h[1] 1405 mla v20.4h, v22.4h, v0.h[2] 1406 mla v20.4h, v23.4h, v0.h[3] 1407 srshr v16.4h, v16.4h, #2 1408 srshr v20.4h, v20.4h, #2 1409.ifc \type, put 1410 sqrshrun v16.8b, v16.8h, #4 1411 sqrshrun v20.8b, v20.8h, #4 1412 st1 {v16.s}[0], [\dst], \d_strd 1413 st1 {v20.s}[0], [\ds2], \d_strd 1414.else 1415 st1 {v16.4h}, [\dst], \d_strd 1416 st1 {v20.4h}, [\ds2], \d_strd 1417.endif 1418 b.gt 4b 1419 ret 1420 142180: // 8xN h 1422 ld1 {v0.8b}, [\xmx] 1423 sub \src, \src, #3 1424 add \ds2, \dst, \d_strd 1425 add \sr2, \src, \s_strd 1426 lsl \d_strd, \d_strd, #1 1427 lsl \s_strd, \s_strd, #1 1428 sxtl v0.8h, v0.8b 14298: 1430 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1431 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1432 uxtl v16.8h, v16.8b 1433 uxtl v17.8h, v17.8b 1434 uxtl v20.8h, v20.8b 1435 uxtl v21.8h, v21.8b 1436 1437 mul v18.8h, v16.8h, v0.h[0] 1438 mul v22.8h, v20.8h, v0.h[0] 1439.irpc i, 1234567 1440 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1441 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1442 mla v18.8h, v19.8h, v0.h[\i] 1443 mla v22.8h, v23.8h, v0.h[\i] 1444.endr 1445 subs \h, \h, #2 1446 srshr v18.8h, v18.8h, #2 1447 srshr v22.8h, v22.8h, #2 1448.ifc \type, put 1449 sqrshrun v18.8b, v18.8h, #4 1450 sqrshrun v22.8b, v22.8h, #4 1451 st1 {v18.8b}, [\dst], \d_strd 1452 st1 {v22.8b}, [\ds2], \d_strd 1453.else 1454 st1 {v18.8h}, [\dst], \d_strd 1455 st1 {v22.8h}, [\ds2], \d_strd 1456.endif 1457 b.gt 8b 1458 ret 1459160: 1460320: 1461640: 14621280: // 16xN, 32xN, ... h 1463 ld1 {v0.8b}, [\xmx] 1464 sub \src, \src, #3 1465 add \ds2, \dst, \d_strd 1466 add \sr2, \src, \s_strd 1467 lsl \s_strd, \s_strd, #1 1468 sxtl v0.8h, v0.8b 1469 1470 sub \s_strd, \s_strd, \w, uxtw 1471 sub \s_strd, \s_strd, #8 1472.ifc \type, put 1473 lsl \d_strd, \d_strd, #1 1474 sub \d_strd, \d_strd, \w, uxtw 1475.endif 1476161: 1477 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1478 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1479 mov \mx, \w 1480 uxtl v16.8h, v16.8b 1481 uxtl v17.8h, v17.8b 1482 uxtl v18.8h, v18.8b 1483 uxtl v20.8h, v20.8b 1484 uxtl v21.8h, v21.8b 1485 uxtl v22.8h, v22.8b 1486 148716: 1488 mul v24.8h, v16.8h, v0.h[0] 1489 mul v25.8h, v17.8h, v0.h[0] 1490 mul v26.8h, v20.8h, v0.h[0] 1491 mul v27.8h, v21.8h, v0.h[0] 1492.irpc i, 1234567 1493 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1494 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1495 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1496 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1497 mla v24.8h, v28.8h, v0.h[\i] 1498 mla v25.8h, v29.8h, v0.h[\i] 1499 mla v26.8h, v30.8h, v0.h[\i] 1500 mla v27.8h, v31.8h, v0.h[\i] 1501.endr 1502 srshr v24.8h, v24.8h, #2 1503 srshr v25.8h, v25.8h, #2 1504 srshr v26.8h, v26.8h, #2 1505 srshr v27.8h, v27.8h, #2 1506 subs \mx, \mx, #16 1507.ifc \type, put 1508 sqrshrun v24.8b, v24.8h, #4 1509 sqrshrun2 v24.16b, v25.8h, #4 1510 sqrshrun v26.8b, v26.8h, #4 1511 sqrshrun2 v26.16b, v27.8h, #4 1512 st1 {v24.16b}, [\dst], #16 1513 st1 {v26.16b}, [\ds2], #16 1514.else 1515 st1 {v24.8h, v25.8h}, [\dst], #32 1516 st1 {v26.8h, v27.8h}, [\ds2], #32 1517.endif 1518 b.le 9f 1519 1520 mov v16.16b, v18.16b 1521 mov v20.16b, v22.16b 1522 ld1 {v17.8b, v18.8b}, [\src], #16 1523 ld1 {v21.8b, v22.8b}, [\sr2], #16 1524 uxtl v17.8h, v17.8b 1525 uxtl v18.8h, v18.8b 1526 uxtl v21.8h, v21.8b 1527 uxtl v22.8h, v22.8b 1528 b 16b 1529 15309: 1531 add \dst, \dst, \d_strd 1532 add \ds2, \ds2, \d_strd 1533 add \src, \src, \s_strd 1534 add \sr2, \sr2, \s_strd 1535 1536 subs \h, \h, #2 1537 b.gt 161b 1538 ret 1539 1540L(\type\()_8tap_h_tbl): 1541 .hword L(\type\()_8tap_h_tbl) - 1280b 1542 .hword L(\type\()_8tap_h_tbl) - 640b 1543 .hword L(\type\()_8tap_h_tbl) - 320b 1544 .hword L(\type\()_8tap_h_tbl) - 160b 1545 .hword L(\type\()_8tap_h_tbl) - 80b 1546 .hword L(\type\()_8tap_h_tbl) - 40b 1547 .hword L(\type\()_8tap_h_tbl) - 20b 1548 .hword 0 1549 1550 1551L(\type\()_8tap_v): 1552 cmp \h, #4 1553 ubfx w9, \my, #7, #7 1554 and \my, \my, #0x7f 1555 b.le 4f 1556 mov \my, w9 15574: 1558 add \xmy, x10, \my, uxtw #3 1559 1560 adr x9, L(\type\()_8tap_v_tbl) 1561 ldrh w8, [x9, x8, lsl #1] 1562 sub x9, x9, w8, uxtw 1563 br x9 1564 156520: // 2xN v 1566.ifc \type, put 1567 b.gt 28f 1568 1569 cmp \h, #2 1570 add \xmy, \xmy, #2 1571 ld1 {v0.s}[0], [\xmy] 1572 sub \src, \src, \s_strd 1573 add \ds2, \dst, \d_strd 1574 add \sr2, \src, \s_strd 1575 lsl \s_strd, \s_strd, #1 1576 lsl \d_strd, \d_strd, #1 1577 sxtl v0.8h, v0.8b 1578 1579 // 2x2 v 1580 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1581 interleave_1_h v1, v2, v3, v4, v5 1582 b.gt 24f 1583 uxtl_b v1, v2, v3, v4 1584 mul_mla_4 v6, v1, v2, v3, v4, .4h 1585 sqrshrun_b 6, v6 1586 st_h \d_strd, v6, 2 1587 ret 1588 158924: // 2x4 v 1590 load_h \sr2, \src, \s_strd, v6, v7 1591 interleave_1_h v5, v6, v7 1592 interleave_2_s v1, v2, v3, v4, v5, v6 1593 uxtl_b v1, v2, v3, v4 1594 mul_mla_4 v6, v1, v2, v3, v4, .8h 1595 sqrshrun_b 6, v6 1596 st_h \d_strd, v6, 4 1597 ret 1598 159928: // 2x8, 2x16 v 1600 ld1 {v0.8b}, [\xmy] 1601 sub \sr2, \src, \s_strd, lsl #1 1602 add \ds2, \dst, \d_strd 1603 sub \src, \sr2, \s_strd 1604 lsl \d_strd, \d_strd, #1 1605 lsl \s_strd, \s_strd, #1 1606 sxtl v0.8h, v0.8b 1607 1608 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1609 interleave_1_h v1, v2, v3, v4, v5 1610 interleave_1_h v5, v6, v7 1611 interleave_2_s v1, v2, v3, v4, v5, v6 1612 uxtl_b v1, v2, v3, v4 1613216: 1614 subs \h, \h, #8 1615 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1616 load_h \sr2, \src, \s_strd, v20, v21, v22, v23 1617 interleave_1_h v7, v16, v17, v18, v19 1618 interleave_1_h v19, v20, v21, v22, v23 1619 interleave_2_s v5, v6, v7, v16, v17, v18 1620 interleave_2_s v17, v18, v19, v20, v21, v22 1621 uxtl_b v5, v6, v7, v16 1622 uxtl_b v17, v18, v19, v20 1623 mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 1624 sqrshrun_b 6, v30, v31 1625 st_h \d_strd, v30, 4 1626 st_h \d_strd, v31, 4 1627 b.le 0f 1628 mov v1.16b, v17.16b 1629 mov v2.16b, v18.16b 1630 mov v3.16b, v19.16b 1631 mov v4.16b, v20.16b 1632 mov v5.16b, v21.16b 1633 mov v6.16b, v22.16b 1634 mov v7.16b, v23.16b 1635 b 216b 16360: 1637 ret 1638.endif 1639 164040: 1641 b.gt 480f 1642 1643 // 4x2, 4x4 v 1644 cmp \h, #2 1645 add \xmy, \xmy, #2 1646 ld1 {v0.s}[0], [\xmy] 1647 sub \src, \src, \s_strd 1648 add \ds2, \dst, \d_strd 1649 add \sr2, \src, \s_strd 1650 lsl \s_strd, \s_strd, #1 1651 lsl \d_strd, \d_strd, #1 1652 sxtl v0.8h, v0.8b 1653 1654 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1655 interleave_1_s v1, v2, v3, v4, v5 1656 uxtl_b v1, v2, v3, v4 1657 mul_mla_4 v6, v1, v2, v3, v4, .8h 1658 shift_store_4 \type, \d_strd, v6 1659 b.le 0f 1660 load_s \sr2, \src, \s_strd, v6, v7 1661 interleave_1_s v5, v6, v7 1662 uxtl_b v5, v6 1663 mul_mla_4 v7, v3, v4, v5, v6, .8h 1664 shift_store_4 \type, \d_strd, v7 16650: 1666 ret 1667 1668480: // 4x8, 4x16 v 1669 ld1 {v0.8b}, [\xmy] 1670 sub \sr2, \src, \s_strd, lsl #1 1671 add \ds2, \dst, \d_strd 1672 sub \src, \sr2, \s_strd 1673 lsl \s_strd, \s_strd, #1 1674 lsl \d_strd, \d_strd, #1 1675 sxtl v0.8h, v0.8b 1676 1677 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1678 interleave_1_s v16, v17, v18 1679 interleave_1_s v18, v19, v20, v21, v22 1680 uxtl_b v16, v17 1681 uxtl_b v18, v19, v20, v21 1682 168348: 1684 subs \h, \h, #4 1685 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1686 interleave_1_s v22, v23, v24, v25, v26 1687 uxtl_b v22, v23, v24, v25 1688 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1689 shift_store_4 \type, \d_strd, v1, v2 1690 b.le 0f 1691 subs \h, \h, #4 1692 load_s \sr2, \src, \s_strd, v27, v16, v17, v18 1693 interleave_1_s v26, v27, v16, v17, v18 1694 uxtl_b v26, v27, v16, v17 1695 mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 1696 shift_store_4 \type, \d_strd, v1, v2 1697 b.le 0f 1698 subs \h, \h, #4 1699 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1700 interleave_1_s v18, v19, v20, v21, v22 1701 uxtl_b v18, v19, v20, v21 1702 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1703 shift_store_4 \type, \d_strd, v1, v2 1704 b.gt 48b 17050: 1706 ret 1707 170880: 1709 b.gt 880f 1710 1711 // 8x2, 8x4 v 1712 cmp \h, #2 1713 add \xmy, \xmy, #2 1714 ld1 {v0.s}[0], [\xmy] 1715 sub \src, \src, \s_strd 1716 add \ds2, \dst, \d_strd 1717 add \sr2, \src, \s_strd 1718 lsl \s_strd, \s_strd, #1 1719 lsl \d_strd, \d_strd, #1 1720 sxtl v0.8h, v0.8b 1721 1722 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1723 uxtl_b v1, v2, v3, v4, v5 1724 mul_mla_4 v6, v1, v2, v3, v4, .8h 1725 mul_mla_4 v7, v2, v3, v4, v5, .8h 1726 shift_store_8 \type, \d_strd, v6, v7 1727 b.le 0f 1728 load_8b \sr2, \src, \s_strd, v6, v7 1729 uxtl_b v6, v7 1730 mul_mla_4 v1, v3, v4, v5, v6, .8h 1731 mul_mla_4 v2, v4, v5, v6, v7, .8h 1732 shift_store_8 \type, \d_strd, v1, v2 17330: 1734 ret 1735 1736880: // 8x6, 8x8, 8x16, 8x32 v 17371680: // 16x8, 16x16, ... 1738320: // 32x8, 32x16, ... 1739640: 17401280: 1741 ld1 {v0.8b}, [\xmy] 1742 sub \src, \src, \s_strd 1743 sub \src, \src, \s_strd, lsl #1 1744 sxtl v0.8h, v0.8b 1745 mov \my, \h 1746168: 1747 add \ds2, \dst, \d_strd 1748 add \sr2, \src, \s_strd 1749 lsl \s_strd, \s_strd, #1 1750 lsl \d_strd, \d_strd, #1 1751 1752 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1753 uxtl_b v16, v17, v18, v19, v20, v21, v22 1754 175588: 1756 subs \h, \h, #2 1757 load_8b \sr2, \src, \s_strd, v23, v24 1758 uxtl_b v23, v24 1759 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1760 shift_store_8 \type, \d_strd, v1, v2 1761 b.le 9f 1762 subs \h, \h, #2 1763 load_8b \sr2, \src, \s_strd, v25, v26 1764 uxtl_b v25, v26 1765 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1766 shift_store_8 \type, \d_strd, v3, v4 1767 b.le 9f 1768 subs \h, \h, #2 1769 load_8b \sr2, \src, \s_strd, v27, v16 1770 uxtl_b v27, v16 1771 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1772 shift_store_8 \type, \d_strd, v1, v2 1773 b.le 9f 1774 subs \h, \h, #2 1775 load_8b \sr2, \src, \s_strd, v17, v18 1776 uxtl_b v17, v18 1777 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1778 shift_store_8 \type, \d_strd, v3, v4 1779 b.le 9f 1780 subs \h, \h, #4 1781 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1782 uxtl_b v19, v20, v21, v22 1783 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1784 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1785 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1786 b.gt 88b 17879: 1788 subs \w, \w, #8 1789 b.le 0f 1790 asr \s_strd, \s_strd, #1 1791 asr \d_strd, \d_strd, #1 1792 msub \src, \s_strd, \xmy, \src 1793 msub \dst, \d_strd, \xmy, \dst 1794 sub \src, \src, \s_strd, lsl #3 1795 mov \h, \my 1796 add \src, \src, #8 1797.ifc \type, put 1798 add \dst, \dst, #8 1799.else 1800 add \dst, \dst, #16 1801.endif 1802 b 168b 18030: 1804 ret 1805 1806160: 1807 b.gt 1680b 1808 1809 // 16x2, 16x4 v 1810 add \xmy, \xmy, #2 1811 ld1 {v0.s}[0], [\xmy] 1812 sub \src, \src, \s_strd 1813 add \ds2, \dst, \d_strd 1814 add \sr2, \src, \s_strd 1815 lsl \s_strd, \s_strd, #1 1816 lsl \d_strd, \d_strd, #1 1817 sxtl v0.8h, v0.8b 1818 1819 cmp \h, #2 1820 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1821 uxtl v16.8h, v1.8b 1822 uxtl v17.8h, v2.8b 1823 uxtl v18.8h, v3.8b 1824 uxtl v19.8h, v4.8b 1825 uxtl v20.8h, v5.8b 1826 uxtl2 v23.8h, v1.16b 1827 uxtl2 v24.8h, v2.16b 1828 uxtl2 v25.8h, v3.16b 1829 uxtl2 v26.8h, v4.16b 1830 uxtl2 v27.8h, v5.16b 1831 mul_mla_4 v1, v16, v17, v18, v19, .8h 1832 mul_mla_4 v16, v17, v18, v19, v20, .8h 1833 mul_mla_4 v2, v23, v24, v25, v26, .8h 1834 mul_mla_4 v17, v24, v25, v26, v27, .8h 1835 shift_store_16 \type, \d_strd, v1, v2, v16, v17 1836 b.le 0f 1837 load_16b \sr2, \src, \s_strd, v6, v7 1838 uxtl v21.8h, v6.8b 1839 uxtl v22.8h, v7.8b 1840 uxtl2 v28.8h, v6.16b 1841 uxtl2 v29.8h, v7.16b 1842 mul_mla_4 v1, v18, v19, v20, v21, .8h 1843 mul_mla_4 v3, v19, v20, v21, v22, .8h 1844 mul_mla_4 v2, v25, v26, v27, v28, .8h 1845 mul_mla_4 v4, v26, v27, v28, v29, .8h 1846 shift_store_16 \type, \d_strd, v1, v2, v3, v4 18470: 1848 ret 1849 1850L(\type\()_8tap_v_tbl): 1851 .hword L(\type\()_8tap_v_tbl) - 1280b 1852 .hword L(\type\()_8tap_v_tbl) - 640b 1853 .hword L(\type\()_8tap_v_tbl) - 320b 1854 .hword L(\type\()_8tap_v_tbl) - 160b 1855 .hword L(\type\()_8tap_v_tbl) - 80b 1856 .hword L(\type\()_8tap_v_tbl) - 40b 1857 .hword L(\type\()_8tap_v_tbl) - 20b 1858 .hword 0 1859 1860L(\type\()_8tap_hv): 1861 cmp \h, #4 1862 ubfx w9, \my, #7, #7 1863 and \my, \my, #0x7f 1864 b.le 4f 1865 mov \my, w9 18664: 1867 add \xmy, x10, \my, uxtw #3 1868 1869 adr x9, L(\type\()_8tap_hv_tbl) 1870 ldrh w8, [x9, x8, lsl #1] 1871 sub x9, x9, w8, uxtw 1872 br x9 1873 187420: 1875.ifc \type, put 1876 add \xmx, \xmx, #2 1877 ld1 {v0.s}[0], [\xmx] 1878 b.gt 280f 1879 add \xmy, \xmy, #2 1880 ld1 {v1.s}[0], [\xmy] 1881 1882 // 2x2, 2x4 hv 1883 sub \sr2, \src, #1 1884 sub \src, \sr2, \s_strd 1885 add \ds2, \dst, \d_strd 1886 lsl \s_strd, \s_strd, #1 1887 lsl \d_strd, \d_strd, #1 1888 sxtl v0.8h, v0.8b 1889 sxtl v1.8h, v1.8b 1890 mov x15, x30 1891 1892 ld1 {v28.8b}, [\src], \s_strd 1893 uxtl v28.8h, v28.8b 1894 ext v29.16b, v28.16b, v28.16b, #2 1895 mul v28.4h, v28.4h, v0.4h 1896 mul v29.4h, v29.4h, v0.4h 1897 addp v28.4h, v28.4h, v29.4h 1898 addp v16.4h, v28.4h, v28.4h 1899 srshr v16.4h, v16.4h, #2 1900 bl L(\type\()_8tap_filter_2) 1901 1902 trn1 v16.2s, v16.2s, v28.2s 1903 mov v17.8b, v28.8b 1904 19052: 1906 bl L(\type\()_8tap_filter_2) 1907 1908 ext v18.8b, v17.8b, v28.8b, #4 1909 mov v19.8b, v28.8b 1910 smull v2.4s, v16.4h, v1.h[0] 1911 smlal v2.4s, v17.4h, v1.h[1] 1912 smlal v2.4s, v18.4h, v1.h[2] 1913 smlal v2.4s, v19.4h, v1.h[3] 1914 1915 sqrshrn v2.4h, v2.4s, #\shift_hv 1916 sqxtun v2.8b, v2.8h 1917 subs \h, \h, #2 1918 st1 {v2.h}[0], [\dst], \d_strd 1919 st1 {v2.h}[1], [\ds2], \d_strd 1920 b.le 0f 1921 mov v16.8b, v18.8b 1922 mov v17.8b, v19.8b 1923 b 2b 1924 1925280: // 2x8, 2x16, 2x32 hv 1926 ld1 {v1.8b}, [\xmy] 1927 sub \src, \src, #1 1928 sub \sr2, \src, \s_strd, lsl #1 1929 sub \src, \sr2, \s_strd 1930 add \ds2, \dst, \d_strd 1931 lsl \s_strd, \s_strd, #1 1932 lsl \d_strd, \d_strd, #1 1933 sxtl v0.8h, v0.8b 1934 sxtl v1.8h, v1.8b 1935 mov x15, x30 1936 1937 ld1 {v28.8b}, [\src], \s_strd 1938 uxtl v28.8h, v28.8b 1939 ext v29.16b, v28.16b, v28.16b, #2 1940 mul v28.4h, v28.4h, v0.4h 1941 mul v29.4h, v29.4h, v0.4h 1942 addp v28.4h, v28.4h, v29.4h 1943 addp v16.4h, v28.4h, v28.4h 1944 srshr v16.4h, v16.4h, #2 1945 1946 bl L(\type\()_8tap_filter_2) 1947 trn1 v16.2s, v16.2s, v28.2s 1948 mov v17.8b, v28.8b 1949 bl L(\type\()_8tap_filter_2) 1950 ext v18.8b, v17.8b, v28.8b, #4 1951 mov v19.8b, v28.8b 1952 bl L(\type\()_8tap_filter_2) 1953 ext v20.8b, v19.8b, v28.8b, #4 1954 mov v21.8b, v28.8b 1955 195628: 1957 bl L(\type\()_8tap_filter_2) 1958 ext v22.8b, v21.8b, v28.8b, #4 1959 mov v23.8b, v28.8b 1960 smull v2.4s, v16.4h, v1.h[0] 1961 smlal v2.4s, v17.4h, v1.h[1] 1962 smlal v2.4s, v18.4h, v1.h[2] 1963 smlal v2.4s, v19.4h, v1.h[3] 1964 smlal v2.4s, v20.4h, v1.h[4] 1965 smlal v2.4s, v21.4h, v1.h[5] 1966 smlal v2.4s, v22.4h, v1.h[6] 1967 smlal v2.4s, v23.4h, v1.h[7] 1968 1969 sqrshrn v2.4h, v2.4s, #\shift_hv 1970 sqxtun v2.8b, v2.8h 1971 subs \h, \h, #2 1972 st1 {v2.h}[0], [\dst], \d_strd 1973 st1 {v2.h}[1], [\ds2], \d_strd 1974 b.le 0f 1975 mov v16.8b, v18.8b 1976 mov v17.8b, v19.8b 1977 mov v18.8b, v20.8b 1978 mov v19.8b, v21.8b 1979 mov v20.8b, v22.8b 1980 mov v21.8b, v23.8b 1981 b 28b 1982 19830: 1984 br x15 1985 1986L(\type\()_8tap_filter_2): 1987 ld1 {v28.8b}, [\sr2], \s_strd 1988 ld1 {v30.8b}, [\src], \s_strd 1989 uxtl v28.8h, v28.8b 1990 uxtl v30.8h, v30.8b 1991 ext v29.16b, v28.16b, v28.16b, #2 1992 ext v31.16b, v30.16b, v30.16b, #2 1993 trn1 v27.2s, v28.2s, v30.2s 1994 trn2 v30.2s, v28.2s, v30.2s 1995 trn1 v28.2s, v29.2s, v31.2s 1996 trn2 v31.2s, v29.2s, v31.2s 1997 mul v27.4h, v27.4h, v0.h[0] 1998 mla v27.4h, v28.4h, v0.h[1] 1999 mla v27.4h, v30.4h, v0.h[2] 2000 mla v27.4h, v31.4h, v0.h[3] 2001 srshr v28.4h, v27.4h, #2 2002 ret 2003.endif 2004 200540: 2006 add \xmx, \xmx, #2 2007 ld1 {v0.s}[0], [\xmx] 2008 b.gt 480f 2009 add \xmy, \xmy, #2 2010 ld1 {v1.s}[0], [\xmy] 2011 sub \sr2, \src, #1 2012 sub \src, \sr2, \s_strd 2013 add \ds2, \dst, \d_strd 2014 lsl \s_strd, \s_strd, #1 2015 lsl \d_strd, \d_strd, #1 2016 sxtl v0.8h, v0.8b 2017 sxtl v1.8h, v1.8b 2018 mov x15, x30 2019 2020 // 4x2, 4x4 hv 2021 ld1 {v26.8b}, [\src], \s_strd 2022 uxtl v26.8h, v26.8b 2023 ext v28.16b, v26.16b, v26.16b, #2 2024 ext v29.16b, v26.16b, v26.16b, #4 2025 ext v30.16b, v26.16b, v26.16b, #6 2026 mul v31.4h, v26.4h, v0.h[0] 2027 mla v31.4h, v28.4h, v0.h[1] 2028 mla v31.4h, v29.4h, v0.h[2] 2029 mla v31.4h, v30.4h, v0.h[3] 2030 srshr v16.4h, v31.4h, #2 2031 2032 bl L(\type\()_8tap_filter_4) 2033 mov v17.8b, v28.8b 2034 mov v18.8b, v29.8b 2035 20364: 2037 bl L(\type\()_8tap_filter_4) 2038 // Interleaving the mul/mla chains actually hurts performance 2039 // significantly on Cortex A53, thus keeping mul/mla tightly 2040 // chained like this. 2041 smull v2.4s, v16.4h, v1.h[0] 2042 smlal v2.4s, v17.4h, v1.h[1] 2043 smlal v2.4s, v18.4h, v1.h[2] 2044 smlal v2.4s, v28.4h, v1.h[3] 2045 smull v3.4s, v17.4h, v1.h[0] 2046 smlal v3.4s, v18.4h, v1.h[1] 2047 smlal v3.4s, v28.4h, v1.h[2] 2048 smlal v3.4s, v29.4h, v1.h[3] 2049 sqrshrn v2.4h, v2.4s, #\shift_hv 2050 sqrshrn v3.4h, v3.4s, #\shift_hv 2051 subs \h, \h, #2 2052.ifc \type, put 2053 sqxtun v2.8b, v2.8h 2054 sqxtun v3.8b, v3.8h 2055 st1 {v2.s}[0], [\dst], \d_strd 2056 st1 {v3.s}[0], [\ds2], \d_strd 2057.else 2058 st1 {v2.4h}, [\dst], \d_strd 2059 st1 {v3.4h}, [\ds2], \d_strd 2060.endif 2061 b.le 0f 2062 mov v16.8b, v18.8b 2063 mov v17.8b, v28.8b 2064 mov v18.8b, v29.8b 2065 b 4b 2066 2067480: // 4x8, 4x16, 4x32 hv 2068 ld1 {v1.8b}, [\xmy] 2069 sub \src, \src, #1 2070 sub \sr2, \src, \s_strd, lsl #1 2071 sub \src, \sr2, \s_strd 2072 add \ds2, \dst, \d_strd 2073 lsl \s_strd, \s_strd, #1 2074 lsl \d_strd, \d_strd, #1 2075 sxtl v0.8h, v0.8b 2076 sxtl v1.8h, v1.8b 2077 mov x15, x30 2078 2079 ld1 {v26.8b}, [\src], \s_strd 2080 uxtl v26.8h, v26.8b 2081 ext v28.16b, v26.16b, v26.16b, #2 2082 ext v29.16b, v26.16b, v26.16b, #4 2083 ext v30.16b, v26.16b, v26.16b, #6 2084 mul v31.4h, v26.4h, v0.h[0] 2085 mla v31.4h, v28.4h, v0.h[1] 2086 mla v31.4h, v29.4h, v0.h[2] 2087 mla v31.4h, v30.4h, v0.h[3] 2088 srshr v16.4h, v31.4h, #2 2089 2090 bl L(\type\()_8tap_filter_4) 2091 mov v17.8b, v28.8b 2092 mov v18.8b, v29.8b 2093 bl L(\type\()_8tap_filter_4) 2094 mov v19.8b, v28.8b 2095 mov v20.8b, v29.8b 2096 bl L(\type\()_8tap_filter_4) 2097 mov v21.8b, v28.8b 2098 mov v22.8b, v29.8b 2099 210048: 2101 bl L(\type\()_8tap_filter_4) 2102 smull v2.4s, v16.4h, v1.h[0] 2103 smlal v2.4s, v17.4h, v1.h[1] 2104 smlal v2.4s, v18.4h, v1.h[2] 2105 smlal v2.4s, v19.4h, v1.h[3] 2106 smlal v2.4s, v20.4h, v1.h[4] 2107 smlal v2.4s, v21.4h, v1.h[5] 2108 smlal v2.4s, v22.4h, v1.h[6] 2109 smlal v2.4s, v28.4h, v1.h[7] 2110 smull v3.4s, v17.4h, v1.h[0] 2111 smlal v3.4s, v18.4h, v1.h[1] 2112 smlal v3.4s, v19.4h, v1.h[2] 2113 smlal v3.4s, v20.4h, v1.h[3] 2114 smlal v3.4s, v21.4h, v1.h[4] 2115 smlal v3.4s, v22.4h, v1.h[5] 2116 smlal v3.4s, v28.4h, v1.h[6] 2117 smlal v3.4s, v29.4h, v1.h[7] 2118 sqrshrn v2.4h, v2.4s, #\shift_hv 2119 sqrshrn v3.4h, v3.4s, #\shift_hv 2120 subs \h, \h, #2 2121.ifc \type, put 2122 sqxtun v2.8b, v2.8h 2123 sqxtun v3.8b, v3.8h 2124 st1 {v2.s}[0], [\dst], \d_strd 2125 st1 {v3.s}[0], [\ds2], \d_strd 2126.else 2127 st1 {v2.4h}, [\dst], \d_strd 2128 st1 {v3.4h}, [\ds2], \d_strd 2129.endif 2130 b.le 0f 2131 mov v16.8b, v18.8b 2132 mov v17.8b, v19.8b 2133 mov v18.8b, v20.8b 2134 mov v19.8b, v21.8b 2135 mov v20.8b, v22.8b 2136 mov v21.8b, v28.8b 2137 mov v22.8b, v29.8b 2138 b 48b 21390: 2140 br x15 2141 2142L(\type\()_8tap_filter_4): 2143 ld1 {v26.8b}, [\sr2], \s_strd 2144 ld1 {v27.8b}, [\src], \s_strd 2145 uxtl v26.8h, v26.8b 2146 uxtl v27.8h, v27.8b 2147 ext v28.16b, v26.16b, v26.16b, #2 2148 ext v29.16b, v26.16b, v26.16b, #4 2149 ext v30.16b, v26.16b, v26.16b, #6 2150 mul v31.4h, v26.4h, v0.h[0] 2151 mla v31.4h, v28.4h, v0.h[1] 2152 mla v31.4h, v29.4h, v0.h[2] 2153 mla v31.4h, v30.4h, v0.h[3] 2154 ext v28.16b, v27.16b, v27.16b, #2 2155 ext v29.16b, v27.16b, v27.16b, #4 2156 ext v30.16b, v27.16b, v27.16b, #6 2157 mul v27.4h, v27.4h, v0.h[0] 2158 mla v27.4h, v28.4h, v0.h[1] 2159 mla v27.4h, v29.4h, v0.h[2] 2160 mla v27.4h, v30.4h, v0.h[3] 2161 srshr v28.4h, v31.4h, #2 2162 srshr v29.4h, v27.4h, #2 2163 ret 2164 216580: 2166160: 2167320: 2168 b.gt 880f 2169 add \xmy, \xmy, #2 2170 ld1 {v0.8b}, [\xmx] 2171 ld1 {v1.s}[0], [\xmy] 2172 sub \src, \src, #3 2173 sub \src, \src, \s_strd 2174 sxtl v0.8h, v0.8b 2175 sxtl v1.8h, v1.8b 2176 mov x15, x30 2177 mov \my, \h 2178 2179164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2180 add \ds2, \dst, \d_strd 2181 add \sr2, \src, \s_strd 2182 lsl \d_strd, \d_strd, #1 2183 lsl \s_strd, \s_strd, #1 2184 2185 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2186 uxtl v28.8h, v28.8b 2187 uxtl v29.8h, v29.8b 2188 mul v24.8h, v28.8h, v0.h[0] 2189.irpc i, 1234567 2190 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2191 mla v24.8h, v26.8h, v0.h[\i] 2192.endr 2193 srshr v16.8h, v24.8h, #2 2194 2195 bl L(\type\()_8tap_filter_8) 2196 mov v17.16b, v24.16b 2197 mov v18.16b, v25.16b 2198 21998: 2200 smull v2.4s, v16.4h, v1.h[0] 2201 smull2 v3.4s, v16.8h, v1.h[0] 2202 bl L(\type\()_8tap_filter_8) 2203 smull v4.4s, v17.4h, v1.h[0] 2204 smull2 v5.4s, v17.8h, v1.h[0] 2205 smlal v2.4s, v17.4h, v1.h[1] 2206 smlal2 v3.4s, v17.8h, v1.h[1] 2207 smlal v4.4s, v18.4h, v1.h[1] 2208 smlal2 v5.4s, v18.8h, v1.h[1] 2209 smlal v2.4s, v18.4h, v1.h[2] 2210 smlal2 v3.4s, v18.8h, v1.h[2] 2211 smlal v4.4s, v24.4h, v1.h[2] 2212 smlal2 v5.4s, v24.8h, v1.h[2] 2213 smlal v2.4s, v24.4h, v1.h[3] 2214 smlal2 v3.4s, v24.8h, v1.h[3] 2215 smlal v4.4s, v25.4h, v1.h[3] 2216 smlal2 v5.4s, v25.8h, v1.h[3] 2217 sqrshrn v2.4h, v2.4s, #\shift_hv 2218 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2219 sqrshrn v4.4h, v4.4s, #\shift_hv 2220 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2221 subs \h, \h, #2 2222.ifc \type, put 2223 sqxtun v2.8b, v2.8h 2224 sqxtun v4.8b, v4.8h 2225 st1 {v2.8b}, [\dst], \d_strd 2226 st1 {v4.8b}, [\ds2], \d_strd 2227.else 2228 st1 {v2.8h}, [\dst], \d_strd 2229 st1 {v4.8h}, [\ds2], \d_strd 2230.endif 2231 b.le 9f 2232 mov v16.16b, v18.16b 2233 mov v17.16b, v24.16b 2234 mov v18.16b, v25.16b 2235 b 8b 22369: 2237 subs \w, \w, #8 2238 b.le 0f 2239 asr \s_strd, \s_strd, #1 2240 asr \d_strd, \d_strd, #1 2241 msub \src, \s_strd, \xmy, \src 2242 msub \dst, \d_strd, \xmy, \dst 2243 sub \src, \src, \s_strd, lsl #2 2244 mov \h, \my 2245 add \src, \src, #8 2246.ifc \type, put 2247 add \dst, \dst, #8 2248.else 2249 add \dst, \dst, #16 2250.endif 2251 b 164b 2252 2253880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2254640: 22551280: 2256 ld1 {v0.8b}, [\xmx] 2257 ld1 {v1.8b}, [\xmy] 2258 sub \src, \src, #3 2259 sub \src, \src, \s_strd 2260 sub \src, \src, \s_strd, lsl #1 2261 sxtl v0.8h, v0.8b 2262 sxtl v1.8h, v1.8b 2263 mov x15, x30 2264 mov \my, \h 2265 2266168: 2267 add \ds2, \dst, \d_strd 2268 add \sr2, \src, \s_strd 2269 lsl \d_strd, \d_strd, #1 2270 lsl \s_strd, \s_strd, #1 2271 2272 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2273 uxtl v28.8h, v28.8b 2274 uxtl v29.8h, v29.8b 2275 mul v24.8h, v28.8h, v0.h[0] 2276.irpc i, 1234567 2277 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2278 mla v24.8h, v26.8h, v0.h[\i] 2279.endr 2280 srshr v16.8h, v24.8h, #2 2281 2282 bl L(\type\()_8tap_filter_8) 2283 mov v17.16b, v24.16b 2284 mov v18.16b, v25.16b 2285 bl L(\type\()_8tap_filter_8) 2286 mov v19.16b, v24.16b 2287 mov v20.16b, v25.16b 2288 bl L(\type\()_8tap_filter_8) 2289 mov v21.16b, v24.16b 2290 mov v22.16b, v25.16b 2291 229288: 2293 smull v2.4s, v16.4h, v1.h[0] 2294 smull2 v3.4s, v16.8h, v1.h[0] 2295 bl L(\type\()_8tap_filter_8) 2296 smull v4.4s, v17.4h, v1.h[0] 2297 smull2 v5.4s, v17.8h, v1.h[0] 2298 smlal v2.4s, v17.4h, v1.h[1] 2299 smlal2 v3.4s, v17.8h, v1.h[1] 2300 smlal v4.4s, v18.4h, v1.h[1] 2301 smlal2 v5.4s, v18.8h, v1.h[1] 2302 smlal v2.4s, v18.4h, v1.h[2] 2303 smlal2 v3.4s, v18.8h, v1.h[2] 2304 smlal v4.4s, v19.4h, v1.h[2] 2305 smlal2 v5.4s, v19.8h, v1.h[2] 2306 smlal v2.4s, v19.4h, v1.h[3] 2307 smlal2 v3.4s, v19.8h, v1.h[3] 2308 smlal v4.4s, v20.4h, v1.h[3] 2309 smlal2 v5.4s, v20.8h, v1.h[3] 2310 smlal v2.4s, v20.4h, v1.h[4] 2311 smlal2 v3.4s, v20.8h, v1.h[4] 2312 smlal v4.4s, v21.4h, v1.h[4] 2313 smlal2 v5.4s, v21.8h, v1.h[4] 2314 smlal v2.4s, v21.4h, v1.h[5] 2315 smlal2 v3.4s, v21.8h, v1.h[5] 2316 smlal v4.4s, v22.4h, v1.h[5] 2317 smlal2 v5.4s, v22.8h, v1.h[5] 2318 smlal v2.4s, v22.4h, v1.h[6] 2319 smlal2 v3.4s, v22.8h, v1.h[6] 2320 smlal v4.4s, v24.4h, v1.h[6] 2321 smlal2 v5.4s, v24.8h, v1.h[6] 2322 smlal v2.4s, v24.4h, v1.h[7] 2323 smlal2 v3.4s, v24.8h, v1.h[7] 2324 smlal v4.4s, v25.4h, v1.h[7] 2325 smlal2 v5.4s, v25.8h, v1.h[7] 2326 sqrshrn v2.4h, v2.4s, #\shift_hv 2327 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2328 sqrshrn v4.4h, v4.4s, #\shift_hv 2329 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2330 subs \h, \h, #2 2331.ifc \type, put 2332 sqxtun v2.8b, v2.8h 2333 sqxtun v4.8b, v4.8h 2334 st1 {v2.8b}, [\dst], \d_strd 2335 st1 {v4.8b}, [\ds2], \d_strd 2336.else 2337 st1 {v2.8h}, [\dst], \d_strd 2338 st1 {v4.8h}, [\ds2], \d_strd 2339.endif 2340 b.le 9f 2341 mov v16.16b, v18.16b 2342 mov v17.16b, v19.16b 2343 mov v18.16b, v20.16b 2344 mov v19.16b, v21.16b 2345 mov v20.16b, v22.16b 2346 mov v21.16b, v24.16b 2347 mov v22.16b, v25.16b 2348 b 88b 23499: 2350 subs \w, \w, #8 2351 b.le 0f 2352 asr \s_strd, \s_strd, #1 2353 asr \d_strd, \d_strd, #1 2354 msub \src, \s_strd, \xmy, \src 2355 msub \dst, \d_strd, \xmy, \dst 2356 sub \src, \src, \s_strd, lsl #3 2357 mov \h, \my 2358 add \src, \src, #8 2359.ifc \type, put 2360 add \dst, \dst, #8 2361.else 2362 add \dst, \dst, #16 2363.endif 2364 b 168b 23650: 2366 br x15 2367 2368L(\type\()_8tap_filter_8): 2369 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2370 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2371 uxtl v28.8h, v28.8b 2372 uxtl v29.8h, v29.8b 2373 uxtl v30.8h, v30.8b 2374 uxtl v31.8h, v31.8b 2375 mul v24.8h, v28.8h, v0.h[0] 2376 mul v25.8h, v30.8h, v0.h[0] 2377.irpc i, 1234567 2378 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2379 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2380 mla v24.8h, v26.8h, v0.h[\i] 2381 mla v25.8h, v27.8h, v0.h[\i] 2382.endr 2383 srshr v24.8h, v24.8h, #2 2384 srshr v25.8h, v25.8h, #2 2385 ret 2386 2387L(\type\()_8tap_hv_tbl): 2388 .hword L(\type\()_8tap_hv_tbl) - 1280b 2389 .hword L(\type\()_8tap_hv_tbl) - 640b 2390 .hword L(\type\()_8tap_hv_tbl) - 320b 2391 .hword L(\type\()_8tap_hv_tbl) - 160b 2392 .hword L(\type\()_8tap_hv_tbl) - 80b 2393 .hword L(\type\()_8tap_hv_tbl) - 40b 2394 .hword L(\type\()_8tap_hv_tbl) - 20b 2395 .hword 0 2396endfunc 2397 2398 2399function \type\()_bilin_8bpc_neon, export=1 2400 dup v1.16b, \mx 2401 dup v3.16b, \my 2402 mov w9, #16 2403 sub w8, w9, \mx 2404 sub w9, w9, \my 2405 dup v0.16b, w8 2406 dup v2.16b, w9 2407.ifc \type, prep 2408 uxtw \d_strd, \w 2409 lsl \d_strd, \d_strd, #1 2410.endif 2411 2412 clz w8, \w 2413 sub w8, w8, #24 2414 cbnz \mx, L(\type\()_bilin_h) 2415 cbnz \my, L(\type\()_bilin_v) 2416 b \type\()_neon 2417 2418L(\type\()_bilin_h): 2419 cbnz \my, L(\type\()_bilin_hv) 2420 2421 adr x9, L(\type\()_bilin_h_tbl) 2422 ldrh w8, [x9, x8, lsl #1] 2423 sub x9, x9, w8, uxtw 2424 br x9 2425 242620: // 2xN h 2427.ifc \type, put 2428 add \ds2, \dst, \d_strd 2429 add \sr2, \src, \s_strd 2430 lsl \d_strd, \d_strd, #1 2431 lsl \s_strd, \s_strd, #1 24322: 2433 ld1 {v4.s}[0], [\src], \s_strd 2434 ld1 {v6.s}[0], [\sr2], \s_strd 2435 ext v5.8b, v4.8b, v4.8b, #1 2436 ext v7.8b, v6.8b, v6.8b, #1 2437 trn1 v4.4h, v4.4h, v6.4h 2438 trn1 v5.4h, v5.4h, v7.4h 2439 subs \h, \h, #2 2440 umull v4.8h, v4.8b, v0.8b 2441 umlal v4.8h, v5.8b, v1.8b 2442 uqrshrn v4.8b, v4.8h, #4 2443 st1 {v4.h}[0], [\dst], \d_strd 2444 st1 {v4.h}[1], [\ds2], \d_strd 2445 b.gt 2b 2446 ret 2447.endif 2448 244940: // 4xN h 2450 add \ds2, \dst, \d_strd 2451 add \sr2, \src, \s_strd 2452 lsl \d_strd, \d_strd, #1 2453 lsl \s_strd, \s_strd, #1 24544: 2455 ld1 {v4.8b}, [\src], \s_strd 2456 ld1 {v6.8b}, [\sr2], \s_strd 2457 ext v5.8b, v4.8b, v4.8b, #1 2458 ext v7.8b, v6.8b, v6.8b, #1 2459 trn1 v4.2s, v4.2s, v6.2s 2460 trn1 v5.2s, v5.2s, v7.2s 2461 subs \h, \h, #2 2462 umull v4.8h, v4.8b, v0.8b 2463 umlal v4.8h, v5.8b, v1.8b 2464.ifc \type, put 2465 uqrshrn v4.8b, v4.8h, #4 2466 st1 {v4.s}[0], [\dst], \d_strd 2467 st1 {v4.s}[1], [\ds2], \d_strd 2468.else 2469 st1 {v4.d}[0], [\dst], \d_strd 2470 st1 {v4.d}[1], [\ds2], \d_strd 2471.endif 2472 b.gt 4b 2473 ret 2474 247580: // 8xN h 2476 add \ds2, \dst, \d_strd 2477 add \sr2, \src, \s_strd 2478 lsl \d_strd, \d_strd, #1 2479 lsl \s_strd, \s_strd, #1 24808: 2481 ld1 {v4.16b}, [\src], \s_strd 2482 ld1 {v6.16b}, [\sr2], \s_strd 2483 ext v5.16b, v4.16b, v4.16b, #1 2484 ext v7.16b, v6.16b, v6.16b, #1 2485 subs \h, \h, #2 2486 umull v4.8h, v4.8b, v0.8b 2487 umull v6.8h, v6.8b, v0.8b 2488 umlal v4.8h, v5.8b, v1.8b 2489 umlal v6.8h, v7.8b, v1.8b 2490.ifc \type, put 2491 uqrshrn v4.8b, v4.8h, #4 2492 uqrshrn v6.8b, v6.8h, #4 2493 st1 {v4.8b}, [\dst], \d_strd 2494 st1 {v6.8b}, [\ds2], \d_strd 2495.else 2496 st1 {v4.8h}, [\dst], \d_strd 2497 st1 {v6.8h}, [\ds2], \d_strd 2498.endif 2499 b.gt 8b 2500 ret 2501160: 2502320: 2503640: 25041280: // 16xN, 32xN, ... h 2505 add \ds2, \dst, \d_strd 2506 add \sr2, \src, \s_strd 2507 lsl \s_strd, \s_strd, #1 2508 2509 sub \s_strd, \s_strd, \w, uxtw 2510 sub \s_strd, \s_strd, #8 2511.ifc \type, put 2512 lsl \d_strd, \d_strd, #1 2513 sub \d_strd, \d_strd, \w, uxtw 2514.endif 2515161: 2516 ld1 {v16.d}[1], [\src], #8 2517 ld1 {v20.d}[1], [\sr2], #8 2518 mov \mx, \w 2519 252016: 2521 ld1 {v18.16b}, [\src], #16 2522 ld1 {v22.16b}, [\sr2], #16 2523 ext v17.16b, v16.16b, v18.16b, #8 2524 ext v19.16b, v16.16b, v18.16b, #9 2525 ext v21.16b, v20.16b, v22.16b, #8 2526 ext v23.16b, v20.16b, v22.16b, #9 2527 umull v16.8h, v17.8b, v0.8b 2528 umull2 v17.8h, v17.16b, v0.16b 2529 umull v20.8h, v21.8b, v0.8b 2530 umull2 v21.8h, v21.16b, v0.16b 2531 umlal v16.8h, v19.8b, v1.8b 2532 umlal2 v17.8h, v19.16b, v1.16b 2533 umlal v20.8h, v23.8b, v1.8b 2534 umlal2 v21.8h, v23.16b, v1.16b 2535 subs \mx, \mx, #16 2536.ifc \type, put 2537 uqrshrn v16.8b, v16.8h, #4 2538 uqrshrn2 v16.16b, v17.8h, #4 2539 uqrshrn v20.8b, v20.8h, #4 2540 uqrshrn2 v20.16b, v21.8h, #4 2541 st1 {v16.16b}, [\dst], #16 2542 st1 {v20.16b}, [\ds2], #16 2543.else 2544 st1 {v16.8h, v17.8h}, [\dst], #32 2545 st1 {v20.8h, v21.8h}, [\ds2], #32 2546.endif 2547 b.le 9f 2548 2549 mov v16.16b, v18.16b 2550 mov v20.16b, v22.16b 2551 b 16b 2552 25539: 2554 add \dst, \dst, \d_strd 2555 add \ds2, \ds2, \d_strd 2556 add \src, \src, \s_strd 2557 add \sr2, \sr2, \s_strd 2558 2559 subs \h, \h, #2 2560 b.gt 161b 2561 ret 2562 2563L(\type\()_bilin_h_tbl): 2564 .hword L(\type\()_bilin_h_tbl) - 1280b 2565 .hword L(\type\()_bilin_h_tbl) - 640b 2566 .hword L(\type\()_bilin_h_tbl) - 320b 2567 .hword L(\type\()_bilin_h_tbl) - 160b 2568 .hword L(\type\()_bilin_h_tbl) - 80b 2569 .hword L(\type\()_bilin_h_tbl) - 40b 2570 .hword L(\type\()_bilin_h_tbl) - 20b 2571 .hword 0 2572 2573 2574L(\type\()_bilin_v): 2575 cmp \h, #4 2576 adr x9, L(\type\()_bilin_v_tbl) 2577 ldrh w8, [x9, x8, lsl #1] 2578 sub x9, x9, w8, uxtw 2579 br x9 2580 258120: // 2xN v 2582.ifc \type, put 2583 cmp \h, #2 2584 add \ds2, \dst, \d_strd 2585 add \sr2, \src, \s_strd 2586 lsl \s_strd, \s_strd, #1 2587 lsl \d_strd, \d_strd, #1 2588 2589 // 2x2 v 2590 ld1 {v16.h}[0], [\src], \s_strd 2591 b.gt 24f 2592 ld1 {v17.h}[0], [\sr2], \s_strd 2593 ld1 {v18.h}[0], [\src], \s_strd 2594 trn1 v16.4h, v16.4h, v17.4h 2595 trn1 v17.4h, v17.4h, v18.4h 2596 umull v4.8h, v16.8b, v2.8b 2597 umlal v4.8h, v17.8b, v3.8b 2598 uqrshrn v4.8b, v4.8h, #4 2599 st1 {v4.h}[0], [\dst] 2600 st1 {v4.h}[1], [\ds2] 2601 ret 260224: // 2x4, 2x8, ... v 2603 ld1 {v17.h}[0], [\sr2], \s_strd 2604 ld1 {v18.h}[0], [\src], \s_strd 2605 ld1 {v19.h}[0], [\sr2], \s_strd 2606 ld1 {v20.h}[0], [\src], \s_strd 2607 trn1 v16.4h, v16.4h, v17.4h 2608 trn1 v17.4h, v17.4h, v18.4h 2609 trn1 v18.4h, v18.4h, v19.4h 2610 trn1 v19.4h, v19.4h, v20.4h 2611 trn1 v16.2s, v16.2s, v18.2s 2612 trn1 v17.2s, v17.2s, v19.2s 2613 umull v4.8h, v16.8b, v2.8b 2614 umlal v4.8h, v17.8b, v3.8b 2615 subs \h, \h, #4 2616 uqrshrn v4.8b, v4.8h, #4 2617 st1 {v4.h}[0], [\dst], \d_strd 2618 st1 {v4.h}[1], [\ds2], \d_strd 2619 st1 {v4.h}[2], [\dst], \d_strd 2620 st1 {v4.h}[3], [\ds2], \d_strd 2621 b.le 0f 2622 mov v16.8b, v20.8b 2623 b 24b 26240: 2625 ret 2626.endif 2627 262840: // 4xN v 2629 add \ds2, \dst, \d_strd 2630 add \sr2, \src, \s_strd 2631 lsl \s_strd, \s_strd, #1 2632 lsl \d_strd, \d_strd, #1 2633 ld1 {v16.s}[0], [\src], \s_strd 26344: 2635 ld1 {v17.s}[0], [\sr2], \s_strd 2636 ld1 {v18.s}[0], [\src], \s_strd 2637 trn1 v16.2s, v16.2s, v17.2s 2638 trn1 v17.2s, v17.2s, v18.2s 2639 umull v4.8h, v16.8b, v2.8b 2640 umlal v4.8h, v17.8b, v3.8b 2641 subs \h, \h, #2 2642.ifc \type, put 2643 uqrshrn v4.8b, v4.8h, #4 2644 st1 {v4.s}[0], [\dst], \d_strd 2645 st1 {v4.s}[1], [\ds2], \d_strd 2646.else 2647 st1 {v4.d}[0], [\dst], \d_strd 2648 st1 {v4.d}[1], [\ds2], \d_strd 2649.endif 2650 b.le 0f 2651 mov v16.8b, v18.8b 2652 b 4b 26530: 2654 ret 2655 265680: // 8xN v 2657 add \ds2, \dst, \d_strd 2658 add \sr2, \src, \s_strd 2659 lsl \s_strd, \s_strd, #1 2660 lsl \d_strd, \d_strd, #1 2661 ld1 {v16.8b}, [\src], \s_strd 26628: 2663 ld1 {v17.8b}, [\sr2], \s_strd 2664 ld1 {v18.8b}, [\src], \s_strd 2665 umull v4.8h, v16.8b, v2.8b 2666 umull v5.8h, v17.8b, v2.8b 2667 umlal v4.8h, v17.8b, v3.8b 2668 umlal v5.8h, v18.8b, v3.8b 2669 subs \h, \h, #2 2670.ifc \type, put 2671 uqrshrn v4.8b, v4.8h, #4 2672 uqrshrn v5.8b, v5.8h, #4 2673 st1 {v4.8b}, [\dst], \d_strd 2674 st1 {v5.8b}, [\ds2], \d_strd 2675.else 2676 st1 {v4.8h}, [\dst], \d_strd 2677 st1 {v5.8h}, [\ds2], \d_strd 2678.endif 2679 b.le 0f 2680 mov v16.8b, v18.8b 2681 b 8b 26820: 2683 ret 2684 2685160: // 16xN, 32xN, ... 2686320: 2687640: 26881280: 2689 mov \my, \h 26901: 2691 add \ds2, \dst, \d_strd 2692 add \sr2, \src, \s_strd 2693 lsl \s_strd, \s_strd, #1 2694 lsl \d_strd, \d_strd, #1 2695 2696 ld1 {v16.16b}, [\src], \s_strd 26972: 2698 ld1 {v17.16b}, [\sr2], \s_strd 2699 ld1 {v18.16b}, [\src], \s_strd 2700 umull v4.8h, v16.8b, v2.8b 2701 umull2 v5.8h, v16.16b, v2.16b 2702 umull v6.8h, v17.8b, v2.8b 2703 umull2 v7.8h, v17.16b, v2.16b 2704 umlal v4.8h, v17.8b, v3.8b 2705 umlal2 v5.8h, v17.16b, v3.16b 2706 umlal v6.8h, v18.8b, v3.8b 2707 umlal2 v7.8h, v18.16b, v3.16b 2708 subs \h, \h, #2 2709.ifc \type, put 2710 uqrshrn v4.8b, v4.8h, #4 2711 uqrshrn2 v4.16b, v5.8h, #4 2712 uqrshrn v6.8b, v6.8h, #4 2713 uqrshrn2 v6.16b, v7.8h, #4 2714 st1 {v4.16b}, [\dst], \d_strd 2715 st1 {v6.16b}, [\ds2], \d_strd 2716.else 2717 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2718 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2719.endif 2720 b.le 9f 2721 mov v16.16b, v18.16b 2722 b 2b 27239: 2724 subs \w, \w, #16 2725 b.le 0f 2726 asr \s_strd, \s_strd, #1 2727 asr \d_strd, \d_strd, #1 2728 msub \src, \s_strd, \xmy, \src 2729 msub \dst, \d_strd, \xmy, \dst 2730 sub \src, \src, \s_strd, lsl #1 2731 mov \h, \my 2732 add \src, \src, #16 2733.ifc \type, put 2734 add \dst, \dst, #16 2735.else 2736 add \dst, \dst, #32 2737.endif 2738 b 1b 27390: 2740 ret 2741 2742L(\type\()_bilin_v_tbl): 2743 .hword L(\type\()_bilin_v_tbl) - 1280b 2744 .hword L(\type\()_bilin_v_tbl) - 640b 2745 .hword L(\type\()_bilin_v_tbl) - 320b 2746 .hword L(\type\()_bilin_v_tbl) - 160b 2747 .hword L(\type\()_bilin_v_tbl) - 80b 2748 .hword L(\type\()_bilin_v_tbl) - 40b 2749 .hword L(\type\()_bilin_v_tbl) - 20b 2750 .hword 0 2751 2752L(\type\()_bilin_hv): 2753 uxtl v2.8h, v2.8b 2754 uxtl v3.8h, v3.8b 2755 adr x9, L(\type\()_bilin_hv_tbl) 2756 ldrh w8, [x9, x8, lsl #1] 2757 sub x9, x9, w8, uxtw 2758 br x9 2759 276020: // 2xN hv 2761.ifc \type, put 2762 add \sr2, \src, \s_strd 2763 add \ds2, \dst, \d_strd 2764 lsl \s_strd, \s_strd, #1 2765 lsl \d_strd, \d_strd, #1 2766 2767 ld1 {v28.s}[0], [\src], \s_strd 2768 ext v29.8b, v28.8b, v28.8b, #1 2769 umull v16.8h, v28.8b, v0.8b 2770 umlal v16.8h, v29.8b, v1.8b 2771 27722: 2773 ld1 {v28.s}[0], [\sr2], \s_strd 2774 ld1 {v30.s}[0], [\src], \s_strd 2775 ext v29.8b, v28.8b, v28.8b, #1 2776 ext v31.8b, v30.8b, v30.8b, #1 2777 trn1 v28.4h, v28.4h, v30.4h 2778 trn1 v29.4h, v29.4h, v31.4h 2779 umull v17.8h, v28.8b, v0.8b 2780 umlal v17.8h, v29.8b, v1.8b 2781 2782 trn1 v16.2s, v16.2s, v17.2s 2783 2784 mul v4.4h, v16.4h, v2.4h 2785 mla v4.4h, v17.4h, v3.4h 2786 uqrshrn v4.8b, v4.8h, #8 2787 subs \h, \h, #2 2788 st1 {v4.h}[0], [\dst], \d_strd 2789 st1 {v4.h}[1], [\ds2], \d_strd 2790 b.le 0f 2791 trn2 v16.2s, v17.2s, v17.2s 2792 b 2b 27930: 2794 ret 2795.endif 2796 279740: // 4xN hv 2798 add \sr2, \src, \s_strd 2799 add \ds2, \dst, \d_strd 2800 lsl \s_strd, \s_strd, #1 2801 lsl \d_strd, \d_strd, #1 2802 2803 ld1 {v28.8b}, [\src], \s_strd 2804 ext v29.8b, v28.8b, v28.8b, #1 2805 umull v16.8h, v28.8b, v0.8b 2806 umlal v16.8h, v29.8b, v1.8b 2807 28084: 2809 ld1 {v28.8b}, [\sr2], \s_strd 2810 ld1 {v30.8b}, [\src], \s_strd 2811 ext v29.8b, v28.8b, v28.8b, #1 2812 ext v31.8b, v30.8b, v30.8b, #1 2813 trn1 v28.2s, v28.2s, v30.2s 2814 trn1 v29.2s, v29.2s, v31.2s 2815 umull v17.8h, v28.8b, v0.8b 2816 umlal v17.8h, v29.8b, v1.8b 2817 2818 trn1 v16.2d, v16.2d, v17.2d 2819 2820 mul v4.8h, v16.8h, v2.8h 2821 mla v4.8h, v17.8h, v3.8h 2822 subs \h, \h, #2 2823.ifc \type, put 2824 uqrshrn v4.8b, v4.8h, #8 2825 st1 {v4.s}[0], [\dst], \d_strd 2826 st1 {v4.s}[1], [\ds2], \d_strd 2827.else 2828 urshr v4.8h, v4.8h, #4 2829 st1 {v4.d}[0], [\dst], \d_strd 2830 st1 {v4.d}[1], [\ds2], \d_strd 2831.endif 2832 b.le 0f 2833 trn2 v16.2d, v17.2d, v17.2d 2834 b 4b 28350: 2836 ret 2837 283880: // 8xN, 16xN, ... hv 2839160: 2840320: 2841640: 28421280: 2843 mov \my, \h 2844 28451: 2846 add \sr2, \src, \s_strd 2847 add \ds2, \dst, \d_strd 2848 lsl \s_strd, \s_strd, #1 2849 lsl \d_strd, \d_strd, #1 2850 2851 ld1 {v28.16b}, [\src], \s_strd 2852 ext v29.16b, v28.16b, v28.16b, #1 2853 umull v16.8h, v28.8b, v0.8b 2854 umlal v16.8h, v29.8b, v1.8b 2855 28562: 2857 ld1 {v28.16b}, [\sr2], \s_strd 2858 ld1 {v30.16b}, [\src], \s_strd 2859 ext v29.16b, v28.16b, v28.16b, #1 2860 ext v31.16b, v30.16b, v30.16b, #1 2861 umull v17.8h, v28.8b, v0.8b 2862 umlal v17.8h, v29.8b, v1.8b 2863 umull v18.8h, v30.8b, v0.8b 2864 umlal v18.8h, v31.8b, v1.8b 2865 2866 mul v4.8h, v16.8h, v2.8h 2867 mla v4.8h, v17.8h, v3.8h 2868 mul v5.8h, v17.8h, v2.8h 2869 mla v5.8h, v18.8h, v3.8h 2870 subs \h, \h, #2 2871.ifc \type, put 2872 uqrshrn v4.8b, v4.8h, #8 2873 uqrshrn v5.8b, v5.8h, #8 2874 st1 {v4.8b}, [\dst], \d_strd 2875 st1 {v5.8b}, [\ds2], \d_strd 2876.else 2877 urshr v4.8h, v4.8h, #4 2878 urshr v5.8h, v5.8h, #4 2879 st1 {v4.8h}, [\dst], \d_strd 2880 st1 {v5.8h}, [\ds2], \d_strd 2881.endif 2882 b.le 9f 2883 mov v16.16b, v18.16b 2884 b 2b 28859: 2886 subs \w, \w, #8 2887 b.le 0f 2888 asr \s_strd, \s_strd, #1 2889 asr \d_strd, \d_strd, #1 2890 msub \src, \s_strd, \xmy, \src 2891 msub \dst, \d_strd, \xmy, \dst 2892 sub \src, \src, \s_strd, lsl #1 2893 mov \h, \my 2894 add \src, \src, #8 2895.ifc \type, put 2896 add \dst, \dst, #8 2897.else 2898 add \dst, \dst, #16 2899.endif 2900 b 1b 29010: 2902 ret 2903 2904L(\type\()_bilin_hv_tbl): 2905 .hword L(\type\()_bilin_hv_tbl) - 1280b 2906 .hword L(\type\()_bilin_hv_tbl) - 640b 2907 .hword L(\type\()_bilin_hv_tbl) - 320b 2908 .hword L(\type\()_bilin_hv_tbl) - 160b 2909 .hword L(\type\()_bilin_hv_tbl) - 80b 2910 .hword L(\type\()_bilin_hv_tbl) - 40b 2911 .hword L(\type\()_bilin_hv_tbl) - 20b 2912 .hword 0 2913endfunc 2914.endm 2915 2916filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 2917filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 2918 2919.macro load_filter_row dst, src, inc 2920 asr w13, \src, #10 2921 ldr \dst, [x11, w13, sxtw #3] 2922 add \src, \src, \inc 2923.endm 2924 2925function warp_filter_horz_neon 2926 add w12, w5, #512 2927 2928 ld1 {v16.8b, v17.8b}, [x2], x3 2929 2930 load_filter_row d0, w12, w7 2931 uxtl v16.8h, v16.8b 2932 load_filter_row d1, w12, w7 2933 uxtl v17.8h, v17.8b 2934 load_filter_row d2, w12, w7 2935 sxtl v0.8h, v0.8b 2936 load_filter_row d3, w12, w7 2937 sxtl v1.8h, v1.8b 2938 load_filter_row d4, w12, w7 2939 sxtl v2.8h, v2.8b 2940 load_filter_row d5, w12, w7 2941 sxtl v3.8h, v3.8b 2942 load_filter_row d6, w12, w7 2943 sxtl v4.8h, v4.8b 2944 load_filter_row d7, w12, w7 2945 sxtl v5.8h, v5.8b 2946 ext v18.16b, v16.16b, v17.16b, #2*1 2947 mul v23.8h, v16.8h, v0.8h 2948 sxtl v6.8h, v6.8b 2949 ext v19.16b, v16.16b, v17.16b, #2*2 2950 mul v18.8h, v18.8h, v1.8h 2951 sxtl v7.8h, v7.8b 2952 ext v20.16b, v16.16b, v17.16b, #2*3 2953 mul v19.8h, v19.8h, v2.8h 2954 ext v21.16b, v16.16b, v17.16b, #2*4 2955 saddlp v23.4s, v23.8h 2956 mul v20.8h, v20.8h, v3.8h 2957 ext v22.16b, v16.16b, v17.16b, #2*5 2958 saddlp v18.4s, v18.8h 2959 mul v21.8h, v21.8h, v4.8h 2960 saddlp v19.4s, v19.8h 2961 mul v22.8h, v22.8h, v5.8h 2962 saddlp v20.4s, v20.8h 2963 saddlp v21.4s, v21.8h 2964 saddlp v22.4s, v22.8h 2965 addp v18.4s, v23.4s, v18.4s 2966 ext v23.16b, v16.16b, v17.16b, #2*6 2967 addp v19.4s, v19.4s, v20.4s 2968 mul v23.8h, v23.8h, v6.8h 2969 ext v20.16b, v16.16b, v17.16b, #2*7 2970 mul v20.8h, v20.8h, v7.8h 2971 saddlp v23.4s, v23.8h 2972 addp v21.4s, v21.4s, v22.4s 2973 saddlp v20.4s, v20.8h 2974 addp v20.4s, v23.4s, v20.4s 2975 addp v18.4s, v18.4s, v19.4s 2976 addp v20.4s, v21.4s, v20.4s 2977 2978 add w5, w5, w8 2979 2980 rshrn v16.4h, v18.4s, #3 2981 rshrn2 v16.8h, v20.4s, #3 2982 2983 ret 2984endfunc 2985 2986// void dav1d_warp_affine_8x8_8bpc_neon( 2987// pixel *dst, const ptrdiff_t dst_stride, 2988// const pixel *src, const ptrdiff_t src_stride, 2989// const int16_t *const abcd, int mx, int my) 2990.macro warp t, shift 2991function warp_affine_8x8\t\()_8bpc_neon, export=1 2992 ldr x4, [x4] 2993 sbfx x7, x4, #0, #16 2994 sbfx x8, x4, #16, #16 2995 sbfx x9, x4, #32, #16 2996 sbfx x4, x4, #48, #16 2997 mov w10, #8 2998 sub x2, x2, x3, lsl #1 2999 sub x2, x2, x3 3000 sub x2, x2, #3 3001 movrel x11, X(mc_warp_filter), 64*8 3002 mov x15, x30 3003.ifnb \t 3004 lsl x1, x1, #1 3005.endif 3006 3007 bl warp_filter_horz_neon 3008 mov v24.16b, v16.16b 3009 bl warp_filter_horz_neon 3010 mov v25.16b, v16.16b 3011 bl warp_filter_horz_neon 3012 mov v26.16b, v16.16b 3013 bl warp_filter_horz_neon 3014 mov v27.16b, v16.16b 3015 bl warp_filter_horz_neon 3016 mov v28.16b, v16.16b 3017 bl warp_filter_horz_neon 3018 mov v29.16b, v16.16b 3019 bl warp_filter_horz_neon 3020 mov v30.16b, v16.16b 3021 30221: 3023 add w14, w6, #512 3024 bl warp_filter_horz_neon 3025 mov v31.16b, v16.16b 3026 3027 load_filter_row d0, w14, w9 3028 load_filter_row d1, w14, w9 3029 load_filter_row d2, w14, w9 3030 load_filter_row d3, w14, w9 3031 load_filter_row d4, w14, w9 3032 load_filter_row d5, w14, w9 3033 load_filter_row d6, w14, w9 3034 load_filter_row d7, w14, w9 3035 transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 3036 sxtl v0.8h, v0.8b 3037 sxtl v1.8h, v1.8b 3038 sxtl v2.8h, v2.8b 3039 sxtl v3.8h, v3.8b 3040 sxtl v4.8h, v4.8b 3041 sxtl v5.8h, v5.8b 3042 sxtl v6.8h, v6.8b 3043 sxtl v7.8h, v7.8b 3044 3045 // This ordering of smull/smlal/smull2/smlal2 is highly 3046 // beneficial for Cortex A53 here. 3047 smull v16.4s, v24.4h, v0.4h 3048 smlal v16.4s, v25.4h, v1.4h 3049 smlal v16.4s, v26.4h, v2.4h 3050 smlal v16.4s, v27.4h, v3.4h 3051 smlal v16.4s, v28.4h, v4.4h 3052 smlal v16.4s, v29.4h, v5.4h 3053 smlal v16.4s, v30.4h, v6.4h 3054 smlal v16.4s, v31.4h, v7.4h 3055 smull2 v17.4s, v24.8h, v0.8h 3056 smlal2 v17.4s, v25.8h, v1.8h 3057 smlal2 v17.4s, v26.8h, v2.8h 3058 smlal2 v17.4s, v27.8h, v3.8h 3059 smlal2 v17.4s, v28.8h, v4.8h 3060 smlal2 v17.4s, v29.8h, v5.8h 3061 smlal2 v17.4s, v30.8h, v6.8h 3062 smlal2 v17.4s, v31.8h, v7.8h 3063 3064 mov v24.16b, v25.16b 3065 mov v25.16b, v26.16b 3066 sqrshrn v16.4h, v16.4s, #\shift 3067 mov v26.16b, v27.16b 3068 sqrshrn2 v16.8h, v17.4s, #\shift 3069 mov v27.16b, v28.16b 3070 mov v28.16b, v29.16b 3071.ifb \t 3072 sqxtun v16.8b, v16.8h 3073.endif 3074 mov v29.16b, v30.16b 3075 mov v30.16b, v31.16b 3076 subs w10, w10, #1 3077.ifnb \t 3078 st1 {v16.8h}, [x0], x1 3079.else 3080 st1 {v16.8b}, [x0], x1 3081.endif 3082 3083 add w6, w6, w4 3084 b.gt 1b 3085 3086 br x15 3087endfunc 3088.endm 3089 3090warp , 11 3091warp t, 7 3092