1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2020, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32#define PREP_BIAS 8192 33 34.macro avg d0, d1, t0, t1, t2, t3 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 43 sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) 44 sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) 45.endm 46 47.macro w_avg d0, d1, t0, t1, t2, t3 48 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 49 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 50 // This difference requires a 17 bit range, and all bits are 51 // significant for the following multiplication. 52 ssubl \d0\().4s, \t2\().4h, \t0\().4h 53 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 54 ssubl \d1\().4s, \t3\().4h, \t1\().4h 55 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 56 mul \d0\().4s, \d0\().4s, v27.4s 57 mul \t0\().4s, \t0\().4s, v27.4s 58 mul \d1\().4s, \d1\().4s, v27.4s 59 mul \t1\().4s, \t1\().4s, v27.4s 60 sshr \d0\().4s, \d0\().4s, #4 61 sshr \t0\().4s, \t0\().4s, #4 62 sshr \d1\().4s, \d1\().4s, #4 63 sshr \t1\().4s, \t1\().4s, #4 64 saddw \d0\().4s, \d0\().4s, \t2\().4h 65 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 66 saddw \d1\().4s, \d1\().4s, \t3\().4h 67 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 68 xtn \d0\().4h, \d0\().4s 69 xtn2 \d0\().8h, \t0\().4s 70 xtn \d1\().4h, \d1\().4s 71 xtn2 \d1\().8h, \t1\().4s 72 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 73 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 74 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 75 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 76 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 77 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 78 smax \d0\().8h, \d0\().8h, v30.8h // 0 79 smax \d1\().8h, \d1\().8h, v30.8h // 0 80.endm 81 82.macro mask d0, d1, t0, t1, t2, t3 83 ld1 {v27.16b}, [x6], 16 84 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 85 neg v27.16b, v27.16b 86 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 87 sxtl v26.8h, v27.8b 88 sxtl2 v27.8h, v27.16b 89 sxtl v24.4s, v26.4h 90 sxtl2 v25.4s, v26.8h 91 sxtl v26.4s, v27.4h 92 sxtl2 v27.4s, v27.8h 93 ssubl \d0\().4s, \t2\().4h, \t0\().4h 94 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 95 ssubl \d1\().4s, \t3\().4h, \t1\().4h 96 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 97 mul \d0\().4s, \d0\().4s, v24.4s 98 mul \t0\().4s, \t0\().4s, v25.4s 99 mul \d1\().4s, \d1\().4s, v26.4s 100 mul \t1\().4s, \t1\().4s, v27.4s 101 sshr \d0\().4s, \d0\().4s, #6 102 sshr \t0\().4s, \t0\().4s, #6 103 sshr \d1\().4s, \d1\().4s, #6 104 sshr \t1\().4s, \t1\().4s, #6 105 saddw \d0\().4s, \d0\().4s, \t2\().4h 106 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 107 saddw \d1\().4s, \d1\().4s, \t3\().4h 108 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 109 xtn \d0\().4h, \d0\().4s 110 xtn2 \d0\().8h, \t0\().4s 111 xtn \d1\().4h, \d1\().4s 112 xtn2 \d1\().8h, \t1\().4s 113 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 114 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 115 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 116 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 117 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 118 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 119 smax \d0\().8h, \d0\().8h, v30.8h // 0 120 smax \d1\().8h, \d1\().8h, v30.8h // 0 121.endm 122 123.macro bidir_fn type, bdmax 124function \type\()_16bpc_neon, export=1 125 clz w4, w4 126.ifnc \type, avg 127 dup v31.8h, \bdmax // bitdepth_max 128 movi v30.8h, #0 129.endif 130 clz w7, \bdmax 131 sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 132.ifc \type, avg 133 mov w9, #1 134 mov w8, #-2*PREP_BIAS 135 lsl w9, w9, w7 // 1 << intermediate_bits 136 add w7, w7, #1 137 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits 138 neg w7, w7 // -(intermediate_bits+1) 139 dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits 140 dup v29.8h, w7 // -(intermediate_bits+1) 141.else 142 mov w8, #PREP_BIAS 143 lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits 144 neg w7, w7 // -intermediate_bits 145 dup v28.8h, w8 // PREP_BIAS >> intermediate_bits 146 dup v29.8h, w7 // -intermediate_bits 147.endif 148.ifc \type, w_avg 149 dup v27.4s, w6 150 neg v27.4s, v27.4s 151.endif 152 adr x7, L(\type\()_tbl) 153 sub w4, w4, #24 154 \type v4, v5, v0, v1, v2, v3 155 ldrh w4, [x7, x4, lsl #1] 156 sub x7, x7, w4, uxtw 157 br x7 15840: 159 add x7, x0, x1 160 lsl x1, x1, #1 1614: 162 subs w5, w5, #4 163 st1 {v4.d}[0], [x0], x1 164 st1 {v4.d}[1], [x7], x1 165 st1 {v5.d}[0], [x0], x1 166 st1 {v5.d}[1], [x7], x1 167 b.le 0f 168 \type v4, v5, v0, v1, v2, v3 169 b 4b 17080: 171 add x7, x0, x1 172 lsl x1, x1, #1 1738: 174 st1 {v4.8h}, [x0], x1 175 subs w5, w5, #2 176 st1 {v5.8h}, [x7], x1 177 b.le 0f 178 \type v4, v5, v0, v1, v2, v3 179 b 8b 18016: 181 \type v6, v7, v0, v1, v2, v3 182 st1 {v4.8h, v5.8h}, [x0], x1 183 subs w5, w5, #2 184 st1 {v6.8h, v7.8h}, [x0], x1 185 b.le 0f 186 \type v4, v5, v0, v1, v2, v3 187 b 16b 18832: 189 \type v6, v7, v0, v1, v2, v3 190 subs w5, w5, #1 191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 192 b.le 0f 193 \type v4, v5, v0, v1, v2, v3 194 b 32b 195640: 196 add x7, x0, #64 19764: 198 \type v6, v7, v0, v1, v2, v3 199 \type v16, v17, v0, v1, v2, v3 200 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 201 \type v18, v19, v0, v1, v2, v3 202 subs w5, w5, #1 203 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 204 b.le 0f 205 \type v4, v5, v0, v1, v2, v3 206 b 64b 2071280: 208 add x7, x0, #64 209 mov x8, #128 210 sub x1, x1, #128 211128: 212 \type v6, v7, v0, v1, v2, v3 213 \type v16, v17, v0, v1, v2, v3 214 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 215 \type v18, v19, v0, v1, v2, v3 216 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 217 \type v4, v5, v0, v1, v2, v3 218 \type v6, v7, v0, v1, v2, v3 219 \type v16, v17, v0, v1, v2, v3 220 subs w5, w5, #1 221 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 222 \type v18, v19, v0, v1, v2, v3 223 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 224 b.le 0f 225 \type v4, v5, v0, v1, v2, v3 226 b 128b 2270: 228 ret 229L(\type\()_tbl): 230 .hword L(\type\()_tbl) - 1280b 231 .hword L(\type\()_tbl) - 640b 232 .hword L(\type\()_tbl) - 32b 233 .hword L(\type\()_tbl) - 16b 234 .hword L(\type\()_tbl) - 80b 235 .hword L(\type\()_tbl) - 40b 236endfunc 237.endm 238 239bidir_fn avg, w6 240bidir_fn w_avg, w7 241bidir_fn mask, w7 242 243 244.macro w_mask_fn type 245function w_mask_\type\()_16bpc_neon, export=1 246 ldr w8, [sp] 247 clz w9, w4 248 adr x10, L(w_mask_\type\()_tbl) 249 dup v31.8h, w8 // bitdepth_max 250 sub w9, w9, #24 251 clz w8, w8 // clz(bitdepth_max) 252 ldrh w9, [x10, x9, lsl #1] 253 sub x10, x10, w9, uxtw 254 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 255 mov w9, #PREP_BIAS*64 256 neg w8, w8 // -sh 257 mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd 258 dup v30.4s, w9 // PREP_BIAS*64 259 dup v29.4s, w8 // -sh 260 dup v0.8h, w11 261.if \type == 444 262 movi v1.16b, #64 263.elseif \type == 422 264 dup v2.8b, w7 265 movi v3.8b, #129 266 sub v3.8b, v3.8b, v2.8b 267.elseif \type == 420 268 dup v2.8h, w7 269 movi v3.8h, #1, lsl #8 270 sub v3.8h, v3.8h, v2.8h 271.endif 272 add x12, x0, x1 273 lsl x1, x1, #1 274 br x10 2754: 276 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 277 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 278 subs w5, w5, #4 279 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 280 sabd v21.8h, v5.8h, v7.8h 281 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 282 ssubl2 v17.4s, v6.8h, v4.8h 283 ssubl v18.4s, v7.4h, v5.4h 284 ssubl2 v19.4s, v7.8h, v5.8h 285 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 286 uqsub v21.8h, v0.8h, v21.8h 287 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 288 sshll v6.4s, v5.4h, #6 289 sshll2 v5.4s, v4.8h, #6 290 sshll v4.4s, v4.4h, #6 291 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 292 ushr v21.8h, v21.8h, #10 293 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 294 add v5.4s, v5.4s, v30.4s 295 add v6.4s, v6.4s, v30.4s 296 add v7.4s, v7.4s, v30.4s 297 uxtl v22.4s, v20.4h 298 uxtl2 v23.4s, v20.8h 299 uxtl v24.4s, v21.4h 300 uxtl2 v25.4s, v21.8h 301 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 302 mla v5.4s, v17.4s, v23.4s 303 mla v6.4s, v18.4s, v24.4s 304 mla v7.4s, v19.4s, v25.4s 305 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 306 srshl v5.4s, v5.4s, v29.4s 307 srshl v6.4s, v6.4s, v29.4s 308 srshl v7.4s, v7.4s, v29.4s 309 sqxtun v4.4h, v4.4s // iclip_pixel 310 sqxtun2 v4.8h, v5.4s 311 sqxtun v5.4h, v6.4s 312 sqxtun2 v5.8h, v7.4s 313 umin v4.8h, v4.8h, v31.8h // iclip_pixel 314 umin v5.8h, v5.8h, v31.8h 315.if \type == 444 316 xtn v20.8b, v20.8h // 64 - m 317 xtn2 v20.16b, v21.8h 318 sub v20.16b, v1.16b, v20.16b // m 319 st1 {v20.16b}, [x6], #16 320.elseif \type == 422 321 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 322 xtn v20.8b, v20.8h 323 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 324 st1 {v20.8b}, [x6], #8 325.elseif \type == 420 326 trn1 v24.2d, v20.2d, v21.2d 327 trn2 v25.2d, v20.2d, v21.2d 328 add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) 329 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) 330 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 331 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 332 st1 {v20.s}[0], [x6], #4 333.endif 334 st1 {v4.d}[0], [x0], x1 335 st1 {v4.d}[1], [x12], x1 336 st1 {v5.d}[0], [x0], x1 337 st1 {v5.d}[1], [x12], x1 338 b.gt 4b 339 ret 3408: 341 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 342 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 343 subs w5, w5, #2 344 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 345 sabd v21.8h, v5.8h, v7.8h 346 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 347 ssubl2 v17.4s, v6.8h, v4.8h 348 ssubl v18.4s, v7.4h, v5.4h 349 ssubl2 v19.4s, v7.8h, v5.8h 350 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 351 uqsub v21.8h, v0.8h, v21.8h 352 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 353 sshll v6.4s, v5.4h, #6 354 sshll2 v5.4s, v4.8h, #6 355 sshll v4.4s, v4.4h, #6 356 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 357 ushr v21.8h, v21.8h, #10 358 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 359 add v5.4s, v5.4s, v30.4s 360 add v6.4s, v6.4s, v30.4s 361 add v7.4s, v7.4s, v30.4s 362 uxtl v22.4s, v20.4h 363 uxtl2 v23.4s, v20.8h 364 uxtl v24.4s, v21.4h 365 uxtl2 v25.4s, v21.8h 366 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 367 mla v5.4s, v17.4s, v23.4s 368 mla v6.4s, v18.4s, v24.4s 369 mla v7.4s, v19.4s, v25.4s 370 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 371 srshl v5.4s, v5.4s, v29.4s 372 srshl v6.4s, v6.4s, v29.4s 373 srshl v7.4s, v7.4s, v29.4s 374 sqxtun v4.4h, v4.4s // iclip_pixel 375 sqxtun2 v4.8h, v5.4s 376 sqxtun v5.4h, v6.4s 377 sqxtun2 v5.8h, v7.4s 378 umin v4.8h, v4.8h, v31.8h // iclip_pixel 379 umin v5.8h, v5.8h, v31.8h 380.if \type == 444 381 xtn v20.8b, v20.8h // 64 - m 382 xtn2 v20.16b, v21.8h 383 sub v20.16b, v1.16b, v20.16b // m 384 st1 {v20.16b}, [x6], #16 385.elseif \type == 422 386 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 387 xtn v20.8b, v20.8h 388 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 389 st1 {v20.8b}, [x6], #8 390.elseif \type == 420 391 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) 392 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) 393 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 394 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 395 st1 {v20.s}[0], [x6], #4 396.endif 397 st1 {v4.8h}, [x0], x1 398 st1 {v5.8h}, [x12], x1 399 b.gt 8b 400 ret 4011280: 402640: 403320: 404160: 405 mov w11, w4 406 sub x1, x1, w4, uxtw #1 407.if \type == 444 408 add x10, x6, w4, uxtw 409.elseif \type == 422 410 add x10, x6, x11, lsr #1 411.endif 412 add x9, x3, w4, uxtw #1 413 add x7, x2, w4, uxtw #1 414161: 415 mov w8, w4 41616: 417 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 418 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 419 ld1 {v6.8h, v7.8h}, [x7], #32 420 ld1 {v18.8h, v19.8h}, [x9], #32 421 subs w8, w8, #16 422 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) 423 sabd v21.8h, v5.8h, v17.8h 424 ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 425 ssubl2 v23.4s, v16.8h, v4.8h 426 ssubl v24.4s, v17.4h, v5.4h 427 ssubl2 v25.4s, v17.8h, v5.8h 428 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 429 uqsub v21.8h, v0.8h, v21.8h 430 sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 431 sshll v26.4s, v5.4h, #6 432 sshll2 v5.4s, v4.8h, #6 433 sshll v4.4s, v4.4h, #6 434 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 435 ushr v21.8h, v21.8h, #10 436 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 437 add v5.4s, v5.4s, v30.4s 438 add v26.4s, v26.4s, v30.4s 439 add v27.4s, v27.4s, v30.4s 440 uxtl v16.4s, v20.4h 441 uxtl2 v17.4s, v20.8h 442 uxtl v28.4s, v21.4h 443 mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) 444 uxtl2 v16.4s, v21.8h 445 mla v5.4s, v23.4s, v17.4s 446 mla v26.4s, v24.4s, v28.4s 447 mla v27.4s, v25.4s, v16.4s 448 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 449 srshl v5.4s, v5.4s, v29.4s 450 srshl v26.4s, v26.4s, v29.4s 451 srshl v27.4s, v27.4s, v29.4s 452 sqxtun v4.4h, v4.4s // iclip_pixel 453 sqxtun2 v4.8h, v5.4s 454 sqxtun v5.4h, v26.4s 455 sqxtun2 v5.8h, v27.4s 456 457 // Start of other half 458 sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) 459 sabd v23.8h, v7.8h, v19.8h 460 461 umin v4.8h, v4.8h, v31.8h // iclip_pixel 462 umin v5.8h, v5.8h, v31.8h 463 464 ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) 465 ssubl2 v17.4s, v18.8h, v6.8h 466 ssubl v18.4s, v19.4h, v7.4h 467 ssubl2 v19.4s, v19.8h, v7.8h 468 uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() 469 uqsub v23.8h, v0.8h, v23.8h 470 sshll v24.4s, v6.4h, #6 // tmp1 << 6 471 sshll2 v25.4s, v6.8h, #6 472 sshll v26.4s, v7.4h, #6 473 sshll2 v27.4s, v7.8h, #6 474 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 475 ushr v23.8h, v23.8h, #10 476 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 477 add v25.4s, v25.4s, v30.4s 478 add v26.4s, v26.4s, v30.4s 479 add v27.4s, v27.4s, v30.4s 480 uxtl v6.4s, v22.4h 481 uxtl2 v7.4s, v22.8h 482 uxtl v28.4s, v23.4h 483 mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) 484 uxtl2 v6.4s, v23.8h 485 mla v25.4s, v17.4s, v7.4s 486 mla v26.4s, v18.4s, v28.4s 487 mla v27.4s, v19.4s, v6.4s 488 srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 489 srshl v25.4s, v25.4s, v29.4s 490 srshl v26.4s, v26.4s, v29.4s 491 srshl v27.4s, v27.4s, v29.4s 492 sqxtun v6.4h, v24.4s // iclip_pixel 493 sqxtun2 v6.8h, v25.4s 494 sqxtun v7.4h, v26.4s 495 sqxtun2 v7.8h, v27.4s 496 umin v6.8h, v6.8h, v31.8h // iclip_pixel 497 umin v7.8h, v7.8h, v31.8h 498.if \type == 444 499 xtn v20.8b, v20.8h // 64 - m 500 xtn2 v20.16b, v21.8h 501 xtn v21.8b, v22.8h 502 xtn2 v21.16b, v23.8h 503 sub v20.16b, v1.16b, v20.16b // m 504 sub v21.16b, v1.16b, v21.16b 505 st1 {v20.16b}, [x6], #16 506 st1 {v21.16b}, [x10], #16 507.elseif \type == 422 508 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 509 addp v21.8h, v22.8h, v23.8h 510 xtn v20.8b, v20.8h 511 xtn v21.8b, v21.8h 512 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 513 uhsub v21.8b, v3.8b, v21.8b 514 st1 {v20.8b}, [x6], #8 515 st1 {v21.8b}, [x10], #8 516.elseif \type == 420 517 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) 518 add v21.8h, v21.8h, v23.8h 519 addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) 520 sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) 521 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 522 st1 {v20.8b}, [x6], #8 523.endif 524 st1 {v4.8h, v5.8h}, [x0], #32 525 st1 {v6.8h, v7.8h}, [x12], #32 526 b.gt 16b 527 subs w5, w5, #2 528 add x2, x2, w4, uxtw #1 529 add x3, x3, w4, uxtw #1 530 add x7, x7, w4, uxtw #1 531 add x9, x9, w4, uxtw #1 532.if \type == 444 533 add x6, x6, w4, uxtw 534 add x10, x10, w4, uxtw 535.elseif \type == 422 536 add x6, x6, x11, lsr #1 537 add x10, x10, x11, lsr #1 538.endif 539 add x0, x0, x1 540 add x12, x12, x1 541 b.gt 161b 542 ret 543L(w_mask_\type\()_tbl): 544 .hword L(w_mask_\type\()_tbl) - 1280b 545 .hword L(w_mask_\type\()_tbl) - 640b 546 .hword L(w_mask_\type\()_tbl) - 320b 547 .hword L(w_mask_\type\()_tbl) - 160b 548 .hword L(w_mask_\type\()_tbl) - 8b 549 .hword L(w_mask_\type\()_tbl) - 4b 550endfunc 551.endm 552 553w_mask_fn 444 554w_mask_fn 422 555w_mask_fn 420 556 557 558function blend_16bpc_neon, export=1 559 adr x6, L(blend_tbl) 560 clz w3, w3 561 sub w3, w3, #26 562 ldrh w3, [x6, x3, lsl #1] 563 sub x6, x6, w3, uxtw 564 add x8, x0, x1 565 br x6 56640: 567 lsl x1, x1, #1 5684: 569 ld1 {v2.8b}, [x5], #8 570 ld1 {v1.8h}, [x2], #16 571 ld1 {v0.d}[0], [x0] 572 neg v2.8b, v2.8b // -m 573 subs w4, w4, #2 574 ld1 {v0.d}[1], [x8] 575 sxtl v2.8h, v2.8b 576 shl v2.8h, v2.8h, #9 // -m << 9 577 sub v1.8h, v0.8h, v1.8h // a - b 578 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 579 add v0.8h, v0.8h, v1.8h 580 st1 {v0.d}[0], [x0], x1 581 st1 {v0.d}[1], [x8], x1 582 b.gt 4b 583 ret 58480: 585 lsl x1, x1, #1 5868: 587 ld1 {v4.16b}, [x5], #16 588 ld1 {v2.8h, v3.8h}, [x2], #32 589 neg v5.16b, v4.16b // -m 590 ld1 {v0.8h}, [x0] 591 ld1 {v1.8h}, [x8] 592 sxtl v4.8h, v5.8b 593 sxtl2 v5.8h, v5.16b 594 shl v4.8h, v4.8h, #9 // -m << 9 595 shl v5.8h, v5.8h, #9 596 sub v2.8h, v0.8h, v2.8h // a - b 597 sub v3.8h, v1.8h, v3.8h 598 subs w4, w4, #2 599 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 600 sqrdmulh v3.8h, v3.8h, v5.8h 601 add v0.8h, v0.8h, v2.8h 602 add v1.8h, v1.8h, v3.8h 603 st1 {v0.8h}, [x0], x1 604 st1 {v1.8h}, [x8], x1 605 b.gt 8b 606 ret 607160: 608 lsl x1, x1, #1 60916: 610 ld1 {v16.16b, v17.16b}, [x5], #32 611 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 612 subs w4, w4, #2 613 neg v18.16b, v16.16b // -m 614 neg v19.16b, v17.16b 615 ld1 {v0.8h, v1.8h}, [x0] 616 sxtl v16.8h, v18.8b 617 sxtl2 v17.8h, v18.16b 618 sxtl v18.8h, v19.8b 619 sxtl2 v19.8h, v19.16b 620 ld1 {v2.8h, v3.8h}, [x8] 621 shl v16.8h, v16.8h, #9 // -m << 9 622 shl v17.8h, v17.8h, #9 623 shl v18.8h, v18.8h, #9 624 shl v19.8h, v19.8h, #9 625 sub v4.8h, v0.8h, v4.8h // a - b 626 sub v5.8h, v1.8h, v5.8h 627 sub v6.8h, v2.8h, v6.8h 628 sub v7.8h, v3.8h, v7.8h 629 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 630 sqrdmulh v5.8h, v5.8h, v17.8h 631 sqrdmulh v6.8h, v6.8h, v18.8h 632 sqrdmulh v7.8h, v7.8h, v19.8h 633 add v0.8h, v0.8h, v4.8h 634 add v1.8h, v1.8h, v5.8h 635 add v2.8h, v2.8h, v6.8h 636 add v3.8h, v3.8h, v7.8h 637 st1 {v0.8h, v1.8h}, [x0], x1 638 st1 {v2.8h, v3.8h}, [x8], x1 639 b.gt 16b 640 ret 64132: 642 ld1 {v16.16b, v17.16b}, [x5], #32 643 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 644 subs w4, w4, #1 645 neg v18.16b, v16.16b // -m 646 neg v19.16b, v17.16b 647 sxtl v16.8h, v18.8b 648 sxtl2 v17.8h, v18.16b 649 sxtl v18.8h, v19.8b 650 sxtl2 v19.8h, v19.16b 651 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 652 shl v16.8h, v16.8h, #9 // -m << 9 653 shl v17.8h, v17.8h, #9 654 shl v18.8h, v18.8h, #9 655 shl v19.8h, v19.8h, #9 656 sub v4.8h, v0.8h, v4.8h // a - b 657 sub v5.8h, v1.8h, v5.8h 658 sub v6.8h, v2.8h, v6.8h 659 sub v7.8h, v3.8h, v7.8h 660 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 661 sqrdmulh v5.8h, v5.8h, v17.8h 662 sqrdmulh v6.8h, v6.8h, v18.8h 663 sqrdmulh v7.8h, v7.8h, v19.8h 664 add v0.8h, v0.8h, v4.8h 665 add v1.8h, v1.8h, v5.8h 666 add v2.8h, v2.8h, v6.8h 667 add v3.8h, v3.8h, v7.8h 668 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 669 b.gt 32b 670 ret 671L(blend_tbl): 672 .hword L(blend_tbl) - 32b 673 .hword L(blend_tbl) - 160b 674 .hword L(blend_tbl) - 80b 675 .hword L(blend_tbl) - 40b 676endfunc 677 678function blend_h_16bpc_neon, export=1 679 adr x6, L(blend_h_tbl) 680 movrel x5, X(obmc_masks) 681 add x5, x5, w4, uxtw 682 sub w4, w4, w4, lsr #2 683 clz w7, w3 684 add x8, x0, x1 685 lsl x1, x1, #1 686 sub w7, w7, #24 687 ldrh w7, [x6, x7, lsl #1] 688 sub x6, x6, w7, uxtw 689 br x6 6902: 691 ld2r {v2.8b, v3.8b}, [x5], #2 692 ld1 {v1.4h}, [x2], #8 693 ext v2.8b, v2.8b, v3.8b, #6 694 subs w4, w4, #2 695 neg v2.8b, v2.8b // -m 696 ld1 {v0.s}[0], [x0] 697 ld1 {v0.s}[1], [x8] 698 sxtl v2.8h, v2.8b 699 shl v2.4h, v2.4h, #9 // -m << 9 700 sub v1.4h, v0.4h, v1.4h // a - b 701 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 702 add v0.4h, v0.4h, v1.4h 703 st1 {v0.s}[0], [x0], x1 704 st1 {v0.s}[1], [x8], x1 705 b.gt 2b 706 ret 7074: 708 ld2r {v2.8b, v3.8b}, [x5], #2 709 ld1 {v1.8h}, [x2], #16 710 ext v2.8b, v2.8b, v3.8b, #4 711 subs w4, w4, #2 712 neg v2.8b, v2.8b // -m 713 ld1 {v0.d}[0], [x0] 714 ld1 {v0.d}[1], [x8] 715 sxtl v2.8h, v2.8b 716 shl v2.8h, v2.8h, #9 // -m << 9 717 sub v1.8h, v0.8h, v1.8h // a - b 718 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 719 add v0.8h, v0.8h, v1.8h 720 st1 {v0.d}[0], [x0], x1 721 st1 {v0.d}[1], [x8], x1 722 b.gt 4b 723 ret 7248: 725 ld2r {v4.8b, v5.8b}, [x5], #2 726 ld1 {v2.8h, v3.8h}, [x2], #32 727 neg v4.8b, v4.8b // -m 728 neg v5.8b, v5.8b 729 ld1 {v0.8h}, [x0] 730 subs w4, w4, #2 731 sxtl v4.8h, v4.8b 732 sxtl v5.8h, v5.8b 733 ld1 {v1.8h}, [x8] 734 shl v4.8h, v4.8h, #9 // -m << 9 735 shl v5.8h, v5.8h, #9 736 sub v2.8h, v0.8h, v2.8h // a - b 737 sub v3.8h, v1.8h, v3.8h 738 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 739 sqrdmulh v3.8h, v3.8h, v5.8h 740 add v0.8h, v0.8h, v2.8h 741 add v1.8h, v1.8h, v3.8h 742 st1 {v0.8h}, [x0], x1 743 st1 {v1.8h}, [x8], x1 744 b.gt 8b 745 ret 74616: 747 ld2r {v16.8b, v17.8b}, [x5], #2 748 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 749 neg v16.8b, v16.8b // -m 750 neg v17.8b, v17.8b 751 ld1 {v0.8h, v1.8h}, [x0] 752 ld1 {v2.8h, v3.8h}, [x8] 753 subs w4, w4, #2 754 sxtl v16.8h, v16.8b 755 sxtl v17.8h, v17.8b 756 shl v16.8h, v16.8h, #9 // -m << 9 757 shl v17.8h, v17.8h, #9 758 sub v4.8h, v0.8h, v4.8h // a - b 759 sub v5.8h, v1.8h, v5.8h 760 sub v6.8h, v2.8h, v6.8h 761 sub v7.8h, v3.8h, v7.8h 762 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 763 sqrdmulh v5.8h, v5.8h, v16.8h 764 sqrdmulh v6.8h, v6.8h, v17.8h 765 sqrdmulh v7.8h, v7.8h, v17.8h 766 add v0.8h, v0.8h, v4.8h 767 add v1.8h, v1.8h, v5.8h 768 add v2.8h, v2.8h, v6.8h 769 add v3.8h, v3.8h, v7.8h 770 st1 {v0.8h, v1.8h}, [x0], x1 771 st1 {v2.8h, v3.8h}, [x8], x1 772 b.gt 16b 773 ret 7741280: 775640: 776320: 777 sub x1, x1, w3, uxtw #1 778 add x7, x2, w3, uxtw #1 779321: 780 ld2r {v24.8b, v25.8b}, [x5], #2 781 mov w6, w3 782 neg v24.8b, v24.8b // -m 783 neg v25.8b, v25.8b 784 sxtl v24.8h, v24.8b 785 sxtl v25.8h, v25.8b 786 shl v24.8h, v24.8h, #9 // -m << 9 787 shl v25.8h, v25.8h, #9 78832: 789 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 790 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 791 subs w6, w6, #32 792 sub v16.8h, v0.8h, v16.8h // a - b 793 sub v17.8h, v1.8h, v17.8h 794 sub v18.8h, v2.8h, v18.8h 795 sub v19.8h, v3.8h, v19.8h 796 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 797 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] 798 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 799 sqrdmulh v17.8h, v17.8h, v24.8h 800 sqrdmulh v18.8h, v18.8h, v24.8h 801 sqrdmulh v19.8h, v19.8h, v24.8h 802 sub v20.8h, v4.8h, v20.8h // a - b 803 sub v21.8h, v5.8h, v21.8h 804 sub v22.8h, v6.8h, v22.8h 805 sub v23.8h, v7.8h, v23.8h 806 add v0.8h, v0.8h, v16.8h 807 add v1.8h, v1.8h, v17.8h 808 add v2.8h, v2.8h, v18.8h 809 add v3.8h, v3.8h, v19.8h 810 sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 811 sqrdmulh v21.8h, v21.8h, v25.8h 812 sqrdmulh v22.8h, v22.8h, v25.8h 813 sqrdmulh v23.8h, v23.8h, v25.8h 814 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 815 add v4.8h, v4.8h, v20.8h 816 add v5.8h, v5.8h, v21.8h 817 add v6.8h, v6.8h, v22.8h 818 add v7.8h, v7.8h, v23.8h 819 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 820 b.gt 32b 821 subs w4, w4, #2 822 add x0, x0, x1 823 add x8, x8, x1 824 add x2, x2, w3, uxtw #1 825 add x7, x7, w3, uxtw #1 826 b.gt 321b 827 ret 828L(blend_h_tbl): 829 .hword L(blend_h_tbl) - 1280b 830 .hword L(blend_h_tbl) - 640b 831 .hword L(blend_h_tbl) - 320b 832 .hword L(blend_h_tbl) - 16b 833 .hword L(blend_h_tbl) - 8b 834 .hword L(blend_h_tbl) - 4b 835 .hword L(blend_h_tbl) - 2b 836endfunc 837 838function blend_v_16bpc_neon, export=1 839 adr x6, L(blend_v_tbl) 840 movrel x5, X(obmc_masks) 841 add x5, x5, w3, uxtw 842 clz w3, w3 843 add x8, x0, x1 844 lsl x1, x1, #1 845 sub w3, w3, #26 846 ldrh w3, [x6, x3, lsl #1] 847 sub x6, x6, w3, uxtw 848 br x6 84920: 850 ld1r {v2.8b}, [x5] 851 neg v2.8b, v2.8b // -m 852 sxtl v2.8h, v2.8b 853 shl v2.4h, v2.4h, #9 // -m << 9 8542: 855 ld1 {v1.s}[0], [x2], #4 856 ld1 {v0.h}[0], [x0] 857 subs w4, w4, #2 858 ld1 {v1.h}[1], [x2] 859 ld1 {v0.h}[1], [x8] 860 add x2, x2, #4 861 sub v1.4h, v0.4h, v1.4h // a - b 862 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 863 add v0.4h, v0.4h, v1.4h 864 st1 {v0.h}[0], [x0], x1 865 st1 {v0.h}[1], [x8], x1 866 b.gt 2b 867 ret 86840: 869 ld1r {v2.2s}, [x5] 870 sub x1, x1, #4 871 neg v2.8b, v2.8b // -m 872 sxtl v2.8h, v2.8b 873 shl v2.8h, v2.8h, #9 // -m << 9 8744: 875 ld1 {v1.8h}, [x2], #16 876 ld1 {v0.d}[0], [x0] 877 ld1 {v0.d}[1], [x8] 878 subs w4, w4, #2 879 sub v1.8h, v0.8h, v1.8h // a - b 880 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 881 add v0.8h, v0.8h, v1.8h 882 st1 {v0.s}[0], [x0], #4 883 st1 {v0.s}[2], [x8], #4 884 st1 {v0.h}[2], [x0], x1 885 st1 {v0.h}[6], [x8], x1 886 b.gt 4b 887 ret 88880: 889 ld1 {v4.8b}, [x5] 890 sub x1, x1, #8 891 neg v4.8b, v4.8b // -m 892 sxtl v4.8h, v4.8b 893 shl v4.8h, v4.8h, #9 // -m << 9 8948: 895 ld1 {v2.8h, v3.8h}, [x2], #32 896 ld1 {v0.8h}, [x0] 897 ld1 {v1.8h}, [x8] 898 subs w4, w4, #2 899 sub v2.8h, v0.8h, v2.8h // a - b 900 sub v3.8h, v1.8h, v3.8h 901 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 902 sqrdmulh v3.8h, v3.8h, v4.8h 903 add v0.8h, v0.8h, v2.8h 904 add v1.8h, v1.8h, v3.8h 905 st1 {v0.d}[0], [x0], #8 906 st1 {v1.d}[0], [x8], #8 907 st1 {v0.s}[2], [x0], x1 908 st1 {v1.s}[2], [x8], x1 909 b.gt 8b 910 ret 911160: 912 ld1 {v16.8b, v17.8b}, [x5] 913 sub x1, x1, #16 914 neg v16.8b, v16.8b // -m 915 neg v17.8b, v17.8b 916 sxtl v16.8h, v16.8b 917 sxtl v17.8h, v17.8b 918 shl v16.8h, v16.8h, #9 // -m << 9 919 shl v17.4h, v17.4h, #9 92016: 921 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 922 ld1 {v0.8h, v1.8h}, [x0] 923 subs w4, w4, #2 924 ld1 {v2.8h, v3.8h}, [x8] 925 sub v4.8h, v0.8h, v4.8h // a - b 926 sub v5.4h, v1.4h, v5.4h 927 sub v6.8h, v2.8h, v6.8h 928 sub v7.4h, v3.4h, v7.4h 929 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 930 sqrdmulh v5.4h, v5.4h, v17.4h 931 sqrdmulh v6.8h, v6.8h, v16.8h 932 sqrdmulh v7.4h, v7.4h, v17.4h 933 add v0.8h, v0.8h, v4.8h 934 add v1.4h, v1.4h, v5.4h 935 add v2.8h, v2.8h, v6.8h 936 add v3.4h, v3.4h, v7.4h 937 st1 {v0.8h}, [x0], #16 938 st1 {v2.8h}, [x8], #16 939 st1 {v1.4h}, [x0], x1 940 st1 {v3.4h}, [x8], x1 941 b.gt 16b 942 ret 943320: 944 ld1 {v24.16b, v25.16b}, [x5] 945 neg v26.16b, v24.16b // -m 946 neg v27.8b, v25.8b 947 sxtl v24.8h, v26.8b 948 sxtl2 v25.8h, v26.16b 949 sxtl v26.8h, v27.8b 950 shl v24.8h, v24.8h, #9 // -m << 9 951 shl v25.8h, v25.8h, #9 952 shl v26.8h, v26.8h, #9 95332: 954 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 955 ld1 {v0.8h, v1.8h, v2.8h}, [x0] 956 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 957 ld1 {v4.8h, v5.8h, v6.8h}, [x8] 958 subs w4, w4, #2 959 sub v16.8h, v0.8h, v16.8h // a - b 960 sub v17.8h, v1.8h, v17.8h 961 sub v18.8h, v2.8h, v18.8h 962 sub v20.8h, v4.8h, v20.8h 963 sub v21.8h, v5.8h, v21.8h 964 sub v22.8h, v6.8h, v22.8h 965 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 966 sqrdmulh v17.8h, v17.8h, v25.8h 967 sqrdmulh v18.8h, v18.8h, v26.8h 968 sqrdmulh v20.8h, v20.8h, v24.8h 969 sqrdmulh v21.8h, v21.8h, v25.8h 970 sqrdmulh v22.8h, v22.8h, v26.8h 971 add v0.8h, v0.8h, v16.8h 972 add v1.8h, v1.8h, v17.8h 973 add v2.8h, v2.8h, v18.8h 974 add v4.8h, v4.8h, v20.8h 975 add v5.8h, v5.8h, v21.8h 976 add v6.8h, v6.8h, v22.8h 977 st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 978 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 979 b.gt 32b 980 ret 981L(blend_v_tbl): 982 .hword L(blend_v_tbl) - 320b 983 .hword L(blend_v_tbl) - 160b 984 .hword L(blend_v_tbl) - 80b 985 .hword L(blend_v_tbl) - 40b 986 .hword L(blend_v_tbl) - 20b 987endfunc 988 989 990// This has got the same signature as the put_8tap functions, 991// and assumes that x9 is set to (clz(w)-24). 992function put_neon 993 adr x10, L(put_tbl) 994 ldrh w9, [x10, x9, lsl #1] 995 sub x10, x10, w9, uxtw 996 br x10 997 9982: 999 ld1 {v0.s}[0], [x2], x3 1000 ld1 {v1.s}[0], [x2], x3 1001 subs w5, w5, #2 1002 st1 {v0.s}[0], [x0], x1 1003 st1 {v1.s}[0], [x0], x1 1004 b.gt 2b 1005 ret 10064: 1007 ld1 {v0.8b}, [x2], x3 1008 ld1 {v1.8b}, [x2], x3 1009 subs w5, w5, #2 1010 st1 {v0.8b}, [x0], x1 1011 st1 {v1.8b}, [x0], x1 1012 b.gt 4b 1013 ret 101480: 1015 add x8, x0, x1 1016 lsl x1, x1, #1 1017 add x9, x2, x3 1018 lsl x3, x3, #1 10198: 1020 ld1 {v0.16b}, [x2], x3 1021 ld1 {v1.16b}, [x9], x3 1022 subs w5, w5, #2 1023 st1 {v0.16b}, [x0], x1 1024 st1 {v1.16b}, [x8], x1 1025 b.gt 8b 1026 ret 102716: 1028 ldp x6, x7, [x2] 1029 ldp x8, x9, [x2, #16] 1030 stp x6, x7, [x0] 1031 subs w5, w5, #1 1032 stp x8, x9, [x0, #16] 1033 add x2, x2, x3 1034 add x0, x0, x1 1035 b.gt 16b 1036 ret 103732: 1038 ldp x6, x7, [x2] 1039 ldp x8, x9, [x2, #16] 1040 stp x6, x7, [x0] 1041 ldp x10, x11, [x2, #32] 1042 stp x8, x9, [x0, #16] 1043 subs w5, w5, #1 1044 ldp x12, x13, [x2, #48] 1045 stp x10, x11, [x0, #32] 1046 stp x12, x13, [x0, #48] 1047 add x2, x2, x3 1048 add x0, x0, x1 1049 b.gt 32b 1050 ret 105164: 1052 ldp q0, q1, [x2] 1053 ldp q2, q3, [x2, #32] 1054 stp q0, q1, [x0] 1055 ldp q4, q5, [x2, #64] 1056 stp q2, q3, [x0, #32] 1057 ldp q6, q7, [x2, #96] 1058 subs w5, w5, #1 1059 stp q4, q5, [x0, #64] 1060 stp q6, q7, [x0, #96] 1061 add x2, x2, x3 1062 add x0, x0, x1 1063 b.gt 64b 1064 ret 1065128: 1066 ldp q0, q1, [x2] 1067 ldp q2, q3, [x2, #32] 1068 stp q0, q1, [x0] 1069 ldp q4, q5, [x2, #64] 1070 stp q2, q3, [x0, #32] 1071 ldp q6, q7, [x2, #96] 1072 subs w5, w5, #1 1073 stp q4, q5, [x0, #64] 1074 ldp q16, q17, [x2, #128] 1075 stp q6, q7, [x0, #96] 1076 ldp q18, q19, [x2, #160] 1077 stp q16, q17, [x0, #128] 1078 ldp q20, q21, [x2, #192] 1079 stp q18, q19, [x0, #160] 1080 ldp q22, q23, [x2, #224] 1081 stp q20, q21, [x0, #192] 1082 stp q22, q23, [x0, #224] 1083 add x2, x2, x3 1084 add x0, x0, x1 1085 b.gt 128b 1086 ret 1087 1088L(put_tbl): 1089 .hword L(put_tbl) - 128b 1090 .hword L(put_tbl) - 64b 1091 .hword L(put_tbl) - 32b 1092 .hword L(put_tbl) - 16b 1093 .hword L(put_tbl) - 80b 1094 .hword L(put_tbl) - 4b 1095 .hword L(put_tbl) - 2b 1096endfunc 1097 1098 1099// This has got the same signature as the prep_8tap functions, 1100// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and 1101// x8 to w*2. 1102function prep_neon 1103 adr x10, L(prep_tbl) 1104 ldrh w9, [x10, x9, lsl #1] 1105 dup v31.8h, w7 // intermediate_bits 1106 movi v30.8h, #(PREP_BIAS >> 8), lsl #8 1107 sub x10, x10, w9, uxtw 1108 br x10 1109 111040: 1111 add x9, x1, x2 1112 lsl x2, x2, #1 11134: 1114 ld1 {v0.d}[0], [x1], x2 1115 ld1 {v0.d}[1], [x9], x2 1116 subs w4, w4, #2 1117 sshl v0.8h, v0.8h, v31.8h 1118 sub v0.8h, v0.8h, v30.8h 1119 st1 {v0.8h}, [x0], #16 1120 b.gt 4b 1121 ret 112280: 1123 add x9, x1, x2 1124 lsl x2, x2, #1 11258: 1126 ld1 {v0.8h}, [x1], x2 1127 ld1 {v1.8h}, [x9], x2 1128 subs w4, w4, #2 1129 sshl v0.8h, v0.8h, v31.8h 1130 sshl v1.8h, v1.8h, v31.8h 1131 sub v0.8h, v0.8h, v30.8h 1132 sub v1.8h, v1.8h, v30.8h 1133 st1 {v0.8h, v1.8h}, [x0], #32 1134 b.gt 8b 1135 ret 113616: 1137 ldp q0, q1, [x1] 1138 add x1, x1, x2 1139 sshl v0.8h, v0.8h, v31.8h 1140 ldp q2, q3, [x1] 1141 add x1, x1, x2 1142 subs w4, w4, #2 1143 sshl v1.8h, v1.8h, v31.8h 1144 sshl v2.8h, v2.8h, v31.8h 1145 sshl v3.8h, v3.8h, v31.8h 1146 sub v0.8h, v0.8h, v30.8h 1147 sub v1.8h, v1.8h, v30.8h 1148 sub v2.8h, v2.8h, v30.8h 1149 sub v3.8h, v3.8h, v30.8h 1150 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1151 b.gt 16b 1152 ret 115332: 1154 ldp q0, q1, [x1] 1155 sshl v0.8h, v0.8h, v31.8h 1156 ldp q2, q3, [x1, #32] 1157 add x1, x1, x2 1158 sshl v1.8h, v1.8h, v31.8h 1159 sshl v2.8h, v2.8h, v31.8h 1160 sshl v3.8h, v3.8h, v31.8h 1161 subs w4, w4, #1 1162 sub v0.8h, v0.8h, v30.8h 1163 sub v1.8h, v1.8h, v30.8h 1164 sub v2.8h, v2.8h, v30.8h 1165 sub v3.8h, v3.8h, v30.8h 1166 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1167 b.gt 32b 1168 ret 116964: 1170 ldp q0, q1, [x1] 1171 subs w4, w4, #1 1172 sshl v0.8h, v0.8h, v31.8h 1173 ldp q2, q3, [x1, #32] 1174 sshl v1.8h, v1.8h, v31.8h 1175 ldp q4, q5, [x1, #64] 1176 sshl v2.8h, v2.8h, v31.8h 1177 sshl v3.8h, v3.8h, v31.8h 1178 ldp q6, q7, [x1, #96] 1179 add x1, x1, x2 1180 sshl v4.8h, v4.8h, v31.8h 1181 sshl v5.8h, v5.8h, v31.8h 1182 sshl v6.8h, v6.8h, v31.8h 1183 sshl v7.8h, v7.8h, v31.8h 1184 sub v0.8h, v0.8h, v30.8h 1185 sub v1.8h, v1.8h, v30.8h 1186 sub v2.8h, v2.8h, v30.8h 1187 sub v3.8h, v3.8h, v30.8h 1188 stp q0, q1, [x0] 1189 sub v4.8h, v4.8h, v30.8h 1190 sub v5.8h, v5.8h, v30.8h 1191 stp q2, q3, [x0, #32] 1192 sub v6.8h, v6.8h, v30.8h 1193 sub v7.8h, v7.8h, v30.8h 1194 stp q4, q5, [x0, #64] 1195 stp q6, q7, [x0, #96] 1196 add x0, x0, x8 1197 b.gt 64b 1198 ret 1199128: 1200 ldp q0, q1, [x1] 1201 subs w4, w4, #1 1202 sshl v0.8h, v0.8h, v31.8h 1203 ldp q2, q3, [x1, #32] 1204 sshl v1.8h, v1.8h, v31.8h 1205 ldp q4, q5, [x1, #64] 1206 sshl v2.8h, v2.8h, v31.8h 1207 sshl v3.8h, v3.8h, v31.8h 1208 ldp q6, q7, [x1, #96] 1209 sshl v4.8h, v4.8h, v31.8h 1210 sshl v5.8h, v5.8h, v31.8h 1211 ldp q16, q17, [x1, #128] 1212 sshl v6.8h, v6.8h, v31.8h 1213 sshl v7.8h, v7.8h, v31.8h 1214 ldp q18, q19, [x1, #160] 1215 sshl v16.8h, v16.8h, v31.8h 1216 sshl v17.8h, v17.8h, v31.8h 1217 ldp q20, q21, [x1, #192] 1218 sshl v18.8h, v18.8h, v31.8h 1219 sshl v19.8h, v19.8h, v31.8h 1220 ldp q22, q23, [x1, #224] 1221 add x1, x1, x2 1222 sshl v20.8h, v20.8h, v31.8h 1223 sshl v21.8h, v21.8h, v31.8h 1224 sshl v22.8h, v22.8h, v31.8h 1225 sshl v23.8h, v23.8h, v31.8h 1226 sub v0.8h, v0.8h, v30.8h 1227 sub v1.8h, v1.8h, v30.8h 1228 sub v2.8h, v2.8h, v30.8h 1229 sub v3.8h, v3.8h, v30.8h 1230 stp q0, q1, [x0] 1231 sub v4.8h, v4.8h, v30.8h 1232 sub v5.8h, v5.8h, v30.8h 1233 stp q2, q3, [x0, #32] 1234 sub v6.8h, v6.8h, v30.8h 1235 sub v7.8h, v7.8h, v30.8h 1236 stp q4, q5, [x0, #64] 1237 sub v16.8h, v16.8h, v30.8h 1238 sub v17.8h, v17.8h, v30.8h 1239 stp q6, q7, [x0, #96] 1240 sub v18.8h, v18.8h, v30.8h 1241 sub v19.8h, v19.8h, v30.8h 1242 stp q16, q17, [x0, #128] 1243 sub v20.8h, v20.8h, v30.8h 1244 sub v21.8h, v21.8h, v30.8h 1245 stp q18, q19, [x0, #160] 1246 sub v22.8h, v22.8h, v30.8h 1247 sub v23.8h, v23.8h, v30.8h 1248 stp q20, q21, [x0, #192] 1249 stp q22, q23, [x0, #224] 1250 add x0, x0, x8 1251 b.gt 128b 1252 ret 1253 1254L(prep_tbl): 1255 .hword L(prep_tbl) - 128b 1256 .hword L(prep_tbl) - 64b 1257 .hword L(prep_tbl) - 32b 1258 .hword L(prep_tbl) - 16b 1259 .hword L(prep_tbl) - 80b 1260 .hword L(prep_tbl) - 40b 1261endfunc 1262 1263 1264.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1265 ld1 {\d0\wd}[0], [\s0], \strd 1266 ld1 {\d1\wd}[0], [\s1], \strd 1267.ifnb \d2 1268 ld1 {\d2\wd}[0], [\s0], \strd 1269 ld1 {\d3\wd}[0], [\s1], \strd 1270.endif 1271.ifnb \d4 1272 ld1 {\d4\wd}[0], [\s0], \strd 1273.endif 1274.ifnb \d5 1275 ld1 {\d5\wd}[0], [\s1], \strd 1276.endif 1277.ifnb \d6 1278 ld1 {\d6\wd}[0], [\s0], \strd 1279.endif 1280.endm 1281.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1282 ld1 {\d0\wd}, [\s0], \strd 1283 ld1 {\d1\wd}, [\s1], \strd 1284.ifnb \d2 1285 ld1 {\d2\wd}, [\s0], \strd 1286 ld1 {\d3\wd}, [\s1], \strd 1287.endif 1288.ifnb \d4 1289 ld1 {\d4\wd}, [\s0], \strd 1290.endif 1291.ifnb \d5 1292 ld1 {\d5\wd}, [\s1], \strd 1293.endif 1294.ifnb \d6 1295 ld1 {\d6\wd}, [\s0], \strd 1296.endif 1297.endm 1298.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 1299 ld1 {\d0\wd, \d1\wd}, [\s0], \strd 1300.ifnb \d2 1301 ld1 {\d2\wd, \d3\wd}, [\s1], \strd 1302.endif 1303.ifnb \d4 1304 ld1 {\d4\wd, \d5\wd}, [\s0], \strd 1305.endif 1306.endm 1307.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1308 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1309.endm 1310.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1311 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1312.endm 1313.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1314 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1315.endm 1316.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 1317 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 1318.endm 1319.macro interleave_1 wd, r0, r1, r2, r3, r4 1320 trn1 \r0\wd, \r0\wd, \r1\wd 1321 trn1 \r1\wd, \r1\wd, \r2\wd 1322.ifnb \r3 1323 trn1 \r2\wd, \r2\wd, \r3\wd 1324 trn1 \r3\wd, \r3\wd, \r4\wd 1325.endif 1326.endm 1327.macro interleave_1_s r0, r1, r2, r3, r4 1328 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1329.endm 1330.macro umin_h c, wd, r0, r1, r2, r3 1331 umin \r0\wd, \r0\wd, \c\wd 1332.ifnb \r1 1333 umin \r1\wd, \r1\wd, \c\wd 1334.endif 1335.ifnb \r2 1336 umin \r2\wd, \r2\wd, \c\wd 1337 umin \r3\wd, \r3\wd, \c\wd 1338.endif 1339.endm 1340.macro sub_h c, wd, r0, r1, r2, r3 1341 sub \r0\wd, \r0\wd, \c\wd 1342.ifnb \r1 1343 sub \r1\wd, \r1\wd, \c\wd 1344.endif 1345.ifnb \r2 1346 sub \r2\wd, \r2\wd, \c\wd 1347 sub \r3\wd, \r3\wd, \c\wd 1348.endif 1349.endm 1350.macro smull_smlal_4 d, s0, s1, s2, s3 1351 smull \d\().4s, \s0\().4h, v0.h[0] 1352 smlal \d\().4s, \s1\().4h, v0.h[1] 1353 smlal \d\().4s, \s2\().4h, v0.h[2] 1354 smlal \d\().4s, \s3\().4h, v0.h[3] 1355.endm 1356.macro smull2_smlal2_4 d, s0, s1, s2, s3 1357 smull2 \d\().4s, \s0\().8h, v0.h[0] 1358 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1359 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1360 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1361.endm 1362.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1363 smull \d\().4s, \s0\().4h, v0.h[0] 1364 smlal \d\().4s, \s1\().4h, v0.h[1] 1365 smlal \d\().4s, \s2\().4h, v0.h[2] 1366 smlal \d\().4s, \s3\().4h, v0.h[3] 1367 smlal \d\().4s, \s4\().4h, v0.h[4] 1368 smlal \d\().4s, \s5\().4h, v0.h[5] 1369 smlal \d\().4s, \s6\().4h, v0.h[6] 1370 smlal \d\().4s, \s7\().4h, v0.h[7] 1371.endm 1372.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1373 smull2 \d\().4s, \s0\().8h, v0.h[0] 1374 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1375 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1376 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1377 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1378 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1379 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1380 smlal2 \d\().4s, \s7\().8h, v0.h[7] 1381.endm 1382.macro sqrshrun_h shift, r0, r1, r2, r3 1383 sqrshrun \r0\().4h, \r0\().4s, #\shift 1384.ifnb \r1 1385 sqrshrun2 \r0\().8h, \r1\().4s, #\shift 1386.endif 1387.ifnb \r2 1388 sqrshrun \r2\().4h, \r2\().4s, #\shift 1389 sqrshrun2 \r2\().8h, \r3\().4s, #\shift 1390.endif 1391.endm 1392.macro xtn_h r0, r1, r2, r3 1393 xtn \r0\().4h, \r0\().4s 1394 xtn2 \r0\().8h, \r1\().4s 1395.ifnb \r2 1396 xtn \r2\().4h, \r2\().4s 1397 xtn2 \r2\().8h, \r3\().4s 1398.endif 1399.endm 1400.macro srshl_s shift, r0, r1, r2, r3 1401 srshl \r0\().4s, \r0\().4s, \shift\().4s 1402 srshl \r1\().4s, \r1\().4s, \shift\().4s 1403.ifnb \r2 1404 srshl \r2\().4s, \r2\().4s, \shift\().4s 1405 srshl \r3\().4s, \r3\().4s, \shift\().4s 1406.endif 1407.endm 1408.macro st_s strd, reg, lanes 1409 st1 {\reg\().s}[0], [x0], \strd 1410 st1 {\reg\().s}[1], [x9], \strd 1411.if \lanes > 2 1412 st1 {\reg\().s}[2], [x0], \strd 1413 st1 {\reg\().s}[3], [x9], \strd 1414.endif 1415.endm 1416.macro st_d strd, r0, r1 1417 st1 {\r0\().d}[0], [x0], \strd 1418 st1 {\r0\().d}[1], [x9], \strd 1419.ifnb \r1 1420 st1 {\r1\().d}[0], [x0], \strd 1421 st1 {\r1\().d}[1], [x9], \strd 1422.endif 1423.endm 1424.macro shift_store_4 type, strd, r0, r1, r2, r3 1425.ifc \type, put 1426 sqrshrun_h 6, \r0, \r1, \r2, \r3 1427 umin_h v31, .8h, \r0, \r2 1428.else 1429 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1430 xtn_h \r0, \r1, \r2, \r3 1431 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1432.endif 1433 st_d \strd, \r0, \r2 1434.endm 1435.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1436 st1 {\r0\wd}, [x0], \strd 1437 st1 {\r1\wd}, [x9], \strd 1438.ifnb \r2 1439 st1 {\r2\wd}, [x0], \strd 1440 st1 {\r3\wd}, [x9], \strd 1441.endif 1442.ifnb \r4 1443 st1 {\r4\wd}, [x0], \strd 1444 st1 {\r5\wd}, [x9], \strd 1445 st1 {\r6\wd}, [x0], \strd 1446 st1 {\r7\wd}, [x9], \strd 1447.endif 1448.endm 1449.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 1450 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1451.endm 1452.macro shift_store_8 type, strd, r0, r1, r2, r3 1453.ifc \type, put 1454 sqrshrun_h 6, \r0, \r1, \r2, \r3 1455 umin_h v31, .8h, \r0, \r2 1456.else 1457 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1458 xtn_h \r0, \r1, \r2, \r3 1459 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1460.endif 1461 st_8h \strd, \r0, \r2 1462.endm 1463.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 1464.ifc \type, put 1465 sqrshrun_h 6, \r0, \r1, \r2, \r3 1466 umin \r0\().8h, \r0\().8h, v31.8h 1467 umin \r1\().8h, \r2\().8h, v31.8h 1468.else 1469 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1470 xtn_h \r0, \r1, \r2, \r3 1471 sub \r0\().8h, \r0\().8h, v29.8h 1472 sub \r1\().8h, \r2\().8h, v29.8h 1473.endif 1474 st1 {\r0\().8h, \r1\().8h}, [\dst], \strd 1475.endm 1476 1477.macro make_8tap_fn op, type, type_h, type_v 1478function \op\()_8tap_\type\()_16bpc_neon, export=1 1479 mov w9, \type_h 1480 mov w10, \type_v 1481 b \op\()_8tap_neon 1482endfunc 1483.endm 1484 1485// No spaces in these expressions, due to gas-preprocessor. 1486#define REGULAR ((0*15<<7)|3*15) 1487#define SMOOTH ((1*15<<7)|4*15) 1488#define SHARP ((2*15<<7)|3*15) 1489 1490.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 1491make_8tap_fn \type, regular, REGULAR, REGULAR 1492make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1493make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1494make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1495make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1496make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1497make_8tap_fn \type, sharp, SHARP, SHARP 1498make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1499make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1500 1501function \type\()_8tap_neon 1502.ifc \bdmax, w8 1503 ldr w8, [sp] 1504.endif 1505 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1506 mul \mx, \mx, w11 1507 mul \my, \my, w11 1508 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h 1509 add \my, \my, w10 // my, 8tap_v, 4tap_v 1510.ifc \type, prep 1511 uxtw \d_strd, \w 1512 lsl \d_strd, \d_strd, #1 1513.endif 1514 1515 dup v31.8h, \bdmax // bitdepth_max 1516 clz \bdmax, \bdmax 1517 clz w9, \w 1518 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 1519 mov w12, #6 1520 tst \mx, #(0x7f << 14) 1521 sub w9, w9, #24 1522 add w13, w12, \bdmax // 6 + intermediate_bits 1523 sub w12, w12, \bdmax // 6 - intermediate_bits 1524 movrel x11, X(mc_subpel_filters), -8 1525 b.ne L(\type\()_8tap_h) 1526 tst \my, #(0x7f << 14) 1527 b.ne L(\type\()_8tap_v) 1528 b \type\()_neon 1529 1530L(\type\()_8tap_h): 1531 cmp \w, #4 1532 ubfx w10, \mx, #7, #7 1533 and \mx, \mx, #0x7f 1534 b.le 4f 1535 mov \mx, w10 15364: 1537 tst \my, #(0x7f << 14) 1538 add \xmx, x11, \mx, uxtw #3 1539 b.ne L(\type\()_8tap_hv) 1540 1541 adr x10, L(\type\()_8tap_h_tbl) 1542 dup v30.4s, w12 // 6 - intermediate_bits 1543 ldrh w9, [x10, x9, lsl #1] 1544 neg v30.4s, v30.4s // -(6-intermediate_bits) 1545.ifc \type, put 1546 dup v29.8h, \bdmax // intermediate_bits 1547.else 1548 movi v28.8h, #(PREP_BIAS >> 8), lsl #8 1549.endif 1550 sub x10, x10, w9, uxtw 1551.ifc \type, put 1552 neg v29.8h, v29.8h // -intermediate_bits 1553.endif 1554 br x10 1555 155620: // 2xN h 1557.ifc \type, put 1558 add \xmx, \xmx, #2 1559 ld1 {v0.s}[0], [\xmx] 1560 sub \src, \src, #2 1561 add \ds2, \dst, \d_strd 1562 add \sr2, \src, \s_strd 1563 lsl \d_strd, \d_strd, #1 1564 lsl \s_strd, \s_strd, #1 1565 sxtl v0.8h, v0.8b 15662: 1567 ld1 {v4.8h}, [\src], \s_strd 1568 ld1 {v6.8h}, [\sr2], \s_strd 1569 ext v5.16b, v4.16b, v4.16b, #2 1570 ext v7.16b, v6.16b, v6.16b, #2 1571 subs \h, \h, #2 1572 trn1 v3.2s, v4.2s, v6.2s 1573 trn2 v6.2s, v4.2s, v6.2s 1574 trn1 v4.2s, v5.2s, v7.2s 1575 trn2 v7.2s, v5.2s, v7.2s 1576 smull v3.4s, v3.4h, v0.h[0] 1577 smlal v3.4s, v4.4h, v0.h[1] 1578 smlal v3.4s, v6.4h, v0.h[2] 1579 smlal v3.4s, v7.4h, v0.h[3] 1580 srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) 1581 sqxtun v3.4h, v3.4s 1582 srshl v3.4h, v3.4h, v29.4h // -intermediate_bits 1583 umin v3.4h, v3.4h, v31.4h 1584 st1 {v3.s}[0], [\dst], \d_strd 1585 st1 {v3.s}[1], [\ds2], \d_strd 1586 b.gt 2b 1587 ret 1588.endif 1589 159040: // 4xN h 1591 add \xmx, \xmx, #2 1592 ld1 {v0.s}[0], [\xmx] 1593 sub \src, \src, #2 1594 add \ds2, \dst, \d_strd 1595 add \sr2, \src, \s_strd 1596 lsl \d_strd, \d_strd, #1 1597 lsl \s_strd, \s_strd, #1 1598 sxtl v0.8h, v0.8b 15994: 1600 ld1 {v16.8h}, [\src], \s_strd 1601 ld1 {v20.8h}, [\sr2], \s_strd 1602 ext v17.16b, v16.16b, v16.16b, #2 1603 ext v18.16b, v16.16b, v16.16b, #4 1604 ext v19.16b, v16.16b, v16.16b, #6 1605 ext v21.16b, v20.16b, v20.16b, #2 1606 ext v22.16b, v20.16b, v20.16b, #4 1607 ext v23.16b, v20.16b, v20.16b, #6 1608 subs \h, \h, #2 1609 smull v16.4s, v16.4h, v0.h[0] 1610 smlal v16.4s, v17.4h, v0.h[1] 1611 smlal v16.4s, v18.4h, v0.h[2] 1612 smlal v16.4s, v19.4h, v0.h[3] 1613 smull v20.4s, v20.4h, v0.h[0] 1614 smlal v20.4s, v21.4h, v0.h[1] 1615 smlal v20.4s, v22.4h, v0.h[2] 1616 smlal v20.4s, v23.4h, v0.h[3] 1617 srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) 1618 srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) 1619.ifc \type, put 1620 sqxtun v16.4h, v16.4s 1621 sqxtun2 v16.8h, v20.4s 1622 srshl v16.8h, v16.8h, v29.8h // -intermediate_bits 1623 umin v16.8h, v16.8h, v31.8h 1624.else 1625 xtn v16.4h, v16.4s 1626 xtn2 v16.8h, v20.4s 1627 sub v16.8h, v16.8h, v28.8h // PREP_BIAS 1628.endif 1629 st1 {v16.d}[0], [\dst], \d_strd 1630 st1 {v16.d}[1], [\ds2], \d_strd 1631 b.gt 4b 1632 ret 1633 163480: 1635160: 1636320: 1637640: 16381280: // 8xN, 16xN, 32xN, ... h 1639 ld1 {v0.8b}, [\xmx] 1640 sub \src, \src, #6 1641 add \ds2, \dst, \d_strd 1642 add \sr2, \src, \s_strd 1643 lsl \s_strd, \s_strd, #1 1644 sxtl v0.8h, v0.8b 1645 1646 sub \s_strd, \s_strd, \w, uxtw #1 1647 sub \s_strd, \s_strd, #16 1648.ifc \type, put 1649 lsl \d_strd, \d_strd, #1 1650 sub \d_strd, \d_strd, \w, uxtw #1 1651.endif 165281: 1653 ld1 {v16.8h, v17.8h}, [\src], #32 1654 ld1 {v20.8h, v21.8h}, [\sr2], #32 1655 mov \mx, \w 1656 16578: 1658 smull v18.4s, v16.4h, v0.h[0] 1659 smull2 v19.4s, v16.8h, v0.h[0] 1660 smull v22.4s, v20.4h, v0.h[0] 1661 smull2 v23.4s, v20.8h, v0.h[0] 1662.irpc i, 1234567 1663 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1664 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1665 smlal v18.4s, v24.4h, v0.h[\i] 1666 smlal2 v19.4s, v24.8h, v0.h[\i] 1667 smlal v22.4s, v25.4h, v0.h[\i] 1668 smlal2 v23.4s, v25.8h, v0.h[\i] 1669.endr 1670 subs \mx, \mx, #8 1671 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) 1672 srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) 1673 srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) 1674 srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) 1675.ifc \type, put 1676 sqxtun v18.4h, v18.4s 1677 sqxtun2 v18.8h, v19.4s 1678 sqxtun v22.4h, v22.4s 1679 sqxtun2 v22.8h, v23.4s 1680 srshl v18.8h, v18.8h, v29.8h // -intermediate_bits 1681 srshl v22.8h, v22.8h, v29.8h // -intermediate_bits 1682 umin v18.8h, v18.8h, v31.8h 1683 umin v22.8h, v22.8h, v31.8h 1684.else 1685 xtn v18.4h, v18.4s 1686 xtn2 v18.8h, v19.4s 1687 xtn v22.4h, v22.4s 1688 xtn2 v22.8h, v23.4s 1689 sub v18.8h, v18.8h, v28.8h // PREP_BIAS 1690 sub v22.8h, v22.8h, v28.8h // PREP_BIAS 1691.endif 1692 st1 {v18.8h}, [\dst], #16 1693 st1 {v22.8h}, [\ds2], #16 1694 b.le 9f 1695 1696 mov v16.16b, v17.16b 1697 mov v20.16b, v21.16b 1698 ld1 {v17.8h}, [\src], #16 1699 ld1 {v21.8h}, [\sr2], #16 1700 b 8b 1701 17029: 1703 add \dst, \dst, \d_strd 1704 add \ds2, \ds2, \d_strd 1705 add \src, \src, \s_strd 1706 add \sr2, \sr2, \s_strd 1707 1708 subs \h, \h, #2 1709 b.gt 81b 1710 ret 1711 1712L(\type\()_8tap_h_tbl): 1713 .hword L(\type\()_8tap_h_tbl) - 1280b 1714 .hword L(\type\()_8tap_h_tbl) - 640b 1715 .hword L(\type\()_8tap_h_tbl) - 320b 1716 .hword L(\type\()_8tap_h_tbl) - 160b 1717 .hword L(\type\()_8tap_h_tbl) - 80b 1718 .hword L(\type\()_8tap_h_tbl) - 40b 1719 .hword L(\type\()_8tap_h_tbl) - 20b 1720 .hword 0 1721 1722 1723L(\type\()_8tap_v): 1724 cmp \h, #4 1725 ubfx w10, \my, #7, #7 1726 and \my, \my, #0x7f 1727 b.le 4f 1728 mov \my, w10 17294: 1730 add \xmy, x11, \my, uxtw #3 1731 1732.ifc \type, prep 1733 dup v30.4s, w12 // 6 - intermediate_bits 1734 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 1735.endif 1736 adr x10, L(\type\()_8tap_v_tbl) 1737 ldrh w9, [x10, x9, lsl #1] 1738.ifc \type, prep 1739 neg v30.4s, v30.4s // -(6-intermediate_bits) 1740.endif 1741 sub x10, x10, w9, uxtw 1742 br x10 1743 174420: // 2xN v 1745.ifc \type, put 1746 b.gt 28f 1747 1748 cmp \h, #2 1749 add \xmy, \xmy, #2 1750 ld1 {v0.s}[0], [\xmy] 1751 sub \src, \src, \s_strd 1752 add \ds2, \dst, \d_strd 1753 add \sr2, \src, \s_strd 1754 lsl \s_strd, \s_strd, #1 1755 lsl \d_strd, \d_strd, #1 1756 sxtl v0.8h, v0.8b 1757 1758 // 2x2 v 1759 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1760 interleave_1_s v1, v2, v3, v4, v5 1761 b.gt 24f 1762 smull_smlal_4 v6, v1, v2, v3, v4 1763 sqrshrun_h 6, v6 1764 umin_h v31, .8h, v6 1765 st_s \d_strd, v6, 2 1766 ret 1767 176824: // 2x4 v 1769 load_s \sr2, \src, \s_strd, v6, v7 1770 interleave_1_s v5, v6, v7 1771 smull_smlal_4 v16, v1, v2, v3, v4 1772 smull_smlal_4 v17, v3, v4, v5, v6 1773 sqrshrun_h 6, v16, v17 1774 umin_h v31, .8h, v16 1775 st_s \d_strd, v16, 4 1776 ret 1777 177828: // 2x8, 2x16 v 1779 ld1 {v0.8b}, [\xmy] 1780 sub \sr2, \src, \s_strd, lsl #1 1781 add \ds2, \dst, \d_strd 1782 sub \src, \sr2, \s_strd 1783 lsl \d_strd, \d_strd, #1 1784 lsl \s_strd, \s_strd, #1 1785 sxtl v0.8h, v0.8b 1786 1787 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1788 interleave_1_s v1, v2, v3, v4, v5 1789 interleave_1_s v5, v6, v7 1790216: 1791 subs \h, \h, #8 1792 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 1793 load_s \sr2, \src, \s_strd, v20, v21, v22, v23 1794 interleave_1_s v7, v16, v17, v18, v19 1795 interleave_1_s v19, v20, v21, v22, v23 1796 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 1797 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 1798 smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20 1799 smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22 1800 sqrshrun_h 6, v24, v25, v26, v27 1801 umin_h v31, .8h, v24, v26 1802 st_s \d_strd, v24, 4 1803 st_s \d_strd, v26, 4 1804 b.le 0f 1805 mov v1.16b, v17.16b 1806 mov v2.16b, v18.16b 1807 mov v3.16b, v19.16b 1808 mov v4.16b, v20.16b 1809 mov v5.16b, v21.16b 1810 mov v6.16b, v22.16b 1811 mov v7.16b, v23.16b 1812 b 216b 18130: 1814 ret 1815.endif 1816 181740: 1818 b.gt 480f 1819 1820 // 4x2, 4x4 v 1821 cmp \h, #2 1822 add \xmy, \xmy, #2 1823 ld1 {v0.s}[0], [\xmy] 1824 sub \src, \src, \s_strd 1825 add \ds2, \dst, \d_strd 1826 add \sr2, \src, \s_strd 1827 lsl \s_strd, \s_strd, #1 1828 lsl \d_strd, \d_strd, #1 1829 sxtl v0.8h, v0.8b 1830 1831 load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1832 smull_smlal_4 v6, v1, v2, v3, v4 1833 smull_smlal_4 v7, v2, v3, v4, v5 1834 shift_store_4 \type, \d_strd, v6, v7 1835 b.le 0f 1836 load_4h \sr2, \src, \s_strd, v6, v7 1837 smull_smlal_4 v1, v3, v4, v5, v6 1838 smull_smlal_4 v2, v4, v5, v6, v7 1839 shift_store_4 \type, \d_strd, v1, v2 18400: 1841 ret 1842 1843480: // 4x8, 4x16 v 1844 ld1 {v0.8b}, [\xmy] 1845 sub \sr2, \src, \s_strd, lsl #1 1846 add \ds2, \dst, \d_strd 1847 sub \src, \sr2, \s_strd 1848 lsl \s_strd, \s_strd, #1 1849 lsl \d_strd, \d_strd, #1 1850 sxtl v0.8h, v0.8b 1851 1852 load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1853 185448: 1855 subs \h, \h, #4 1856 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 1857 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1858 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 1859 smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 1860 smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1861 shift_store_4 \type, \d_strd, v1, v2, v3, v4 1862 b.le 0f 1863 mov v16.8b, v20.8b 1864 mov v17.8b, v21.8b 1865 mov v18.8b, v22.8b 1866 mov v19.8b, v23.8b 1867 mov v20.8b, v24.8b 1868 mov v21.8b, v25.8b 1869 mov v22.8b, v26.8b 1870 b 48b 18710: 1872 ret 1873 187480: 1875 b.gt 880f 1876 1877 // 8x2, 8x4 v 1878 cmp \h, #2 1879 add \xmy, \xmy, #2 1880 ld1 {v0.s}[0], [\xmy] 1881 sub \src, \src, \s_strd 1882 add \ds2, \dst, \d_strd 1883 add \sr2, \src, \s_strd 1884 lsl \s_strd, \s_strd, #1 1885 lsl \d_strd, \d_strd, #1 1886 sxtl v0.8h, v0.8b 1887 1888 load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1889 smull_smlal_4 v16, v1, v2, v3, v4 1890 smull2_smlal2_4 v17, v1, v2, v3, v4 1891 smull_smlal_4 v18, v2, v3, v4, v5 1892 smull2_smlal2_4 v19, v2, v3, v4, v5 1893 shift_store_8 \type, \d_strd, v16, v17, v18, v19 1894 b.le 0f 1895 load_8h \sr2, \src, \s_strd, v6, v7 1896 smull_smlal_4 v16, v3, v4, v5, v6 1897 smull2_smlal2_4 v17, v3, v4, v5, v6 1898 smull_smlal_4 v18, v4, v5, v6, v7 1899 smull2_smlal2_4 v19, v4, v5, v6, v7 1900 shift_store_8 \type, \d_strd, v16, v17, v18, v19 19010: 1902 ret 1903 1904880: // 8x6, 8x8, 8x16, 8x32 v 19051680: // 16x8, 16x16, ... 1906320: // 32x8, 32x16, ... 1907640: 19081280: 1909 ld1 {v0.8b}, [\xmy] 1910 sub \src, \src, \s_strd 1911 sub \src, \src, \s_strd, lsl #1 1912 sxtl v0.8h, v0.8b 1913 mov \my, \h 1914168: 1915 add \ds2, \dst, \d_strd 1916 add \sr2, \src, \s_strd 1917 lsl \s_strd, \s_strd, #1 1918 lsl \d_strd, \d_strd, #1 1919 1920 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1921 192288: 1923 subs \h, \h, #2 1924 load_8h \sr2, \src, \s_strd, v23, v24 1925 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1926 smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 1927 smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 1928 smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 1929 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1930 b.le 9f 1931 subs \h, \h, #2 1932 load_8h \sr2, \src, \s_strd, v25, v26 1933 smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 1934 smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 1935 smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 1936 smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1937 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1938 b.le 9f 1939 mov v16.16b, v20.16b 1940 mov v17.16b, v21.16b 1941 mov v18.16b, v22.16b 1942 mov v19.16b, v23.16b 1943 mov v20.16b, v24.16b 1944 mov v21.16b, v25.16b 1945 mov v22.16b, v26.16b 1946 b 88b 19479: 1948 subs \w, \w, #8 1949 b.le 0f 1950 asr \s_strd, \s_strd, #1 1951 asr \d_strd, \d_strd, #1 1952 msub \src, \s_strd, \xmy, \src 1953 msub \dst, \d_strd, \xmy, \dst 1954 sub \src, \src, \s_strd, lsl #3 1955 mov \h, \my 1956 add \src, \src, #16 1957 add \dst, \dst, #16 1958 b 168b 19590: 1960 ret 1961 1962160: 1963 b.gt 1680b 1964 1965 // 16x2, 16x4 v 1966 add \xmy, \xmy, #2 1967 ld1 {v0.s}[0], [\xmy] 1968 sub \src, \src, \s_strd 1969 sxtl v0.8h, v0.8b 1970 1971 load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 197216: 1973 load_16h \src, \src, \s_strd, v22, v23 1974 subs \h, \h, #1 1975 smull_smlal_4 v1, v16, v18, v20, v22 1976 smull2_smlal2_4 v2, v16, v18, v20, v22 1977 smull_smlal_4 v3, v17, v19, v21, v23 1978 smull2_smlal2_4 v4, v17, v19, v21, v23 1979 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 1980 b.le 0f 1981 mov v16.16b, v18.16b 1982 mov v17.16b, v19.16b 1983 mov v18.16b, v20.16b 1984 mov v19.16b, v21.16b 1985 mov v20.16b, v22.16b 1986 mov v21.16b, v23.16b 1987 b 16b 19880: 1989 ret 1990 1991L(\type\()_8tap_v_tbl): 1992 .hword L(\type\()_8tap_v_tbl) - 1280b 1993 .hword L(\type\()_8tap_v_tbl) - 640b 1994 .hword L(\type\()_8tap_v_tbl) - 320b 1995 .hword L(\type\()_8tap_v_tbl) - 160b 1996 .hword L(\type\()_8tap_v_tbl) - 80b 1997 .hword L(\type\()_8tap_v_tbl) - 40b 1998 .hword L(\type\()_8tap_v_tbl) - 20b 1999 .hword 0 2000 2001L(\type\()_8tap_hv): 2002 cmp \h, #4 2003 ubfx w10, \my, #7, #7 2004 and \my, \my, #0x7f 2005 b.le 4f 2006 mov \my, w10 20074: 2008 add \xmy, x11, \my, uxtw #3 2009 2010 adr x10, L(\type\()_8tap_hv_tbl) 2011 dup v30.4s, w12 // 6 - intermediate_bits 2012 ldrh w9, [x10, x9, lsl #1] 2013 neg v30.4s, v30.4s // -(6-intermediate_bits) 2014.ifc \type, put 2015 dup v29.4s, w13 // 6 + intermediate_bits 2016.else 2017 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2018.endif 2019 sub x10, x10, w9, uxtw 2020.ifc \type, put 2021 neg v29.4s, v29.4s // -(6+intermediate_bits) 2022.endif 2023 br x10 2024 202520: 2026.ifc \type, put 2027 add \xmx, \xmx, #2 2028 ld1 {v0.s}[0], [\xmx] 2029 b.gt 280f 2030 add \xmy, \xmy, #2 2031 ld1 {v1.s}[0], [\xmy] 2032 2033 // 2x2, 2x4 hv 2034 sub \sr2, \src, #2 2035 sub \src, \sr2, \s_strd 2036 add \ds2, \dst, \d_strd 2037 lsl \s_strd, \s_strd, #1 2038 lsl \d_strd, \d_strd, #1 2039 sxtl v0.8h, v0.8b 2040 sxtl v1.8h, v1.8b 2041 mov x15, x30 2042 sxtl v1.4s, v1.4h 2043 2044 ld1 {v27.8h}, [\src], \s_strd 2045 ext v28.16b, v27.16b, v27.16b, #2 2046 smull v27.4s, v27.4h, v0.4h 2047 smull v28.4s, v28.4h, v0.4h 2048 addp v27.4s, v27.4s, v28.4s 2049 addp v16.4s, v27.4s, v27.4s 2050 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2051 bl L(\type\()_8tap_filter_2) 2052 2053 trn1 v16.2d, v16.2d, v24.2d 2054 mov v17.16b, v24.16b 2055 20562: 2057 bl L(\type\()_8tap_filter_2) 2058 2059 ext v18.16b, v17.16b, v24.16b, #8 2060 mov v19.16b, v24.16b 2061 mul v2.4s, v16.4s, v1.s[0] 2062 mla v2.4s, v17.4s, v1.s[1] 2063 mla v2.4s, v18.4s, v1.s[2] 2064 mla v2.4s, v19.4s, v1.s[3] 2065 2066 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2067 sqxtun v2.4h, v2.4s 2068 umin v2.4h, v2.4h, v31.4h 2069 subs \h, \h, #2 2070 st1 {v2.s}[0], [\dst], \d_strd 2071 st1 {v2.s}[1], [\ds2], \d_strd 2072 b.le 0f 2073 mov v16.16b, v18.16b 2074 mov v17.16b, v19.16b 2075 b 2b 2076 2077280: // 2x8, 2x16, 2x32 hv 2078 ld1 {v1.8b}, [\xmy] 2079 sub \src, \src, #2 2080 sub \sr2, \src, \s_strd, lsl #1 2081 sub \src, \sr2, \s_strd 2082 add \ds2, \dst, \d_strd 2083 lsl \s_strd, \s_strd, #1 2084 lsl \d_strd, \d_strd, #1 2085 sxtl v0.8h, v0.8b 2086 sxtl v1.8h, v1.8b 2087 mov x15, x30 2088 sxtl2 v2.4s, v1.8h 2089 sxtl v1.4s, v1.4h 2090 2091 ld1 {v27.8h}, [\src], \s_strd 2092 ext v28.16b, v27.16b, v27.16b, #2 2093 smull v27.4s, v27.4h, v0.4h 2094 smull v28.4s, v28.4h, v0.4h 2095 addp v27.4s, v27.4s, v28.4s 2096 addp v16.4s, v27.4s, v27.4s 2097 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2098 2099 bl L(\type\()_8tap_filter_2) 2100 trn1 v16.2d, v16.2d, v24.2d 2101 mov v17.16b, v24.16b 2102 bl L(\type\()_8tap_filter_2) 2103 ext v18.16b, v17.16b, v24.16b, #8 2104 mov v19.16b, v24.16b 2105 bl L(\type\()_8tap_filter_2) 2106 ext v20.16b, v19.16b, v24.16b, #8 2107 mov v21.16b, v24.16b 2108 210928: 2110 bl L(\type\()_8tap_filter_2) 2111 ext v22.16b, v21.16b, v24.16b, #8 2112 mov v23.16b, v24.16b 2113 mul v3.4s, v16.4s, v1.s[0] 2114 mla v3.4s, v17.4s, v1.s[1] 2115 mla v3.4s, v18.4s, v1.s[2] 2116 mla v3.4s, v19.4s, v1.s[3] 2117 mla v3.4s, v20.4s, v2.s[0] 2118 mla v3.4s, v21.4s, v2.s[1] 2119 mla v3.4s, v22.4s, v2.s[2] 2120 mla v3.4s, v23.4s, v2.s[3] 2121 2122 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2123 sqxtun v3.4h, v3.4s 2124 umin v3.4h, v3.4h, v31.4h 2125 subs \h, \h, #2 2126 st1 {v3.s}[0], [\dst], \d_strd 2127 st1 {v3.s}[1], [\ds2], \d_strd 2128 b.le 0f 2129 mov v16.16b, v18.16b 2130 mov v17.16b, v19.16b 2131 mov v18.16b, v20.16b 2132 mov v19.16b, v21.16b 2133 mov v20.16b, v22.16b 2134 mov v21.16b, v23.16b 2135 b 28b 2136 21370: 2138 br x15 2139 2140L(\type\()_8tap_filter_2): 2141 ld1 {v25.8h}, [\sr2], \s_strd 2142 ld1 {v27.8h}, [\src], \s_strd 2143 ext v26.16b, v25.16b, v25.16b, #2 2144 ext v28.16b, v27.16b, v27.16b, #2 2145 trn1 v24.2s, v25.2s, v27.2s 2146 trn2 v27.2s, v25.2s, v27.2s 2147 trn1 v25.2s, v26.2s, v28.2s 2148 trn2 v28.2s, v26.2s, v28.2s 2149 smull v24.4s, v24.4h, v0.h[0] 2150 smlal v24.4s, v25.4h, v0.h[1] 2151 smlal v24.4s, v27.4h, v0.h[2] 2152 smlal v24.4s, v28.4h, v0.h[3] 2153 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2154 ret 2155.endif 2156 215740: 2158 add \xmx, \xmx, #2 2159 ld1 {v0.s}[0], [\xmx] 2160 b.gt 480f 2161 add \xmy, \xmy, #2 2162 ld1 {v1.s}[0], [\xmy] 2163 sub \sr2, \src, #2 2164 sub \src, \sr2, \s_strd 2165 add \ds2, \dst, \d_strd 2166 lsl \s_strd, \s_strd, #1 2167 lsl \d_strd, \d_strd, #1 2168 sxtl v0.8h, v0.8b 2169 sxtl v1.8h, v1.8b 2170 mov x15, x30 2171 2172 // 4x2, 4x4 hv 2173 ld1 {v25.8h}, [\src], \s_strd 2174 ext v26.16b, v25.16b, v25.16b, #2 2175 ext v27.16b, v25.16b, v25.16b, #4 2176 ext v28.16b, v25.16b, v25.16b, #6 2177 smull v25.4s, v25.4h, v0.h[0] 2178 smlal v25.4s, v26.4h, v0.h[1] 2179 smlal v25.4s, v27.4h, v0.h[2] 2180 smlal v25.4s, v28.4h, v0.h[3] 2181 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2182 // The intermediates from the horizontal pass fit in 16 bit without 2183 // any bias; we could just as well keep them as .4s, but narrowing 2184 // them to .4h gives a significant speedup on out of order cores 2185 // (at the cost of a smaller slowdown on in-order cores such as A53). 2186 xtn v16.4h, v16.4s 2187 2188 bl L(\type\()_8tap_filter_4) 2189 mov v17.8b, v24.8b 2190 mov v18.8b, v25.8b 2191 21924: 2193 bl L(\type\()_8tap_filter_4) 2194 smull v2.4s, v16.4h, v1.h[0] 2195 smlal v2.4s, v17.4h, v1.h[1] 2196 smlal v2.4s, v18.4h, v1.h[2] 2197 smlal v2.4s, v24.4h, v1.h[3] 2198 smull v3.4s, v17.4h, v1.h[0] 2199 smlal v3.4s, v18.4h, v1.h[1] 2200 smlal v3.4s, v24.4h, v1.h[2] 2201 smlal v3.4s, v25.4h, v1.h[3] 2202.ifc \type, put 2203 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2204 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2205 sqxtun v2.4h, v2.4s 2206 sqxtun2 v2.8h, v3.4s 2207 umin v2.8h, v2.8h, v31.8h 2208.else 2209 rshrn v2.4h, v2.4s, #6 2210 rshrn2 v2.8h, v3.4s, #6 2211 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2212.endif 2213 subs \h, \h, #2 2214 2215 st1 {v2.d}[0], [\dst], \d_strd 2216 st1 {v2.d}[1], [\ds2], \d_strd 2217 b.le 0f 2218 mov v16.8b, v18.8b 2219 mov v17.8b, v24.8b 2220 mov v18.8b, v25.8b 2221 b 4b 2222 2223480: // 4x8, 4x16, 4x32 hv 2224 ld1 {v1.8b}, [\xmy] 2225 sub \src, \src, #2 2226 sub \sr2, \src, \s_strd, lsl #1 2227 sub \src, \sr2, \s_strd 2228 add \ds2, \dst, \d_strd 2229 lsl \s_strd, \s_strd, #1 2230 lsl \d_strd, \d_strd, #1 2231 sxtl v0.8h, v0.8b 2232 sxtl v1.8h, v1.8b 2233 mov x15, x30 2234 2235 ld1 {v25.8h}, [\src], \s_strd 2236 ext v26.16b, v25.16b, v25.16b, #2 2237 ext v27.16b, v25.16b, v25.16b, #4 2238 ext v28.16b, v25.16b, v25.16b, #6 2239 smull v25.4s, v25.4h, v0.h[0] 2240 smlal v25.4s, v26.4h, v0.h[1] 2241 smlal v25.4s, v27.4h, v0.h[2] 2242 smlal v25.4s, v28.4h, v0.h[3] 2243 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2244 // The intermediates from the horizontal pass fit in 16 bit without 2245 // any bias; we could just as well keep them as .4s, but narrowing 2246 // them to .4h gives a significant speedup on out of order cores 2247 // (at the cost of a smaller slowdown on in-order cores such as A53). 2248 xtn v16.4h, v16.4s 2249 2250 bl L(\type\()_8tap_filter_4) 2251 mov v17.8b, v24.8b 2252 mov v18.8b, v25.8b 2253 bl L(\type\()_8tap_filter_4) 2254 mov v19.8b, v24.8b 2255 mov v20.8b, v25.8b 2256 bl L(\type\()_8tap_filter_4) 2257 mov v21.8b, v24.8b 2258 mov v22.8b, v25.8b 2259 226048: 2261 bl L(\type\()_8tap_filter_4) 2262 smull v3.4s, v16.4h, v1.h[0] 2263 smlal v3.4s, v17.4h, v1.h[1] 2264 smlal v3.4s, v18.4h, v1.h[2] 2265 smlal v3.4s, v19.4h, v1.h[3] 2266 smlal v3.4s, v20.4h, v1.h[4] 2267 smlal v3.4s, v21.4h, v1.h[5] 2268 smlal v3.4s, v22.4h, v1.h[6] 2269 smlal v3.4s, v24.4h, v1.h[7] 2270 smull v4.4s, v17.4h, v1.h[0] 2271 smlal v4.4s, v18.4h, v1.h[1] 2272 smlal v4.4s, v19.4h, v1.h[2] 2273 smlal v4.4s, v20.4h, v1.h[3] 2274 smlal v4.4s, v21.4h, v1.h[4] 2275 smlal v4.4s, v22.4h, v1.h[5] 2276 smlal v4.4s, v24.4h, v1.h[6] 2277 smlal v4.4s, v25.4h, v1.h[7] 2278.ifc \type, put 2279 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2280 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2281 sqxtun v3.4h, v3.4s 2282 sqxtun2 v3.8h, v4.4s 2283 umin v3.8h, v3.8h, v31.8h 2284.else 2285 rshrn v3.4h, v3.4s, #6 2286 rshrn2 v3.8h, v4.4s, #6 2287 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2288.endif 2289 subs \h, \h, #2 2290 st1 {v3.d}[0], [\dst], \d_strd 2291 st1 {v3.d}[1], [\ds2], \d_strd 2292 b.le 0f 2293 mov v16.8b, v18.8b 2294 mov v17.8b, v19.8b 2295 mov v18.8b, v20.8b 2296 mov v19.8b, v21.8b 2297 mov v20.8b, v22.8b 2298 mov v21.8b, v24.8b 2299 mov v22.8b, v25.8b 2300 b 48b 23010: 2302 br x15 2303 2304L(\type\()_8tap_filter_4): 2305 ld1 {v24.8h}, [\sr2], \s_strd 2306 ld1 {v25.8h}, [\src], \s_strd 2307 ext v26.16b, v24.16b, v24.16b, #2 2308 ext v27.16b, v24.16b, v24.16b, #4 2309 ext v28.16b, v24.16b, v24.16b, #6 2310 smull v24.4s, v24.4h, v0.h[0] 2311 smlal v24.4s, v26.4h, v0.h[1] 2312 smlal v24.4s, v27.4h, v0.h[2] 2313 smlal v24.4s, v28.4h, v0.h[3] 2314 ext v26.16b, v25.16b, v25.16b, #2 2315 ext v27.16b, v25.16b, v25.16b, #4 2316 ext v28.16b, v25.16b, v25.16b, #6 2317 smull v25.4s, v25.4h, v0.h[0] 2318 smlal v25.4s, v26.4h, v0.h[1] 2319 smlal v25.4s, v27.4h, v0.h[2] 2320 smlal v25.4s, v28.4h, v0.h[3] 2321 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2322 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2323 xtn v24.4h, v24.4s 2324 xtn v25.4h, v25.4s 2325 ret 2326 232780: 2328160: 2329320: 2330 b.gt 880f 2331 add \xmy, \xmy, #2 2332 ld1 {v0.8b}, [\xmx] 2333 ld1 {v1.s}[0], [\xmy] 2334 sub \src, \src, #6 2335 sub \src, \src, \s_strd 2336 sxtl v0.8h, v0.8b 2337 sxtl v1.8h, v1.8b 2338 mov x15, x30 2339 mov \my, \h 2340 2341164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2342 add \ds2, \dst, \d_strd 2343 add \sr2, \src, \s_strd 2344 lsl \d_strd, \d_strd, #1 2345 lsl \s_strd, \s_strd, #1 2346 2347 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2348 smull v24.4s, v27.4h, v0.h[0] 2349 smull2 v25.4s, v27.8h, v0.h[0] 2350.irpc i, 1234567 2351 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2352 smlal v24.4s, v26.4h, v0.h[\i] 2353 smlal2 v25.4s, v26.8h, v0.h[\i] 2354.endr 2355 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2356 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2357 // The intermediates from the horizontal pass fit in 16 bit without 2358 // any bias; we could just as well keep them as .4s, but narrowing 2359 // them to .4h gives a significant speedup on out of order cores 2360 // (at the cost of a smaller slowdown on in-order cores such as A53), 2361 // and conserves register space (no need to clobber v8-v15). 2362 xtn v16.4h, v24.4s 2363 xtn2 v16.8h, v25.4s 2364 2365 bl L(\type\()_8tap_filter_8) 2366 mov v17.16b, v23.16b 2367 mov v18.16b, v24.16b 2368 23698: 2370 smull v2.4s, v16.4h, v1.h[0] 2371 smull2 v3.4s, v16.8h, v1.h[0] 2372 bl L(\type\()_8tap_filter_8) 2373 smull v4.4s, v17.4h, v1.h[0] 2374 smull2 v5.4s, v17.8h, v1.h[0] 2375 smlal v2.4s, v17.4h, v1.h[1] 2376 smlal2 v3.4s, v17.8h, v1.h[1] 2377 smlal v4.4s, v18.4h, v1.h[1] 2378 smlal2 v5.4s, v18.8h, v1.h[1] 2379 smlal v2.4s, v18.4h, v1.h[2] 2380 smlal2 v3.4s, v18.8h, v1.h[2] 2381 smlal v4.4s, v23.4h, v1.h[2] 2382 smlal2 v5.4s, v23.8h, v1.h[2] 2383 smlal v2.4s, v23.4h, v1.h[3] 2384 smlal2 v3.4s, v23.8h, v1.h[3] 2385 smlal v4.4s, v24.4h, v1.h[3] 2386 smlal2 v5.4s, v24.8h, v1.h[3] 2387.ifc \type, put 2388 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2389 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2390 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2391 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2392 sqxtun v2.4h, v2.4s 2393 sqxtun2 v2.8h, v3.4s 2394 sqxtun v3.4h, v4.4s 2395 sqxtun2 v3.8h, v5.4s 2396 umin v2.8h, v2.8h, v31.8h 2397 umin v3.8h, v3.8h, v31.8h 2398.else 2399 rshrn v2.4h, v2.4s, #6 2400 rshrn2 v2.8h, v3.4s, #6 2401 rshrn v3.4h, v4.4s, #6 2402 rshrn2 v3.8h, v5.4s, #6 2403 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2404 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2405.endif 2406 subs \h, \h, #2 2407 st1 {v2.8h}, [\dst], \d_strd 2408 st1 {v3.8h}, [\ds2], \d_strd 2409 b.le 9f 2410 mov v16.16b, v18.16b 2411 mov v17.16b, v23.16b 2412 mov v18.16b, v24.16b 2413 b 8b 24149: 2415 subs \w, \w, #8 2416 b.le 0f 2417 asr \s_strd, \s_strd, #1 2418 asr \d_strd, \d_strd, #1 2419 msub \src, \s_strd, \xmy, \src 2420 msub \dst, \d_strd, \xmy, \dst 2421 sub \src, \src, \s_strd, lsl #2 2422 mov \h, \my 2423 add \src, \src, #16 2424 add \dst, \dst, #16 2425 b 164b 2426 2427880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2428640: 24291280: 2430 ld1 {v0.8b}, [\xmx] 2431 ld1 {v1.8b}, [\xmy] 2432 sub \src, \src, #6 2433 sub \src, \src, \s_strd 2434 sub \src, \src, \s_strd, lsl #1 2435 sxtl v0.8h, v0.8b 2436 sxtl v1.8h, v1.8b 2437 mov x15, x30 2438 mov \my, \h 2439 2440168: 2441 add \ds2, \dst, \d_strd 2442 add \sr2, \src, \s_strd 2443 lsl \d_strd, \d_strd, #1 2444 lsl \s_strd, \s_strd, #1 2445 2446 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2447 smull v24.4s, v27.4h, v0.h[0] 2448 smull2 v25.4s, v27.8h, v0.h[0] 2449.irpc i, 1234567 2450 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2451 smlal v24.4s, v26.4h, v0.h[\i] 2452 smlal2 v25.4s, v26.8h, v0.h[\i] 2453.endr 2454 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2455 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2456 // The intermediates from the horizontal pass fit in 16 bit without 2457 // any bias; we could just as well keep them as .4s, but narrowing 2458 // them to .4h gives a significant speedup on out of order cores 2459 // (at the cost of a smaller slowdown on in-order cores such as A53), 2460 // and conserves register space (no need to clobber v8-v15). 2461 xtn v16.4h, v24.4s 2462 xtn2 v16.8h, v25.4s 2463 2464 bl L(\type\()_8tap_filter_8) 2465 mov v17.16b, v23.16b 2466 mov v18.16b, v24.16b 2467 bl L(\type\()_8tap_filter_8) 2468 mov v19.16b, v23.16b 2469 mov v20.16b, v24.16b 2470 bl L(\type\()_8tap_filter_8) 2471 mov v21.16b, v23.16b 2472 mov v22.16b, v24.16b 2473 247488: 2475 smull v2.4s, v16.4h, v1.h[0] 2476 smull2 v3.4s, v16.8h, v1.h[0] 2477 bl L(\type\()_8tap_filter_8) 2478 smull v4.4s, v17.4h, v1.h[0] 2479 smull2 v5.4s, v17.8h, v1.h[0] 2480 smlal v2.4s, v17.4h, v1.h[1] 2481 smlal2 v3.4s, v17.8h, v1.h[1] 2482 smlal v4.4s, v18.4h, v1.h[1] 2483 smlal2 v5.4s, v18.8h, v1.h[1] 2484 smlal v2.4s, v18.4h, v1.h[2] 2485 smlal2 v3.4s, v18.8h, v1.h[2] 2486 smlal v4.4s, v19.4h, v1.h[2] 2487 smlal2 v5.4s, v19.8h, v1.h[2] 2488 smlal v2.4s, v19.4h, v1.h[3] 2489 smlal2 v3.4s, v19.8h, v1.h[3] 2490 smlal v4.4s, v20.4h, v1.h[3] 2491 smlal2 v5.4s, v20.8h, v1.h[3] 2492 smlal v2.4s, v20.4h, v1.h[4] 2493 smlal2 v3.4s, v20.8h, v1.h[4] 2494 smlal v4.4s, v21.4h, v1.h[4] 2495 smlal2 v5.4s, v21.8h, v1.h[4] 2496 smlal v2.4s, v21.4h, v1.h[5] 2497 smlal2 v3.4s, v21.8h, v1.h[5] 2498 smlal v4.4s, v22.4h, v1.h[5] 2499 smlal2 v5.4s, v22.8h, v1.h[5] 2500 smlal v2.4s, v22.4h, v1.h[6] 2501 smlal2 v3.4s, v22.8h, v1.h[6] 2502 smlal v4.4s, v23.4h, v1.h[6] 2503 smlal2 v5.4s, v23.8h, v1.h[6] 2504 smlal v2.4s, v23.4h, v1.h[7] 2505 smlal2 v3.4s, v23.8h, v1.h[7] 2506 smlal v4.4s, v24.4h, v1.h[7] 2507 smlal2 v5.4s, v24.8h, v1.h[7] 2508.ifc \type, put 2509 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2510 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2511 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2512 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2513 sqxtun v2.4h, v2.4s 2514 sqxtun2 v2.8h, v3.4s 2515 sqxtun v3.4h, v4.4s 2516 sqxtun2 v3.8h, v5.4s 2517 umin v2.8h, v2.8h, v31.8h 2518 umin v3.8h, v3.8h, v31.8h 2519.else 2520 rshrn v2.4h, v2.4s, #6 2521 rshrn2 v2.8h, v3.4s, #6 2522 rshrn v3.4h, v4.4s, #6 2523 rshrn2 v3.8h, v5.4s, #6 2524 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2525 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2526.endif 2527 subs \h, \h, #2 2528 st1 {v2.8h}, [\dst], \d_strd 2529 st1 {v3.8h}, [\ds2], \d_strd 2530 b.le 9f 2531 mov v16.16b, v18.16b 2532 mov v17.16b, v19.16b 2533 mov v18.16b, v20.16b 2534 mov v19.16b, v21.16b 2535 mov v20.16b, v22.16b 2536 mov v21.16b, v23.16b 2537 mov v22.16b, v24.16b 2538 b 88b 25399: 2540 subs \w, \w, #8 2541 b.le 0f 2542 asr \s_strd, \s_strd, #1 2543 asr \d_strd, \d_strd, #1 2544 msub \src, \s_strd, \xmy, \src 2545 msub \dst, \d_strd, \xmy, \dst 2546 sub \src, \src, \s_strd, lsl #3 2547 mov \h, \my 2548 add \src, \src, #16 2549 add \dst, \dst, #16 2550 b 168b 25510: 2552 br x15 2553 2554L(\type\()_8tap_filter_8): 2555 ld1 {v4.8h, v5.8h}, [\sr2], \s_strd 2556 ld1 {v6.8h, v7.8h}, [\src], \s_strd 2557 smull v25.4s, v4.4h, v0.h[0] 2558 smull2 v26.4s, v4.8h, v0.h[0] 2559 smull v27.4s, v6.4h, v0.h[0] 2560 smull2 v28.4s, v6.8h, v0.h[0] 2561.irpc i, 1234567 2562 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2563 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2564 smlal v25.4s, v23.4h, v0.h[\i] 2565 smlal2 v26.4s, v23.8h, v0.h[\i] 2566 smlal v27.4s, v24.4h, v0.h[\i] 2567 smlal2 v28.4s, v24.8h, v0.h[\i] 2568.endr 2569 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2570 srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) 2571 srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) 2572 srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) 2573 xtn v23.4h, v25.4s 2574 xtn2 v23.8h, v26.4s 2575 xtn v24.4h, v27.4s 2576 xtn2 v24.8h, v28.4s 2577 ret 2578 2579L(\type\()_8tap_hv_tbl): 2580 .hword L(\type\()_8tap_hv_tbl) - 1280b 2581 .hword L(\type\()_8tap_hv_tbl) - 640b 2582 .hword L(\type\()_8tap_hv_tbl) - 320b 2583 .hword L(\type\()_8tap_hv_tbl) - 160b 2584 .hword L(\type\()_8tap_hv_tbl) - 80b 2585 .hword L(\type\()_8tap_hv_tbl) - 40b 2586 .hword L(\type\()_8tap_hv_tbl) - 20b 2587 .hword 0 2588endfunc 2589 2590 2591function \type\()_bilin_16bpc_neon, export=1 2592.ifc \bdmax, w8 2593 ldr w8, [sp] 2594.endif 2595 dup v1.8h, \mx 2596 dup v3.8h, \my 2597 mov w10, #16 2598 sub w9, w10, \mx 2599 sub w10, w10, \my 2600 dup v0.8h, w9 2601 dup v2.8h, w10 2602.ifc \type, prep 2603 uxtw \d_strd, \w 2604 lsl \d_strd, \d_strd, #1 2605.endif 2606 2607 clz \bdmax, \bdmax // bitdepth_max 2608 clz w9, \w 2609 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 2610 mov w11, #4 2611 sub w9, w9, #24 2612 sub w11, w11, \bdmax // 4 - intermediate_bits 2613 add w12, \bdmax, #4 // 4 + intermediate_bits 2614 cbnz \mx, L(\type\()_bilin_h) 2615 cbnz \my, L(\type\()_bilin_v) 2616 b \type\()_neon 2617 2618L(\type\()_bilin_h): 2619 cbnz \my, L(\type\()_bilin_hv) 2620 2621 adr x10, L(\type\()_bilin_h_tbl) 2622 dup v31.8h, w11 // 4 - intermediate_bits 2623 ldrh w9, [x10, x9, lsl #1] 2624 neg v31.8h, v31.8h // -(4-intermediate_bits) 2625.ifc \type, put 2626 dup v30.8h, \bdmax // intermediate_bits 2627.else 2628 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2629.endif 2630 sub x10, x10, w9, uxtw 2631.ifc \type, put 2632 neg v30.8h, v30.8h // -intermediate_bits 2633.endif 2634 br x10 2635 263620: // 2xN h 2637.ifc \type, put 2638 add \ds2, \dst, \d_strd 2639 add \sr2, \src, \s_strd 2640 lsl \d_strd, \d_strd, #1 2641 lsl \s_strd, \s_strd, #1 26422: 2643 ld1 {v4.4h}, [\src], \s_strd 2644 ld1 {v6.4h}, [\sr2], \s_strd 2645 ext v5.8b, v4.8b, v4.8b, #2 2646 ext v7.8b, v6.8b, v6.8b, #2 2647 trn1 v4.2s, v4.2s, v6.2s 2648 trn1 v5.2s, v5.2s, v7.2s 2649 subs \h, \h, #2 2650 mul v4.4h, v4.4h, v0.4h 2651 mla v4.4h, v5.4h, v1.4h 2652 urshl v4.4h, v4.4h, v31.4h 2653 urshl v4.4h, v4.4h, v30.4h 2654 st1 {v4.s}[0], [\dst], \d_strd 2655 st1 {v4.s}[1], [\ds2], \d_strd 2656 b.gt 2b 2657 ret 2658.endif 2659 266040: // 4xN h 2661 add \ds2, \dst, \d_strd 2662 add \sr2, \src, \s_strd 2663 lsl \d_strd, \d_strd, #1 2664 lsl \s_strd, \s_strd, #1 26654: 2666 ld1 {v4.8h}, [\src], \s_strd 2667 ld1 {v6.8h}, [\sr2], \s_strd 2668 ext v5.16b, v4.16b, v4.16b, #2 2669 ext v7.16b, v6.16b, v6.16b, #2 2670 trn1 v4.2d, v4.2d, v6.2d 2671 trn1 v5.2d, v5.2d, v7.2d 2672 subs \h, \h, #2 2673 mul v4.8h, v4.8h, v0.8h 2674 mla v4.8h, v5.8h, v1.8h 2675 urshl v4.8h, v4.8h, v31.8h 2676.ifc \type, put 2677 urshl v4.8h, v4.8h, v30.8h 2678.else 2679 sub v4.8h, v4.8h, v29.8h 2680.endif 2681 st1 {v4.d}[0], [\dst], \d_strd 2682 st1 {v4.d}[1], [\ds2], \d_strd 2683 b.gt 4b 2684 ret 2685 268680: // 8xN h 2687 add \ds2, \dst, \d_strd 2688 add \sr2, \src, \s_strd 2689 lsl \d_strd, \d_strd, #1 2690 lsl \s_strd, \s_strd, #1 26918: 2692 ldr h5, [\src, #16] 2693 ldr h7, [\sr2, #16] 2694 ld1 {v4.8h}, [\src], \s_strd 2695 ld1 {v6.8h}, [\sr2], \s_strd 2696 ext v5.16b, v4.16b, v5.16b, #2 2697 ext v7.16b, v6.16b, v7.16b, #2 2698 subs \h, \h, #2 2699 mul v4.8h, v4.8h, v0.8h 2700 mla v4.8h, v5.8h, v1.8h 2701 mul v6.8h, v6.8h, v0.8h 2702 mla v6.8h, v7.8h, v1.8h 2703 urshl v4.8h, v4.8h, v31.8h 2704 urshl v6.8h, v6.8h, v31.8h 2705.ifc \type, put 2706 urshl v4.8h, v4.8h, v30.8h 2707 urshl v6.8h, v6.8h, v30.8h 2708.else 2709 sub v4.8h, v4.8h, v29.8h 2710 sub v6.8h, v6.8h, v29.8h 2711.endif 2712 st1 {v4.8h}, [\dst], \d_strd 2713 st1 {v6.8h}, [\ds2], \d_strd 2714 b.gt 8b 2715 ret 2716160: 2717320: 2718640: 27191280: // 16xN, 32xN, ... h 2720 add \ds2, \dst, \d_strd 2721 add \sr2, \src, \s_strd 2722 lsl \s_strd, \s_strd, #1 2723 2724 sub \s_strd, \s_strd, \w, uxtw #1 2725 sub \s_strd, \s_strd, #16 2726.ifc \type, put 2727 lsl \d_strd, \d_strd, #1 2728 sub \d_strd, \d_strd, \w, uxtw #1 2729.endif 2730161: 2731 ld1 {v16.8h}, [\src], #16 2732 ld1 {v21.8h}, [\sr2], #16 2733 mov \mx, \w 2734 273516: 2736 ld1 {v17.8h, v18.8h}, [\src], #32 2737 ld1 {v22.8h, v23.8h}, [\sr2], #32 2738 ext v19.16b, v16.16b, v17.16b, #2 2739 ext v20.16b, v17.16b, v18.16b, #2 2740 ext v24.16b, v21.16b, v22.16b, #2 2741 ext v25.16b, v22.16b, v23.16b, #2 2742 mul v16.8h, v16.8h, v0.8h 2743 mla v16.8h, v19.8h, v1.8h 2744 mul v17.8h, v17.8h, v0.8h 2745 mla v17.8h, v20.8h, v1.8h 2746 mul v21.8h, v21.8h, v0.8h 2747 mla v21.8h, v24.8h, v1.8h 2748 mul v22.8h, v22.8h, v0.8h 2749 mla v22.8h, v25.8h, v1.8h 2750 urshl v16.8h, v16.8h, v31.8h 2751 urshl v17.8h, v17.8h, v31.8h 2752 urshl v21.8h, v21.8h, v31.8h 2753 urshl v22.8h, v22.8h, v31.8h 2754 subs \mx, \mx, #16 2755.ifc \type, put 2756 urshl v16.8h, v16.8h, v30.8h 2757 urshl v17.8h, v17.8h, v30.8h 2758 urshl v21.8h, v21.8h, v30.8h 2759 urshl v22.8h, v22.8h, v30.8h 2760.else 2761 sub v16.8h, v16.8h, v29.8h 2762 sub v17.8h, v17.8h, v29.8h 2763 sub v21.8h, v21.8h, v29.8h 2764 sub v22.8h, v22.8h, v29.8h 2765.endif 2766 st1 {v16.8h, v17.8h}, [\dst], #32 2767 st1 {v21.8h, v22.8h}, [\ds2], #32 2768 b.le 9f 2769 2770 mov v16.16b, v18.16b 2771 mov v21.16b, v23.16b 2772 b 16b 2773 27749: 2775 add \dst, \dst, \d_strd 2776 add \ds2, \ds2, \d_strd 2777 add \src, \src, \s_strd 2778 add \sr2, \sr2, \s_strd 2779 2780 subs \h, \h, #2 2781 b.gt 161b 2782 ret 2783 2784L(\type\()_bilin_h_tbl): 2785 .hword L(\type\()_bilin_h_tbl) - 1280b 2786 .hword L(\type\()_bilin_h_tbl) - 640b 2787 .hword L(\type\()_bilin_h_tbl) - 320b 2788 .hword L(\type\()_bilin_h_tbl) - 160b 2789 .hword L(\type\()_bilin_h_tbl) - 80b 2790 .hword L(\type\()_bilin_h_tbl) - 40b 2791 .hword L(\type\()_bilin_h_tbl) - 20b 2792 .hword 0 2793 2794 2795L(\type\()_bilin_v): 2796 cmp \h, #4 2797 adr x10, L(\type\()_bilin_v_tbl) 2798.ifc \type, prep 2799 dup v31.8h, w11 // 4 - intermediate_bits 2800.endif 2801 ldrh w9, [x10, x9, lsl #1] 2802.ifc \type, prep 2803 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2804 neg v31.8h, v31.8h // -(4-intermediate_bits) 2805.endif 2806 sub x10, x10, w9, uxtw 2807 br x10 2808 280920: // 2xN v 2810.ifc \type, put 2811 cmp \h, #2 2812 add \ds2, \dst, \d_strd 2813 add \sr2, \src, \s_strd 2814 lsl \s_strd, \s_strd, #1 2815 lsl \d_strd, \d_strd, #1 2816 2817 // 2x2 v 2818 ld1 {v16.s}[0], [\src], \s_strd 2819 b.gt 24f 2820 ld1 {v17.s}[0], [\sr2], \s_strd 2821 ld1 {v18.s}[0], [\src], \s_strd 2822 trn1 v16.2s, v16.2s, v17.2s 2823 trn1 v17.2s, v17.2s, v18.2s 2824 mul v4.4h, v16.4h, v2.4h 2825 mla v4.4h, v17.4h, v3.4h 2826 urshr v4.8h, v4.8h, #4 2827 st1 {v4.s}[0], [\dst] 2828 st1 {v4.s}[1], [\ds2] 2829 ret 283024: // 2x4, 2x8, ... v 2831 ld1 {v17.s}[0], [\sr2], \s_strd 2832 ld1 {v18.s}[0], [\src], \s_strd 2833 ld1 {v19.s}[0], [\sr2], \s_strd 2834 ld1 {v20.s}[0], [\src], \s_strd 2835 trn1 v16.2s, v16.2s, v17.2s 2836 trn1 v17.2s, v17.2s, v18.2s 2837 trn1 v18.2s, v18.2s, v19.2s 2838 trn1 v19.2s, v19.2s, v20.2s 2839 trn1 v16.2d, v16.2d, v18.2d 2840 trn1 v17.2d, v17.2d, v19.2d 2841 mul v4.8h, v16.8h, v2.8h 2842 mla v4.8h, v17.8h, v3.8h 2843 subs \h, \h, #4 2844 urshr v4.8h, v4.8h, #4 2845 st1 {v4.s}[0], [\dst], \d_strd 2846 st1 {v4.s}[1], [\ds2], \d_strd 2847 st1 {v4.s}[2], [\dst], \d_strd 2848 st1 {v4.s}[3], [\ds2], \d_strd 2849 b.le 0f 2850 mov v16.8b, v20.8b 2851 b 24b 28520: 2853 ret 2854.endif 2855 285640: // 4xN v 2857 add \ds2, \dst, \d_strd 2858 add \sr2, \src, \s_strd 2859 lsl \s_strd, \s_strd, #1 2860 lsl \d_strd, \d_strd, #1 2861 ld1 {v16.4h}, [\src], \s_strd 28624: 2863 ld1 {v17.4h}, [\sr2], \s_strd 2864 ld1 {v18.4h}, [\src], \s_strd 2865 trn1 v16.2d, v16.2d, v17.2d 2866 trn1 v17.2d, v17.2d, v18.2d 2867 mul v4.8h, v16.8h, v2.8h 2868 mla v4.8h, v17.8h, v3.8h 2869 subs \h, \h, #2 2870.ifc \type, put 2871 urshr v4.8h, v4.8h, #4 2872.else 2873 urshl v4.8h, v4.8h, v31.8h 2874 sub v4.8h, v4.8h, v29.8h 2875.endif 2876 st1 {v4.d}[0], [\dst], \d_strd 2877 st1 {v4.d}[1], [\ds2], \d_strd 2878 b.le 0f 2879 mov v16.8b, v18.8b 2880 b 4b 28810: 2882 ret 2883 288480: // 8xN v 2885 add \ds2, \dst, \d_strd 2886 add \sr2, \src, \s_strd 2887 lsl \s_strd, \s_strd, #1 2888 lsl \d_strd, \d_strd, #1 2889 ld1 {v16.8h}, [\src], \s_strd 28908: 2891 ld1 {v17.8h}, [\sr2], \s_strd 2892 ld1 {v18.8h}, [\src], \s_strd 2893 mul v4.8h, v16.8h, v2.8h 2894 mla v4.8h, v17.8h, v3.8h 2895 mul v5.8h, v17.8h, v2.8h 2896 mla v5.8h, v18.8h, v3.8h 2897 subs \h, \h, #2 2898.ifc \type, put 2899 urshr v4.8h, v4.8h, #4 2900 urshr v5.8h, v5.8h, #4 2901.else 2902 urshl v4.8h, v4.8h, v31.8h 2903 urshl v5.8h, v5.8h, v31.8h 2904 sub v4.8h, v4.8h, v29.8h 2905 sub v5.8h, v5.8h, v29.8h 2906.endif 2907 st1 {v4.8h}, [\dst], \d_strd 2908 st1 {v5.8h}, [\ds2], \d_strd 2909 b.le 0f 2910 mov v16.16b, v18.16b 2911 b 8b 29120: 2913 ret 2914 2915160: // 16xN, 32xN, ... 2916320: 2917640: 29181280: 2919 mov \my, \h 29201: 2921 add \ds2, \dst, \d_strd 2922 add \sr2, \src, \s_strd 2923 lsl \s_strd, \s_strd, #1 2924 lsl \d_strd, \d_strd, #1 2925 2926 ld1 {v16.8h, v17.8h}, [\src], \s_strd 29272: 2928 ld1 {v18.8h, v19.8h}, [\sr2], \s_strd 2929 ld1 {v20.8h, v21.8h}, [\src], \s_strd 2930 mul v4.8h, v16.8h, v2.8h 2931 mla v4.8h, v18.8h, v3.8h 2932 mul v5.8h, v17.8h, v2.8h 2933 mla v5.8h, v19.8h, v3.8h 2934 mul v6.8h, v18.8h, v2.8h 2935 mla v6.8h, v20.8h, v3.8h 2936 mul v7.8h, v19.8h, v2.8h 2937 mla v7.8h, v21.8h, v3.8h 2938 subs \h, \h, #2 2939.ifc \type, put 2940 urshr v4.8h, v4.8h, #4 2941 urshr v5.8h, v5.8h, #4 2942 urshr v6.8h, v6.8h, #4 2943 urshr v7.8h, v7.8h, #4 2944.else 2945 urshl v4.8h, v4.8h, v31.8h 2946 urshl v5.8h, v5.8h, v31.8h 2947 urshl v6.8h, v6.8h, v31.8h 2948 urshl v7.8h, v7.8h, v31.8h 2949 sub v4.8h, v4.8h, v29.8h 2950 sub v5.8h, v5.8h, v29.8h 2951 sub v6.8h, v6.8h, v29.8h 2952 sub v7.8h, v7.8h, v29.8h 2953.endif 2954 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2955 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2956 b.le 9f 2957 mov v16.16b, v20.16b 2958 mov v17.16b, v21.16b 2959 b 2b 29609: 2961 subs \w, \w, #16 2962 b.le 0f 2963 asr \s_strd, \s_strd, #1 2964 asr \d_strd, \d_strd, #1 2965 msub \src, \s_strd, \xmy, \src 2966 msub \dst, \d_strd, \xmy, \dst 2967 sub \src, \src, \s_strd, lsl #1 2968 mov \h, \my 2969 add \src, \src, #32 2970 add \dst, \dst, #32 2971 b 1b 29720: 2973 ret 2974 2975L(\type\()_bilin_v_tbl): 2976 .hword L(\type\()_bilin_v_tbl) - 1280b 2977 .hword L(\type\()_bilin_v_tbl) - 640b 2978 .hword L(\type\()_bilin_v_tbl) - 320b 2979 .hword L(\type\()_bilin_v_tbl) - 160b 2980 .hword L(\type\()_bilin_v_tbl) - 80b 2981 .hword L(\type\()_bilin_v_tbl) - 40b 2982 .hword L(\type\()_bilin_v_tbl) - 20b 2983 .hword 0 2984 2985L(\type\()_bilin_hv): 2986 adr x10, L(\type\()_bilin_hv_tbl) 2987 dup v31.8h, w11 // 4 - intermediate_bits 2988 ldrh w9, [x10, x9, lsl #1] 2989 neg v31.8h, v31.8h // -(4-intermediate_bits) 2990.ifc \type, put 2991 dup v30.4s, w12 // 4 + intermediate_bits 2992.else 2993 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2994.endif 2995 sub x10, x10, w9, uxtw 2996.ifc \type, put 2997 neg v30.4s, v30.4s // -(4+intermediate_bits) 2998.endif 2999 br x10 3000 300120: // 2xN hv 3002.ifc \type, put 3003 add \sr2, \src, \s_strd 3004 add \ds2, \dst, \d_strd 3005 lsl \s_strd, \s_strd, #1 3006 lsl \d_strd, \d_strd, #1 3007 3008 ld1 {v20.4h}, [\src], \s_strd 3009 ext v21.8b, v20.8b, v20.8b, #2 3010 mul v16.4h, v20.4h, v0.4h 3011 mla v16.4h, v21.4h, v1.4h 3012 urshl v16.4h, v16.4h, v31.4h 3013 30142: 3015 ld1 {v22.4h}, [\sr2], \s_strd 3016 ld1 {v24.4h}, [\src], \s_strd 3017 ext v23.8b, v22.8b, v22.8b, #2 3018 ext v25.8b, v24.8b, v24.8b, #2 3019 trn1 v22.2s, v22.2s, v24.2s 3020 trn1 v23.2s, v23.2s, v25.2s 3021 mul v17.4h, v22.4h, v0.4h 3022 mla v17.4h, v23.4h, v1.4h 3023 urshl v17.4h, v17.4h, v31.4h 3024 3025 trn1 v16.2s, v16.2s, v17.2s 3026 3027 umull v4.4s, v16.4h, v2.4h 3028 umlal v4.4s, v17.4h, v3.4h 3029 urshl v4.4s, v4.4s, v30.4s 3030 xtn v4.4h, v4.4s 3031 subs \h, \h, #2 3032 st1 {v4.s}[0], [\dst], \d_strd 3033 st1 {v4.s}[1], [\ds2], \d_strd 3034 b.le 0f 3035 trn2 v16.2s, v17.2s, v17.2s 3036 b 2b 30370: 3038 ret 3039.endif 3040 304140: // 4xN hv 3042 add \sr2, \src, \s_strd 3043 add \ds2, \dst, \d_strd 3044 lsl \s_strd, \s_strd, #1 3045 lsl \d_strd, \d_strd, #1 3046 3047 ld1 {v20.8h}, [\src], \s_strd 3048 ext v21.16b, v20.16b, v20.16b, #2 3049 mul v16.4h, v20.4h, v0.4h 3050 mla v16.4h, v21.4h, v1.4h 3051 urshl v16.4h, v16.4h, v31.4h 3052 30534: 3054 ld1 {v22.8h}, [\sr2], \s_strd 3055 ld1 {v24.8h}, [\src], \s_strd 3056 ext v23.16b, v22.16b, v22.16b, #2 3057 ext v25.16b, v24.16b, v24.16b, #2 3058 trn1 v22.2d, v22.2d, v24.2d 3059 trn1 v23.2d, v23.2d, v25.2d 3060 mul v17.8h, v22.8h, v0.8h 3061 mla v17.8h, v23.8h, v1.8h 3062 urshl v17.8h, v17.8h, v31.8h 3063 3064 trn1 v16.2d, v16.2d, v17.2d 3065 3066 umull v4.4s, v16.4h, v2.4h 3067 umlal v4.4s, v17.4h, v3.4h 3068 umull2 v5.4s, v16.8h, v2.8h 3069 umlal2 v5.4s, v17.8h, v3.8h 3070.ifc \type, put 3071 urshl v4.4s, v4.4s, v30.4s 3072 urshl v5.4s, v5.4s, v30.4s 3073 xtn v4.4h, v4.4s 3074 xtn2 v4.8h, v5.4s 3075.else 3076 rshrn v4.4h, v4.4s, #4 3077 rshrn2 v4.8h, v5.4s, #4 3078 sub v4.8h, v4.8h, v29.8h 3079.endif 3080 subs \h, \h, #2 3081 st1 {v4.d}[0], [\dst], \d_strd 3082 st1 {v4.d}[1], [\ds2], \d_strd 3083 b.le 0f 3084 trn2 v16.2d, v17.2d, v17.2d 3085 b 4b 30860: 3087 ret 3088 308980: // 8xN, 16xN, ... hv 3090160: 3091320: 3092640: 30931280: 3094 mov \my, \h 3095 30961: 3097 add \sr2, \src, \s_strd 3098 add \ds2, \dst, \d_strd 3099 lsl \s_strd, \s_strd, #1 3100 lsl \d_strd, \d_strd, #1 3101 3102 ldr h21, [\src, #16] 3103 ld1 {v20.8h}, [\src], \s_strd 3104 ext v21.16b, v20.16b, v21.16b, #2 3105 mul v16.8h, v20.8h, v0.8h 3106 mla v16.8h, v21.8h, v1.8h 3107 urshl v16.8h, v16.8h, v31.8h 3108 31092: 3110 ldr h23, [\sr2, #16] 3111 ld1 {v22.8h}, [\sr2], \s_strd 3112 ldr h25, [\src, #16] 3113 ld1 {v24.8h}, [\src], \s_strd 3114 ext v23.16b, v22.16b, v23.16b, #2 3115 ext v25.16b, v24.16b, v25.16b, #2 3116 mul v17.8h, v22.8h, v0.8h 3117 mla v17.8h, v23.8h, v1.8h 3118 mul v18.8h, v24.8h, v0.8h 3119 mla v18.8h, v25.8h, v1.8h 3120 urshl v17.8h, v17.8h, v31.8h 3121 urshl v18.8h, v18.8h, v31.8h 3122 3123 umull v4.4s, v16.4h, v2.4h 3124 umlal v4.4s, v17.4h, v3.4h 3125 umull2 v5.4s, v16.8h, v2.8h 3126 umlal2 v5.4s, v17.8h, v3.8h 3127 umull v6.4s, v17.4h, v2.4h 3128 umlal v6.4s, v18.4h, v3.4h 3129 umull2 v7.4s, v17.8h, v2.8h 3130 umlal2 v7.4s, v18.8h, v3.8h 3131.ifc \type, put 3132 urshl v4.4s, v4.4s, v30.4s 3133 urshl v5.4s, v5.4s, v30.4s 3134 urshl v6.4s, v6.4s, v30.4s 3135 urshl v7.4s, v7.4s, v30.4s 3136 xtn v4.4h, v4.4s 3137 xtn2 v4.8h, v5.4s 3138 xtn v5.4h, v6.4s 3139 xtn2 v5.8h, v7.4s 3140.else 3141 rshrn v4.4h, v4.4s, #4 3142 rshrn2 v4.8h, v5.4s, #4 3143 rshrn v5.4h, v6.4s, #4 3144 rshrn2 v5.8h, v7.4s, #4 3145 sub v4.8h, v4.8h, v29.8h 3146 sub v5.8h, v5.8h, v29.8h 3147.endif 3148 subs \h, \h, #2 3149 st1 {v4.8h}, [\dst], \d_strd 3150 st1 {v5.8h}, [\ds2], \d_strd 3151 b.le 9f 3152 mov v16.16b, v18.16b 3153 b 2b 31549: 3155 subs \w, \w, #8 3156 b.le 0f 3157 asr \s_strd, \s_strd, #1 3158 asr \d_strd, \d_strd, #1 3159 msub \src, \s_strd, \xmy, \src 3160 msub \dst, \d_strd, \xmy, \dst 3161 sub \src, \src, \s_strd, lsl #1 3162 mov \h, \my 3163 add \src, \src, #16 3164 add \dst, \dst, #16 3165 b 1b 31660: 3167 ret 3168 3169L(\type\()_bilin_hv_tbl): 3170 .hword L(\type\()_bilin_hv_tbl) - 1280b 3171 .hword L(\type\()_bilin_hv_tbl) - 640b 3172 .hword L(\type\()_bilin_hv_tbl) - 320b 3173 .hword L(\type\()_bilin_hv_tbl) - 160b 3174 .hword L(\type\()_bilin_hv_tbl) - 80b 3175 .hword L(\type\()_bilin_hv_tbl) - 40b 3176 .hword L(\type\()_bilin_hv_tbl) - 20b 3177 .hword 0 3178endfunc 3179.endm 3180 3181filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 3182filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 3183 3184.macro load_filter_row dst, src, inc 3185 asr w13, \src, #10 3186 ldr \dst, [x11, w13, sxtw #3] 3187 add \src, \src, \inc 3188.endm 3189 3190function warp_filter_horz_neon 3191 add w12, w5, #512 3192 3193 ld1 {v16.8h, v17.8h}, [x2], x3 3194 3195 load_filter_row d0, w12, w7 3196 load_filter_row d1, w12, w7 3197 load_filter_row d2, w12, w7 3198 sxtl v0.8h, v0.8b 3199 load_filter_row d3, w12, w7 3200 sxtl v1.8h, v1.8b 3201 load_filter_row d4, w12, w7 3202 sxtl v2.8h, v2.8b 3203 load_filter_row d5, w12, w7 3204 sxtl v3.8h, v3.8b 3205 load_filter_row d6, w12, w7 3206 sxtl v4.8h, v4.8b 3207 load_filter_row d7, w12, w7 3208 sxtl v5.8h, v5.8b 3209 ext v18.16b, v16.16b, v17.16b, #2*1 3210 smull v8.4s, v16.4h, v0.4h 3211 smull2 v9.4s, v16.8h, v0.8h 3212 sxtl v6.8h, v6.8b 3213 ext v19.16b, v16.16b, v17.16b, #2*2 3214 smull v10.4s, v18.4h, v1.4h 3215 smull2 v11.4s, v18.8h, v1.8h 3216 sxtl v7.8h, v7.8b 3217 ext v20.16b, v16.16b, v17.16b, #2*3 3218 smull v0.4s, v19.4h, v2.4h 3219 smull2 v1.4s, v19.8h, v2.8h 3220 ext v21.16b, v16.16b, v17.16b, #2*4 3221 addp v8.4s, v8.4s, v9.4s 3222 smull v2.4s, v20.4h, v3.4h 3223 smull2 v3.4s, v20.8h, v3.8h 3224 ext v22.16b, v16.16b, v17.16b, #2*5 3225 addp v9.4s, v10.4s, v11.4s 3226 smull v10.4s, v21.4h, v4.4h 3227 smull2 v11.4s, v21.8h, v4.8h 3228 ext v23.16b, v16.16b, v17.16b, #2*6 3229 addp v0.4s, v0.4s, v1.4s 3230 smull v18.4s, v22.4h, v5.4h 3231 smull2 v19.4s, v22.8h, v5.8h 3232 ext v16.16b, v16.16b, v17.16b, #2*7 3233 addp v1.4s, v2.4s, v3.4s 3234 addp v2.4s, v10.4s, v11.4s 3235 smull v20.4s, v23.4h, v6.4h 3236 smull2 v21.4s, v23.8h, v6.8h 3237 addp v3.4s, v18.4s, v19.4s 3238 smull v22.4s, v16.4h, v7.4h 3239 smull2 v23.4s, v16.8h, v7.8h 3240 addp v4.4s, v20.4s, v21.4s 3241 addp v5.4s, v22.4s, v23.4s 3242 3243 addp v8.4s, v8.4s, v9.4s 3244 addp v0.4s, v0.4s, v1.4s 3245 addp v2.4s, v2.4s, v3.4s 3246 addp v4.4s, v4.4s, v5.4s 3247 3248 addp v16.4s, v8.4s, v0.4s 3249 addp v17.4s, v2.4s, v4.4s 3250 3251 add w5, w5, w8 3252 3253 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) 3254 srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) 3255 3256 ret 3257endfunc 3258 3259// void dav1d_warp_affine_8x8_16bpc_neon( 3260// pixel *dst, const ptrdiff_t dst_stride, 3261// const pixel *src, const ptrdiff_t src_stride, 3262// const int16_t *const abcd, int mx, int my, 3263// const int bitdepth_max) 3264.macro warp t 3265function warp_affine_8x8\t\()_16bpc_neon, export=1 3266 stp d8, d9, [sp, #-0x40]! 3267 stp d10, d11, [sp, #0x10] 3268 stp d12, d13, [sp, #0x20] 3269 stp d14, d15, [sp, #0x30] 3270 3271.ifb \t 3272 dup v15.8h, w7 // bitdepth_max 3273.else 3274 movi v15.8h, #(PREP_BIAS >> 8), lsl #8 3275.endif 3276 clz w7, w7 3277 // intermediate_bits = clz(bitdepth_max) - 18 3278.ifb \t 3279 sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 3280.endif 3281 sub w7, w7, #25 // -(7 - intermediate_bits) 3282.ifb \t 3283 neg w8, w8 // -(7 + intermediate_bits) 3284.endif 3285 dup v14.4s, w7 // -(7 - intermediate_bits) 3286.ifb \t 3287 dup v13.4s, w8 // -(7 + intermediate_bits) 3288.endif 3289 3290 ldr x4, [x4] 3291 sbfx x7, x4, #0, #16 3292 sbfx x8, x4, #16, #16 3293 sbfx x9, x4, #32, #16 3294 sbfx x4, x4, #48, #16 3295 mov w10, #8 3296 sub x2, x2, x3, lsl #1 3297 sub x2, x2, x3 3298 sub x2, x2, #6 3299 movrel x11, X(mc_warp_filter), 64*8 3300 mov x15, x30 3301.ifnb \t 3302 lsl x1, x1, #1 3303.endif 3304 3305 bl warp_filter_horz_neon 3306 xtn v24.4h, v16.4s 3307 xtn2 v24.8h, v17.4s 3308 bl warp_filter_horz_neon 3309 xtn v25.4h, v16.4s 3310 xtn2 v25.8h, v17.4s 3311 bl warp_filter_horz_neon 3312 xtn v26.4h, v16.4s 3313 xtn2 v26.8h, v17.4s 3314 bl warp_filter_horz_neon 3315 xtn v27.4h, v16.4s 3316 xtn2 v27.8h, v17.4s 3317 bl warp_filter_horz_neon 3318 xtn v28.4h, v16.4s 3319 xtn2 v28.8h, v17.4s 3320 bl warp_filter_horz_neon 3321 xtn v29.4h, v16.4s 3322 xtn2 v29.8h, v17.4s 3323 bl warp_filter_horz_neon 3324 xtn v30.4h, v16.4s 3325 xtn2 v30.8h, v17.4s 3326 33271: 3328 add w14, w6, #512 3329 bl warp_filter_horz_neon 3330 xtn v31.4h, v16.4s 3331 xtn2 v31.8h, v17.4s 3332 3333 load_filter_row d0, w14, w9 3334 load_filter_row d1, w14, w9 3335 load_filter_row d2, w14, w9 3336 load_filter_row d3, w14, w9 3337 load_filter_row d4, w14, w9 3338 load_filter_row d5, w14, w9 3339 load_filter_row d6, w14, w9 3340 load_filter_row d7, w14, w9 3341 transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 3342 sxtl v0.8h, v0.8b 3343 sxtl v1.8h, v1.8b 3344 sxtl v2.8h, v2.8b 3345 sxtl v3.8h, v3.8b 3346 sxtl v4.8h, v4.8b 3347 sxtl v5.8h, v5.8b 3348 sxtl v6.8h, v6.8b 3349 sxtl v7.8h, v7.8b 3350 3351 // This ordering of smull/smlal/smull2/smlal2 is highly 3352 // beneficial for Cortex A53 here. 3353 smull v16.4s, v24.4h, v0.4h 3354 smlal v16.4s, v25.4h, v1.4h 3355 smlal v16.4s, v26.4h, v2.4h 3356 smlal v16.4s, v27.4h, v3.4h 3357 smlal v16.4s, v28.4h, v4.4h 3358 smlal v16.4s, v29.4h, v5.4h 3359 smlal v16.4s, v30.4h, v6.4h 3360 smlal v16.4s, v31.4h, v7.4h 3361 smull2 v17.4s, v24.8h, v0.8h 3362 smlal2 v17.4s, v25.8h, v1.8h 3363 smlal2 v17.4s, v26.8h, v2.8h 3364 smlal2 v17.4s, v27.8h, v3.8h 3365 smlal2 v17.4s, v28.8h, v4.8h 3366 smlal2 v17.4s, v29.8h, v5.8h 3367 smlal2 v17.4s, v30.8h, v6.8h 3368 smlal2 v17.4s, v31.8h, v7.8h 3369 3370 mov v24.16b, v25.16b 3371 mov v25.16b, v26.16b 3372.ifb \t 3373 srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) 3374 srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) 3375.else 3376 rshrn v16.4h, v16.4s, #7 3377 rshrn2 v16.8h, v17.4s, #7 3378.endif 3379 mov v26.16b, v27.16b 3380.ifb \t 3381 sqxtun v16.4h, v16.4s 3382 sqxtun2 v16.8h, v17.4s 3383.else 3384 sub v16.8h, v16.8h, v15.8h // PREP_BIAS 3385.endif 3386 mov v27.16b, v28.16b 3387 mov v28.16b, v29.16b 3388.ifb \t 3389 umin v16.8h, v16.8h, v15.8h // bitdepth_max 3390.endif 3391 mov v29.16b, v30.16b 3392 mov v30.16b, v31.16b 3393 subs w10, w10, #1 3394 st1 {v16.8h}, [x0], x1 3395 3396 add w6, w6, w4 3397 b.gt 1b 3398 3399 ldp d14, d15, [sp, #0x30] 3400 ldp d12, d13, [sp, #0x20] 3401 ldp d10, d11, [sp, #0x10] 3402 ldp d8, d9, [sp], 0x40 3403 3404 br x15 3405endfunc 3406.endm 3407 3408warp 3409warp t 3410