1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2020, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32#define PREP_BIAS 8192 33 34.macro avg d0, d1, t0, t1, t2, t3 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 43 sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) 44 sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) 45.endm 46 47.macro w_avg d0, d1, t0, t1, t2, t3 48 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 49 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 50 // This difference requires a 17 bit range, and all bits are 51 // significant for the following multiplication. 52 ssubl \d0\().4s, \t2\().4h, \t0\().4h 53 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 54 ssubl \d1\().4s, \t3\().4h, \t1\().4h 55 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 56 mul \d0\().4s, \d0\().4s, v27.4s 57 mul \t0\().4s, \t0\().4s, v27.4s 58 mul \d1\().4s, \d1\().4s, v27.4s 59 mul \t1\().4s, \t1\().4s, v27.4s 60 sshr \d0\().4s, \d0\().4s, #4 61 sshr \t0\().4s, \t0\().4s, #4 62 sshr \d1\().4s, \d1\().4s, #4 63 sshr \t1\().4s, \t1\().4s, #4 64 saddw \d0\().4s, \d0\().4s, \t2\().4h 65 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 66 saddw \d1\().4s, \d1\().4s, \t3\().4h 67 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 68 xtn \d0\().4h, \d0\().4s 69 xtn2 \d0\().8h, \t0\().4s 70 xtn \d1\().4h, \d1\().4s 71 xtn2 \d1\().8h, \t1\().4s 72 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 73 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 74 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 75 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 76 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 77 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 78 smax \d0\().8h, \d0\().8h, v30.8h // 0 79 smax \d1\().8h, \d1\().8h, v30.8h // 0 80.endm 81 82.macro mask d0, d1, t0, t1, t2, t3 83 ld1 {v27.16b}, [x6], 16 84 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 85 neg v27.16b, v27.16b 86 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 87 sxtl v26.8h, v27.8b 88 sxtl2 v27.8h, v27.16b 89 sxtl v24.4s, v26.4h 90 sxtl2 v25.4s, v26.8h 91 sxtl v26.4s, v27.4h 92 sxtl2 v27.4s, v27.8h 93 ssubl \d0\().4s, \t2\().4h, \t0\().4h 94 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 95 ssubl \d1\().4s, \t3\().4h, \t1\().4h 96 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 97 mul \d0\().4s, \d0\().4s, v24.4s 98 mul \t0\().4s, \t0\().4s, v25.4s 99 mul \d1\().4s, \d1\().4s, v26.4s 100 mul \t1\().4s, \t1\().4s, v27.4s 101 sshr \d0\().4s, \d0\().4s, #6 102 sshr \t0\().4s, \t0\().4s, #6 103 sshr \d1\().4s, \d1\().4s, #6 104 sshr \t1\().4s, \t1\().4s, #6 105 saddw \d0\().4s, \d0\().4s, \t2\().4h 106 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 107 saddw \d1\().4s, \d1\().4s, \t3\().4h 108 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 109 xtn \d0\().4h, \d0\().4s 110 xtn2 \d0\().8h, \t0\().4s 111 xtn \d1\().4h, \d1\().4s 112 xtn2 \d1\().8h, \t1\().4s 113 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 114 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 115 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 116 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 117 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 118 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 119 smax \d0\().8h, \d0\().8h, v30.8h // 0 120 smax \d1\().8h, \d1\().8h, v30.8h // 0 121.endm 122 123.macro bidir_fn type, bdmax 124function \type\()_16bpc_neon, export=1 125 clz w4, w4 126.ifnc \type, avg 127 dup v31.8h, \bdmax // bitdepth_max 128 movi v30.8h, #0 129.endif 130 clz w7, \bdmax 131 sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 132.ifc \type, avg 133 mov w9, #1 134 mov w8, #-2*PREP_BIAS 135 lsl w9, w9, w7 // 1 << intermediate_bits 136 add w7, w7, #1 137 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits 138 neg w7, w7 // -(intermediate_bits+1) 139 dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits 140 dup v29.8h, w7 // -(intermediate_bits+1) 141.else 142 mov w8, #PREP_BIAS 143 lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits 144 neg w7, w7 // -intermediate_bits 145 dup v28.8h, w8 // PREP_BIAS >> intermediate_bits 146 dup v29.8h, w7 // -intermediate_bits 147.endif 148.ifc \type, w_avg 149 dup v27.4s, w6 150 neg v27.4s, v27.4s 151.endif 152 adr x7, L(\type\()_tbl) 153 sub w4, w4, #24 154 \type v4, v5, v0, v1, v2, v3 155 ldrh w4, [x7, x4, lsl #1] 156 sub x7, x7, w4, uxtw 157 br x7 15840: 159 add x7, x0, x1 160 lsl x1, x1, #1 1614: 162 subs w5, w5, #4 163 st1 {v4.d}[0], [x0], x1 164 st1 {v4.d}[1], [x7], x1 165 st1 {v5.d}[0], [x0], x1 166 st1 {v5.d}[1], [x7], x1 167 b.le 0f 168 \type v4, v5, v0, v1, v2, v3 169 b 4b 17080: 171 add x7, x0, x1 172 lsl x1, x1, #1 1738: 174 st1 {v4.8h}, [x0], x1 175 subs w5, w5, #2 176 st1 {v5.8h}, [x7], x1 177 b.le 0f 178 \type v4, v5, v0, v1, v2, v3 179 b 8b 18016: 181 \type v6, v7, v0, v1, v2, v3 182 st1 {v4.8h, v5.8h}, [x0], x1 183 subs w5, w5, #2 184 st1 {v6.8h, v7.8h}, [x0], x1 185 b.le 0f 186 \type v4, v5, v0, v1, v2, v3 187 b 16b 18832: 189 \type v6, v7, v0, v1, v2, v3 190 subs w5, w5, #1 191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 192 b.le 0f 193 \type v4, v5, v0, v1, v2, v3 194 b 32b 195640: 196 add x7, x0, #64 19764: 198 \type v6, v7, v0, v1, v2, v3 199 \type v16, v17, v0, v1, v2, v3 200 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 201 \type v18, v19, v0, v1, v2, v3 202 subs w5, w5, #1 203 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 204 b.le 0f 205 \type v4, v5, v0, v1, v2, v3 206 b 64b 2071280: 208 add x7, x0, #64 209 mov x8, #128 210 sub x1, x1, #128 211128: 212 \type v6, v7, v0, v1, v2, v3 213 \type v16, v17, v0, v1, v2, v3 214 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 215 \type v18, v19, v0, v1, v2, v3 216 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 217 \type v4, v5, v0, v1, v2, v3 218 \type v6, v7, v0, v1, v2, v3 219 \type v16, v17, v0, v1, v2, v3 220 subs w5, w5, #1 221 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 222 \type v18, v19, v0, v1, v2, v3 223 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 224 b.le 0f 225 \type v4, v5, v0, v1, v2, v3 226 b 128b 2270: 228 ret 229L(\type\()_tbl): 230 .hword L(\type\()_tbl) - 1280b 231 .hword L(\type\()_tbl) - 640b 232 .hword L(\type\()_tbl) - 32b 233 .hword L(\type\()_tbl) - 16b 234 .hword L(\type\()_tbl) - 80b 235 .hword L(\type\()_tbl) - 40b 236endfunc 237.endm 238 239bidir_fn avg, w6 240bidir_fn w_avg, w7 241bidir_fn mask, w7 242 243 244.macro w_mask_fn type 245function w_mask_\type\()_16bpc_neon, export=1 246 ldr w8, [sp] 247 clz w9, w4 248 adr x10, L(w_mask_\type\()_tbl) 249 dup v31.8h, w8 // bitdepth_max 250 sub w9, w9, #24 251 clz w8, w8 // clz(bitdepth_max) 252 ldrh w9, [x10, x9, lsl #1] 253 sub x10, x10, w9, uxtw 254 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 255 mov w9, #PREP_BIAS*64 256 neg w8, w8 // -sh 257 mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd 258 dup v30.4s, w9 // PREP_BIAS*64 259 dup v29.4s, w8 // -sh 260 dup v0.8h, w11 261.if \type == 444 262 movi v1.16b, #64 263.elseif \type == 422 264 dup v2.8b, w7 265 movi v3.8b, #129 266 sub v3.8b, v3.8b, v2.8b 267.elseif \type == 420 268 dup v2.8h, w7 269 movi v3.8h, #1, lsl #8 270 sub v3.8h, v3.8h, v2.8h 271.endif 272 add x12, x0, x1 273 lsl x1, x1, #1 274 br x10 2754: 276 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 277 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 278 subs w5, w5, #4 279 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 280 sabd v21.8h, v5.8h, v7.8h 281 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 282 ssubl2 v17.4s, v6.8h, v4.8h 283 ssubl v18.4s, v7.4h, v5.4h 284 ssubl2 v19.4s, v7.8h, v5.8h 285 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 286 uqsub v21.8h, v0.8h, v21.8h 287 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 288 sshll v6.4s, v5.4h, #6 289 sshll2 v5.4s, v4.8h, #6 290 sshll v4.4s, v4.4h, #6 291 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 292 ushr v21.8h, v21.8h, #10 293 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 294 add v5.4s, v5.4s, v30.4s 295 add v6.4s, v6.4s, v30.4s 296 add v7.4s, v7.4s, v30.4s 297 uxtl v22.4s, v20.4h 298 uxtl2 v23.4s, v20.8h 299 uxtl v24.4s, v21.4h 300 uxtl2 v25.4s, v21.8h 301 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 302 mla v5.4s, v17.4s, v23.4s 303 mla v6.4s, v18.4s, v24.4s 304 mla v7.4s, v19.4s, v25.4s 305 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 306 srshl v5.4s, v5.4s, v29.4s 307 srshl v6.4s, v6.4s, v29.4s 308 srshl v7.4s, v7.4s, v29.4s 309 sqxtun v4.4h, v4.4s // iclip_pixel 310 sqxtun2 v4.8h, v5.4s 311 sqxtun v5.4h, v6.4s 312 sqxtun2 v5.8h, v7.4s 313 umin v4.8h, v4.8h, v31.8h // iclip_pixel 314 umin v5.8h, v5.8h, v31.8h 315.if \type == 444 316 xtn v20.8b, v20.8h // 64 - m 317 xtn2 v20.16b, v21.8h 318 sub v20.16b, v1.16b, v20.16b // m 319 st1 {v20.16b}, [x6], #16 320.elseif \type == 422 321 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 322 xtn v20.8b, v20.8h 323 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 324 st1 {v20.8b}, [x6], #8 325.elseif \type == 420 326 trn1 v24.2d, v20.2d, v21.2d 327 trn2 v25.2d, v20.2d, v21.2d 328 add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) 329 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) 330 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 331 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 332 st1 {v20.s}[0], [x6], #4 333.endif 334 st1 {v4.d}[0], [x0], x1 335 st1 {v4.d}[1], [x12], x1 336 st1 {v5.d}[0], [x0], x1 337 st1 {v5.d}[1], [x12], x1 338 b.gt 4b 339 ret 3408: 341 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 342 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 343 subs w5, w5, #2 344 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 345 sabd v21.8h, v5.8h, v7.8h 346 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 347 ssubl2 v17.4s, v6.8h, v4.8h 348 ssubl v18.4s, v7.4h, v5.4h 349 ssubl2 v19.4s, v7.8h, v5.8h 350 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 351 uqsub v21.8h, v0.8h, v21.8h 352 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 353 sshll v6.4s, v5.4h, #6 354 sshll2 v5.4s, v4.8h, #6 355 sshll v4.4s, v4.4h, #6 356 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 357 ushr v21.8h, v21.8h, #10 358 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 359 add v5.4s, v5.4s, v30.4s 360 add v6.4s, v6.4s, v30.4s 361 add v7.4s, v7.4s, v30.4s 362 uxtl v22.4s, v20.4h 363 uxtl2 v23.4s, v20.8h 364 uxtl v24.4s, v21.4h 365 uxtl2 v25.4s, v21.8h 366 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 367 mla v5.4s, v17.4s, v23.4s 368 mla v6.4s, v18.4s, v24.4s 369 mla v7.4s, v19.4s, v25.4s 370 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 371 srshl v5.4s, v5.4s, v29.4s 372 srshl v6.4s, v6.4s, v29.4s 373 srshl v7.4s, v7.4s, v29.4s 374 sqxtun v4.4h, v4.4s // iclip_pixel 375 sqxtun2 v4.8h, v5.4s 376 sqxtun v5.4h, v6.4s 377 sqxtun2 v5.8h, v7.4s 378 umin v4.8h, v4.8h, v31.8h // iclip_pixel 379 umin v5.8h, v5.8h, v31.8h 380.if \type == 444 381 xtn v20.8b, v20.8h // 64 - m 382 xtn2 v20.16b, v21.8h 383 sub v20.16b, v1.16b, v20.16b // m 384 st1 {v20.16b}, [x6], #16 385.elseif \type == 422 386 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 387 xtn v20.8b, v20.8h 388 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 389 st1 {v20.8b}, [x6], #8 390.elseif \type == 420 391 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) 392 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) 393 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 394 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 395 st1 {v20.s}[0], [x6], #4 396.endif 397 st1 {v4.8h}, [x0], x1 398 st1 {v5.8h}, [x12], x1 399 b.gt 8b 400 ret 4011280: 402640: 403320: 404160: 405 mov w11, w4 406 sub x1, x1, w4, uxtw #1 407.if \type == 444 408 add x10, x6, w4, uxtw 409.elseif \type == 422 410 add x10, x6, x11, lsr #1 411.endif 412 add x9, x3, w4, uxtw #1 413 add x7, x2, w4, uxtw #1 414161: 415 mov w8, w4 41616: 417 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 418 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 419 ld1 {v6.8h, v7.8h}, [x7], #32 420 ld1 {v18.8h, v19.8h}, [x9], #32 421 subs w8, w8, #16 422 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) 423 sabd v21.8h, v5.8h, v17.8h 424 ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 425 ssubl2 v23.4s, v16.8h, v4.8h 426 ssubl v24.4s, v17.4h, v5.4h 427 ssubl2 v25.4s, v17.8h, v5.8h 428 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 429 uqsub v21.8h, v0.8h, v21.8h 430 sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 431 sshll v26.4s, v5.4h, #6 432 sshll2 v5.4s, v4.8h, #6 433 sshll v4.4s, v4.4h, #6 434 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 435 ushr v21.8h, v21.8h, #10 436 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 437 add v5.4s, v5.4s, v30.4s 438 add v26.4s, v26.4s, v30.4s 439 add v27.4s, v27.4s, v30.4s 440 uxtl v16.4s, v20.4h 441 uxtl2 v17.4s, v20.8h 442 uxtl v28.4s, v21.4h 443 mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) 444 uxtl2 v16.4s, v21.8h 445 mla v5.4s, v23.4s, v17.4s 446 mla v26.4s, v24.4s, v28.4s 447 mla v27.4s, v25.4s, v16.4s 448 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 449 srshl v5.4s, v5.4s, v29.4s 450 srshl v26.4s, v26.4s, v29.4s 451 srshl v27.4s, v27.4s, v29.4s 452 sqxtun v4.4h, v4.4s // iclip_pixel 453 sqxtun2 v4.8h, v5.4s 454 sqxtun v5.4h, v26.4s 455 sqxtun2 v5.8h, v27.4s 456 457 // Start of other half 458 sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) 459 sabd v23.8h, v7.8h, v19.8h 460 461 umin v4.8h, v4.8h, v31.8h // iclip_pixel 462 umin v5.8h, v5.8h, v31.8h 463 464 ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) 465 ssubl2 v17.4s, v18.8h, v6.8h 466 ssubl v18.4s, v19.4h, v7.4h 467 ssubl2 v19.4s, v19.8h, v7.8h 468 uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() 469 uqsub v23.8h, v0.8h, v23.8h 470 sshll v24.4s, v6.4h, #6 // tmp1 << 6 471 sshll2 v25.4s, v6.8h, #6 472 sshll v26.4s, v7.4h, #6 473 sshll2 v27.4s, v7.8h, #6 474 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 475 ushr v23.8h, v23.8h, #10 476 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 477 add v25.4s, v25.4s, v30.4s 478 add v26.4s, v26.4s, v30.4s 479 add v27.4s, v27.4s, v30.4s 480 uxtl v6.4s, v22.4h 481 uxtl2 v7.4s, v22.8h 482 uxtl v28.4s, v23.4h 483 mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) 484 uxtl2 v6.4s, v23.8h 485 mla v25.4s, v17.4s, v7.4s 486 mla v26.4s, v18.4s, v28.4s 487 mla v27.4s, v19.4s, v6.4s 488 srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 489 srshl v25.4s, v25.4s, v29.4s 490 srshl v26.4s, v26.4s, v29.4s 491 srshl v27.4s, v27.4s, v29.4s 492 sqxtun v6.4h, v24.4s // iclip_pixel 493 sqxtun2 v6.8h, v25.4s 494 sqxtun v7.4h, v26.4s 495 sqxtun2 v7.8h, v27.4s 496 umin v6.8h, v6.8h, v31.8h // iclip_pixel 497 umin v7.8h, v7.8h, v31.8h 498.if \type == 444 499 xtn v20.8b, v20.8h // 64 - m 500 xtn2 v20.16b, v21.8h 501 xtn v21.8b, v22.8h 502 xtn2 v21.16b, v23.8h 503 sub v20.16b, v1.16b, v20.16b // m 504 sub v21.16b, v1.16b, v21.16b 505 st1 {v20.16b}, [x6], #16 506 st1 {v21.16b}, [x10], #16 507.elseif \type == 422 508 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 509 addp v21.8h, v22.8h, v23.8h 510 xtn v20.8b, v20.8h 511 xtn v21.8b, v21.8h 512 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 513 uhsub v21.8b, v3.8b, v21.8b 514 st1 {v20.8b}, [x6], #8 515 st1 {v21.8b}, [x10], #8 516.elseif \type == 420 517 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) 518 add v21.8h, v21.8h, v23.8h 519 addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) 520 sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) 521 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 522 st1 {v20.8b}, [x6], #8 523.endif 524 st1 {v4.8h, v5.8h}, [x0], #32 525 st1 {v6.8h, v7.8h}, [x12], #32 526 b.gt 16b 527 subs w5, w5, #2 528 add x2, x2, w4, uxtw #1 529 add x3, x3, w4, uxtw #1 530 add x7, x7, w4, uxtw #1 531 add x9, x9, w4, uxtw #1 532.if \type == 444 533 add x6, x6, w4, uxtw 534 add x10, x10, w4, uxtw 535.elseif \type == 422 536 add x6, x6, x11, lsr #1 537 add x10, x10, x11, lsr #1 538.endif 539 add x0, x0, x1 540 add x12, x12, x1 541 b.gt 161b 542 ret 543L(w_mask_\type\()_tbl): 544 .hword L(w_mask_\type\()_tbl) - 1280b 545 .hword L(w_mask_\type\()_tbl) - 640b 546 .hword L(w_mask_\type\()_tbl) - 320b 547 .hword L(w_mask_\type\()_tbl) - 160b 548 .hword L(w_mask_\type\()_tbl) - 8b 549 .hword L(w_mask_\type\()_tbl) - 4b 550endfunc 551.endm 552 553w_mask_fn 444 554w_mask_fn 422 555w_mask_fn 420 556 557 558function blend_16bpc_neon, export=1 559 adr x6, L(blend_tbl) 560 clz w3, w3 561 sub w3, w3, #26 562 ldrh w3, [x6, x3, lsl #1] 563 sub x6, x6, w3, uxtw 564 add x8, x0, x1 565 br x6 56640: 567 lsl x1, x1, #1 5684: 569 ld1 {v2.8b}, [x5], #8 570 ld1 {v1.8h}, [x2], #16 571 ld1 {v0.d}[0], [x0] 572 neg v2.8b, v2.8b // -m 573 subs w4, w4, #2 574 ld1 {v0.d}[1], [x8] 575 sxtl v2.8h, v2.8b 576 shl v2.8h, v2.8h, #9 // -m << 9 577 sub v1.8h, v0.8h, v1.8h // a - b 578 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 579 add v0.8h, v0.8h, v1.8h 580 st1 {v0.d}[0], [x0], x1 581 st1 {v0.d}[1], [x8], x1 582 b.gt 4b 583 ret 58480: 585 lsl x1, x1, #1 5868: 587 ld1 {v4.16b}, [x5], #16 588 ld1 {v2.8h, v3.8h}, [x2], #32 589 neg v5.16b, v4.16b // -m 590 ld1 {v0.8h}, [x0] 591 ld1 {v1.8h}, [x8] 592 sxtl v4.8h, v5.8b 593 sxtl2 v5.8h, v5.16b 594 shl v4.8h, v4.8h, #9 // -m << 9 595 shl v5.8h, v5.8h, #9 596 sub v2.8h, v0.8h, v2.8h // a - b 597 sub v3.8h, v1.8h, v3.8h 598 subs w4, w4, #2 599 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 600 sqrdmulh v3.8h, v3.8h, v5.8h 601 add v0.8h, v0.8h, v2.8h 602 add v1.8h, v1.8h, v3.8h 603 st1 {v0.8h}, [x0], x1 604 st1 {v1.8h}, [x8], x1 605 b.gt 8b 606 ret 607160: 608 lsl x1, x1, #1 60916: 610 ld1 {v16.16b, v17.16b}, [x5], #32 611 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 612 subs w4, w4, #2 613 neg v18.16b, v16.16b // -m 614 neg v19.16b, v17.16b 615 ld1 {v0.8h, v1.8h}, [x0] 616 sxtl v16.8h, v18.8b 617 sxtl2 v17.8h, v18.16b 618 sxtl v18.8h, v19.8b 619 sxtl2 v19.8h, v19.16b 620 ld1 {v2.8h, v3.8h}, [x8] 621 shl v16.8h, v16.8h, #9 // -m << 9 622 shl v17.8h, v17.8h, #9 623 shl v18.8h, v18.8h, #9 624 shl v19.8h, v19.8h, #9 625 sub v4.8h, v0.8h, v4.8h // a - b 626 sub v5.8h, v1.8h, v5.8h 627 sub v6.8h, v2.8h, v6.8h 628 sub v7.8h, v3.8h, v7.8h 629 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 630 sqrdmulh v5.8h, v5.8h, v17.8h 631 sqrdmulh v6.8h, v6.8h, v18.8h 632 sqrdmulh v7.8h, v7.8h, v19.8h 633 add v0.8h, v0.8h, v4.8h 634 add v1.8h, v1.8h, v5.8h 635 add v2.8h, v2.8h, v6.8h 636 add v3.8h, v3.8h, v7.8h 637 st1 {v0.8h, v1.8h}, [x0], x1 638 st1 {v2.8h, v3.8h}, [x8], x1 639 b.gt 16b 640 ret 64132: 642 ld1 {v16.16b, v17.16b}, [x5], #32 643 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 644 subs w4, w4, #1 645 neg v18.16b, v16.16b // -m 646 neg v19.16b, v17.16b 647 sxtl v16.8h, v18.8b 648 sxtl2 v17.8h, v18.16b 649 sxtl v18.8h, v19.8b 650 sxtl2 v19.8h, v19.16b 651 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 652 shl v16.8h, v16.8h, #9 // -m << 9 653 shl v17.8h, v17.8h, #9 654 shl v18.8h, v18.8h, #9 655 shl v19.8h, v19.8h, #9 656 sub v4.8h, v0.8h, v4.8h // a - b 657 sub v5.8h, v1.8h, v5.8h 658 sub v6.8h, v2.8h, v6.8h 659 sub v7.8h, v3.8h, v7.8h 660 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 661 sqrdmulh v5.8h, v5.8h, v17.8h 662 sqrdmulh v6.8h, v6.8h, v18.8h 663 sqrdmulh v7.8h, v7.8h, v19.8h 664 add v0.8h, v0.8h, v4.8h 665 add v1.8h, v1.8h, v5.8h 666 add v2.8h, v2.8h, v6.8h 667 add v3.8h, v3.8h, v7.8h 668 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 669 b.gt 32b 670 ret 671L(blend_tbl): 672 .hword L(blend_tbl) - 32b 673 .hword L(blend_tbl) - 160b 674 .hword L(blend_tbl) - 80b 675 .hword L(blend_tbl) - 40b 676endfunc 677 678function blend_h_16bpc_neon, export=1 679 adr x6, L(blend_h_tbl) 680 movrel x5, X(obmc_masks) 681 add x5, x5, w4, uxtw 682 sub w4, w4, w4, lsr #2 683 clz w7, w3 684 add x8, x0, x1 685 lsl x1, x1, #1 686 sub w7, w7, #24 687 ldrh w7, [x6, x7, lsl #1] 688 sub x6, x6, w7, uxtw 689 br x6 6902: 691 ld2r {v2.8b, v3.8b}, [x5], #2 692 ld1 {v1.4h}, [x2], #8 693 ext v2.8b, v2.8b, v3.8b, #6 694 subs w4, w4, #2 695 neg v2.8b, v2.8b // -m 696 ld1 {v0.s}[0], [x0] 697 ld1 {v0.s}[1], [x8] 698 sxtl v2.8h, v2.8b 699 shl v2.4h, v2.4h, #9 // -m << 9 700 sub v1.4h, v0.4h, v1.4h // a - b 701 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 702 add v0.4h, v0.4h, v1.4h 703 st1 {v0.s}[0], [x0], x1 704 st1 {v0.s}[1], [x8], x1 705 b.gt 2b 706 ret 7074: 708 ld2r {v2.8b, v3.8b}, [x5], #2 709 ld1 {v1.8h}, [x2], #16 710 ext v2.8b, v2.8b, v3.8b, #4 711 subs w4, w4, #2 712 neg v2.8b, v2.8b // -m 713 ld1 {v0.d}[0], [x0] 714 ld1 {v0.d}[1], [x8] 715 sxtl v2.8h, v2.8b 716 shl v2.8h, v2.8h, #9 // -m << 9 717 sub v1.8h, v0.8h, v1.8h // a - b 718 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 719 add v0.8h, v0.8h, v1.8h 720 st1 {v0.d}[0], [x0], x1 721 st1 {v0.d}[1], [x8], x1 722 b.gt 4b 723 ret 7248: 725 ld2r {v4.8b, v5.8b}, [x5], #2 726 ld1 {v2.8h, v3.8h}, [x2], #32 727 neg v4.8b, v4.8b // -m 728 neg v5.8b, v5.8b 729 ld1 {v0.8h}, [x0] 730 subs w4, w4, #2 731 sxtl v4.8h, v4.8b 732 sxtl v5.8h, v5.8b 733 ld1 {v1.8h}, [x8] 734 shl v4.8h, v4.8h, #9 // -m << 9 735 shl v5.8h, v5.8h, #9 736 sub v2.8h, v0.8h, v2.8h // a - b 737 sub v3.8h, v1.8h, v3.8h 738 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 739 sqrdmulh v3.8h, v3.8h, v5.8h 740 add v0.8h, v0.8h, v2.8h 741 add v1.8h, v1.8h, v3.8h 742 st1 {v0.8h}, [x0], x1 743 st1 {v1.8h}, [x8], x1 744 b.gt 8b 745 ret 74616: 747 ld2r {v16.8b, v17.8b}, [x5], #2 748 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 749 neg v16.8b, v16.8b // -m 750 neg v17.8b, v17.8b 751 ld1 {v0.8h, v1.8h}, [x0] 752 ld1 {v2.8h, v3.8h}, [x8] 753 subs w4, w4, #2 754 sxtl v16.8h, v16.8b 755 sxtl v17.8h, v17.8b 756 shl v16.8h, v16.8h, #9 // -m << 9 757 shl v17.8h, v17.8h, #9 758 sub v4.8h, v0.8h, v4.8h // a - b 759 sub v5.8h, v1.8h, v5.8h 760 sub v6.8h, v2.8h, v6.8h 761 sub v7.8h, v3.8h, v7.8h 762 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 763 sqrdmulh v5.8h, v5.8h, v16.8h 764 sqrdmulh v6.8h, v6.8h, v17.8h 765 sqrdmulh v7.8h, v7.8h, v17.8h 766 add v0.8h, v0.8h, v4.8h 767 add v1.8h, v1.8h, v5.8h 768 add v2.8h, v2.8h, v6.8h 769 add v3.8h, v3.8h, v7.8h 770 st1 {v0.8h, v1.8h}, [x0], x1 771 st1 {v2.8h, v3.8h}, [x8], x1 772 b.gt 16b 773 ret 7741280: 775640: 776320: 777 sub x1, x1, w3, uxtw #1 778 add x7, x2, w3, uxtw #1 779321: 780 ld2r {v24.8b, v25.8b}, [x5], #2 781 mov w6, w3 782 neg v24.8b, v24.8b // -m 783 neg v25.8b, v25.8b 784 sxtl v24.8h, v24.8b 785 sxtl v25.8h, v25.8b 786 shl v24.8h, v24.8h, #9 // -m << 9 787 shl v25.8h, v25.8h, #9 78832: 789 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 790 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 791 subs w6, w6, #32 792 sub v16.8h, v0.8h, v16.8h // a - b 793 sub v17.8h, v1.8h, v17.8h 794 sub v18.8h, v2.8h, v18.8h 795 sub v19.8h, v3.8h, v19.8h 796 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 797 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] 798 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 799 sqrdmulh v17.8h, v17.8h, v24.8h 800 sqrdmulh v18.8h, v18.8h, v24.8h 801 sqrdmulh v19.8h, v19.8h, v24.8h 802 sub v20.8h, v4.8h, v20.8h // a - b 803 sub v21.8h, v5.8h, v21.8h 804 sub v22.8h, v6.8h, v22.8h 805 sub v23.8h, v7.8h, v23.8h 806 add v0.8h, v0.8h, v16.8h 807 add v1.8h, v1.8h, v17.8h 808 add v2.8h, v2.8h, v18.8h 809 add v3.8h, v3.8h, v19.8h 810 sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 811 sqrdmulh v21.8h, v21.8h, v25.8h 812 sqrdmulh v22.8h, v22.8h, v25.8h 813 sqrdmulh v23.8h, v23.8h, v25.8h 814 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 815 add v4.8h, v4.8h, v20.8h 816 add v5.8h, v5.8h, v21.8h 817 add v6.8h, v6.8h, v22.8h 818 add v7.8h, v7.8h, v23.8h 819 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 820 b.gt 32b 821 subs w4, w4, #2 822 add x0, x0, x1 823 add x8, x8, x1 824 add x2, x2, w3, uxtw #1 825 add x7, x7, w3, uxtw #1 826 b.gt 321b 827 ret 828L(blend_h_tbl): 829 .hword L(blend_h_tbl) - 1280b 830 .hword L(blend_h_tbl) - 640b 831 .hword L(blend_h_tbl) - 320b 832 .hword L(blend_h_tbl) - 16b 833 .hword L(blend_h_tbl) - 8b 834 .hword L(blend_h_tbl) - 4b 835 .hword L(blend_h_tbl) - 2b 836endfunc 837 838function blend_v_16bpc_neon, export=1 839 adr x6, L(blend_v_tbl) 840 movrel x5, X(obmc_masks) 841 add x5, x5, w3, uxtw 842 clz w3, w3 843 add x8, x0, x1 844 lsl x1, x1, #1 845 sub w3, w3, #26 846 ldrh w3, [x6, x3, lsl #1] 847 sub x6, x6, w3, uxtw 848 br x6 84920: 850 ld1r {v2.8b}, [x5] 851 neg v2.8b, v2.8b // -m 852 sxtl v2.8h, v2.8b 853 shl v2.4h, v2.4h, #9 // -m << 9 8542: 855 ld1 {v1.s}[0], [x2], #4 856 ld1 {v0.h}[0], [x0] 857 subs w4, w4, #2 858 ld1 {v1.h}[1], [x2] 859 ld1 {v0.h}[1], [x8] 860 add x2, x2, #4 861 sub v1.4h, v0.4h, v1.4h // a - b 862 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 863 add v0.4h, v0.4h, v1.4h 864 st1 {v0.h}[0], [x0], x1 865 st1 {v0.h}[1], [x8], x1 866 b.gt 2b 867 ret 86840: 869 ld1r {v2.2s}, [x5] 870 sub x1, x1, #4 871 neg v2.8b, v2.8b // -m 872 sxtl v2.8h, v2.8b 873 shl v2.8h, v2.8h, #9 // -m << 9 8744: 875 ld1 {v1.8h}, [x2], #16 876 ld1 {v0.d}[0], [x0] 877 ld1 {v0.d}[1], [x8] 878 subs w4, w4, #2 879 sub v1.8h, v0.8h, v1.8h // a - b 880 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 881 add v0.8h, v0.8h, v1.8h 882 st1 {v0.s}[0], [x0], #4 883 st1 {v0.s}[2], [x8], #4 884 st1 {v0.h}[2], [x0], x1 885 st1 {v0.h}[6], [x8], x1 886 b.gt 4b 887 ret 88880: 889 ld1 {v4.8b}, [x5] 890 sub x1, x1, #8 891 neg v4.8b, v4.8b // -m 892 sxtl v4.8h, v4.8b 893 shl v4.8h, v4.8h, #9 // -m << 9 8948: 895 ld1 {v2.8h, v3.8h}, [x2], #32 896 ld1 {v0.8h}, [x0] 897 ld1 {v1.8h}, [x8] 898 subs w4, w4, #2 899 sub v2.8h, v0.8h, v2.8h // a - b 900 sub v3.8h, v1.8h, v3.8h 901 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 902 sqrdmulh v3.8h, v3.8h, v4.8h 903 add v0.8h, v0.8h, v2.8h 904 add v1.8h, v1.8h, v3.8h 905 st1 {v0.d}[0], [x0], #8 906 st1 {v1.d}[0], [x8], #8 907 st1 {v0.s}[2], [x0], x1 908 st1 {v1.s}[2], [x8], x1 909 b.gt 8b 910 ret 911160: 912 ld1 {v16.16b}, [x5] 913 sub x1, x1, #16 914 neg v17.16b, v16.16b // -m 915 sxtl v16.8h, v17.8b 916 sxtl2 v17.8h, v17.16b 917 shl v16.8h, v16.8h, #9 // -m << 9 918 shl v17.4h, v17.4h, #9 91916: 920 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 921 ld1 {v0.8h, v1.8h}, [x0] 922 subs w4, w4, #2 923 ld1 {v2.8h, v3.8h}, [x8] 924 sub v4.8h, v0.8h, v4.8h // a - b 925 sub v5.4h, v1.4h, v5.4h 926 sub v6.8h, v2.8h, v6.8h 927 sub v7.4h, v3.4h, v7.4h 928 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 929 sqrdmulh v5.4h, v5.4h, v17.4h 930 sqrdmulh v6.8h, v6.8h, v16.8h 931 sqrdmulh v7.4h, v7.4h, v17.4h 932 add v0.8h, v0.8h, v4.8h 933 add v1.4h, v1.4h, v5.4h 934 add v2.8h, v2.8h, v6.8h 935 add v3.4h, v3.4h, v7.4h 936 st1 {v0.8h}, [x0], #16 937 st1 {v2.8h}, [x8], #16 938 st1 {v1.4h}, [x0], x1 939 st1 {v3.4h}, [x8], x1 940 b.gt 16b 941 ret 942320: 943 ld1 {v24.16b, v25.16b}, [x5] 944 neg v26.16b, v24.16b // -m 945 neg v27.8b, v25.8b 946 sxtl v24.8h, v26.8b 947 sxtl2 v25.8h, v26.16b 948 sxtl v26.8h, v27.8b 949 shl v24.8h, v24.8h, #9 // -m << 9 950 shl v25.8h, v25.8h, #9 951 shl v26.8h, v26.8h, #9 95232: 953 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 954 ld1 {v0.8h, v1.8h, v2.8h}, [x0] 955 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 956 ld1 {v4.8h, v5.8h, v6.8h}, [x8] 957 subs w4, w4, #2 958 sub v16.8h, v0.8h, v16.8h // a - b 959 sub v17.8h, v1.8h, v17.8h 960 sub v18.8h, v2.8h, v18.8h 961 sub v20.8h, v4.8h, v20.8h 962 sub v21.8h, v5.8h, v21.8h 963 sub v22.8h, v6.8h, v22.8h 964 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 965 sqrdmulh v17.8h, v17.8h, v25.8h 966 sqrdmulh v18.8h, v18.8h, v26.8h 967 sqrdmulh v20.8h, v20.8h, v24.8h 968 sqrdmulh v21.8h, v21.8h, v25.8h 969 sqrdmulh v22.8h, v22.8h, v26.8h 970 add v0.8h, v0.8h, v16.8h 971 add v1.8h, v1.8h, v17.8h 972 add v2.8h, v2.8h, v18.8h 973 add v4.8h, v4.8h, v20.8h 974 add v5.8h, v5.8h, v21.8h 975 add v6.8h, v6.8h, v22.8h 976 st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 977 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 978 b.gt 32b 979 ret 980L(blend_v_tbl): 981 .hword L(blend_v_tbl) - 320b 982 .hword L(blend_v_tbl) - 160b 983 .hword L(blend_v_tbl) - 80b 984 .hword L(blend_v_tbl) - 40b 985 .hword L(blend_v_tbl) - 20b 986endfunc 987 988 989// This has got the same signature as the put_8tap functions, 990// and assumes that x9 is set to (clz(w)-24). 991function put_neon 992 adr x10, L(put_tbl) 993 ldrh w9, [x10, x9, lsl #1] 994 sub x10, x10, w9, uxtw 995 br x10 996 9972: 998 ld1 {v0.s}[0], [x2], x3 999 ld1 {v1.s}[0], [x2], x3 1000 subs w5, w5, #2 1001 st1 {v0.s}[0], [x0], x1 1002 st1 {v1.s}[0], [x0], x1 1003 b.gt 2b 1004 ret 10054: 1006 ld1 {v0.4h}, [x2], x3 1007 ld1 {v1.4h}, [x2], x3 1008 subs w5, w5, #2 1009 st1 {v0.4h}, [x0], x1 1010 st1 {v1.4h}, [x0], x1 1011 b.gt 4b 1012 ret 101380: 1014 add x8, x0, x1 1015 lsl x1, x1, #1 1016 add x9, x2, x3 1017 lsl x3, x3, #1 10188: 1019 ld1 {v0.8h}, [x2], x3 1020 ld1 {v1.8h}, [x9], x3 1021 subs w5, w5, #2 1022 st1 {v0.8h}, [x0], x1 1023 st1 {v1.8h}, [x8], x1 1024 b.gt 8b 1025 ret 102616: 1027 ldp x6, x7, [x2] 1028 ldp x8, x9, [x2, #16] 1029 stp x6, x7, [x0] 1030 subs w5, w5, #1 1031 stp x8, x9, [x0, #16] 1032 add x2, x2, x3 1033 add x0, x0, x1 1034 b.gt 16b 1035 ret 103632: 1037 ldp x6, x7, [x2] 1038 ldp x8, x9, [x2, #16] 1039 stp x6, x7, [x0] 1040 ldp x10, x11, [x2, #32] 1041 stp x8, x9, [x0, #16] 1042 subs w5, w5, #1 1043 ldp x12, x13, [x2, #48] 1044 stp x10, x11, [x0, #32] 1045 stp x12, x13, [x0, #48] 1046 add x2, x2, x3 1047 add x0, x0, x1 1048 b.gt 32b 1049 ret 105064: 1051 ldp q0, q1, [x2] 1052 ldp q2, q3, [x2, #32] 1053 stp q0, q1, [x0] 1054 ldp q4, q5, [x2, #64] 1055 stp q2, q3, [x0, #32] 1056 ldp q6, q7, [x2, #96] 1057 subs w5, w5, #1 1058 stp q4, q5, [x0, #64] 1059 stp q6, q7, [x0, #96] 1060 add x2, x2, x3 1061 add x0, x0, x1 1062 b.gt 64b 1063 ret 1064128: 1065 ldp q0, q1, [x2] 1066 ldp q2, q3, [x2, #32] 1067 stp q0, q1, [x0] 1068 ldp q4, q5, [x2, #64] 1069 stp q2, q3, [x0, #32] 1070 ldp q6, q7, [x2, #96] 1071 subs w5, w5, #1 1072 stp q4, q5, [x0, #64] 1073 ldp q16, q17, [x2, #128] 1074 stp q6, q7, [x0, #96] 1075 ldp q18, q19, [x2, #160] 1076 stp q16, q17, [x0, #128] 1077 ldp q20, q21, [x2, #192] 1078 stp q18, q19, [x0, #160] 1079 ldp q22, q23, [x2, #224] 1080 stp q20, q21, [x0, #192] 1081 stp q22, q23, [x0, #224] 1082 add x2, x2, x3 1083 add x0, x0, x1 1084 b.gt 128b 1085 ret 1086 1087L(put_tbl): 1088 .hword L(put_tbl) - 128b 1089 .hword L(put_tbl) - 64b 1090 .hword L(put_tbl) - 32b 1091 .hword L(put_tbl) - 16b 1092 .hword L(put_tbl) - 80b 1093 .hword L(put_tbl) - 4b 1094 .hword L(put_tbl) - 2b 1095endfunc 1096 1097 1098// This has got the same signature as the prep_8tap functions, 1099// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and 1100// x8 to w*2. 1101function prep_neon 1102 adr x10, L(prep_tbl) 1103 ldrh w9, [x10, x9, lsl #1] 1104 dup v31.8h, w7 // intermediate_bits 1105 movi v30.8h, #(PREP_BIAS >> 8), lsl #8 1106 sub x10, x10, w9, uxtw 1107 br x10 1108 110940: 1110 add x9, x1, x2 1111 lsl x2, x2, #1 11124: 1113 ld1 {v0.d}[0], [x1], x2 1114 ld1 {v0.d}[1], [x9], x2 1115 subs w4, w4, #2 1116 sshl v0.8h, v0.8h, v31.8h 1117 sub v0.8h, v0.8h, v30.8h 1118 st1 {v0.8h}, [x0], #16 1119 b.gt 4b 1120 ret 112180: 1122 add x9, x1, x2 1123 lsl x2, x2, #1 11248: 1125 ld1 {v0.8h}, [x1], x2 1126 ld1 {v1.8h}, [x9], x2 1127 subs w4, w4, #2 1128 sshl v0.8h, v0.8h, v31.8h 1129 sshl v1.8h, v1.8h, v31.8h 1130 sub v0.8h, v0.8h, v30.8h 1131 sub v1.8h, v1.8h, v30.8h 1132 st1 {v0.8h, v1.8h}, [x0], #32 1133 b.gt 8b 1134 ret 113516: 1136 ldp q0, q1, [x1] 1137 add x1, x1, x2 1138 sshl v0.8h, v0.8h, v31.8h 1139 ldp q2, q3, [x1] 1140 add x1, x1, x2 1141 subs w4, w4, #2 1142 sshl v1.8h, v1.8h, v31.8h 1143 sshl v2.8h, v2.8h, v31.8h 1144 sshl v3.8h, v3.8h, v31.8h 1145 sub v0.8h, v0.8h, v30.8h 1146 sub v1.8h, v1.8h, v30.8h 1147 sub v2.8h, v2.8h, v30.8h 1148 sub v3.8h, v3.8h, v30.8h 1149 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1150 b.gt 16b 1151 ret 115232: 1153 ldp q0, q1, [x1] 1154 sshl v0.8h, v0.8h, v31.8h 1155 ldp q2, q3, [x1, #32] 1156 add x1, x1, x2 1157 sshl v1.8h, v1.8h, v31.8h 1158 sshl v2.8h, v2.8h, v31.8h 1159 sshl v3.8h, v3.8h, v31.8h 1160 subs w4, w4, #1 1161 sub v0.8h, v0.8h, v30.8h 1162 sub v1.8h, v1.8h, v30.8h 1163 sub v2.8h, v2.8h, v30.8h 1164 sub v3.8h, v3.8h, v30.8h 1165 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1166 b.gt 32b 1167 ret 116864: 1169 ldp q0, q1, [x1] 1170 subs w4, w4, #1 1171 sshl v0.8h, v0.8h, v31.8h 1172 ldp q2, q3, [x1, #32] 1173 sshl v1.8h, v1.8h, v31.8h 1174 ldp q4, q5, [x1, #64] 1175 sshl v2.8h, v2.8h, v31.8h 1176 sshl v3.8h, v3.8h, v31.8h 1177 ldp q6, q7, [x1, #96] 1178 add x1, x1, x2 1179 sshl v4.8h, v4.8h, v31.8h 1180 sshl v5.8h, v5.8h, v31.8h 1181 sshl v6.8h, v6.8h, v31.8h 1182 sshl v7.8h, v7.8h, v31.8h 1183 sub v0.8h, v0.8h, v30.8h 1184 sub v1.8h, v1.8h, v30.8h 1185 sub v2.8h, v2.8h, v30.8h 1186 sub v3.8h, v3.8h, v30.8h 1187 stp q0, q1, [x0] 1188 sub v4.8h, v4.8h, v30.8h 1189 sub v5.8h, v5.8h, v30.8h 1190 stp q2, q3, [x0, #32] 1191 sub v6.8h, v6.8h, v30.8h 1192 sub v7.8h, v7.8h, v30.8h 1193 stp q4, q5, [x0, #64] 1194 stp q6, q7, [x0, #96] 1195 add x0, x0, x8 1196 b.gt 64b 1197 ret 1198128: 1199 ldp q0, q1, [x1] 1200 subs w4, w4, #1 1201 sshl v0.8h, v0.8h, v31.8h 1202 ldp q2, q3, [x1, #32] 1203 sshl v1.8h, v1.8h, v31.8h 1204 ldp q4, q5, [x1, #64] 1205 sshl v2.8h, v2.8h, v31.8h 1206 sshl v3.8h, v3.8h, v31.8h 1207 ldp q6, q7, [x1, #96] 1208 sshl v4.8h, v4.8h, v31.8h 1209 sshl v5.8h, v5.8h, v31.8h 1210 ldp q16, q17, [x1, #128] 1211 sshl v6.8h, v6.8h, v31.8h 1212 sshl v7.8h, v7.8h, v31.8h 1213 ldp q18, q19, [x1, #160] 1214 sshl v16.8h, v16.8h, v31.8h 1215 sshl v17.8h, v17.8h, v31.8h 1216 ldp q20, q21, [x1, #192] 1217 sshl v18.8h, v18.8h, v31.8h 1218 sshl v19.8h, v19.8h, v31.8h 1219 ldp q22, q23, [x1, #224] 1220 add x1, x1, x2 1221 sshl v20.8h, v20.8h, v31.8h 1222 sshl v21.8h, v21.8h, v31.8h 1223 sshl v22.8h, v22.8h, v31.8h 1224 sshl v23.8h, v23.8h, v31.8h 1225 sub v0.8h, v0.8h, v30.8h 1226 sub v1.8h, v1.8h, v30.8h 1227 sub v2.8h, v2.8h, v30.8h 1228 sub v3.8h, v3.8h, v30.8h 1229 stp q0, q1, [x0] 1230 sub v4.8h, v4.8h, v30.8h 1231 sub v5.8h, v5.8h, v30.8h 1232 stp q2, q3, [x0, #32] 1233 sub v6.8h, v6.8h, v30.8h 1234 sub v7.8h, v7.8h, v30.8h 1235 stp q4, q5, [x0, #64] 1236 sub v16.8h, v16.8h, v30.8h 1237 sub v17.8h, v17.8h, v30.8h 1238 stp q6, q7, [x0, #96] 1239 sub v18.8h, v18.8h, v30.8h 1240 sub v19.8h, v19.8h, v30.8h 1241 stp q16, q17, [x0, #128] 1242 sub v20.8h, v20.8h, v30.8h 1243 sub v21.8h, v21.8h, v30.8h 1244 stp q18, q19, [x0, #160] 1245 sub v22.8h, v22.8h, v30.8h 1246 sub v23.8h, v23.8h, v30.8h 1247 stp q20, q21, [x0, #192] 1248 stp q22, q23, [x0, #224] 1249 add x0, x0, x8 1250 b.gt 128b 1251 ret 1252 1253L(prep_tbl): 1254 .hword L(prep_tbl) - 128b 1255 .hword L(prep_tbl) - 64b 1256 .hword L(prep_tbl) - 32b 1257 .hword L(prep_tbl) - 16b 1258 .hword L(prep_tbl) - 80b 1259 .hword L(prep_tbl) - 40b 1260endfunc 1261 1262 1263.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1264 ld1 {\d0\wd}[0], [\s0], \strd 1265 ld1 {\d1\wd}[0], [\s1], \strd 1266.ifnb \d2 1267 ld1 {\d2\wd}[0], [\s0], \strd 1268 ld1 {\d3\wd}[0], [\s1], \strd 1269.endif 1270.ifnb \d4 1271 ld1 {\d4\wd}[0], [\s0], \strd 1272.endif 1273.ifnb \d5 1274 ld1 {\d5\wd}[0], [\s1], \strd 1275.endif 1276.ifnb \d6 1277 ld1 {\d6\wd}[0], [\s0], \strd 1278.endif 1279.endm 1280.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1281 ld1 {\d0\wd}, [\s0], \strd 1282 ld1 {\d1\wd}, [\s1], \strd 1283.ifnb \d2 1284 ld1 {\d2\wd}, [\s0], \strd 1285 ld1 {\d3\wd}, [\s1], \strd 1286.endif 1287.ifnb \d4 1288 ld1 {\d4\wd}, [\s0], \strd 1289.endif 1290.ifnb \d5 1291 ld1 {\d5\wd}, [\s1], \strd 1292.endif 1293.ifnb \d6 1294 ld1 {\d6\wd}, [\s0], \strd 1295.endif 1296.endm 1297.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 1298 ld1 {\d0\wd, \d1\wd}, [\s0], \strd 1299.ifnb \d2 1300 ld1 {\d2\wd, \d3\wd}, [\s1], \strd 1301.endif 1302.ifnb \d4 1303 ld1 {\d4\wd, \d5\wd}, [\s0], \strd 1304.endif 1305.endm 1306.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1307 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1308.endm 1309.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1310 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1311.endm 1312.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1313 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1314.endm 1315.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 1316 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 1317.endm 1318.macro interleave_1 wd, r0, r1, r2, r3, r4 1319 trn1 \r0\wd, \r0\wd, \r1\wd 1320 trn1 \r1\wd, \r1\wd, \r2\wd 1321.ifnb \r3 1322 trn1 \r2\wd, \r2\wd, \r3\wd 1323 trn1 \r3\wd, \r3\wd, \r4\wd 1324.endif 1325.endm 1326.macro interleave_1_s r0, r1, r2, r3, r4 1327 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1328.endm 1329.macro umin_h c, wd, r0, r1, r2, r3 1330 umin \r0\wd, \r0\wd, \c\wd 1331.ifnb \r1 1332 umin \r1\wd, \r1\wd, \c\wd 1333.endif 1334.ifnb \r2 1335 umin \r2\wd, \r2\wd, \c\wd 1336 umin \r3\wd, \r3\wd, \c\wd 1337.endif 1338.endm 1339.macro sub_h c, wd, r0, r1, r2, r3 1340 sub \r0\wd, \r0\wd, \c\wd 1341.ifnb \r1 1342 sub \r1\wd, \r1\wd, \c\wd 1343.endif 1344.ifnb \r2 1345 sub \r2\wd, \r2\wd, \c\wd 1346 sub \r3\wd, \r3\wd, \c\wd 1347.endif 1348.endm 1349.macro smull_smlal_4 d, s0, s1, s2, s3 1350 smull \d\().4s, \s0\().4h, v0.h[0] 1351 smlal \d\().4s, \s1\().4h, v0.h[1] 1352 smlal \d\().4s, \s2\().4h, v0.h[2] 1353 smlal \d\().4s, \s3\().4h, v0.h[3] 1354.endm 1355.macro smull2_smlal2_4 d, s0, s1, s2, s3 1356 smull2 \d\().4s, \s0\().8h, v0.h[0] 1357 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1358 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1359 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1360.endm 1361.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1362 smull \d\().4s, \s0\().4h, v0.h[0] 1363 smlal \d\().4s, \s1\().4h, v0.h[1] 1364 smlal \d\().4s, \s2\().4h, v0.h[2] 1365 smlal \d\().4s, \s3\().4h, v0.h[3] 1366 smlal \d\().4s, \s4\().4h, v0.h[4] 1367 smlal \d\().4s, \s5\().4h, v0.h[5] 1368 smlal \d\().4s, \s6\().4h, v0.h[6] 1369 smlal \d\().4s, \s7\().4h, v0.h[7] 1370.endm 1371.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1372 smull2 \d\().4s, \s0\().8h, v0.h[0] 1373 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1374 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1375 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1376 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1377 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1378 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1379 smlal2 \d\().4s, \s7\().8h, v0.h[7] 1380.endm 1381.macro sqrshrun_h shift, r0, r1, r2, r3 1382 sqrshrun \r0\().4h, \r0\().4s, #\shift 1383.ifnb \r1 1384 sqrshrun2 \r0\().8h, \r1\().4s, #\shift 1385.endif 1386.ifnb \r2 1387 sqrshrun \r2\().4h, \r2\().4s, #\shift 1388 sqrshrun2 \r2\().8h, \r3\().4s, #\shift 1389.endif 1390.endm 1391.macro xtn_h r0, r1, r2, r3 1392 xtn \r0\().4h, \r0\().4s 1393 xtn2 \r0\().8h, \r1\().4s 1394.ifnb \r2 1395 xtn \r2\().4h, \r2\().4s 1396 xtn2 \r2\().8h, \r3\().4s 1397.endif 1398.endm 1399.macro srshl_s shift, r0, r1, r2, r3 1400 srshl \r0\().4s, \r0\().4s, \shift\().4s 1401 srshl \r1\().4s, \r1\().4s, \shift\().4s 1402.ifnb \r2 1403 srshl \r2\().4s, \r2\().4s, \shift\().4s 1404 srshl \r3\().4s, \r3\().4s, \shift\().4s 1405.endif 1406.endm 1407.macro st_s strd, reg, lanes 1408 st1 {\reg\().s}[0], [x0], \strd 1409 st1 {\reg\().s}[1], [x9], \strd 1410.if \lanes > 2 1411 st1 {\reg\().s}[2], [x0], \strd 1412 st1 {\reg\().s}[3], [x9], \strd 1413.endif 1414.endm 1415.macro st_d strd, r0, r1 1416 st1 {\r0\().d}[0], [x0], \strd 1417 st1 {\r0\().d}[1], [x9], \strd 1418.ifnb \r1 1419 st1 {\r1\().d}[0], [x0], \strd 1420 st1 {\r1\().d}[1], [x9], \strd 1421.endif 1422.endm 1423.macro shift_store_4 type, strd, r0, r1, r2, r3 1424.ifc \type, put 1425 sqrshrun_h 6, \r0, \r1, \r2, \r3 1426 umin_h v31, .8h, \r0, \r2 1427.else 1428 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1429 xtn_h \r0, \r1, \r2, \r3 1430 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1431.endif 1432 st_d \strd, \r0, \r2 1433.endm 1434.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1435 st1 {\r0\wd}, [x0], \strd 1436 st1 {\r1\wd}, [x9], \strd 1437.ifnb \r2 1438 st1 {\r2\wd}, [x0], \strd 1439 st1 {\r3\wd}, [x9], \strd 1440.endif 1441.ifnb \r4 1442 st1 {\r4\wd}, [x0], \strd 1443 st1 {\r5\wd}, [x9], \strd 1444 st1 {\r6\wd}, [x0], \strd 1445 st1 {\r7\wd}, [x9], \strd 1446.endif 1447.endm 1448.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 1449 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1450.endm 1451.macro shift_store_8 type, strd, r0, r1, r2, r3 1452.ifc \type, put 1453 sqrshrun_h 6, \r0, \r1, \r2, \r3 1454 umin_h v31, .8h, \r0, \r2 1455.else 1456 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1457 xtn_h \r0, \r1, \r2, \r3 1458 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1459.endif 1460 st_8h \strd, \r0, \r2 1461.endm 1462.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 1463.ifc \type, put 1464 sqrshrun_h 6, \r0, \r1, \r2, \r3 1465 umin \r0\().8h, \r0\().8h, v31.8h 1466 umin \r1\().8h, \r2\().8h, v31.8h 1467.else 1468 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1469 xtn_h \r0, \r1, \r2, \r3 1470 sub \r0\().8h, \r0\().8h, v29.8h 1471 sub \r1\().8h, \r2\().8h, v29.8h 1472.endif 1473 st1 {\r0\().8h, \r1\().8h}, [\dst], \strd 1474.endm 1475 1476.macro make_8tap_fn op, type, type_h, type_v 1477function \op\()_8tap_\type\()_16bpc_neon, export=1 1478 mov w9, \type_h 1479 mov w10, \type_v 1480 b \op\()_8tap_neon 1481endfunc 1482.endm 1483 1484// No spaces in these expressions, due to gas-preprocessor. 1485#define REGULAR ((0*15<<7)|3*15) 1486#define SMOOTH ((1*15<<7)|4*15) 1487#define SHARP ((2*15<<7)|3*15) 1488 1489.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 1490make_8tap_fn \type, regular, REGULAR, REGULAR 1491make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1492make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1493make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1494make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1495make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1496make_8tap_fn \type, sharp, SHARP, SHARP 1497make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1498make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1499 1500function \type\()_8tap_neon 1501.ifc \bdmax, w8 1502 ldr w8, [sp] 1503.endif 1504 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1505 mul \mx, \mx, w11 1506 mul \my, \my, w11 1507 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h 1508 add \my, \my, w10 // my, 8tap_v, 4tap_v 1509.ifc \type, prep 1510 uxtw \d_strd, \w 1511 lsl \d_strd, \d_strd, #1 1512.endif 1513 1514 dup v31.8h, \bdmax // bitdepth_max 1515 clz \bdmax, \bdmax 1516 clz w9, \w 1517 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 1518 mov w12, #6 1519 tst \mx, #(0x7f << 14) 1520 sub w9, w9, #24 1521 add w13, w12, \bdmax // 6 + intermediate_bits 1522 sub w12, w12, \bdmax // 6 - intermediate_bits 1523 movrel x11, X(mc_subpel_filters), -8 1524 b.ne L(\type\()_8tap_h) 1525 tst \my, #(0x7f << 14) 1526 b.ne L(\type\()_8tap_v) 1527 b \type\()_neon 1528 1529L(\type\()_8tap_h): 1530 cmp \w, #4 1531 ubfx w10, \mx, #7, #7 1532 and \mx, \mx, #0x7f 1533 b.le 4f 1534 mov \mx, w10 15354: 1536 tst \my, #(0x7f << 14) 1537 add \xmx, x11, \mx, uxtw #3 1538 b.ne L(\type\()_8tap_hv) 1539 1540 adr x10, L(\type\()_8tap_h_tbl) 1541 dup v30.4s, w12 // 6 - intermediate_bits 1542 ldrh w9, [x10, x9, lsl #1] 1543 neg v30.4s, v30.4s // -(6-intermediate_bits) 1544.ifc \type, put 1545 dup v29.8h, \bdmax // intermediate_bits 1546.else 1547 movi v28.8h, #(PREP_BIAS >> 8), lsl #8 1548.endif 1549 sub x10, x10, w9, uxtw 1550.ifc \type, put 1551 neg v29.8h, v29.8h // -intermediate_bits 1552.endif 1553 br x10 1554 155520: // 2xN h 1556.ifc \type, put 1557 add \xmx, \xmx, #2 1558 ld1 {v0.s}[0], [\xmx] 1559 sub \src, \src, #2 1560 add \ds2, \dst, \d_strd 1561 add \sr2, \src, \s_strd 1562 lsl \d_strd, \d_strd, #1 1563 lsl \s_strd, \s_strd, #1 1564 sxtl v0.8h, v0.8b 15652: 1566 ld1 {v4.8h}, [\src], \s_strd 1567 ld1 {v6.8h}, [\sr2], \s_strd 1568 ext v5.16b, v4.16b, v4.16b, #2 1569 ext v7.16b, v6.16b, v6.16b, #2 1570 subs \h, \h, #2 1571 trn1 v3.2s, v4.2s, v6.2s 1572 trn2 v6.2s, v4.2s, v6.2s 1573 trn1 v4.2s, v5.2s, v7.2s 1574 trn2 v7.2s, v5.2s, v7.2s 1575 smull v3.4s, v3.4h, v0.h[0] 1576 smlal v3.4s, v4.4h, v0.h[1] 1577 smlal v3.4s, v6.4h, v0.h[2] 1578 smlal v3.4s, v7.4h, v0.h[3] 1579 srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) 1580 sqxtun v3.4h, v3.4s 1581 srshl v3.4h, v3.4h, v29.4h // -intermediate_bits 1582 umin v3.4h, v3.4h, v31.4h 1583 st1 {v3.s}[0], [\dst], \d_strd 1584 st1 {v3.s}[1], [\ds2], \d_strd 1585 b.gt 2b 1586 ret 1587.endif 1588 158940: // 4xN h 1590 add \xmx, \xmx, #2 1591 ld1 {v0.s}[0], [\xmx] 1592 sub \src, \src, #2 1593 add \ds2, \dst, \d_strd 1594 add \sr2, \src, \s_strd 1595 lsl \d_strd, \d_strd, #1 1596 lsl \s_strd, \s_strd, #1 1597 sxtl v0.8h, v0.8b 15984: 1599 ld1 {v16.8h}, [\src], \s_strd 1600 ld1 {v20.8h}, [\sr2], \s_strd 1601 ext v17.16b, v16.16b, v16.16b, #2 1602 ext v18.16b, v16.16b, v16.16b, #4 1603 ext v19.16b, v16.16b, v16.16b, #6 1604 ext v21.16b, v20.16b, v20.16b, #2 1605 ext v22.16b, v20.16b, v20.16b, #4 1606 ext v23.16b, v20.16b, v20.16b, #6 1607 subs \h, \h, #2 1608 smull v16.4s, v16.4h, v0.h[0] 1609 smlal v16.4s, v17.4h, v0.h[1] 1610 smlal v16.4s, v18.4h, v0.h[2] 1611 smlal v16.4s, v19.4h, v0.h[3] 1612 smull v20.4s, v20.4h, v0.h[0] 1613 smlal v20.4s, v21.4h, v0.h[1] 1614 smlal v20.4s, v22.4h, v0.h[2] 1615 smlal v20.4s, v23.4h, v0.h[3] 1616 srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) 1617 srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) 1618.ifc \type, put 1619 sqxtun v16.4h, v16.4s 1620 sqxtun2 v16.8h, v20.4s 1621 srshl v16.8h, v16.8h, v29.8h // -intermediate_bits 1622 umin v16.8h, v16.8h, v31.8h 1623.else 1624 xtn v16.4h, v16.4s 1625 xtn2 v16.8h, v20.4s 1626 sub v16.8h, v16.8h, v28.8h // PREP_BIAS 1627.endif 1628 st1 {v16.d}[0], [\dst], \d_strd 1629 st1 {v16.d}[1], [\ds2], \d_strd 1630 b.gt 4b 1631 ret 1632 163380: 1634160: 1635320: 1636640: 16371280: // 8xN, 16xN, 32xN, ... h 1638 ld1 {v0.8b}, [\xmx] 1639 sub \src, \src, #6 1640 add \ds2, \dst, \d_strd 1641 add \sr2, \src, \s_strd 1642 lsl \s_strd, \s_strd, #1 1643 sxtl v0.8h, v0.8b 1644 1645 sub \s_strd, \s_strd, \w, uxtw #1 1646 sub \s_strd, \s_strd, #16 1647.ifc \type, put 1648 lsl \d_strd, \d_strd, #1 1649 sub \d_strd, \d_strd, \w, uxtw #1 1650.endif 165181: 1652 ld1 {v16.8h, v17.8h}, [\src], #32 1653 ld1 {v20.8h, v21.8h}, [\sr2], #32 1654 mov \mx, \w 1655 16568: 1657 smull v18.4s, v16.4h, v0.h[0] 1658 smull2 v19.4s, v16.8h, v0.h[0] 1659 smull v22.4s, v20.4h, v0.h[0] 1660 smull2 v23.4s, v20.8h, v0.h[0] 1661.irpc i, 1234567 1662 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1663 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1664 smlal v18.4s, v24.4h, v0.h[\i] 1665 smlal2 v19.4s, v24.8h, v0.h[\i] 1666 smlal v22.4s, v25.4h, v0.h[\i] 1667 smlal2 v23.4s, v25.8h, v0.h[\i] 1668.endr 1669 subs \mx, \mx, #8 1670 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) 1671 srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) 1672 srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) 1673 srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) 1674.ifc \type, put 1675 sqxtun v18.4h, v18.4s 1676 sqxtun2 v18.8h, v19.4s 1677 sqxtun v22.4h, v22.4s 1678 sqxtun2 v22.8h, v23.4s 1679 srshl v18.8h, v18.8h, v29.8h // -intermediate_bits 1680 srshl v22.8h, v22.8h, v29.8h // -intermediate_bits 1681 umin v18.8h, v18.8h, v31.8h 1682 umin v22.8h, v22.8h, v31.8h 1683.else 1684 xtn v18.4h, v18.4s 1685 xtn2 v18.8h, v19.4s 1686 xtn v22.4h, v22.4s 1687 xtn2 v22.8h, v23.4s 1688 sub v18.8h, v18.8h, v28.8h // PREP_BIAS 1689 sub v22.8h, v22.8h, v28.8h // PREP_BIAS 1690.endif 1691 st1 {v18.8h}, [\dst], #16 1692 st1 {v22.8h}, [\ds2], #16 1693 b.le 9f 1694 1695 mov v16.16b, v17.16b 1696 mov v20.16b, v21.16b 1697 ld1 {v17.8h}, [\src], #16 1698 ld1 {v21.8h}, [\sr2], #16 1699 b 8b 1700 17019: 1702 add \dst, \dst, \d_strd 1703 add \ds2, \ds2, \d_strd 1704 add \src, \src, \s_strd 1705 add \sr2, \sr2, \s_strd 1706 1707 subs \h, \h, #2 1708 b.gt 81b 1709 ret 1710 1711L(\type\()_8tap_h_tbl): 1712 .hword L(\type\()_8tap_h_tbl) - 1280b 1713 .hword L(\type\()_8tap_h_tbl) - 640b 1714 .hword L(\type\()_8tap_h_tbl) - 320b 1715 .hword L(\type\()_8tap_h_tbl) - 160b 1716 .hword L(\type\()_8tap_h_tbl) - 80b 1717 .hword L(\type\()_8tap_h_tbl) - 40b 1718 .hword L(\type\()_8tap_h_tbl) - 20b 1719 .hword 0 1720 1721 1722L(\type\()_8tap_v): 1723 cmp \h, #4 1724 ubfx w10, \my, #7, #7 1725 and \my, \my, #0x7f 1726 b.le 4f 1727 mov \my, w10 17284: 1729 add \xmy, x11, \my, uxtw #3 1730 1731.ifc \type, prep 1732 dup v30.4s, w12 // 6 - intermediate_bits 1733 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 1734.endif 1735 adr x10, L(\type\()_8tap_v_tbl) 1736 ldrh w9, [x10, x9, lsl #1] 1737.ifc \type, prep 1738 neg v30.4s, v30.4s // -(6-intermediate_bits) 1739.endif 1740 sub x10, x10, w9, uxtw 1741 br x10 1742 174320: // 2xN v 1744.ifc \type, put 1745 b.gt 28f 1746 1747 cmp \h, #2 1748 add \xmy, \xmy, #2 1749 ld1 {v0.s}[0], [\xmy] 1750 sub \src, \src, \s_strd 1751 add \ds2, \dst, \d_strd 1752 add \sr2, \src, \s_strd 1753 lsl \s_strd, \s_strd, #1 1754 lsl \d_strd, \d_strd, #1 1755 sxtl v0.8h, v0.8b 1756 1757 // 2x2 v 1758 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1759 interleave_1_s v1, v2, v3, v4, v5 1760 b.gt 24f 1761 smull_smlal_4 v6, v1, v2, v3, v4 1762 sqrshrun_h 6, v6 1763 umin_h v31, .8h, v6 1764 st_s \d_strd, v6, 2 1765 ret 1766 176724: // 2x4 v 1768 load_s \sr2, \src, \s_strd, v6, v7 1769 interleave_1_s v5, v6, v7 1770 smull_smlal_4 v16, v1, v2, v3, v4 1771 smull_smlal_4 v17, v3, v4, v5, v6 1772 sqrshrun_h 6, v16, v17 1773 umin_h v31, .8h, v16 1774 st_s \d_strd, v16, 4 1775 ret 1776 177728: // 2x8, 2x16 v 1778 ld1 {v0.8b}, [\xmy] 1779 sub \sr2, \src, \s_strd, lsl #1 1780 add \ds2, \dst, \d_strd 1781 sub \src, \sr2, \s_strd 1782 lsl \d_strd, \d_strd, #1 1783 lsl \s_strd, \s_strd, #1 1784 sxtl v0.8h, v0.8b 1785 1786 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1787 interleave_1_s v1, v2, v3, v4, v5 1788 interleave_1_s v5, v6, v7 1789216: 1790 subs \h, \h, #8 1791 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 1792 load_s \sr2, \src, \s_strd, v20, v21, v22, v23 1793 interleave_1_s v7, v16, v17, v18, v19 1794 interleave_1_s v19, v20, v21, v22, v23 1795 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 1796 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 1797 smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20 1798 smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22 1799 sqrshrun_h 6, v24, v25, v26, v27 1800 umin_h v31, .8h, v24, v26 1801 st_s \d_strd, v24, 4 1802 st_s \d_strd, v26, 4 1803 b.le 0f 1804 mov v1.16b, v17.16b 1805 mov v2.16b, v18.16b 1806 mov v3.16b, v19.16b 1807 mov v4.16b, v20.16b 1808 mov v5.16b, v21.16b 1809 mov v6.16b, v22.16b 1810 mov v7.16b, v23.16b 1811 b 216b 18120: 1813 ret 1814.endif 1815 181640: 1817 b.gt 480f 1818 1819 // 4x2, 4x4 v 1820 cmp \h, #2 1821 add \xmy, \xmy, #2 1822 ld1 {v0.s}[0], [\xmy] 1823 sub \src, \src, \s_strd 1824 add \ds2, \dst, \d_strd 1825 add \sr2, \src, \s_strd 1826 lsl \s_strd, \s_strd, #1 1827 lsl \d_strd, \d_strd, #1 1828 sxtl v0.8h, v0.8b 1829 1830 load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1831 smull_smlal_4 v6, v1, v2, v3, v4 1832 smull_smlal_4 v7, v2, v3, v4, v5 1833 shift_store_4 \type, \d_strd, v6, v7 1834 b.le 0f 1835 load_4h \sr2, \src, \s_strd, v6, v7 1836 smull_smlal_4 v1, v3, v4, v5, v6 1837 smull_smlal_4 v2, v4, v5, v6, v7 1838 shift_store_4 \type, \d_strd, v1, v2 18390: 1840 ret 1841 1842480: // 4x8, 4x16 v 1843 ld1 {v0.8b}, [\xmy] 1844 sub \sr2, \src, \s_strd, lsl #1 1845 add \ds2, \dst, \d_strd 1846 sub \src, \sr2, \s_strd 1847 lsl \s_strd, \s_strd, #1 1848 lsl \d_strd, \d_strd, #1 1849 sxtl v0.8h, v0.8b 1850 1851 load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1852 185348: 1854 subs \h, \h, #4 1855 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 1856 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1857 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 1858 smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 1859 smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1860 shift_store_4 \type, \d_strd, v1, v2, v3, v4 1861 b.le 0f 1862 mov v16.8b, v20.8b 1863 mov v17.8b, v21.8b 1864 mov v18.8b, v22.8b 1865 mov v19.8b, v23.8b 1866 mov v20.8b, v24.8b 1867 mov v21.8b, v25.8b 1868 mov v22.8b, v26.8b 1869 b 48b 18700: 1871 ret 1872 187380: 1874 b.gt 880f 1875 1876 // 8x2, 8x4 v 1877 cmp \h, #2 1878 add \xmy, \xmy, #2 1879 ld1 {v0.s}[0], [\xmy] 1880 sub \src, \src, \s_strd 1881 add \ds2, \dst, \d_strd 1882 add \sr2, \src, \s_strd 1883 lsl \s_strd, \s_strd, #1 1884 lsl \d_strd, \d_strd, #1 1885 sxtl v0.8h, v0.8b 1886 1887 load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1888 smull_smlal_4 v16, v1, v2, v3, v4 1889 smull2_smlal2_4 v17, v1, v2, v3, v4 1890 smull_smlal_4 v18, v2, v3, v4, v5 1891 smull2_smlal2_4 v19, v2, v3, v4, v5 1892 shift_store_8 \type, \d_strd, v16, v17, v18, v19 1893 b.le 0f 1894 load_8h \sr2, \src, \s_strd, v6, v7 1895 smull_smlal_4 v16, v3, v4, v5, v6 1896 smull2_smlal2_4 v17, v3, v4, v5, v6 1897 smull_smlal_4 v18, v4, v5, v6, v7 1898 smull2_smlal2_4 v19, v4, v5, v6, v7 1899 shift_store_8 \type, \d_strd, v16, v17, v18, v19 19000: 1901 ret 1902 1903880: // 8x6, 8x8, 8x16, 8x32 v 19041680: // 16x8, 16x16, ... 1905320: // 32x8, 32x16, ... 1906640: 19071280: 1908 ld1 {v0.8b}, [\xmy] 1909 sub \src, \src, \s_strd 1910 sub \src, \src, \s_strd, lsl #1 1911 sxtl v0.8h, v0.8b 1912 mov \my, \h 1913168: 1914 add \ds2, \dst, \d_strd 1915 add \sr2, \src, \s_strd 1916 lsl \s_strd, \s_strd, #1 1917 lsl \d_strd, \d_strd, #1 1918 1919 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1920 192188: 1922 subs \h, \h, #2 1923 load_8h \sr2, \src, \s_strd, v23, v24 1924 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1925 smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 1926 smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 1927 smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 1928 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1929 b.le 9f 1930 subs \h, \h, #2 1931 load_8h \sr2, \src, \s_strd, v25, v26 1932 smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 1933 smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 1934 smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 1935 smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1936 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1937 b.le 9f 1938 mov v16.16b, v20.16b 1939 mov v17.16b, v21.16b 1940 mov v18.16b, v22.16b 1941 mov v19.16b, v23.16b 1942 mov v20.16b, v24.16b 1943 mov v21.16b, v25.16b 1944 mov v22.16b, v26.16b 1945 b 88b 19469: 1947 subs \w, \w, #8 1948 b.le 0f 1949 asr \s_strd, \s_strd, #1 1950 asr \d_strd, \d_strd, #1 1951 msub \src, \s_strd, \xmy, \src 1952 msub \dst, \d_strd, \xmy, \dst 1953 sub \src, \src, \s_strd, lsl #3 1954 mov \h, \my 1955 add \src, \src, #16 1956 add \dst, \dst, #16 1957 b 168b 19580: 1959 ret 1960 1961160: 1962 b.gt 1680b 1963 1964 // 16x2, 16x4 v 1965 add \xmy, \xmy, #2 1966 ld1 {v0.s}[0], [\xmy] 1967 sub \src, \src, \s_strd 1968 sxtl v0.8h, v0.8b 1969 1970 load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 197116: 1972 load_16h \src, \src, \s_strd, v22, v23 1973 subs \h, \h, #1 1974 smull_smlal_4 v1, v16, v18, v20, v22 1975 smull2_smlal2_4 v2, v16, v18, v20, v22 1976 smull_smlal_4 v3, v17, v19, v21, v23 1977 smull2_smlal2_4 v4, v17, v19, v21, v23 1978 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 1979 b.le 0f 1980 mov v16.16b, v18.16b 1981 mov v17.16b, v19.16b 1982 mov v18.16b, v20.16b 1983 mov v19.16b, v21.16b 1984 mov v20.16b, v22.16b 1985 mov v21.16b, v23.16b 1986 b 16b 19870: 1988 ret 1989 1990L(\type\()_8tap_v_tbl): 1991 .hword L(\type\()_8tap_v_tbl) - 1280b 1992 .hword L(\type\()_8tap_v_tbl) - 640b 1993 .hword L(\type\()_8tap_v_tbl) - 320b 1994 .hword L(\type\()_8tap_v_tbl) - 160b 1995 .hword L(\type\()_8tap_v_tbl) - 80b 1996 .hword L(\type\()_8tap_v_tbl) - 40b 1997 .hword L(\type\()_8tap_v_tbl) - 20b 1998 .hword 0 1999 2000L(\type\()_8tap_hv): 2001 cmp \h, #4 2002 ubfx w10, \my, #7, #7 2003 and \my, \my, #0x7f 2004 b.le 4f 2005 mov \my, w10 20064: 2007 add \xmy, x11, \my, uxtw #3 2008 2009 adr x10, L(\type\()_8tap_hv_tbl) 2010 dup v30.4s, w12 // 6 - intermediate_bits 2011 ldrh w9, [x10, x9, lsl #1] 2012 neg v30.4s, v30.4s // -(6-intermediate_bits) 2013.ifc \type, put 2014 dup v29.4s, w13 // 6 + intermediate_bits 2015.else 2016 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2017.endif 2018 sub x10, x10, w9, uxtw 2019.ifc \type, put 2020 neg v29.4s, v29.4s // -(6+intermediate_bits) 2021.endif 2022 br x10 2023 202420: 2025.ifc \type, put 2026 add \xmx, \xmx, #2 2027 ld1 {v0.s}[0], [\xmx] 2028 b.gt 280f 2029 add \xmy, \xmy, #2 2030 ld1 {v1.s}[0], [\xmy] 2031 2032 // 2x2, 2x4 hv 2033 sub \sr2, \src, #2 2034 sub \src, \sr2, \s_strd 2035 add \ds2, \dst, \d_strd 2036 lsl \s_strd, \s_strd, #1 2037 lsl \d_strd, \d_strd, #1 2038 sxtl v0.8h, v0.8b 2039 sxtl v1.8h, v1.8b 2040 mov x15, x30 2041 2042 ld1 {v27.8h}, [\src], \s_strd 2043 ext v28.16b, v27.16b, v27.16b, #2 2044 smull v27.4s, v27.4h, v0.4h 2045 smull v28.4s, v28.4h, v0.4h 2046 addp v27.4s, v27.4s, v28.4s 2047 addp v16.4s, v27.4s, v27.4s 2048 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2049 bl L(\type\()_8tap_filter_2) 2050 // The intermediates from the horizontal pass fit in 16 bit without 2051 // any bias; we could just as well keep them as .4s, but narrowing 2052 // them to .4h gives a significant speedup on out of order cores 2053 // (at the cost of a smaller slowdown on in-order cores such as A53). 2054 xtn v16.4h, v16.4s 2055 2056 trn1 v16.2s, v16.2s, v24.2s 2057 mov v17.8b, v24.8b 2058 20592: 2060 bl L(\type\()_8tap_filter_2) 2061 2062 ext v18.8b, v17.8b, v24.8b, #4 2063 smull v2.4s, v16.4h, v1.h[0] 2064 smlal v2.4s, v17.4h, v1.h[1] 2065 smlal v2.4s, v18.4h, v1.h[2] 2066 smlal v2.4s, v24.4h, v1.h[3] 2067 2068 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2069 sqxtun v2.4h, v2.4s 2070 umin v2.4h, v2.4h, v31.4h 2071 subs \h, \h, #2 2072 st1 {v2.s}[0], [\dst], \d_strd 2073 st1 {v2.s}[1], [\ds2], \d_strd 2074 b.le 0f 2075 mov v16.8b, v18.8b 2076 mov v17.8b, v24.8b 2077 b 2b 2078 2079280: // 2x8, 2x16, 2x32 hv 2080 ld1 {v1.8b}, [\xmy] 2081 sub \src, \src, #2 2082 sub \sr2, \src, \s_strd, lsl #1 2083 sub \src, \sr2, \s_strd 2084 add \ds2, \dst, \d_strd 2085 lsl \s_strd, \s_strd, #1 2086 lsl \d_strd, \d_strd, #1 2087 sxtl v0.8h, v0.8b 2088 sxtl v1.8h, v1.8b 2089 mov x15, x30 2090 2091 ld1 {v27.8h}, [\src], \s_strd 2092 ext v28.16b, v27.16b, v27.16b, #2 2093 smull v27.4s, v27.4h, v0.4h 2094 smull v28.4s, v28.4h, v0.4h 2095 addp v27.4s, v27.4s, v28.4s 2096 addp v16.4s, v27.4s, v27.4s 2097 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2098 // The intermediates from the horizontal pass fit in 16 bit without 2099 // any bias; we could just as well keep them as .4s, but narrowing 2100 // them to .4h gives a significant speedup on out of order cores 2101 // (at the cost of a smaller slowdown on in-order cores such as A53). 2102 2103 bl L(\type\()_8tap_filter_2) 2104 xtn v16.4h, v16.4s 2105 trn1 v16.2s, v16.2s, v24.2s 2106 mov v17.8b, v24.8b 2107 bl L(\type\()_8tap_filter_2) 2108 ext v18.8b, v17.8b, v24.8b, #4 2109 mov v19.8b, v24.8b 2110 bl L(\type\()_8tap_filter_2) 2111 ext v20.8b, v19.8b, v24.8b, #4 2112 mov v21.8b, v24.8b 2113 211428: 2115 bl L(\type\()_8tap_filter_2) 2116 ext v22.8b, v21.8b, v24.8b, #4 2117 smull v3.4s, v16.4h, v1.h[0] 2118 smlal v3.4s, v17.4h, v1.h[1] 2119 smlal v3.4s, v18.4h, v1.h[2] 2120 smlal v3.4s, v19.4h, v1.h[3] 2121 smlal v3.4s, v20.4h, v1.h[4] 2122 smlal v3.4s, v21.4h, v1.h[5] 2123 smlal v3.4s, v22.4h, v1.h[6] 2124 smlal v3.4s, v24.4h, v1.h[7] 2125 2126 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2127 sqxtun v3.4h, v3.4s 2128 umin v3.4h, v3.4h, v31.4h 2129 subs \h, \h, #2 2130 st1 {v3.s}[0], [\dst], \d_strd 2131 st1 {v3.s}[1], [\ds2], \d_strd 2132 b.le 0f 2133 mov v16.8b, v18.8b 2134 mov v17.8b, v19.8b 2135 mov v18.8b, v20.8b 2136 mov v19.8b, v21.8b 2137 mov v20.8b, v22.8b 2138 mov v21.8b, v24.8b 2139 b 28b 2140 21410: 2142 br x15 2143 2144L(\type\()_8tap_filter_2): 2145 ld1 {v25.8h}, [\sr2], \s_strd 2146 ld1 {v27.8h}, [\src], \s_strd 2147 ext v26.16b, v25.16b, v25.16b, #2 2148 ext v28.16b, v27.16b, v27.16b, #2 2149 trn1 v24.2s, v25.2s, v27.2s 2150 trn2 v27.2s, v25.2s, v27.2s 2151 trn1 v25.2s, v26.2s, v28.2s 2152 trn2 v28.2s, v26.2s, v28.2s 2153 smull v24.4s, v24.4h, v0.h[0] 2154 smlal v24.4s, v25.4h, v0.h[1] 2155 smlal v24.4s, v27.4h, v0.h[2] 2156 smlal v24.4s, v28.4h, v0.h[3] 2157 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2158 xtn v24.4h, v24.4s 2159 ret 2160.endif 2161 216240: 2163 add \xmx, \xmx, #2 2164 ld1 {v0.s}[0], [\xmx] 2165 b.gt 480f 2166 add \xmy, \xmy, #2 2167 ld1 {v1.s}[0], [\xmy] 2168 sub \sr2, \src, #2 2169 sub \src, \sr2, \s_strd 2170 add \ds2, \dst, \d_strd 2171 lsl \s_strd, \s_strd, #1 2172 lsl \d_strd, \d_strd, #1 2173 sxtl v0.8h, v0.8b 2174 sxtl v1.8h, v1.8b 2175 mov x15, x30 2176 2177 // 4x2, 4x4 hv 2178 ld1 {v25.8h}, [\src], \s_strd 2179 ext v26.16b, v25.16b, v25.16b, #2 2180 ext v27.16b, v25.16b, v25.16b, #4 2181 ext v28.16b, v25.16b, v25.16b, #6 2182 smull v25.4s, v25.4h, v0.h[0] 2183 smlal v25.4s, v26.4h, v0.h[1] 2184 smlal v25.4s, v27.4h, v0.h[2] 2185 smlal v25.4s, v28.4h, v0.h[3] 2186 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2187 // The intermediates from the horizontal pass fit in 16 bit without 2188 // any bias; we could just as well keep them as .4s, but narrowing 2189 // them to .4h gives a significant speedup on out of order cores 2190 // (at the cost of a smaller slowdown on in-order cores such as A53). 2191 xtn v16.4h, v16.4s 2192 2193 bl L(\type\()_8tap_filter_4) 2194 mov v17.8b, v24.8b 2195 mov v18.8b, v25.8b 2196 21974: 2198 bl L(\type\()_8tap_filter_4) 2199 smull v2.4s, v16.4h, v1.h[0] 2200 smlal v2.4s, v17.4h, v1.h[1] 2201 smlal v2.4s, v18.4h, v1.h[2] 2202 smlal v2.4s, v24.4h, v1.h[3] 2203 smull v3.4s, v17.4h, v1.h[0] 2204 smlal v3.4s, v18.4h, v1.h[1] 2205 smlal v3.4s, v24.4h, v1.h[2] 2206 smlal v3.4s, v25.4h, v1.h[3] 2207.ifc \type, put 2208 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2209 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2210 sqxtun v2.4h, v2.4s 2211 sqxtun2 v2.8h, v3.4s 2212 umin v2.8h, v2.8h, v31.8h 2213.else 2214 rshrn v2.4h, v2.4s, #6 2215 rshrn2 v2.8h, v3.4s, #6 2216 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2217.endif 2218 subs \h, \h, #2 2219 2220 st1 {v2.d}[0], [\dst], \d_strd 2221 st1 {v2.d}[1], [\ds2], \d_strd 2222 b.le 0f 2223 mov v16.8b, v18.8b 2224 mov v17.8b, v24.8b 2225 mov v18.8b, v25.8b 2226 b 4b 2227 2228480: // 4x8, 4x16, 4x32 hv 2229 ld1 {v1.8b}, [\xmy] 2230 sub \src, \src, #2 2231 sub \sr2, \src, \s_strd, lsl #1 2232 sub \src, \sr2, \s_strd 2233 add \ds2, \dst, \d_strd 2234 lsl \s_strd, \s_strd, #1 2235 lsl \d_strd, \d_strd, #1 2236 sxtl v0.8h, v0.8b 2237 sxtl v1.8h, v1.8b 2238 mov x15, x30 2239 2240 ld1 {v25.8h}, [\src], \s_strd 2241 ext v26.16b, v25.16b, v25.16b, #2 2242 ext v27.16b, v25.16b, v25.16b, #4 2243 ext v28.16b, v25.16b, v25.16b, #6 2244 smull v25.4s, v25.4h, v0.h[0] 2245 smlal v25.4s, v26.4h, v0.h[1] 2246 smlal v25.4s, v27.4h, v0.h[2] 2247 smlal v25.4s, v28.4h, v0.h[3] 2248 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2249 // The intermediates from the horizontal pass fit in 16 bit without 2250 // any bias; we could just as well keep them as .4s, but narrowing 2251 // them to .4h gives a significant speedup on out of order cores 2252 // (at the cost of a smaller slowdown on in-order cores such as A53). 2253 xtn v16.4h, v16.4s 2254 2255 bl L(\type\()_8tap_filter_4) 2256 mov v17.8b, v24.8b 2257 mov v18.8b, v25.8b 2258 bl L(\type\()_8tap_filter_4) 2259 mov v19.8b, v24.8b 2260 mov v20.8b, v25.8b 2261 bl L(\type\()_8tap_filter_4) 2262 mov v21.8b, v24.8b 2263 mov v22.8b, v25.8b 2264 226548: 2266 bl L(\type\()_8tap_filter_4) 2267 smull v3.4s, v16.4h, v1.h[0] 2268 smlal v3.4s, v17.4h, v1.h[1] 2269 smlal v3.4s, v18.4h, v1.h[2] 2270 smlal v3.4s, v19.4h, v1.h[3] 2271 smlal v3.4s, v20.4h, v1.h[4] 2272 smlal v3.4s, v21.4h, v1.h[5] 2273 smlal v3.4s, v22.4h, v1.h[6] 2274 smlal v3.4s, v24.4h, v1.h[7] 2275 smull v4.4s, v17.4h, v1.h[0] 2276 smlal v4.4s, v18.4h, v1.h[1] 2277 smlal v4.4s, v19.4h, v1.h[2] 2278 smlal v4.4s, v20.4h, v1.h[3] 2279 smlal v4.4s, v21.4h, v1.h[4] 2280 smlal v4.4s, v22.4h, v1.h[5] 2281 smlal v4.4s, v24.4h, v1.h[6] 2282 smlal v4.4s, v25.4h, v1.h[7] 2283.ifc \type, put 2284 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2285 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2286 sqxtun v3.4h, v3.4s 2287 sqxtun2 v3.8h, v4.4s 2288 umin v3.8h, v3.8h, v31.8h 2289.else 2290 rshrn v3.4h, v3.4s, #6 2291 rshrn2 v3.8h, v4.4s, #6 2292 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2293.endif 2294 subs \h, \h, #2 2295 st1 {v3.d}[0], [\dst], \d_strd 2296 st1 {v3.d}[1], [\ds2], \d_strd 2297 b.le 0f 2298 mov v16.8b, v18.8b 2299 mov v17.8b, v19.8b 2300 mov v18.8b, v20.8b 2301 mov v19.8b, v21.8b 2302 mov v20.8b, v22.8b 2303 mov v21.8b, v24.8b 2304 mov v22.8b, v25.8b 2305 b 48b 23060: 2307 br x15 2308 2309L(\type\()_8tap_filter_4): 2310 ld1 {v24.8h}, [\sr2], \s_strd 2311 ld1 {v25.8h}, [\src], \s_strd 2312 ext v26.16b, v24.16b, v24.16b, #2 2313 ext v27.16b, v24.16b, v24.16b, #4 2314 ext v28.16b, v24.16b, v24.16b, #6 2315 smull v24.4s, v24.4h, v0.h[0] 2316 smlal v24.4s, v26.4h, v0.h[1] 2317 smlal v24.4s, v27.4h, v0.h[2] 2318 smlal v24.4s, v28.4h, v0.h[3] 2319 ext v26.16b, v25.16b, v25.16b, #2 2320 ext v27.16b, v25.16b, v25.16b, #4 2321 ext v28.16b, v25.16b, v25.16b, #6 2322 smull v25.4s, v25.4h, v0.h[0] 2323 smlal v25.4s, v26.4h, v0.h[1] 2324 smlal v25.4s, v27.4h, v0.h[2] 2325 smlal v25.4s, v28.4h, v0.h[3] 2326 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2327 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2328 xtn v24.4h, v24.4s 2329 xtn v25.4h, v25.4s 2330 ret 2331 233280: 2333160: 2334320: 2335 b.gt 880f 2336 add \xmy, \xmy, #2 2337 ld1 {v0.8b}, [\xmx] 2338 ld1 {v1.s}[0], [\xmy] 2339 sub \src, \src, #6 2340 sub \src, \src, \s_strd 2341 sxtl v0.8h, v0.8b 2342 sxtl v1.8h, v1.8b 2343 mov x15, x30 2344 mov \my, \h 2345 2346164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2347 add \ds2, \dst, \d_strd 2348 add \sr2, \src, \s_strd 2349 lsl \d_strd, \d_strd, #1 2350 lsl \s_strd, \s_strd, #1 2351 2352 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2353 smull v24.4s, v27.4h, v0.h[0] 2354 smull2 v25.4s, v27.8h, v0.h[0] 2355.irpc i, 1234567 2356 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2357 smlal v24.4s, v26.4h, v0.h[\i] 2358 smlal2 v25.4s, v26.8h, v0.h[\i] 2359.endr 2360 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2361 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2362 // The intermediates from the horizontal pass fit in 16 bit without 2363 // any bias; we could just as well keep them as .4s, but narrowing 2364 // them to .4h gives a significant speedup on out of order cores 2365 // (at the cost of a smaller slowdown on in-order cores such as A53), 2366 // and conserves register space (no need to clobber v8-v15). 2367 xtn v16.4h, v24.4s 2368 xtn2 v16.8h, v25.4s 2369 2370 bl L(\type\()_8tap_filter_8) 2371 mov v17.16b, v23.16b 2372 mov v18.16b, v24.16b 2373 23748: 2375 smull v2.4s, v16.4h, v1.h[0] 2376 smull2 v3.4s, v16.8h, v1.h[0] 2377 bl L(\type\()_8tap_filter_8) 2378 smull v4.4s, v17.4h, v1.h[0] 2379 smull2 v5.4s, v17.8h, v1.h[0] 2380 smlal v2.4s, v17.4h, v1.h[1] 2381 smlal2 v3.4s, v17.8h, v1.h[1] 2382 smlal v4.4s, v18.4h, v1.h[1] 2383 smlal2 v5.4s, v18.8h, v1.h[1] 2384 smlal v2.4s, v18.4h, v1.h[2] 2385 smlal2 v3.4s, v18.8h, v1.h[2] 2386 smlal v4.4s, v23.4h, v1.h[2] 2387 smlal2 v5.4s, v23.8h, v1.h[2] 2388 smlal v2.4s, v23.4h, v1.h[3] 2389 smlal2 v3.4s, v23.8h, v1.h[3] 2390 smlal v4.4s, v24.4h, v1.h[3] 2391 smlal2 v5.4s, v24.8h, v1.h[3] 2392.ifc \type, put 2393 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2394 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2395 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2396 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2397 sqxtun v2.4h, v2.4s 2398 sqxtun2 v2.8h, v3.4s 2399 sqxtun v3.4h, v4.4s 2400 sqxtun2 v3.8h, v5.4s 2401 umin v2.8h, v2.8h, v31.8h 2402 umin v3.8h, v3.8h, v31.8h 2403.else 2404 rshrn v2.4h, v2.4s, #6 2405 rshrn2 v2.8h, v3.4s, #6 2406 rshrn v3.4h, v4.4s, #6 2407 rshrn2 v3.8h, v5.4s, #6 2408 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2409 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2410.endif 2411 subs \h, \h, #2 2412 st1 {v2.8h}, [\dst], \d_strd 2413 st1 {v3.8h}, [\ds2], \d_strd 2414 b.le 9f 2415 mov v16.16b, v18.16b 2416 mov v17.16b, v23.16b 2417 mov v18.16b, v24.16b 2418 b 8b 24199: 2420 subs \w, \w, #8 2421 b.le 0f 2422 asr \s_strd, \s_strd, #1 2423 asr \d_strd, \d_strd, #1 2424 msub \src, \s_strd, \xmy, \src 2425 msub \dst, \d_strd, \xmy, \dst 2426 sub \src, \src, \s_strd, lsl #2 2427 mov \h, \my 2428 add \src, \src, #16 2429 add \dst, \dst, #16 2430 b 164b 2431 2432880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2433640: 24341280: 2435 ld1 {v0.8b}, [\xmx] 2436 ld1 {v1.8b}, [\xmy] 2437 sub \src, \src, #6 2438 sub \src, \src, \s_strd 2439 sub \src, \src, \s_strd, lsl #1 2440 sxtl v0.8h, v0.8b 2441 sxtl v1.8h, v1.8b 2442 mov x15, x30 2443 mov \my, \h 2444 2445168: 2446 add \ds2, \dst, \d_strd 2447 add \sr2, \src, \s_strd 2448 lsl \d_strd, \d_strd, #1 2449 lsl \s_strd, \s_strd, #1 2450 2451 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2452 smull v24.4s, v27.4h, v0.h[0] 2453 smull2 v25.4s, v27.8h, v0.h[0] 2454.irpc i, 1234567 2455 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2456 smlal v24.4s, v26.4h, v0.h[\i] 2457 smlal2 v25.4s, v26.8h, v0.h[\i] 2458.endr 2459 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2460 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2461 // The intermediates from the horizontal pass fit in 16 bit without 2462 // any bias; we could just as well keep them as .4s, but narrowing 2463 // them to .4h gives a significant speedup on out of order cores 2464 // (at the cost of a smaller slowdown on in-order cores such as A53), 2465 // and conserves register space (no need to clobber v8-v15). 2466 xtn v16.4h, v24.4s 2467 xtn2 v16.8h, v25.4s 2468 2469 bl L(\type\()_8tap_filter_8) 2470 mov v17.16b, v23.16b 2471 mov v18.16b, v24.16b 2472 bl L(\type\()_8tap_filter_8) 2473 mov v19.16b, v23.16b 2474 mov v20.16b, v24.16b 2475 bl L(\type\()_8tap_filter_8) 2476 mov v21.16b, v23.16b 2477 mov v22.16b, v24.16b 2478 247988: 2480 smull v2.4s, v16.4h, v1.h[0] 2481 smull2 v3.4s, v16.8h, v1.h[0] 2482 bl L(\type\()_8tap_filter_8) 2483 smull v4.4s, v17.4h, v1.h[0] 2484 smull2 v5.4s, v17.8h, v1.h[0] 2485 smlal v2.4s, v17.4h, v1.h[1] 2486 smlal2 v3.4s, v17.8h, v1.h[1] 2487 smlal v4.4s, v18.4h, v1.h[1] 2488 smlal2 v5.4s, v18.8h, v1.h[1] 2489 smlal v2.4s, v18.4h, v1.h[2] 2490 smlal2 v3.4s, v18.8h, v1.h[2] 2491 smlal v4.4s, v19.4h, v1.h[2] 2492 smlal2 v5.4s, v19.8h, v1.h[2] 2493 smlal v2.4s, v19.4h, v1.h[3] 2494 smlal2 v3.4s, v19.8h, v1.h[3] 2495 smlal v4.4s, v20.4h, v1.h[3] 2496 smlal2 v5.4s, v20.8h, v1.h[3] 2497 smlal v2.4s, v20.4h, v1.h[4] 2498 smlal2 v3.4s, v20.8h, v1.h[4] 2499 smlal v4.4s, v21.4h, v1.h[4] 2500 smlal2 v5.4s, v21.8h, v1.h[4] 2501 smlal v2.4s, v21.4h, v1.h[5] 2502 smlal2 v3.4s, v21.8h, v1.h[5] 2503 smlal v4.4s, v22.4h, v1.h[5] 2504 smlal2 v5.4s, v22.8h, v1.h[5] 2505 smlal v2.4s, v22.4h, v1.h[6] 2506 smlal2 v3.4s, v22.8h, v1.h[6] 2507 smlal v4.4s, v23.4h, v1.h[6] 2508 smlal2 v5.4s, v23.8h, v1.h[6] 2509 smlal v2.4s, v23.4h, v1.h[7] 2510 smlal2 v3.4s, v23.8h, v1.h[7] 2511 smlal v4.4s, v24.4h, v1.h[7] 2512 smlal2 v5.4s, v24.8h, v1.h[7] 2513.ifc \type, put 2514 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2515 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2516 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2517 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2518 sqxtun v2.4h, v2.4s 2519 sqxtun2 v2.8h, v3.4s 2520 sqxtun v3.4h, v4.4s 2521 sqxtun2 v3.8h, v5.4s 2522 umin v2.8h, v2.8h, v31.8h 2523 umin v3.8h, v3.8h, v31.8h 2524.else 2525 rshrn v2.4h, v2.4s, #6 2526 rshrn2 v2.8h, v3.4s, #6 2527 rshrn v3.4h, v4.4s, #6 2528 rshrn2 v3.8h, v5.4s, #6 2529 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2530 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2531.endif 2532 subs \h, \h, #2 2533 st1 {v2.8h}, [\dst], \d_strd 2534 st1 {v3.8h}, [\ds2], \d_strd 2535 b.le 9f 2536 mov v16.16b, v18.16b 2537 mov v17.16b, v19.16b 2538 mov v18.16b, v20.16b 2539 mov v19.16b, v21.16b 2540 mov v20.16b, v22.16b 2541 mov v21.16b, v23.16b 2542 mov v22.16b, v24.16b 2543 b 88b 25449: 2545 subs \w, \w, #8 2546 b.le 0f 2547 asr \s_strd, \s_strd, #1 2548 asr \d_strd, \d_strd, #1 2549 msub \src, \s_strd, \xmy, \src 2550 msub \dst, \d_strd, \xmy, \dst 2551 sub \src, \src, \s_strd, lsl #3 2552 mov \h, \my 2553 add \src, \src, #16 2554 add \dst, \dst, #16 2555 b 168b 25560: 2557 br x15 2558 2559L(\type\()_8tap_filter_8): 2560 ld1 {v4.8h, v5.8h}, [\sr2], \s_strd 2561 ld1 {v6.8h, v7.8h}, [\src], \s_strd 2562 smull v25.4s, v4.4h, v0.h[0] 2563 smull2 v26.4s, v4.8h, v0.h[0] 2564 smull v27.4s, v6.4h, v0.h[0] 2565 smull2 v28.4s, v6.8h, v0.h[0] 2566.irpc i, 1234567 2567 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2568 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2569 smlal v25.4s, v23.4h, v0.h[\i] 2570 smlal2 v26.4s, v23.8h, v0.h[\i] 2571 smlal v27.4s, v24.4h, v0.h[\i] 2572 smlal2 v28.4s, v24.8h, v0.h[\i] 2573.endr 2574 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2575 srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) 2576 srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) 2577 srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) 2578 xtn v23.4h, v25.4s 2579 xtn2 v23.8h, v26.4s 2580 xtn v24.4h, v27.4s 2581 xtn2 v24.8h, v28.4s 2582 ret 2583 2584L(\type\()_8tap_hv_tbl): 2585 .hword L(\type\()_8tap_hv_tbl) - 1280b 2586 .hword L(\type\()_8tap_hv_tbl) - 640b 2587 .hword L(\type\()_8tap_hv_tbl) - 320b 2588 .hword L(\type\()_8tap_hv_tbl) - 160b 2589 .hword L(\type\()_8tap_hv_tbl) - 80b 2590 .hword L(\type\()_8tap_hv_tbl) - 40b 2591 .hword L(\type\()_8tap_hv_tbl) - 20b 2592 .hword 0 2593endfunc 2594 2595 2596function \type\()_bilin_16bpc_neon, export=1 2597.ifc \bdmax, w8 2598 ldr w8, [sp] 2599.endif 2600 dup v1.8h, \mx 2601 dup v3.8h, \my 2602 mov w10, #16 2603 sub w9, w10, \mx 2604 sub w10, w10, \my 2605 dup v0.8h, w9 2606 dup v2.8h, w10 2607.ifc \type, prep 2608 uxtw \d_strd, \w 2609 lsl \d_strd, \d_strd, #1 2610.endif 2611 2612 clz \bdmax, \bdmax // bitdepth_max 2613 clz w9, \w 2614 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 2615 mov w11, #4 2616 sub w9, w9, #24 2617 sub w11, w11, \bdmax // 4 - intermediate_bits 2618 add w12, \bdmax, #4 // 4 + intermediate_bits 2619 cbnz \mx, L(\type\()_bilin_h) 2620 cbnz \my, L(\type\()_bilin_v) 2621 b \type\()_neon 2622 2623L(\type\()_bilin_h): 2624 cbnz \my, L(\type\()_bilin_hv) 2625 2626 adr x10, L(\type\()_bilin_h_tbl) 2627 dup v31.8h, w11 // 4 - intermediate_bits 2628 ldrh w9, [x10, x9, lsl #1] 2629 neg v31.8h, v31.8h // -(4-intermediate_bits) 2630.ifc \type, put 2631 dup v30.8h, \bdmax // intermediate_bits 2632.else 2633 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2634.endif 2635 sub x10, x10, w9, uxtw 2636.ifc \type, put 2637 neg v30.8h, v30.8h // -intermediate_bits 2638.endif 2639 br x10 2640 264120: // 2xN h 2642.ifc \type, put 2643 add \ds2, \dst, \d_strd 2644 add \sr2, \src, \s_strd 2645 lsl \d_strd, \d_strd, #1 2646 lsl \s_strd, \s_strd, #1 26472: 2648 ld1 {v4.4h}, [\src], \s_strd 2649 ld1 {v6.4h}, [\sr2], \s_strd 2650 ext v5.8b, v4.8b, v4.8b, #2 2651 ext v7.8b, v6.8b, v6.8b, #2 2652 trn1 v4.2s, v4.2s, v6.2s 2653 trn1 v5.2s, v5.2s, v7.2s 2654 subs \h, \h, #2 2655 mul v4.4h, v4.4h, v0.4h 2656 mla v4.4h, v5.4h, v1.4h 2657 urshl v4.4h, v4.4h, v31.4h 2658 urshl v4.4h, v4.4h, v30.4h 2659 st1 {v4.s}[0], [\dst], \d_strd 2660 st1 {v4.s}[1], [\ds2], \d_strd 2661 b.gt 2b 2662 ret 2663.endif 2664 266540: // 4xN h 2666 add \ds2, \dst, \d_strd 2667 add \sr2, \src, \s_strd 2668 lsl \d_strd, \d_strd, #1 2669 lsl \s_strd, \s_strd, #1 26704: 2671 ld1 {v4.8h}, [\src], \s_strd 2672 ld1 {v6.8h}, [\sr2], \s_strd 2673 ext v5.16b, v4.16b, v4.16b, #2 2674 ext v7.16b, v6.16b, v6.16b, #2 2675 trn1 v4.2d, v4.2d, v6.2d 2676 trn1 v5.2d, v5.2d, v7.2d 2677 subs \h, \h, #2 2678 mul v4.8h, v4.8h, v0.8h 2679 mla v4.8h, v5.8h, v1.8h 2680 urshl v4.8h, v4.8h, v31.8h 2681.ifc \type, put 2682 urshl v4.8h, v4.8h, v30.8h 2683.else 2684 sub v4.8h, v4.8h, v29.8h 2685.endif 2686 st1 {v4.d}[0], [\dst], \d_strd 2687 st1 {v4.d}[1], [\ds2], \d_strd 2688 b.gt 4b 2689 ret 2690 269180: // 8xN h 2692 add \ds2, \dst, \d_strd 2693 add \sr2, \src, \s_strd 2694 lsl \d_strd, \d_strd, #1 2695 lsl \s_strd, \s_strd, #1 26968: 2697 ldr h5, [\src, #16] 2698 ldr h7, [\sr2, #16] 2699 ld1 {v4.8h}, [\src], \s_strd 2700 ld1 {v6.8h}, [\sr2], \s_strd 2701 ext v5.16b, v4.16b, v5.16b, #2 2702 ext v7.16b, v6.16b, v7.16b, #2 2703 subs \h, \h, #2 2704 mul v4.8h, v4.8h, v0.8h 2705 mla v4.8h, v5.8h, v1.8h 2706 mul v6.8h, v6.8h, v0.8h 2707 mla v6.8h, v7.8h, v1.8h 2708 urshl v4.8h, v4.8h, v31.8h 2709 urshl v6.8h, v6.8h, v31.8h 2710.ifc \type, put 2711 urshl v4.8h, v4.8h, v30.8h 2712 urshl v6.8h, v6.8h, v30.8h 2713.else 2714 sub v4.8h, v4.8h, v29.8h 2715 sub v6.8h, v6.8h, v29.8h 2716.endif 2717 st1 {v4.8h}, [\dst], \d_strd 2718 st1 {v6.8h}, [\ds2], \d_strd 2719 b.gt 8b 2720 ret 2721160: 2722320: 2723640: 27241280: // 16xN, 32xN, ... h 2725 add \ds2, \dst, \d_strd 2726 add \sr2, \src, \s_strd 2727 lsl \s_strd, \s_strd, #1 2728 2729 sub \s_strd, \s_strd, \w, uxtw #1 2730 sub \s_strd, \s_strd, #16 2731.ifc \type, put 2732 lsl \d_strd, \d_strd, #1 2733 sub \d_strd, \d_strd, \w, uxtw #1 2734.endif 2735161: 2736 ld1 {v16.8h}, [\src], #16 2737 ld1 {v21.8h}, [\sr2], #16 2738 mov \mx, \w 2739 274016: 2741 ld1 {v17.8h, v18.8h}, [\src], #32 2742 ld1 {v22.8h, v23.8h}, [\sr2], #32 2743 ext v19.16b, v16.16b, v17.16b, #2 2744 ext v20.16b, v17.16b, v18.16b, #2 2745 ext v24.16b, v21.16b, v22.16b, #2 2746 ext v25.16b, v22.16b, v23.16b, #2 2747 mul v16.8h, v16.8h, v0.8h 2748 mla v16.8h, v19.8h, v1.8h 2749 mul v17.8h, v17.8h, v0.8h 2750 mla v17.8h, v20.8h, v1.8h 2751 mul v21.8h, v21.8h, v0.8h 2752 mla v21.8h, v24.8h, v1.8h 2753 mul v22.8h, v22.8h, v0.8h 2754 mla v22.8h, v25.8h, v1.8h 2755 urshl v16.8h, v16.8h, v31.8h 2756 urshl v17.8h, v17.8h, v31.8h 2757 urshl v21.8h, v21.8h, v31.8h 2758 urshl v22.8h, v22.8h, v31.8h 2759 subs \mx, \mx, #16 2760.ifc \type, put 2761 urshl v16.8h, v16.8h, v30.8h 2762 urshl v17.8h, v17.8h, v30.8h 2763 urshl v21.8h, v21.8h, v30.8h 2764 urshl v22.8h, v22.8h, v30.8h 2765.else 2766 sub v16.8h, v16.8h, v29.8h 2767 sub v17.8h, v17.8h, v29.8h 2768 sub v21.8h, v21.8h, v29.8h 2769 sub v22.8h, v22.8h, v29.8h 2770.endif 2771 st1 {v16.8h, v17.8h}, [\dst], #32 2772 st1 {v21.8h, v22.8h}, [\ds2], #32 2773 b.le 9f 2774 2775 mov v16.16b, v18.16b 2776 mov v21.16b, v23.16b 2777 b 16b 2778 27799: 2780 add \dst, \dst, \d_strd 2781 add \ds2, \ds2, \d_strd 2782 add \src, \src, \s_strd 2783 add \sr2, \sr2, \s_strd 2784 2785 subs \h, \h, #2 2786 b.gt 161b 2787 ret 2788 2789L(\type\()_bilin_h_tbl): 2790 .hword L(\type\()_bilin_h_tbl) - 1280b 2791 .hword L(\type\()_bilin_h_tbl) - 640b 2792 .hword L(\type\()_bilin_h_tbl) - 320b 2793 .hword L(\type\()_bilin_h_tbl) - 160b 2794 .hword L(\type\()_bilin_h_tbl) - 80b 2795 .hword L(\type\()_bilin_h_tbl) - 40b 2796 .hword L(\type\()_bilin_h_tbl) - 20b 2797 .hword 0 2798 2799 2800L(\type\()_bilin_v): 2801 cmp \h, #4 2802 adr x10, L(\type\()_bilin_v_tbl) 2803.ifc \type, prep 2804 dup v31.8h, w11 // 4 - intermediate_bits 2805.endif 2806 ldrh w9, [x10, x9, lsl #1] 2807.ifc \type, prep 2808 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2809 neg v31.8h, v31.8h // -(4-intermediate_bits) 2810.endif 2811 sub x10, x10, w9, uxtw 2812 br x10 2813 281420: // 2xN v 2815.ifc \type, put 2816 cmp \h, #2 2817 add \ds2, \dst, \d_strd 2818 add \sr2, \src, \s_strd 2819 lsl \s_strd, \s_strd, #1 2820 lsl \d_strd, \d_strd, #1 2821 2822 // 2x2 v 2823 ld1 {v16.s}[0], [\src], \s_strd 2824 b.gt 24f 2825 ld1 {v17.s}[0], [\sr2], \s_strd 2826 ld1 {v18.s}[0], [\src], \s_strd 2827 trn1 v16.2s, v16.2s, v17.2s 2828 trn1 v17.2s, v17.2s, v18.2s 2829 mul v4.4h, v16.4h, v2.4h 2830 mla v4.4h, v17.4h, v3.4h 2831 urshr v4.8h, v4.8h, #4 2832 st1 {v4.s}[0], [\dst] 2833 st1 {v4.s}[1], [\ds2] 2834 ret 283524: // 2x4, 2x8, ... v 2836 ld1 {v17.s}[0], [\sr2], \s_strd 2837 ld1 {v18.s}[0], [\src], \s_strd 2838 ld1 {v19.s}[0], [\sr2], \s_strd 2839 ld1 {v20.s}[0], [\src], \s_strd 2840 trn1 v16.2s, v16.2s, v17.2s 2841 trn1 v17.2s, v17.2s, v18.2s 2842 trn1 v18.2s, v18.2s, v19.2s 2843 trn1 v19.2s, v19.2s, v20.2s 2844 trn1 v16.2d, v16.2d, v18.2d 2845 trn1 v17.2d, v17.2d, v19.2d 2846 mul v4.8h, v16.8h, v2.8h 2847 mla v4.8h, v17.8h, v3.8h 2848 subs \h, \h, #4 2849 urshr v4.8h, v4.8h, #4 2850 st1 {v4.s}[0], [\dst], \d_strd 2851 st1 {v4.s}[1], [\ds2], \d_strd 2852 st1 {v4.s}[2], [\dst], \d_strd 2853 st1 {v4.s}[3], [\ds2], \d_strd 2854 b.le 0f 2855 mov v16.8b, v20.8b 2856 b 24b 28570: 2858 ret 2859.endif 2860 286140: // 4xN v 2862 add \ds2, \dst, \d_strd 2863 add \sr2, \src, \s_strd 2864 lsl \s_strd, \s_strd, #1 2865 lsl \d_strd, \d_strd, #1 2866 ld1 {v16.4h}, [\src], \s_strd 28674: 2868 ld1 {v17.4h}, [\sr2], \s_strd 2869 ld1 {v18.4h}, [\src], \s_strd 2870 trn1 v16.2d, v16.2d, v17.2d 2871 trn1 v17.2d, v17.2d, v18.2d 2872 mul v4.8h, v16.8h, v2.8h 2873 mla v4.8h, v17.8h, v3.8h 2874 subs \h, \h, #2 2875.ifc \type, put 2876 urshr v4.8h, v4.8h, #4 2877.else 2878 urshl v4.8h, v4.8h, v31.8h 2879 sub v4.8h, v4.8h, v29.8h 2880.endif 2881 st1 {v4.d}[0], [\dst], \d_strd 2882 st1 {v4.d}[1], [\ds2], \d_strd 2883 b.le 0f 2884 mov v16.8b, v18.8b 2885 b 4b 28860: 2887 ret 2888 288980: // 8xN v 2890 add \ds2, \dst, \d_strd 2891 add \sr2, \src, \s_strd 2892 lsl \s_strd, \s_strd, #1 2893 lsl \d_strd, \d_strd, #1 2894 ld1 {v16.8h}, [\src], \s_strd 28958: 2896 ld1 {v17.8h}, [\sr2], \s_strd 2897 ld1 {v18.8h}, [\src], \s_strd 2898 mul v4.8h, v16.8h, v2.8h 2899 mla v4.8h, v17.8h, v3.8h 2900 mul v5.8h, v17.8h, v2.8h 2901 mla v5.8h, v18.8h, v3.8h 2902 subs \h, \h, #2 2903.ifc \type, put 2904 urshr v4.8h, v4.8h, #4 2905 urshr v5.8h, v5.8h, #4 2906.else 2907 urshl v4.8h, v4.8h, v31.8h 2908 urshl v5.8h, v5.8h, v31.8h 2909 sub v4.8h, v4.8h, v29.8h 2910 sub v5.8h, v5.8h, v29.8h 2911.endif 2912 st1 {v4.8h}, [\dst], \d_strd 2913 st1 {v5.8h}, [\ds2], \d_strd 2914 b.le 0f 2915 mov v16.16b, v18.16b 2916 b 8b 29170: 2918 ret 2919 2920160: // 16xN, 32xN, ... 2921320: 2922640: 29231280: 2924 mov \my, \h 29251: 2926 add \ds2, \dst, \d_strd 2927 add \sr2, \src, \s_strd 2928 lsl \s_strd, \s_strd, #1 2929 lsl \d_strd, \d_strd, #1 2930 2931 ld1 {v16.8h, v17.8h}, [\src], \s_strd 29322: 2933 ld1 {v18.8h, v19.8h}, [\sr2], \s_strd 2934 ld1 {v20.8h, v21.8h}, [\src], \s_strd 2935 mul v4.8h, v16.8h, v2.8h 2936 mla v4.8h, v18.8h, v3.8h 2937 mul v5.8h, v17.8h, v2.8h 2938 mla v5.8h, v19.8h, v3.8h 2939 mul v6.8h, v18.8h, v2.8h 2940 mla v6.8h, v20.8h, v3.8h 2941 mul v7.8h, v19.8h, v2.8h 2942 mla v7.8h, v21.8h, v3.8h 2943 subs \h, \h, #2 2944.ifc \type, put 2945 urshr v4.8h, v4.8h, #4 2946 urshr v5.8h, v5.8h, #4 2947 urshr v6.8h, v6.8h, #4 2948 urshr v7.8h, v7.8h, #4 2949.else 2950 urshl v4.8h, v4.8h, v31.8h 2951 urshl v5.8h, v5.8h, v31.8h 2952 urshl v6.8h, v6.8h, v31.8h 2953 urshl v7.8h, v7.8h, v31.8h 2954 sub v4.8h, v4.8h, v29.8h 2955 sub v5.8h, v5.8h, v29.8h 2956 sub v6.8h, v6.8h, v29.8h 2957 sub v7.8h, v7.8h, v29.8h 2958.endif 2959 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2960 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2961 b.le 9f 2962 mov v16.16b, v20.16b 2963 mov v17.16b, v21.16b 2964 b 2b 29659: 2966 subs \w, \w, #16 2967 b.le 0f 2968 asr \s_strd, \s_strd, #1 2969 asr \d_strd, \d_strd, #1 2970 msub \src, \s_strd, \xmy, \src 2971 msub \dst, \d_strd, \xmy, \dst 2972 sub \src, \src, \s_strd, lsl #1 2973 mov \h, \my 2974 add \src, \src, #32 2975 add \dst, \dst, #32 2976 b 1b 29770: 2978 ret 2979 2980L(\type\()_bilin_v_tbl): 2981 .hword L(\type\()_bilin_v_tbl) - 1280b 2982 .hword L(\type\()_bilin_v_tbl) - 640b 2983 .hword L(\type\()_bilin_v_tbl) - 320b 2984 .hword L(\type\()_bilin_v_tbl) - 160b 2985 .hword L(\type\()_bilin_v_tbl) - 80b 2986 .hword L(\type\()_bilin_v_tbl) - 40b 2987 .hword L(\type\()_bilin_v_tbl) - 20b 2988 .hword 0 2989 2990L(\type\()_bilin_hv): 2991 adr x10, L(\type\()_bilin_hv_tbl) 2992 dup v31.8h, w11 // 4 - intermediate_bits 2993 ldrh w9, [x10, x9, lsl #1] 2994 neg v31.8h, v31.8h // -(4-intermediate_bits) 2995.ifc \type, put 2996 dup v30.4s, w12 // 4 + intermediate_bits 2997.else 2998 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2999.endif 3000 sub x10, x10, w9, uxtw 3001.ifc \type, put 3002 neg v30.4s, v30.4s // -(4+intermediate_bits) 3003.endif 3004 br x10 3005 300620: // 2xN hv 3007.ifc \type, put 3008 add \sr2, \src, \s_strd 3009 add \ds2, \dst, \d_strd 3010 lsl \s_strd, \s_strd, #1 3011 lsl \d_strd, \d_strd, #1 3012 3013 ld1 {v20.4h}, [\src], \s_strd 3014 ext v21.8b, v20.8b, v20.8b, #2 3015 mul v16.4h, v20.4h, v0.4h 3016 mla v16.4h, v21.4h, v1.4h 3017 urshl v16.4h, v16.4h, v31.4h 3018 30192: 3020 ld1 {v22.4h}, [\sr2], \s_strd 3021 ld1 {v24.4h}, [\src], \s_strd 3022 ext v23.8b, v22.8b, v22.8b, #2 3023 ext v25.8b, v24.8b, v24.8b, #2 3024 trn1 v22.2s, v22.2s, v24.2s 3025 trn1 v23.2s, v23.2s, v25.2s 3026 mul v17.4h, v22.4h, v0.4h 3027 mla v17.4h, v23.4h, v1.4h 3028 urshl v17.4h, v17.4h, v31.4h 3029 3030 trn1 v16.2s, v16.2s, v17.2s 3031 3032 umull v4.4s, v16.4h, v2.4h 3033 umlal v4.4s, v17.4h, v3.4h 3034 urshl v4.4s, v4.4s, v30.4s 3035 xtn v4.4h, v4.4s 3036 subs \h, \h, #2 3037 st1 {v4.s}[0], [\dst], \d_strd 3038 st1 {v4.s}[1], [\ds2], \d_strd 3039 b.le 0f 3040 trn2 v16.2s, v17.2s, v17.2s 3041 b 2b 30420: 3043 ret 3044.endif 3045 304640: // 4xN hv 3047 add \sr2, \src, \s_strd 3048 add \ds2, \dst, \d_strd 3049 lsl \s_strd, \s_strd, #1 3050 lsl \d_strd, \d_strd, #1 3051 3052 ld1 {v20.8h}, [\src], \s_strd 3053 ext v21.16b, v20.16b, v20.16b, #2 3054 mul v16.4h, v20.4h, v0.4h 3055 mla v16.4h, v21.4h, v1.4h 3056 urshl v16.4h, v16.4h, v31.4h 3057 30584: 3059 ld1 {v22.8h}, [\sr2], \s_strd 3060 ld1 {v24.8h}, [\src], \s_strd 3061 ext v23.16b, v22.16b, v22.16b, #2 3062 ext v25.16b, v24.16b, v24.16b, #2 3063 trn1 v22.2d, v22.2d, v24.2d 3064 trn1 v23.2d, v23.2d, v25.2d 3065 mul v17.8h, v22.8h, v0.8h 3066 mla v17.8h, v23.8h, v1.8h 3067 urshl v17.8h, v17.8h, v31.8h 3068 3069 trn1 v16.2d, v16.2d, v17.2d 3070 3071 umull v4.4s, v16.4h, v2.4h 3072 umlal v4.4s, v17.4h, v3.4h 3073 umull2 v5.4s, v16.8h, v2.8h 3074 umlal2 v5.4s, v17.8h, v3.8h 3075.ifc \type, put 3076 urshl v4.4s, v4.4s, v30.4s 3077 urshl v5.4s, v5.4s, v30.4s 3078 xtn v4.4h, v4.4s 3079 xtn2 v4.8h, v5.4s 3080.else 3081 rshrn v4.4h, v4.4s, #4 3082 rshrn2 v4.8h, v5.4s, #4 3083 sub v4.8h, v4.8h, v29.8h 3084.endif 3085 subs \h, \h, #2 3086 st1 {v4.d}[0], [\dst], \d_strd 3087 st1 {v4.d}[1], [\ds2], \d_strd 3088 b.le 0f 3089 trn2 v16.2d, v17.2d, v17.2d 3090 b 4b 30910: 3092 ret 3093 309480: // 8xN, 16xN, ... hv 3095160: 3096320: 3097640: 30981280: 3099 mov \my, \h 3100 31011: 3102 add \sr2, \src, \s_strd 3103 add \ds2, \dst, \d_strd 3104 lsl \s_strd, \s_strd, #1 3105 lsl \d_strd, \d_strd, #1 3106 3107 ldr h21, [\src, #16] 3108 ld1 {v20.8h}, [\src], \s_strd 3109 ext v21.16b, v20.16b, v21.16b, #2 3110 mul v16.8h, v20.8h, v0.8h 3111 mla v16.8h, v21.8h, v1.8h 3112 urshl v16.8h, v16.8h, v31.8h 3113 31142: 3115 ldr h23, [\sr2, #16] 3116 ld1 {v22.8h}, [\sr2], \s_strd 3117 ldr h25, [\src, #16] 3118 ld1 {v24.8h}, [\src], \s_strd 3119 ext v23.16b, v22.16b, v23.16b, #2 3120 ext v25.16b, v24.16b, v25.16b, #2 3121 mul v17.8h, v22.8h, v0.8h 3122 mla v17.8h, v23.8h, v1.8h 3123 mul v18.8h, v24.8h, v0.8h 3124 mla v18.8h, v25.8h, v1.8h 3125 urshl v17.8h, v17.8h, v31.8h 3126 urshl v18.8h, v18.8h, v31.8h 3127 3128 umull v4.4s, v16.4h, v2.4h 3129 umlal v4.4s, v17.4h, v3.4h 3130 umull2 v5.4s, v16.8h, v2.8h 3131 umlal2 v5.4s, v17.8h, v3.8h 3132 umull v6.4s, v17.4h, v2.4h 3133 umlal v6.4s, v18.4h, v3.4h 3134 umull2 v7.4s, v17.8h, v2.8h 3135 umlal2 v7.4s, v18.8h, v3.8h 3136.ifc \type, put 3137 urshl v4.4s, v4.4s, v30.4s 3138 urshl v5.4s, v5.4s, v30.4s 3139 urshl v6.4s, v6.4s, v30.4s 3140 urshl v7.4s, v7.4s, v30.4s 3141 xtn v4.4h, v4.4s 3142 xtn2 v4.8h, v5.4s 3143 xtn v5.4h, v6.4s 3144 xtn2 v5.8h, v7.4s 3145.else 3146 rshrn v4.4h, v4.4s, #4 3147 rshrn2 v4.8h, v5.4s, #4 3148 rshrn v5.4h, v6.4s, #4 3149 rshrn2 v5.8h, v7.4s, #4 3150 sub v4.8h, v4.8h, v29.8h 3151 sub v5.8h, v5.8h, v29.8h 3152.endif 3153 subs \h, \h, #2 3154 st1 {v4.8h}, [\dst], \d_strd 3155 st1 {v5.8h}, [\ds2], \d_strd 3156 b.le 9f 3157 mov v16.16b, v18.16b 3158 b 2b 31599: 3160 subs \w, \w, #8 3161 b.le 0f 3162 asr \s_strd, \s_strd, #1 3163 asr \d_strd, \d_strd, #1 3164 msub \src, \s_strd, \xmy, \src 3165 msub \dst, \d_strd, \xmy, \dst 3166 sub \src, \src, \s_strd, lsl #1 3167 mov \h, \my 3168 add \src, \src, #16 3169 add \dst, \dst, #16 3170 b 1b 31710: 3172 ret 3173 3174L(\type\()_bilin_hv_tbl): 3175 .hword L(\type\()_bilin_hv_tbl) - 1280b 3176 .hword L(\type\()_bilin_hv_tbl) - 640b 3177 .hword L(\type\()_bilin_hv_tbl) - 320b 3178 .hword L(\type\()_bilin_hv_tbl) - 160b 3179 .hword L(\type\()_bilin_hv_tbl) - 80b 3180 .hword L(\type\()_bilin_hv_tbl) - 40b 3181 .hword L(\type\()_bilin_hv_tbl) - 20b 3182 .hword 0 3183endfunc 3184.endm 3185 3186filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 3187filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 3188 3189.macro load_filter_row dst, src, inc 3190 asr w13, \src, #10 3191 add \src, \src, \inc 3192 ldr \dst, [x11, w13, sxtw #3] 3193.endm 3194 3195function warp_filter_horz_neon 3196 add w12, w5, #512 3197 3198 ld1 {v16.8h, v17.8h}, [x2], x3 3199 3200 load_filter_row d0, w12, w7 3201 load_filter_row d1, w12, w7 3202 load_filter_row d2, w12, w7 3203 sxtl v0.8h, v0.8b 3204 load_filter_row d3, w12, w7 3205 sxtl v1.8h, v1.8b 3206 load_filter_row d4, w12, w7 3207 sxtl v2.8h, v2.8b 3208 load_filter_row d5, w12, w7 3209 sxtl v3.8h, v3.8b 3210 load_filter_row d6, w12, w7 3211 sxtl v4.8h, v4.8b 3212 load_filter_row d7, w12, w7 3213 sxtl v5.8h, v5.8b 3214 ext v18.16b, v16.16b, v17.16b, #2*1 3215 smull v8.4s, v16.4h, v0.4h 3216 smull2 v9.4s, v16.8h, v0.8h 3217 sxtl v6.8h, v6.8b 3218 ext v19.16b, v16.16b, v17.16b, #2*2 3219 smull v10.4s, v18.4h, v1.4h 3220 smull2 v11.4s, v18.8h, v1.8h 3221 sxtl v7.8h, v7.8b 3222 ext v20.16b, v16.16b, v17.16b, #2*3 3223 smull v0.4s, v19.4h, v2.4h 3224 smull2 v1.4s, v19.8h, v2.8h 3225 ext v21.16b, v16.16b, v17.16b, #2*4 3226 addp v8.4s, v8.4s, v9.4s 3227 smull v2.4s, v20.4h, v3.4h 3228 smull2 v3.4s, v20.8h, v3.8h 3229 ext v22.16b, v16.16b, v17.16b, #2*5 3230 addp v9.4s, v10.4s, v11.4s 3231 smull v10.4s, v21.4h, v4.4h 3232 smull2 v11.4s, v21.8h, v4.8h 3233 ext v23.16b, v16.16b, v17.16b, #2*6 3234 addp v0.4s, v0.4s, v1.4s 3235 smull v18.4s, v22.4h, v5.4h 3236 smull2 v19.4s, v22.8h, v5.8h 3237 ext v16.16b, v16.16b, v17.16b, #2*7 3238 addp v1.4s, v2.4s, v3.4s 3239 addp v2.4s, v10.4s, v11.4s 3240 smull v20.4s, v23.4h, v6.4h 3241 smull2 v21.4s, v23.8h, v6.8h 3242 addp v3.4s, v18.4s, v19.4s 3243 smull v22.4s, v16.4h, v7.4h 3244 smull2 v23.4s, v16.8h, v7.8h 3245 addp v4.4s, v20.4s, v21.4s 3246 addp v5.4s, v22.4s, v23.4s 3247 3248 addp v8.4s, v8.4s, v9.4s 3249 addp v0.4s, v0.4s, v1.4s 3250 addp v2.4s, v2.4s, v3.4s 3251 addp v4.4s, v4.4s, v5.4s 3252 3253 addp v16.4s, v8.4s, v0.4s 3254 addp v17.4s, v2.4s, v4.4s 3255 3256 add w5, w5, w8 3257 3258 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) 3259 srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) 3260 3261 ret 3262endfunc 3263 3264// void dav1d_warp_affine_8x8_16bpc_neon( 3265// pixel *dst, const ptrdiff_t dst_stride, 3266// const pixel *src, const ptrdiff_t src_stride, 3267// const int16_t *const abcd, int mx, int my, 3268// const int bitdepth_max) 3269.macro warp t 3270function warp_affine_8x8\t\()_16bpc_neon, export=1 3271 stp d8, d9, [sp, #-0x40]! 3272 stp d10, d11, [sp, #0x10] 3273 stp d12, d13, [sp, #0x20] 3274 stp d14, d15, [sp, #0x30] 3275 3276.ifb \t 3277 dup v15.8h, w7 // bitdepth_max 3278.else 3279 movi v15.8h, #(PREP_BIAS >> 8), lsl #8 3280.endif 3281 clz w7, w7 3282 // intermediate_bits = clz(bitdepth_max) - 18 3283.ifb \t 3284 sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 3285.endif 3286 sub w7, w7, #25 // -(7 - intermediate_bits) 3287.ifb \t 3288 neg w8, w8 // -(7 + intermediate_bits) 3289.endif 3290 dup v14.4s, w7 // -(7 - intermediate_bits) 3291.ifb \t 3292 dup v13.4s, w8 // -(7 + intermediate_bits) 3293.endif 3294 3295 ldr x4, [x4] 3296 sbfx x7, x4, #0, #16 3297 sbfx x8, x4, #16, #16 3298 sbfx x9, x4, #32, #16 3299 sbfx x4, x4, #48, #16 3300 mov w10, #8 3301 sub x2, x2, x3, lsl #1 3302 sub x2, x2, x3 3303 sub x2, x2, #6 3304 movrel x11, X(mc_warp_filter), 64*8 3305 mov x15, x30 3306.ifnb \t 3307 lsl x1, x1, #1 3308.endif 3309 3310 bl warp_filter_horz_neon 3311 xtn v24.4h, v16.4s 3312 xtn2 v24.8h, v17.4s 3313 bl warp_filter_horz_neon 3314 xtn v25.4h, v16.4s 3315 xtn2 v25.8h, v17.4s 3316 bl warp_filter_horz_neon 3317 xtn v26.4h, v16.4s 3318 xtn2 v26.8h, v17.4s 3319 bl warp_filter_horz_neon 3320 xtn v27.4h, v16.4s 3321 xtn2 v27.8h, v17.4s 3322 bl warp_filter_horz_neon 3323 xtn v28.4h, v16.4s 3324 xtn2 v28.8h, v17.4s 3325 bl warp_filter_horz_neon 3326 xtn v29.4h, v16.4s 3327 xtn2 v29.8h, v17.4s 3328 bl warp_filter_horz_neon 3329 xtn v30.4h, v16.4s 3330 xtn2 v30.8h, v17.4s 3331 33321: 3333 add w14, w6, #512 3334 bl warp_filter_horz_neon 3335 xtn v31.4h, v16.4s 3336 xtn2 v31.8h, v17.4s 3337 3338 load_filter_row d0, w14, w9 3339 load_filter_row d1, w14, w9 3340 load_filter_row d2, w14, w9 3341 load_filter_row d3, w14, w9 3342 load_filter_row d4, w14, w9 3343 load_filter_row d5, w14, w9 3344 load_filter_row d6, w14, w9 3345 load_filter_row d7, w14, w9 3346 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3347 3348 // This ordering of smull/smlal/smull2/smlal2 is highly 3349 // beneficial for Cortex A53 here. 3350 smull v16.4s, v24.4h, v0.4h 3351 smlal v16.4s, v25.4h, v1.4h 3352 smlal v16.4s, v26.4h, v2.4h 3353 smlal v16.4s, v27.4h, v3.4h 3354 smlal v16.4s, v28.4h, v4.4h 3355 smlal v16.4s, v29.4h, v5.4h 3356 smlal v16.4s, v30.4h, v6.4h 3357 smlal v16.4s, v31.4h, v7.4h 3358 smull2 v17.4s, v24.8h, v0.8h 3359 smlal2 v17.4s, v25.8h, v1.8h 3360 smlal2 v17.4s, v26.8h, v2.8h 3361 smlal2 v17.4s, v27.8h, v3.8h 3362 smlal2 v17.4s, v28.8h, v4.8h 3363 smlal2 v17.4s, v29.8h, v5.8h 3364 smlal2 v17.4s, v30.8h, v6.8h 3365 smlal2 v17.4s, v31.8h, v7.8h 3366 3367 mov v24.16b, v25.16b 3368 mov v25.16b, v26.16b 3369.ifb \t 3370 srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) 3371 srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) 3372.else 3373 rshrn v16.4h, v16.4s, #7 3374 rshrn2 v16.8h, v17.4s, #7 3375.endif 3376 mov v26.16b, v27.16b 3377.ifb \t 3378 sqxtun v16.4h, v16.4s 3379 sqxtun2 v16.8h, v17.4s 3380.else 3381 sub v16.8h, v16.8h, v15.8h // PREP_BIAS 3382.endif 3383 mov v27.16b, v28.16b 3384 mov v28.16b, v29.16b 3385.ifb \t 3386 umin v16.8h, v16.8h, v15.8h // bitdepth_max 3387.endif 3388 mov v29.16b, v30.16b 3389 mov v30.16b, v31.16b 3390 subs w10, w10, #1 3391 st1 {v16.8h}, [x0], x1 3392 3393 add w6, w6, w4 3394 b.gt 1b 3395 3396 ldp d14, d15, [sp, #0x30] 3397 ldp d12, d13, [sp, #0x20] 3398 ldp d10, d11, [sp, #0x10] 3399 ldp d8, d9, [sp], 0x40 3400 3401 br x15 3402endfunc 3403.endm 3404 3405warp 3406warp t 3407 3408// void dav1d_emu_edge_16bpc_neon( 3409// const intptr_t bw, const intptr_t bh, 3410// const intptr_t iw, const intptr_t ih, 3411// const intptr_t x, const intptr_t y, 3412// pixel *dst, const ptrdiff_t dst_stride, 3413// const pixel *ref, const ptrdiff_t ref_stride) 3414function emu_edge_16bpc_neon, export=1 3415 ldp x8, x9, [sp] 3416 3417 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3418 // ref += iclip(x, 0, iw - 1) 3419 sub x12, x3, #1 // ih - 1 3420 cmp x5, x3 3421 sub x13, x2, #1 // iw - 1 3422 csel x12, x12, x5, ge // min(y, ih - 1) 3423 cmp x4, x2 3424 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3425 csel x13, x13, x4, ge // min(x, iw - 1) 3426 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3427 madd x8, x12, x9, x8 // ref += iclip() * stride 3428 add x8, x8, x13, lsl #1 // ref += iclip() 3429 3430 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3431 // top_ext = iclip(-y, 0, bh - 1) 3432 add x10, x5, x1 // y + bh 3433 neg x5, x5 // -y 3434 sub x10, x10, x3 // y + bh - ih 3435 sub x12, x1, #1 // bh - 1 3436 cmp x10, x1 3437 bic x5, x5, x5, asr #63 // max(-y, 0) 3438 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3439 cmp x5, x1 3440 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3441 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3442 3443 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3444 // left_ext = iclip(-x, 0, bw - 1) 3445 add x11, x4, x0 // x + bw 3446 neg x4, x4 // -x 3447 sub x11, x11, x2 // x + bw - iw 3448 sub x13, x0, #1 // bw - 1 3449 cmp x11, x0 3450 bic x4, x4, x4, asr #63 // max(-x, 0) 3451 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3452 cmp x4, x0 3453 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3454 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3455 3456 // center_h = bh - top_ext - bottom_ext 3457 // dst += top_ext * PXSTRIDE(dst_stride) 3458 // center_w = bw - left_ext - right_ext 3459 sub x1, x1, x5 // bh - top_ext 3460 madd x6, x5, x7, x6 3461 sub x2, x0, x4 // bw - left_ext 3462 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3463 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3464 3465 mov x14, x6 // backup of dst 3466 3467.macro v_loop need_left, need_right 34680: 3469.if \need_left 3470 ld1r {v0.8h}, [x8] 3471 mov x12, x6 // out = dst 3472 mov x3, x4 3473 mov v1.16b, v0.16b 34741: 3475 subs x3, x3, #16 3476 st1 {v0.8h, v1.8h}, [x12], #32 3477 b.gt 1b 3478.endif 3479 mov x13, x8 3480 add x12, x6, x4, lsl #1 // out = dst + left_ext 3481 mov x3, x2 34821: 3483 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 3484 subs x3, x3, #32 3485 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 3486 b.gt 1b 3487.if \need_right 3488 add x3, x8, x2, lsl #1 // in + center_w 3489 sub x3, x3, #2 // in + center_w - 1 3490 add x12, x6, x4, lsl #1 // dst + left_ext 3491 ld1r {v0.8h}, [x3] 3492 add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w 3493 mov x3, x11 3494 mov v1.16b, v0.16b 34951: 3496 subs x3, x3, #16 3497 st1 {v0.8h, v1.8h}, [x12], #32 3498 b.gt 1b 3499.endif 3500 3501 subs x1, x1, #1 // center_h-- 3502 add x6, x6, x7 3503 add x8, x8, x9 3504 b.gt 0b 3505.endm 3506 3507 cbz x4, 2f 3508 // need_left 3509 cbz x11, 3f 3510 // need_left + need_right 3511 v_loop 1, 1 3512 b 5f 3513 35142: 3515 // !need_left 3516 cbz x11, 4f 3517 // !need_left + need_right 3518 v_loop 0, 1 3519 b 5f 3520 35213: 3522 // need_left + !need_right 3523 v_loop 1, 0 3524 b 5f 3525 35264: 3527 // !need_left + !need_right 3528 v_loop 0, 0 3529 35305: 3531 3532 cbz x10, 3f 3533 // need_bottom 3534 sub x8, x6, x7 // ref = dst - stride 3535 mov x4, x0 35361: 3537 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 3538 mov x3, x10 35392: 3540 subs x3, x3, #1 3541 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3542 b.gt 2b 3543 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3544 subs x4, x4, #32 // bw -= 32 3545 add x6, x6, #64 // dst += 32 3546 b.gt 1b 3547 35483: 3549 cbz x5, 3f 3550 // need_top 3551 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 35521: 3553 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 3554 mov x3, x5 35552: 3556 subs x3, x3, #1 3557 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3558 b.gt 2b 3559 msub x6, x7, x5, x6 // dst -= top_ext * stride 3560 subs x0, x0, #32 // bw -= 32 3561 add x6, x6, #64 // dst += 32 3562 b.gt 1b 3563 35643: 3565 ret 3566endfunc 3567