1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2020, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32#define PREP_BIAS 8192 33 34.macro avg d0, d1, t0, t1, t2, t3 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 43 sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) 44 sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) 45.endm 46 47.macro w_avg d0, d1, t0, t1, t2, t3 48 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 49 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 50 // This difference requires a 17 bit range, and all bits are 51 // significant for the following multiplication. 52 ssubl \d0\().4s, \t2\().4h, \t0\().4h 53 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 54 ssubl \d1\().4s, \t3\().4h, \t1\().4h 55 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 56 mul \d0\().4s, \d0\().4s, v27.4s 57 mul \t0\().4s, \t0\().4s, v27.4s 58 mul \d1\().4s, \d1\().4s, v27.4s 59 mul \t1\().4s, \t1\().4s, v27.4s 60 sshr \d0\().4s, \d0\().4s, #4 61 sshr \t0\().4s, \t0\().4s, #4 62 sshr \d1\().4s, \d1\().4s, #4 63 sshr \t1\().4s, \t1\().4s, #4 64 saddw \d0\().4s, \d0\().4s, \t2\().4h 65 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 66 saddw \d1\().4s, \d1\().4s, \t3\().4h 67 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 68 xtn \d0\().4h, \d0\().4s 69 xtn2 \d0\().8h, \t0\().4s 70 xtn \d1\().4h, \d1\().4s 71 xtn2 \d1\().8h, \t1\().4s 72 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 73 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 74 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 75 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 76 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 77 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 78 smax \d0\().8h, \d0\().8h, v30.8h // 0 79 smax \d1\().8h, \d1\().8h, v30.8h // 0 80.endm 81 82.macro mask d0, d1, t0, t1, t2, t3 83 ld1 {v27.16b}, [x6], 16 84 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 85 neg v27.16b, v27.16b 86 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 87 sxtl v26.8h, v27.8b 88 sxtl2 v27.8h, v27.16b 89 sxtl v24.4s, v26.4h 90 sxtl2 v25.4s, v26.8h 91 sxtl v26.4s, v27.4h 92 sxtl2 v27.4s, v27.8h 93 ssubl \d0\().4s, \t2\().4h, \t0\().4h 94 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 95 ssubl \d1\().4s, \t3\().4h, \t1\().4h 96 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 97 mul \d0\().4s, \d0\().4s, v24.4s 98 mul \t0\().4s, \t0\().4s, v25.4s 99 mul \d1\().4s, \d1\().4s, v26.4s 100 mul \t1\().4s, \t1\().4s, v27.4s 101 sshr \d0\().4s, \d0\().4s, #6 102 sshr \t0\().4s, \t0\().4s, #6 103 sshr \d1\().4s, \d1\().4s, #6 104 sshr \t1\().4s, \t1\().4s, #6 105 saddw \d0\().4s, \d0\().4s, \t2\().4h 106 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 107 saddw \d1\().4s, \d1\().4s, \t3\().4h 108 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 109 xtn \d0\().4h, \d0\().4s 110 xtn2 \d0\().8h, \t0\().4s 111 xtn \d1\().4h, \d1\().4s 112 xtn2 \d1\().8h, \t1\().4s 113 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 114 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 115 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 116 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 117 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 118 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 119 smax \d0\().8h, \d0\().8h, v30.8h // 0 120 smax \d1\().8h, \d1\().8h, v30.8h // 0 121.endm 122 123.macro bidir_fn type, bdmax 124function \type\()_16bpc_neon, export=1 125 clz w4, w4 126.ifnc \type, avg 127 dup v31.8h, \bdmax // bitdepth_max 128 movi v30.8h, #0 129.endif 130 clz w7, \bdmax 131 sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 132.ifc \type, avg 133 mov w9, #1 134 mov w8, #-2*PREP_BIAS 135 lsl w9, w9, w7 // 1 << intermediate_bits 136 add w7, w7, #1 137 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits 138 neg w7, w7 // -(intermediate_bits+1) 139 dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits 140 dup v29.8h, w7 // -(intermediate_bits+1) 141.else 142 mov w8, #PREP_BIAS 143 lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits 144 neg w7, w7 // -intermediate_bits 145 dup v28.8h, w8 // PREP_BIAS >> intermediate_bits 146 dup v29.8h, w7 // -intermediate_bits 147.endif 148.ifc \type, w_avg 149 dup v27.4s, w6 150 neg v27.4s, v27.4s 151.endif 152 adr x7, L(\type\()_tbl) 153 sub w4, w4, #24 154 \type v4, v5, v0, v1, v2, v3 155 ldrh w4, [x7, x4, lsl #1] 156 sub x7, x7, w4, uxtw 157 br x7 15840: 159 add x7, x0, x1 160 lsl x1, x1, #1 1614: 162 subs w5, w5, #4 163 st1 {v4.d}[0], [x0], x1 164 st1 {v4.d}[1], [x7], x1 165 st1 {v5.d}[0], [x0], x1 166 st1 {v5.d}[1], [x7], x1 167 b.le 0f 168 \type v4, v5, v0, v1, v2, v3 169 b 4b 17080: 171 add x7, x0, x1 172 lsl x1, x1, #1 1738: 174 st1 {v4.8h}, [x0], x1 175 subs w5, w5, #2 176 st1 {v5.8h}, [x7], x1 177 b.le 0f 178 \type v4, v5, v0, v1, v2, v3 179 b 8b 18016: 181 \type v6, v7, v0, v1, v2, v3 182 st1 {v4.8h, v5.8h}, [x0], x1 183 subs w5, w5, #2 184 st1 {v6.8h, v7.8h}, [x0], x1 185 b.le 0f 186 \type v4, v5, v0, v1, v2, v3 187 b 16b 18832: 189 \type v6, v7, v0, v1, v2, v3 190 subs w5, w5, #1 191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 192 b.le 0f 193 \type v4, v5, v0, v1, v2, v3 194 b 32b 195640: 196 add x7, x0, #64 19764: 198 \type v6, v7, v0, v1, v2, v3 199 \type v16, v17, v0, v1, v2, v3 200 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 201 \type v18, v19, v0, v1, v2, v3 202 subs w5, w5, #1 203 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 204 b.le 0f 205 \type v4, v5, v0, v1, v2, v3 206 b 64b 2071280: 208 add x7, x0, #64 209 mov x8, #128 210 sub x1, x1, #128 211128: 212 \type v6, v7, v0, v1, v2, v3 213 \type v16, v17, v0, v1, v2, v3 214 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 215 \type v18, v19, v0, v1, v2, v3 216 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 217 \type v4, v5, v0, v1, v2, v3 218 \type v6, v7, v0, v1, v2, v3 219 \type v16, v17, v0, v1, v2, v3 220 subs w5, w5, #1 221 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 222 \type v18, v19, v0, v1, v2, v3 223 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 224 b.le 0f 225 \type v4, v5, v0, v1, v2, v3 226 b 128b 2270: 228 ret 229L(\type\()_tbl): 230 .hword L(\type\()_tbl) - 1280b 231 .hword L(\type\()_tbl) - 640b 232 .hword L(\type\()_tbl) - 32b 233 .hword L(\type\()_tbl) - 16b 234 .hword L(\type\()_tbl) - 80b 235 .hword L(\type\()_tbl) - 40b 236endfunc 237.endm 238 239bidir_fn avg, w6 240bidir_fn w_avg, w7 241bidir_fn mask, w7 242 243 244.macro w_mask_fn type 245function w_mask_\type\()_16bpc_neon, export=1 246 ldr w8, [sp] 247 clz w9, w4 248 adr x10, L(w_mask_\type\()_tbl) 249 dup v31.8h, w8 // bitdepth_max 250 sub w9, w9, #24 251 clz w8, w8 // clz(bitdepth_max) 252 ldrh w9, [x10, x9, lsl #1] 253 sub x10, x10, w9, uxtw 254 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 255 mov w9, #PREP_BIAS*64 256 neg w8, w8 // -sh 257 mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd 258 dup v30.4s, w9 // PREP_BIAS*64 259 dup v29.4s, w8 // -sh 260 dup v0.8h, w11 261.if \type == 444 262 movi v1.16b, #64 263.elseif \type == 422 264 dup v2.8b, w7 265 movi v3.8b, #129 266 sub v3.8b, v3.8b, v2.8b 267.elseif \type == 420 268 dup v2.8h, w7 269 movi v3.8h, #1, lsl #8 270 sub v3.8h, v3.8h, v2.8h 271.endif 272 add x12, x0, x1 273 lsl x1, x1, #1 274 br x10 2754: 276 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 277 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 278 subs w5, w5, #4 279 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 280 sabd v21.8h, v5.8h, v7.8h 281 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 282 ssubl2 v17.4s, v6.8h, v4.8h 283 ssubl v18.4s, v7.4h, v5.4h 284 ssubl2 v19.4s, v7.8h, v5.8h 285 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 286 uqsub v21.8h, v0.8h, v21.8h 287 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 288 sshll v6.4s, v5.4h, #6 289 sshll2 v5.4s, v4.8h, #6 290 sshll v4.4s, v4.4h, #6 291 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 292 ushr v21.8h, v21.8h, #10 293 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 294 add v5.4s, v5.4s, v30.4s 295 add v6.4s, v6.4s, v30.4s 296 add v7.4s, v7.4s, v30.4s 297 uxtl v22.4s, v20.4h 298 uxtl2 v23.4s, v20.8h 299 uxtl v24.4s, v21.4h 300 uxtl2 v25.4s, v21.8h 301 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 302 mla v5.4s, v17.4s, v23.4s 303 mla v6.4s, v18.4s, v24.4s 304 mla v7.4s, v19.4s, v25.4s 305 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 306 srshl v5.4s, v5.4s, v29.4s 307 srshl v6.4s, v6.4s, v29.4s 308 srshl v7.4s, v7.4s, v29.4s 309 sqxtun v4.4h, v4.4s // iclip_pixel 310 sqxtun2 v4.8h, v5.4s 311 sqxtun v5.4h, v6.4s 312 sqxtun2 v5.8h, v7.4s 313 umin v4.8h, v4.8h, v31.8h // iclip_pixel 314 umin v5.8h, v5.8h, v31.8h 315.if \type == 444 316 xtn v20.8b, v20.8h // 64 - m 317 xtn2 v20.16b, v21.8h 318 sub v20.16b, v1.16b, v20.16b // m 319 st1 {v20.16b}, [x6], #16 320.elseif \type == 422 321 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 322 xtn v20.8b, v20.8h 323 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 324 st1 {v20.8b}, [x6], #8 325.elseif \type == 420 326 trn1 v24.2d, v20.2d, v21.2d 327 trn2 v25.2d, v20.2d, v21.2d 328 add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) 329 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) 330 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 331 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 332 st1 {v20.s}[0], [x6], #4 333.endif 334 st1 {v4.d}[0], [x0], x1 335 st1 {v4.d}[1], [x12], x1 336 st1 {v5.d}[0], [x0], x1 337 st1 {v5.d}[1], [x12], x1 338 b.gt 4b 339 ret 3408: 341 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 342 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 343 subs w5, w5, #2 344 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 345 sabd v21.8h, v5.8h, v7.8h 346 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 347 ssubl2 v17.4s, v6.8h, v4.8h 348 ssubl v18.4s, v7.4h, v5.4h 349 ssubl2 v19.4s, v7.8h, v5.8h 350 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 351 uqsub v21.8h, v0.8h, v21.8h 352 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 353 sshll v6.4s, v5.4h, #6 354 sshll2 v5.4s, v4.8h, #6 355 sshll v4.4s, v4.4h, #6 356 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 357 ushr v21.8h, v21.8h, #10 358 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 359 add v5.4s, v5.4s, v30.4s 360 add v6.4s, v6.4s, v30.4s 361 add v7.4s, v7.4s, v30.4s 362 uxtl v22.4s, v20.4h 363 uxtl2 v23.4s, v20.8h 364 uxtl v24.4s, v21.4h 365 uxtl2 v25.4s, v21.8h 366 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 367 mla v5.4s, v17.4s, v23.4s 368 mla v6.4s, v18.4s, v24.4s 369 mla v7.4s, v19.4s, v25.4s 370 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 371 srshl v5.4s, v5.4s, v29.4s 372 srshl v6.4s, v6.4s, v29.4s 373 srshl v7.4s, v7.4s, v29.4s 374 sqxtun v4.4h, v4.4s // iclip_pixel 375 sqxtun2 v4.8h, v5.4s 376 sqxtun v5.4h, v6.4s 377 sqxtun2 v5.8h, v7.4s 378 umin v4.8h, v4.8h, v31.8h // iclip_pixel 379 umin v5.8h, v5.8h, v31.8h 380.if \type == 444 381 xtn v20.8b, v20.8h // 64 - m 382 xtn2 v20.16b, v21.8h 383 sub v20.16b, v1.16b, v20.16b // m 384 st1 {v20.16b}, [x6], #16 385.elseif \type == 422 386 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 387 xtn v20.8b, v20.8h 388 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 389 st1 {v20.8b}, [x6], #8 390.elseif \type == 420 391 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) 392 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) 393 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 394 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 395 st1 {v20.s}[0], [x6], #4 396.endif 397 st1 {v4.8h}, [x0], x1 398 st1 {v5.8h}, [x12], x1 399 b.gt 8b 400 ret 4011280: 402640: 403320: 404160: 405 mov w11, w4 406 sub x1, x1, w4, uxtw #1 407.if \type == 444 408 add x10, x6, w4, uxtw 409.elseif \type == 422 410 add x10, x6, x11, lsr #1 411.endif 412 add x9, x3, w4, uxtw #1 413 add x7, x2, w4, uxtw #1 414161: 415 mov w8, w4 41616: 417 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 418 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 419 ld1 {v6.8h, v7.8h}, [x7], #32 420 ld1 {v18.8h, v19.8h}, [x9], #32 421 subs w8, w8, #16 422 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) 423 sabd v21.8h, v5.8h, v17.8h 424 ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 425 ssubl2 v23.4s, v16.8h, v4.8h 426 ssubl v24.4s, v17.4h, v5.4h 427 ssubl2 v25.4s, v17.8h, v5.8h 428 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 429 uqsub v21.8h, v0.8h, v21.8h 430 sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 431 sshll v26.4s, v5.4h, #6 432 sshll2 v5.4s, v4.8h, #6 433 sshll v4.4s, v4.4h, #6 434 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 435 ushr v21.8h, v21.8h, #10 436 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 437 add v5.4s, v5.4s, v30.4s 438 add v26.4s, v26.4s, v30.4s 439 add v27.4s, v27.4s, v30.4s 440 uxtl v16.4s, v20.4h 441 uxtl2 v17.4s, v20.8h 442 uxtl v28.4s, v21.4h 443 mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) 444 uxtl2 v16.4s, v21.8h 445 mla v5.4s, v23.4s, v17.4s 446 mla v26.4s, v24.4s, v28.4s 447 mla v27.4s, v25.4s, v16.4s 448 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 449 srshl v5.4s, v5.4s, v29.4s 450 srshl v26.4s, v26.4s, v29.4s 451 srshl v27.4s, v27.4s, v29.4s 452 sqxtun v4.4h, v4.4s // iclip_pixel 453 sqxtun2 v4.8h, v5.4s 454 sqxtun v5.4h, v26.4s 455 sqxtun2 v5.8h, v27.4s 456 457 // Start of other half 458 sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) 459 sabd v23.8h, v7.8h, v19.8h 460 461 umin v4.8h, v4.8h, v31.8h // iclip_pixel 462 umin v5.8h, v5.8h, v31.8h 463 464 ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) 465 ssubl2 v17.4s, v18.8h, v6.8h 466 ssubl v18.4s, v19.4h, v7.4h 467 ssubl2 v19.4s, v19.8h, v7.8h 468 uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() 469 uqsub v23.8h, v0.8h, v23.8h 470 sshll v24.4s, v6.4h, #6 // tmp1 << 6 471 sshll2 v25.4s, v6.8h, #6 472 sshll v26.4s, v7.4h, #6 473 sshll2 v27.4s, v7.8h, #6 474 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 475 ushr v23.8h, v23.8h, #10 476 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 477 add v25.4s, v25.4s, v30.4s 478 add v26.4s, v26.4s, v30.4s 479 add v27.4s, v27.4s, v30.4s 480 uxtl v6.4s, v22.4h 481 uxtl2 v7.4s, v22.8h 482 uxtl v28.4s, v23.4h 483 mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) 484 uxtl2 v6.4s, v23.8h 485 mla v25.4s, v17.4s, v7.4s 486 mla v26.4s, v18.4s, v28.4s 487 mla v27.4s, v19.4s, v6.4s 488 srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 489 srshl v25.4s, v25.4s, v29.4s 490 srshl v26.4s, v26.4s, v29.4s 491 srshl v27.4s, v27.4s, v29.4s 492 sqxtun v6.4h, v24.4s // iclip_pixel 493 sqxtun2 v6.8h, v25.4s 494 sqxtun v7.4h, v26.4s 495 sqxtun2 v7.8h, v27.4s 496 umin v6.8h, v6.8h, v31.8h // iclip_pixel 497 umin v7.8h, v7.8h, v31.8h 498.if \type == 444 499 xtn v20.8b, v20.8h // 64 - m 500 xtn2 v20.16b, v21.8h 501 xtn v21.8b, v22.8h 502 xtn2 v21.16b, v23.8h 503 sub v20.16b, v1.16b, v20.16b // m 504 sub v21.16b, v1.16b, v21.16b 505 st1 {v20.16b}, [x6], #16 506 st1 {v21.16b}, [x10], #16 507.elseif \type == 422 508 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 509 addp v21.8h, v22.8h, v23.8h 510 xtn v20.8b, v20.8h 511 xtn v21.8b, v21.8h 512 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 513 uhsub v21.8b, v3.8b, v21.8b 514 st1 {v20.8b}, [x6], #8 515 st1 {v21.8b}, [x10], #8 516.elseif \type == 420 517 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) 518 add v21.8h, v21.8h, v23.8h 519 addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) 520 sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) 521 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 522 st1 {v20.8b}, [x6], #8 523.endif 524 st1 {v4.8h, v5.8h}, [x0], #32 525 st1 {v6.8h, v7.8h}, [x12], #32 526 b.gt 16b 527 subs w5, w5, #2 528 add x2, x2, w4, uxtw #1 529 add x3, x3, w4, uxtw #1 530 add x7, x7, w4, uxtw #1 531 add x9, x9, w4, uxtw #1 532.if \type == 444 533 add x6, x6, w4, uxtw 534 add x10, x10, w4, uxtw 535.elseif \type == 422 536 add x6, x6, x11, lsr #1 537 add x10, x10, x11, lsr #1 538.endif 539 add x0, x0, x1 540 add x12, x12, x1 541 b.gt 161b 542 ret 543L(w_mask_\type\()_tbl): 544 .hword L(w_mask_\type\()_tbl) - 1280b 545 .hword L(w_mask_\type\()_tbl) - 640b 546 .hword L(w_mask_\type\()_tbl) - 320b 547 .hword L(w_mask_\type\()_tbl) - 160b 548 .hword L(w_mask_\type\()_tbl) - 8b 549 .hword L(w_mask_\type\()_tbl) - 4b 550endfunc 551.endm 552 553w_mask_fn 444 554w_mask_fn 422 555w_mask_fn 420 556 557 558function blend_16bpc_neon, export=1 559 adr x6, L(blend_tbl) 560 clz w3, w3 561 sub w3, w3, #26 562 ldrh w3, [x6, x3, lsl #1] 563 sub x6, x6, w3, uxtw 564 add x8, x0, x1 565 br x6 56640: 567 lsl x1, x1, #1 5684: 569 ld1 {v2.8b}, [x5], #8 570 ld1 {v1.8h}, [x2], #16 571 ld1 {v0.d}[0], [x0] 572 neg v2.8b, v2.8b // -m 573 subs w4, w4, #2 574 ld1 {v0.d}[1], [x8] 575 sxtl v2.8h, v2.8b 576 shl v2.8h, v2.8h, #9 // -m << 9 577 sub v1.8h, v0.8h, v1.8h // a - b 578 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 579 add v0.8h, v0.8h, v1.8h 580 st1 {v0.d}[0], [x0], x1 581 st1 {v0.d}[1], [x8], x1 582 b.gt 4b 583 ret 58480: 585 lsl x1, x1, #1 5868: 587 ld1 {v4.16b}, [x5], #16 588 ld1 {v2.8h, v3.8h}, [x2], #32 589 neg v5.16b, v4.16b // -m 590 ld1 {v0.8h}, [x0] 591 ld1 {v1.8h}, [x8] 592 sxtl v4.8h, v5.8b 593 sxtl2 v5.8h, v5.16b 594 shl v4.8h, v4.8h, #9 // -m << 9 595 shl v5.8h, v5.8h, #9 596 sub v2.8h, v0.8h, v2.8h // a - b 597 sub v3.8h, v1.8h, v3.8h 598 subs w4, w4, #2 599 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 600 sqrdmulh v3.8h, v3.8h, v5.8h 601 add v0.8h, v0.8h, v2.8h 602 add v1.8h, v1.8h, v3.8h 603 st1 {v0.8h}, [x0], x1 604 st1 {v1.8h}, [x8], x1 605 b.gt 8b 606 ret 607160: 608 lsl x1, x1, #1 60916: 610 ld1 {v16.16b, v17.16b}, [x5], #32 611 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 612 subs w4, w4, #2 613 neg v18.16b, v16.16b // -m 614 neg v19.16b, v17.16b 615 ld1 {v0.8h, v1.8h}, [x0] 616 sxtl v16.8h, v18.8b 617 sxtl2 v17.8h, v18.16b 618 sxtl v18.8h, v19.8b 619 sxtl2 v19.8h, v19.16b 620 ld1 {v2.8h, v3.8h}, [x8] 621 shl v16.8h, v16.8h, #9 // -m << 9 622 shl v17.8h, v17.8h, #9 623 shl v18.8h, v18.8h, #9 624 shl v19.8h, v19.8h, #9 625 sub v4.8h, v0.8h, v4.8h // a - b 626 sub v5.8h, v1.8h, v5.8h 627 sub v6.8h, v2.8h, v6.8h 628 sub v7.8h, v3.8h, v7.8h 629 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 630 sqrdmulh v5.8h, v5.8h, v17.8h 631 sqrdmulh v6.8h, v6.8h, v18.8h 632 sqrdmulh v7.8h, v7.8h, v19.8h 633 add v0.8h, v0.8h, v4.8h 634 add v1.8h, v1.8h, v5.8h 635 add v2.8h, v2.8h, v6.8h 636 add v3.8h, v3.8h, v7.8h 637 st1 {v0.8h, v1.8h}, [x0], x1 638 st1 {v2.8h, v3.8h}, [x8], x1 639 b.gt 16b 640 ret 64132: 642 ld1 {v16.16b, v17.16b}, [x5], #32 643 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 644 subs w4, w4, #1 645 neg v18.16b, v16.16b // -m 646 neg v19.16b, v17.16b 647 sxtl v16.8h, v18.8b 648 sxtl2 v17.8h, v18.16b 649 sxtl v18.8h, v19.8b 650 sxtl2 v19.8h, v19.16b 651 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 652 shl v16.8h, v16.8h, #9 // -m << 9 653 shl v17.8h, v17.8h, #9 654 shl v18.8h, v18.8h, #9 655 shl v19.8h, v19.8h, #9 656 sub v4.8h, v0.8h, v4.8h // a - b 657 sub v5.8h, v1.8h, v5.8h 658 sub v6.8h, v2.8h, v6.8h 659 sub v7.8h, v3.8h, v7.8h 660 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 661 sqrdmulh v5.8h, v5.8h, v17.8h 662 sqrdmulh v6.8h, v6.8h, v18.8h 663 sqrdmulh v7.8h, v7.8h, v19.8h 664 add v0.8h, v0.8h, v4.8h 665 add v1.8h, v1.8h, v5.8h 666 add v2.8h, v2.8h, v6.8h 667 add v3.8h, v3.8h, v7.8h 668 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 669 b.gt 32b 670 ret 671L(blend_tbl): 672 .hword L(blend_tbl) - 32b 673 .hword L(blend_tbl) - 160b 674 .hword L(blend_tbl) - 80b 675 .hword L(blend_tbl) - 40b 676endfunc 677 678function blend_h_16bpc_neon, export=1 679 adr x6, L(blend_h_tbl) 680 movrel x5, X(obmc_masks) 681 add x5, x5, w4, uxtw 682 sub w4, w4, w4, lsr #2 683 clz w7, w3 684 add x8, x0, x1 685 lsl x1, x1, #1 686 sub w7, w7, #24 687 ldrh w7, [x6, x7, lsl #1] 688 sub x6, x6, w7, uxtw 689 br x6 6902: 691 ld2r {v2.8b, v3.8b}, [x5], #2 692 ld1 {v1.4h}, [x2], #8 693 ext v2.8b, v2.8b, v3.8b, #6 694 subs w4, w4, #2 695 neg v2.8b, v2.8b // -m 696 ld1 {v0.s}[0], [x0] 697 ld1 {v0.s}[1], [x8] 698 sxtl v2.8h, v2.8b 699 shl v2.4h, v2.4h, #9 // -m << 9 700 sub v1.4h, v0.4h, v1.4h // a - b 701 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 702 add v0.4h, v0.4h, v1.4h 703 st1 {v0.s}[0], [x0], x1 704 st1 {v0.s}[1], [x8], x1 705 b.gt 2b 706 ret 7074: 708 ld2r {v2.8b, v3.8b}, [x5], #2 709 ld1 {v1.8h}, [x2], #16 710 ext v2.8b, v2.8b, v3.8b, #4 711 subs w4, w4, #2 712 neg v2.8b, v2.8b // -m 713 ld1 {v0.d}[0], [x0] 714 ld1 {v0.d}[1], [x8] 715 sxtl v2.8h, v2.8b 716 shl v2.8h, v2.8h, #9 // -m << 9 717 sub v1.8h, v0.8h, v1.8h // a - b 718 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 719 add v0.8h, v0.8h, v1.8h 720 st1 {v0.d}[0], [x0], x1 721 st1 {v0.d}[1], [x8], x1 722 b.gt 4b 723 ret 7248: 725 ld2r {v4.8b, v5.8b}, [x5], #2 726 ld1 {v2.8h, v3.8h}, [x2], #32 727 neg v4.8b, v4.8b // -m 728 neg v5.8b, v5.8b 729 ld1 {v0.8h}, [x0] 730 subs w4, w4, #2 731 sxtl v4.8h, v4.8b 732 sxtl v5.8h, v5.8b 733 ld1 {v1.8h}, [x8] 734 shl v4.8h, v4.8h, #9 // -m << 9 735 shl v5.8h, v5.8h, #9 736 sub v2.8h, v0.8h, v2.8h // a - b 737 sub v3.8h, v1.8h, v3.8h 738 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 739 sqrdmulh v3.8h, v3.8h, v5.8h 740 add v0.8h, v0.8h, v2.8h 741 add v1.8h, v1.8h, v3.8h 742 st1 {v0.8h}, [x0], x1 743 st1 {v1.8h}, [x8], x1 744 b.gt 8b 745 ret 74616: 747 ld2r {v16.8b, v17.8b}, [x5], #2 748 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 749 neg v16.8b, v16.8b // -m 750 neg v17.8b, v17.8b 751 ld1 {v0.8h, v1.8h}, [x0] 752 ld1 {v2.8h, v3.8h}, [x8] 753 subs w4, w4, #2 754 sxtl v16.8h, v16.8b 755 sxtl v17.8h, v17.8b 756 shl v16.8h, v16.8h, #9 // -m << 9 757 shl v17.8h, v17.8h, #9 758 sub v4.8h, v0.8h, v4.8h // a - b 759 sub v5.8h, v1.8h, v5.8h 760 sub v6.8h, v2.8h, v6.8h 761 sub v7.8h, v3.8h, v7.8h 762 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 763 sqrdmulh v5.8h, v5.8h, v16.8h 764 sqrdmulh v6.8h, v6.8h, v17.8h 765 sqrdmulh v7.8h, v7.8h, v17.8h 766 add v0.8h, v0.8h, v4.8h 767 add v1.8h, v1.8h, v5.8h 768 add v2.8h, v2.8h, v6.8h 769 add v3.8h, v3.8h, v7.8h 770 st1 {v0.8h, v1.8h}, [x0], x1 771 st1 {v2.8h, v3.8h}, [x8], x1 772 b.gt 16b 773 ret 7741280: 775640: 776320: 777 sub x1, x1, w3, uxtw #1 778 add x7, x2, w3, uxtw #1 779321: 780 ld2r {v24.8b, v25.8b}, [x5], #2 781 mov w6, w3 782 neg v24.8b, v24.8b // -m 783 neg v25.8b, v25.8b 784 sxtl v24.8h, v24.8b 785 sxtl v25.8h, v25.8b 786 shl v24.8h, v24.8h, #9 // -m << 9 787 shl v25.8h, v25.8h, #9 78832: 789 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 790 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 791 subs w6, w6, #32 792 sub v16.8h, v0.8h, v16.8h // a - b 793 sub v17.8h, v1.8h, v17.8h 794 sub v18.8h, v2.8h, v18.8h 795 sub v19.8h, v3.8h, v19.8h 796 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 797 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] 798 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 799 sqrdmulh v17.8h, v17.8h, v24.8h 800 sqrdmulh v18.8h, v18.8h, v24.8h 801 sqrdmulh v19.8h, v19.8h, v24.8h 802 sub v20.8h, v4.8h, v20.8h // a - b 803 sub v21.8h, v5.8h, v21.8h 804 sub v22.8h, v6.8h, v22.8h 805 sub v23.8h, v7.8h, v23.8h 806 add v0.8h, v0.8h, v16.8h 807 add v1.8h, v1.8h, v17.8h 808 add v2.8h, v2.8h, v18.8h 809 add v3.8h, v3.8h, v19.8h 810 sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 811 sqrdmulh v21.8h, v21.8h, v25.8h 812 sqrdmulh v22.8h, v22.8h, v25.8h 813 sqrdmulh v23.8h, v23.8h, v25.8h 814 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 815 add v4.8h, v4.8h, v20.8h 816 add v5.8h, v5.8h, v21.8h 817 add v6.8h, v6.8h, v22.8h 818 add v7.8h, v7.8h, v23.8h 819 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 820 b.gt 32b 821 subs w4, w4, #2 822 add x0, x0, x1 823 add x8, x8, x1 824 add x2, x2, w3, uxtw #1 825 add x7, x7, w3, uxtw #1 826 b.gt 321b 827 ret 828L(blend_h_tbl): 829 .hword L(blend_h_tbl) - 1280b 830 .hword L(blend_h_tbl) - 640b 831 .hword L(blend_h_tbl) - 320b 832 .hword L(blend_h_tbl) - 16b 833 .hword L(blend_h_tbl) - 8b 834 .hword L(blend_h_tbl) - 4b 835 .hword L(blend_h_tbl) - 2b 836endfunc 837 838function blend_v_16bpc_neon, export=1 839 adr x6, L(blend_v_tbl) 840 movrel x5, X(obmc_masks) 841 add x5, x5, w3, uxtw 842 clz w3, w3 843 add x8, x0, x1 844 lsl x1, x1, #1 845 sub w3, w3, #26 846 ldrh w3, [x6, x3, lsl #1] 847 sub x6, x6, w3, uxtw 848 br x6 84920: 850 ld1r {v2.8b}, [x5] 851 neg v2.8b, v2.8b // -m 852 sxtl v2.8h, v2.8b 853 shl v2.4h, v2.4h, #9 // -m << 9 8542: 855 ld1 {v1.s}[0], [x2], #4 856 ld1 {v0.h}[0], [x0] 857 subs w4, w4, #2 858 ld1 {v1.h}[1], [x2] 859 ld1 {v0.h}[1], [x8] 860 add x2, x2, #4 861 sub v1.4h, v0.4h, v1.4h // a - b 862 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 863 add v0.4h, v0.4h, v1.4h 864 st1 {v0.h}[0], [x0], x1 865 st1 {v0.h}[1], [x8], x1 866 b.gt 2b 867 ret 86840: 869 ld1r {v2.2s}, [x5] 870 sub x1, x1, #4 871 neg v2.8b, v2.8b // -m 872 sxtl v2.8h, v2.8b 873 shl v2.8h, v2.8h, #9 // -m << 9 8744: 875 ld1 {v1.8h}, [x2], #16 876 ld1 {v0.d}[0], [x0] 877 ld1 {v0.d}[1], [x8] 878 subs w4, w4, #2 879 sub v1.8h, v0.8h, v1.8h // a - b 880 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 881 add v0.8h, v0.8h, v1.8h 882 st1 {v0.s}[0], [x0], #4 883 st1 {v0.s}[2], [x8], #4 884 st1 {v0.h}[2], [x0], x1 885 st1 {v0.h}[6], [x8], x1 886 b.gt 4b 887 ret 88880: 889 ld1 {v4.8b}, [x5] 890 sub x1, x1, #8 891 neg v4.8b, v4.8b // -m 892 sxtl v4.8h, v4.8b 893 shl v4.8h, v4.8h, #9 // -m << 9 8948: 895 ld1 {v2.8h, v3.8h}, [x2], #32 896 ld1 {v0.8h}, [x0] 897 ld1 {v1.8h}, [x8] 898 subs w4, w4, #2 899 sub v2.8h, v0.8h, v2.8h // a - b 900 sub v3.8h, v1.8h, v3.8h 901 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 902 sqrdmulh v3.8h, v3.8h, v4.8h 903 add v0.8h, v0.8h, v2.8h 904 add v1.8h, v1.8h, v3.8h 905 st1 {v0.d}[0], [x0], #8 906 st1 {v1.d}[0], [x8], #8 907 st1 {v0.s}[2], [x0], x1 908 st1 {v1.s}[2], [x8], x1 909 b.gt 8b 910 ret 911160: 912 ld1 {v16.8b, v17.8b}, [x5] 913 sub x1, x1, #16 914 neg v16.8b, v16.8b // -m 915 neg v17.8b, v17.8b 916 sxtl v16.8h, v16.8b 917 sxtl v17.8h, v17.8b 918 shl v16.8h, v16.8h, #9 // -m << 9 919 shl v17.4h, v17.4h, #9 92016: 921 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 922 ld1 {v0.8h, v1.8h}, [x0] 923 subs w4, w4, #2 924 ld1 {v2.8h, v3.8h}, [x8] 925 sub v4.8h, v0.8h, v4.8h // a - b 926 sub v5.4h, v1.4h, v5.4h 927 sub v6.8h, v2.8h, v6.8h 928 sub v7.4h, v3.4h, v7.4h 929 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 930 sqrdmulh v5.4h, v5.4h, v17.4h 931 sqrdmulh v6.8h, v6.8h, v16.8h 932 sqrdmulh v7.4h, v7.4h, v17.4h 933 add v0.8h, v0.8h, v4.8h 934 add v1.4h, v1.4h, v5.4h 935 add v2.8h, v2.8h, v6.8h 936 add v3.4h, v3.4h, v7.4h 937 st1 {v0.8h}, [x0], #16 938 st1 {v2.8h}, [x8], #16 939 st1 {v1.4h}, [x0], x1 940 st1 {v3.4h}, [x8], x1 941 b.gt 16b 942 ret 943320: 944 ld1 {v24.16b, v25.16b}, [x5] 945 neg v26.16b, v24.16b // -m 946 neg v27.8b, v25.8b 947 sxtl v24.8h, v26.8b 948 sxtl2 v25.8h, v26.16b 949 sxtl v26.8h, v27.8b 950 shl v24.8h, v24.8h, #9 // -m << 9 951 shl v25.8h, v25.8h, #9 952 shl v26.8h, v26.8h, #9 95332: 954 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 955 ld1 {v0.8h, v1.8h, v2.8h}, [x0] 956 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 957 ld1 {v4.8h, v5.8h, v6.8h}, [x8] 958 subs w4, w4, #2 959 sub v16.8h, v0.8h, v16.8h // a - b 960 sub v17.8h, v1.8h, v17.8h 961 sub v18.8h, v2.8h, v18.8h 962 sub v20.8h, v4.8h, v20.8h 963 sub v21.8h, v5.8h, v21.8h 964 sub v22.8h, v6.8h, v22.8h 965 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 966 sqrdmulh v17.8h, v17.8h, v25.8h 967 sqrdmulh v18.8h, v18.8h, v26.8h 968 sqrdmulh v20.8h, v20.8h, v24.8h 969 sqrdmulh v21.8h, v21.8h, v25.8h 970 sqrdmulh v22.8h, v22.8h, v26.8h 971 add v0.8h, v0.8h, v16.8h 972 add v1.8h, v1.8h, v17.8h 973 add v2.8h, v2.8h, v18.8h 974 add v4.8h, v4.8h, v20.8h 975 add v5.8h, v5.8h, v21.8h 976 add v6.8h, v6.8h, v22.8h 977 st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 978 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 979 b.gt 32b 980 ret 981L(blend_v_tbl): 982 .hword L(blend_v_tbl) - 320b 983 .hword L(blend_v_tbl) - 160b 984 .hword L(blend_v_tbl) - 80b 985 .hword L(blend_v_tbl) - 40b 986 .hword L(blend_v_tbl) - 20b 987endfunc 988 989 990// This has got the same signature as the put_8tap functions, 991// and assumes that x9 is set to (clz(w)-24). 992function put_neon 993 adr x10, L(put_tbl) 994 ldrh w9, [x10, x9, lsl #1] 995 sub x10, x10, w9, uxtw 996 br x10 997 9982: 999 ld1 {v0.s}[0], [x2], x3 1000 ld1 {v1.s}[0], [x2], x3 1001 subs w5, w5, #2 1002 st1 {v0.s}[0], [x0], x1 1003 st1 {v1.s}[0], [x0], x1 1004 b.gt 2b 1005 ret 10064: 1007 ld1 {v0.4h}, [x2], x3 1008 ld1 {v1.4h}, [x2], x3 1009 subs w5, w5, #2 1010 st1 {v0.4h}, [x0], x1 1011 st1 {v1.4h}, [x0], x1 1012 b.gt 4b 1013 ret 101480: 1015 add x8, x0, x1 1016 lsl x1, x1, #1 1017 add x9, x2, x3 1018 lsl x3, x3, #1 10198: 1020 ld1 {v0.8h}, [x2], x3 1021 ld1 {v1.8h}, [x9], x3 1022 subs w5, w5, #2 1023 st1 {v0.8h}, [x0], x1 1024 st1 {v1.8h}, [x8], x1 1025 b.gt 8b 1026 ret 102716: 1028 ldp x6, x7, [x2] 1029 ldp x8, x9, [x2, #16] 1030 stp x6, x7, [x0] 1031 subs w5, w5, #1 1032 stp x8, x9, [x0, #16] 1033 add x2, x2, x3 1034 add x0, x0, x1 1035 b.gt 16b 1036 ret 103732: 1038 ldp x6, x7, [x2] 1039 ldp x8, x9, [x2, #16] 1040 stp x6, x7, [x0] 1041 ldp x10, x11, [x2, #32] 1042 stp x8, x9, [x0, #16] 1043 subs w5, w5, #1 1044 ldp x12, x13, [x2, #48] 1045 stp x10, x11, [x0, #32] 1046 stp x12, x13, [x0, #48] 1047 add x2, x2, x3 1048 add x0, x0, x1 1049 b.gt 32b 1050 ret 105164: 1052 ldp q0, q1, [x2] 1053 ldp q2, q3, [x2, #32] 1054 stp q0, q1, [x0] 1055 ldp q4, q5, [x2, #64] 1056 stp q2, q3, [x0, #32] 1057 ldp q6, q7, [x2, #96] 1058 subs w5, w5, #1 1059 stp q4, q5, [x0, #64] 1060 stp q6, q7, [x0, #96] 1061 add x2, x2, x3 1062 add x0, x0, x1 1063 b.gt 64b 1064 ret 1065128: 1066 ldp q0, q1, [x2] 1067 ldp q2, q3, [x2, #32] 1068 stp q0, q1, [x0] 1069 ldp q4, q5, [x2, #64] 1070 stp q2, q3, [x0, #32] 1071 ldp q6, q7, [x2, #96] 1072 subs w5, w5, #1 1073 stp q4, q5, [x0, #64] 1074 ldp q16, q17, [x2, #128] 1075 stp q6, q7, [x0, #96] 1076 ldp q18, q19, [x2, #160] 1077 stp q16, q17, [x0, #128] 1078 ldp q20, q21, [x2, #192] 1079 stp q18, q19, [x0, #160] 1080 ldp q22, q23, [x2, #224] 1081 stp q20, q21, [x0, #192] 1082 stp q22, q23, [x0, #224] 1083 add x2, x2, x3 1084 add x0, x0, x1 1085 b.gt 128b 1086 ret 1087 1088L(put_tbl): 1089 .hword L(put_tbl) - 128b 1090 .hword L(put_tbl) - 64b 1091 .hword L(put_tbl) - 32b 1092 .hword L(put_tbl) - 16b 1093 .hword L(put_tbl) - 80b 1094 .hword L(put_tbl) - 4b 1095 .hword L(put_tbl) - 2b 1096endfunc 1097 1098 1099// This has got the same signature as the prep_8tap functions, 1100// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and 1101// x8 to w*2. 1102function prep_neon 1103 adr x10, L(prep_tbl) 1104 ldrh w9, [x10, x9, lsl #1] 1105 dup v31.8h, w7 // intermediate_bits 1106 movi v30.8h, #(PREP_BIAS >> 8), lsl #8 1107 sub x10, x10, w9, uxtw 1108 br x10 1109 111040: 1111 add x9, x1, x2 1112 lsl x2, x2, #1 11134: 1114 ld1 {v0.d}[0], [x1], x2 1115 ld1 {v0.d}[1], [x9], x2 1116 subs w4, w4, #2 1117 sshl v0.8h, v0.8h, v31.8h 1118 sub v0.8h, v0.8h, v30.8h 1119 st1 {v0.8h}, [x0], #16 1120 b.gt 4b 1121 ret 112280: 1123 add x9, x1, x2 1124 lsl x2, x2, #1 11258: 1126 ld1 {v0.8h}, [x1], x2 1127 ld1 {v1.8h}, [x9], x2 1128 subs w4, w4, #2 1129 sshl v0.8h, v0.8h, v31.8h 1130 sshl v1.8h, v1.8h, v31.8h 1131 sub v0.8h, v0.8h, v30.8h 1132 sub v1.8h, v1.8h, v30.8h 1133 st1 {v0.8h, v1.8h}, [x0], #32 1134 b.gt 8b 1135 ret 113616: 1137 ldp q0, q1, [x1] 1138 add x1, x1, x2 1139 sshl v0.8h, v0.8h, v31.8h 1140 ldp q2, q3, [x1] 1141 add x1, x1, x2 1142 subs w4, w4, #2 1143 sshl v1.8h, v1.8h, v31.8h 1144 sshl v2.8h, v2.8h, v31.8h 1145 sshl v3.8h, v3.8h, v31.8h 1146 sub v0.8h, v0.8h, v30.8h 1147 sub v1.8h, v1.8h, v30.8h 1148 sub v2.8h, v2.8h, v30.8h 1149 sub v3.8h, v3.8h, v30.8h 1150 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1151 b.gt 16b 1152 ret 115332: 1154 ldp q0, q1, [x1] 1155 sshl v0.8h, v0.8h, v31.8h 1156 ldp q2, q3, [x1, #32] 1157 add x1, x1, x2 1158 sshl v1.8h, v1.8h, v31.8h 1159 sshl v2.8h, v2.8h, v31.8h 1160 sshl v3.8h, v3.8h, v31.8h 1161 subs w4, w4, #1 1162 sub v0.8h, v0.8h, v30.8h 1163 sub v1.8h, v1.8h, v30.8h 1164 sub v2.8h, v2.8h, v30.8h 1165 sub v3.8h, v3.8h, v30.8h 1166 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1167 b.gt 32b 1168 ret 116964: 1170 ldp q0, q1, [x1] 1171 subs w4, w4, #1 1172 sshl v0.8h, v0.8h, v31.8h 1173 ldp q2, q3, [x1, #32] 1174 sshl v1.8h, v1.8h, v31.8h 1175 ldp q4, q5, [x1, #64] 1176 sshl v2.8h, v2.8h, v31.8h 1177 sshl v3.8h, v3.8h, v31.8h 1178 ldp q6, q7, [x1, #96] 1179 add x1, x1, x2 1180 sshl v4.8h, v4.8h, v31.8h 1181 sshl v5.8h, v5.8h, v31.8h 1182 sshl v6.8h, v6.8h, v31.8h 1183 sshl v7.8h, v7.8h, v31.8h 1184 sub v0.8h, v0.8h, v30.8h 1185 sub v1.8h, v1.8h, v30.8h 1186 sub v2.8h, v2.8h, v30.8h 1187 sub v3.8h, v3.8h, v30.8h 1188 stp q0, q1, [x0] 1189 sub v4.8h, v4.8h, v30.8h 1190 sub v5.8h, v5.8h, v30.8h 1191 stp q2, q3, [x0, #32] 1192 sub v6.8h, v6.8h, v30.8h 1193 sub v7.8h, v7.8h, v30.8h 1194 stp q4, q5, [x0, #64] 1195 stp q6, q7, [x0, #96] 1196 add x0, x0, x8 1197 b.gt 64b 1198 ret 1199128: 1200 ldp q0, q1, [x1] 1201 subs w4, w4, #1 1202 sshl v0.8h, v0.8h, v31.8h 1203 ldp q2, q3, [x1, #32] 1204 sshl v1.8h, v1.8h, v31.8h 1205 ldp q4, q5, [x1, #64] 1206 sshl v2.8h, v2.8h, v31.8h 1207 sshl v3.8h, v3.8h, v31.8h 1208 ldp q6, q7, [x1, #96] 1209 sshl v4.8h, v4.8h, v31.8h 1210 sshl v5.8h, v5.8h, v31.8h 1211 ldp q16, q17, [x1, #128] 1212 sshl v6.8h, v6.8h, v31.8h 1213 sshl v7.8h, v7.8h, v31.8h 1214 ldp q18, q19, [x1, #160] 1215 sshl v16.8h, v16.8h, v31.8h 1216 sshl v17.8h, v17.8h, v31.8h 1217 ldp q20, q21, [x1, #192] 1218 sshl v18.8h, v18.8h, v31.8h 1219 sshl v19.8h, v19.8h, v31.8h 1220 ldp q22, q23, [x1, #224] 1221 add x1, x1, x2 1222 sshl v20.8h, v20.8h, v31.8h 1223 sshl v21.8h, v21.8h, v31.8h 1224 sshl v22.8h, v22.8h, v31.8h 1225 sshl v23.8h, v23.8h, v31.8h 1226 sub v0.8h, v0.8h, v30.8h 1227 sub v1.8h, v1.8h, v30.8h 1228 sub v2.8h, v2.8h, v30.8h 1229 sub v3.8h, v3.8h, v30.8h 1230 stp q0, q1, [x0] 1231 sub v4.8h, v4.8h, v30.8h 1232 sub v5.8h, v5.8h, v30.8h 1233 stp q2, q3, [x0, #32] 1234 sub v6.8h, v6.8h, v30.8h 1235 sub v7.8h, v7.8h, v30.8h 1236 stp q4, q5, [x0, #64] 1237 sub v16.8h, v16.8h, v30.8h 1238 sub v17.8h, v17.8h, v30.8h 1239 stp q6, q7, [x0, #96] 1240 sub v18.8h, v18.8h, v30.8h 1241 sub v19.8h, v19.8h, v30.8h 1242 stp q16, q17, [x0, #128] 1243 sub v20.8h, v20.8h, v30.8h 1244 sub v21.8h, v21.8h, v30.8h 1245 stp q18, q19, [x0, #160] 1246 sub v22.8h, v22.8h, v30.8h 1247 sub v23.8h, v23.8h, v30.8h 1248 stp q20, q21, [x0, #192] 1249 stp q22, q23, [x0, #224] 1250 add x0, x0, x8 1251 b.gt 128b 1252 ret 1253 1254L(prep_tbl): 1255 .hword L(prep_tbl) - 128b 1256 .hword L(prep_tbl) - 64b 1257 .hword L(prep_tbl) - 32b 1258 .hword L(prep_tbl) - 16b 1259 .hword L(prep_tbl) - 80b 1260 .hword L(prep_tbl) - 40b 1261endfunc 1262 1263 1264.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1265 ld1 {\d0\wd}[0], [\s0], \strd 1266 ld1 {\d1\wd}[0], [\s1], \strd 1267.ifnb \d2 1268 ld1 {\d2\wd}[0], [\s0], \strd 1269 ld1 {\d3\wd}[0], [\s1], \strd 1270.endif 1271.ifnb \d4 1272 ld1 {\d4\wd}[0], [\s0], \strd 1273.endif 1274.ifnb \d5 1275 ld1 {\d5\wd}[0], [\s1], \strd 1276.endif 1277.ifnb \d6 1278 ld1 {\d6\wd}[0], [\s0], \strd 1279.endif 1280.endm 1281.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1282 ld1 {\d0\wd}, [\s0], \strd 1283 ld1 {\d1\wd}, [\s1], \strd 1284.ifnb \d2 1285 ld1 {\d2\wd}, [\s0], \strd 1286 ld1 {\d3\wd}, [\s1], \strd 1287.endif 1288.ifnb \d4 1289 ld1 {\d4\wd}, [\s0], \strd 1290.endif 1291.ifnb \d5 1292 ld1 {\d5\wd}, [\s1], \strd 1293.endif 1294.ifnb \d6 1295 ld1 {\d6\wd}, [\s0], \strd 1296.endif 1297.endm 1298.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 1299 ld1 {\d0\wd, \d1\wd}, [\s0], \strd 1300.ifnb \d2 1301 ld1 {\d2\wd, \d3\wd}, [\s1], \strd 1302.endif 1303.ifnb \d4 1304 ld1 {\d4\wd, \d5\wd}, [\s0], \strd 1305.endif 1306.endm 1307.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1308 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1309.endm 1310.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1311 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1312.endm 1313.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1314 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1315.endm 1316.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 1317 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 1318.endm 1319.macro interleave_1 wd, r0, r1, r2, r3, r4 1320 trn1 \r0\wd, \r0\wd, \r1\wd 1321 trn1 \r1\wd, \r1\wd, \r2\wd 1322.ifnb \r3 1323 trn1 \r2\wd, \r2\wd, \r3\wd 1324 trn1 \r3\wd, \r3\wd, \r4\wd 1325.endif 1326.endm 1327.macro interleave_1_s r0, r1, r2, r3, r4 1328 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1329.endm 1330.macro umin_h c, wd, r0, r1, r2, r3 1331 umin \r0\wd, \r0\wd, \c\wd 1332.ifnb \r1 1333 umin \r1\wd, \r1\wd, \c\wd 1334.endif 1335.ifnb \r2 1336 umin \r2\wd, \r2\wd, \c\wd 1337 umin \r3\wd, \r3\wd, \c\wd 1338.endif 1339.endm 1340.macro sub_h c, wd, r0, r1, r2, r3 1341 sub \r0\wd, \r0\wd, \c\wd 1342.ifnb \r1 1343 sub \r1\wd, \r1\wd, \c\wd 1344.endif 1345.ifnb \r2 1346 sub \r2\wd, \r2\wd, \c\wd 1347 sub \r3\wd, \r3\wd, \c\wd 1348.endif 1349.endm 1350.macro smull_smlal_4 d, s0, s1, s2, s3 1351 smull \d\().4s, \s0\().4h, v0.h[0] 1352 smlal \d\().4s, \s1\().4h, v0.h[1] 1353 smlal \d\().4s, \s2\().4h, v0.h[2] 1354 smlal \d\().4s, \s3\().4h, v0.h[3] 1355.endm 1356.macro smull2_smlal2_4 d, s0, s1, s2, s3 1357 smull2 \d\().4s, \s0\().8h, v0.h[0] 1358 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1359 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1360 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1361.endm 1362.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1363 smull \d\().4s, \s0\().4h, v0.h[0] 1364 smlal \d\().4s, \s1\().4h, v0.h[1] 1365 smlal \d\().4s, \s2\().4h, v0.h[2] 1366 smlal \d\().4s, \s3\().4h, v0.h[3] 1367 smlal \d\().4s, \s4\().4h, v0.h[4] 1368 smlal \d\().4s, \s5\().4h, v0.h[5] 1369 smlal \d\().4s, \s6\().4h, v0.h[6] 1370 smlal \d\().4s, \s7\().4h, v0.h[7] 1371.endm 1372.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 1373 smull2 \d\().4s, \s0\().8h, v0.h[0] 1374 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1375 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1376 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1377 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1378 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1379 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1380 smlal2 \d\().4s, \s7\().8h, v0.h[7] 1381.endm 1382.macro sqrshrun_h shift, r0, r1, r2, r3 1383 sqrshrun \r0\().4h, \r0\().4s, #\shift 1384.ifnb \r1 1385 sqrshrun2 \r0\().8h, \r1\().4s, #\shift 1386.endif 1387.ifnb \r2 1388 sqrshrun \r2\().4h, \r2\().4s, #\shift 1389 sqrshrun2 \r2\().8h, \r3\().4s, #\shift 1390.endif 1391.endm 1392.macro xtn_h r0, r1, r2, r3 1393 xtn \r0\().4h, \r0\().4s 1394 xtn2 \r0\().8h, \r1\().4s 1395.ifnb \r2 1396 xtn \r2\().4h, \r2\().4s 1397 xtn2 \r2\().8h, \r3\().4s 1398.endif 1399.endm 1400.macro srshl_s shift, r0, r1, r2, r3 1401 srshl \r0\().4s, \r0\().4s, \shift\().4s 1402 srshl \r1\().4s, \r1\().4s, \shift\().4s 1403.ifnb \r2 1404 srshl \r2\().4s, \r2\().4s, \shift\().4s 1405 srshl \r3\().4s, \r3\().4s, \shift\().4s 1406.endif 1407.endm 1408.macro st_s strd, reg, lanes 1409 st1 {\reg\().s}[0], [x0], \strd 1410 st1 {\reg\().s}[1], [x9], \strd 1411.if \lanes > 2 1412 st1 {\reg\().s}[2], [x0], \strd 1413 st1 {\reg\().s}[3], [x9], \strd 1414.endif 1415.endm 1416.macro st_d strd, r0, r1 1417 st1 {\r0\().d}[0], [x0], \strd 1418 st1 {\r0\().d}[1], [x9], \strd 1419.ifnb \r1 1420 st1 {\r1\().d}[0], [x0], \strd 1421 st1 {\r1\().d}[1], [x9], \strd 1422.endif 1423.endm 1424.macro shift_store_4 type, strd, r0, r1, r2, r3 1425.ifc \type, put 1426 sqrshrun_h 6, \r0, \r1, \r2, \r3 1427 umin_h v31, .8h, \r0, \r2 1428.else 1429 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1430 xtn_h \r0, \r1, \r2, \r3 1431 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1432.endif 1433 st_d \strd, \r0, \r2 1434.endm 1435.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1436 st1 {\r0\wd}, [x0], \strd 1437 st1 {\r1\wd}, [x9], \strd 1438.ifnb \r2 1439 st1 {\r2\wd}, [x0], \strd 1440 st1 {\r3\wd}, [x9], \strd 1441.endif 1442.ifnb \r4 1443 st1 {\r4\wd}, [x0], \strd 1444 st1 {\r5\wd}, [x9], \strd 1445 st1 {\r6\wd}, [x0], \strd 1446 st1 {\r7\wd}, [x9], \strd 1447.endif 1448.endm 1449.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 1450 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1451.endm 1452.macro shift_store_8 type, strd, r0, r1, r2, r3 1453.ifc \type, put 1454 sqrshrun_h 6, \r0, \r1, \r2, \r3 1455 umin_h v31, .8h, \r0, \r2 1456.else 1457 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1458 xtn_h \r0, \r1, \r2, \r3 1459 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1460.endif 1461 st_8h \strd, \r0, \r2 1462.endm 1463.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 1464.ifc \type, put 1465 sqrshrun_h 6, \r0, \r1, \r2, \r3 1466 umin \r0\().8h, \r0\().8h, v31.8h 1467 umin \r1\().8h, \r2\().8h, v31.8h 1468.else 1469 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1470 xtn_h \r0, \r1, \r2, \r3 1471 sub \r0\().8h, \r0\().8h, v29.8h 1472 sub \r1\().8h, \r2\().8h, v29.8h 1473.endif 1474 st1 {\r0\().8h, \r1\().8h}, [\dst], \strd 1475.endm 1476 1477.macro make_8tap_fn op, type, type_h, type_v 1478function \op\()_8tap_\type\()_16bpc_neon, export=1 1479 mov w9, \type_h 1480 mov w10, \type_v 1481 b \op\()_8tap_neon 1482endfunc 1483.endm 1484 1485// No spaces in these expressions, due to gas-preprocessor. 1486#define REGULAR ((0*15<<7)|3*15) 1487#define SMOOTH ((1*15<<7)|4*15) 1488#define SHARP ((2*15<<7)|3*15) 1489 1490.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 1491make_8tap_fn \type, regular, REGULAR, REGULAR 1492make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH 1493make_8tap_fn \type, regular_sharp, REGULAR, SHARP 1494make_8tap_fn \type, smooth, SMOOTH, SMOOTH 1495make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR 1496make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP 1497make_8tap_fn \type, sharp, SHARP, SHARP 1498make_8tap_fn \type, sharp_regular, SHARP, REGULAR 1499make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH 1500 1501function \type\()_8tap_neon 1502.ifc \bdmax, w8 1503 ldr w8, [sp] 1504.endif 1505 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1506 mul \mx, \mx, w11 1507 mul \my, \my, w11 1508 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h 1509 add \my, \my, w10 // my, 8tap_v, 4tap_v 1510.ifc \type, prep 1511 uxtw \d_strd, \w 1512 lsl \d_strd, \d_strd, #1 1513.endif 1514 1515 dup v31.8h, \bdmax // bitdepth_max 1516 clz \bdmax, \bdmax 1517 clz w9, \w 1518 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 1519 mov w12, #6 1520 tst \mx, #(0x7f << 14) 1521 sub w9, w9, #24 1522 add w13, w12, \bdmax // 6 + intermediate_bits 1523 sub w12, w12, \bdmax // 6 - intermediate_bits 1524 movrel x11, X(mc_subpel_filters), -8 1525 b.ne L(\type\()_8tap_h) 1526 tst \my, #(0x7f << 14) 1527 b.ne L(\type\()_8tap_v) 1528 b \type\()_neon 1529 1530L(\type\()_8tap_h): 1531 cmp \w, #4 1532 ubfx w10, \mx, #7, #7 1533 and \mx, \mx, #0x7f 1534 b.le 4f 1535 mov \mx, w10 15364: 1537 tst \my, #(0x7f << 14) 1538 add \xmx, x11, \mx, uxtw #3 1539 b.ne L(\type\()_8tap_hv) 1540 1541 adr x10, L(\type\()_8tap_h_tbl) 1542 dup v30.4s, w12 // 6 - intermediate_bits 1543 ldrh w9, [x10, x9, lsl #1] 1544 neg v30.4s, v30.4s // -(6-intermediate_bits) 1545.ifc \type, put 1546 dup v29.8h, \bdmax // intermediate_bits 1547.else 1548 movi v28.8h, #(PREP_BIAS >> 8), lsl #8 1549.endif 1550 sub x10, x10, w9, uxtw 1551.ifc \type, put 1552 neg v29.8h, v29.8h // -intermediate_bits 1553.endif 1554 br x10 1555 155620: // 2xN h 1557.ifc \type, put 1558 add \xmx, \xmx, #2 1559 ld1 {v0.s}[0], [\xmx] 1560 sub \src, \src, #2 1561 add \ds2, \dst, \d_strd 1562 add \sr2, \src, \s_strd 1563 lsl \d_strd, \d_strd, #1 1564 lsl \s_strd, \s_strd, #1 1565 sxtl v0.8h, v0.8b 15662: 1567 ld1 {v4.8h}, [\src], \s_strd 1568 ld1 {v6.8h}, [\sr2], \s_strd 1569 ext v5.16b, v4.16b, v4.16b, #2 1570 ext v7.16b, v6.16b, v6.16b, #2 1571 subs \h, \h, #2 1572 trn1 v3.2s, v4.2s, v6.2s 1573 trn2 v6.2s, v4.2s, v6.2s 1574 trn1 v4.2s, v5.2s, v7.2s 1575 trn2 v7.2s, v5.2s, v7.2s 1576 smull v3.4s, v3.4h, v0.h[0] 1577 smlal v3.4s, v4.4h, v0.h[1] 1578 smlal v3.4s, v6.4h, v0.h[2] 1579 smlal v3.4s, v7.4h, v0.h[3] 1580 srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) 1581 sqxtun v3.4h, v3.4s 1582 srshl v3.4h, v3.4h, v29.4h // -intermediate_bits 1583 umin v3.4h, v3.4h, v31.4h 1584 st1 {v3.s}[0], [\dst], \d_strd 1585 st1 {v3.s}[1], [\ds2], \d_strd 1586 b.gt 2b 1587 ret 1588.endif 1589 159040: // 4xN h 1591 add \xmx, \xmx, #2 1592 ld1 {v0.s}[0], [\xmx] 1593 sub \src, \src, #2 1594 add \ds2, \dst, \d_strd 1595 add \sr2, \src, \s_strd 1596 lsl \d_strd, \d_strd, #1 1597 lsl \s_strd, \s_strd, #1 1598 sxtl v0.8h, v0.8b 15994: 1600 ld1 {v16.8h}, [\src], \s_strd 1601 ld1 {v20.8h}, [\sr2], \s_strd 1602 ext v17.16b, v16.16b, v16.16b, #2 1603 ext v18.16b, v16.16b, v16.16b, #4 1604 ext v19.16b, v16.16b, v16.16b, #6 1605 ext v21.16b, v20.16b, v20.16b, #2 1606 ext v22.16b, v20.16b, v20.16b, #4 1607 ext v23.16b, v20.16b, v20.16b, #6 1608 subs \h, \h, #2 1609 smull v16.4s, v16.4h, v0.h[0] 1610 smlal v16.4s, v17.4h, v0.h[1] 1611 smlal v16.4s, v18.4h, v0.h[2] 1612 smlal v16.4s, v19.4h, v0.h[3] 1613 smull v20.4s, v20.4h, v0.h[0] 1614 smlal v20.4s, v21.4h, v0.h[1] 1615 smlal v20.4s, v22.4h, v0.h[2] 1616 smlal v20.4s, v23.4h, v0.h[3] 1617 srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) 1618 srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) 1619.ifc \type, put 1620 sqxtun v16.4h, v16.4s 1621 sqxtun2 v16.8h, v20.4s 1622 srshl v16.8h, v16.8h, v29.8h // -intermediate_bits 1623 umin v16.8h, v16.8h, v31.8h 1624.else 1625 xtn v16.4h, v16.4s 1626 xtn2 v16.8h, v20.4s 1627 sub v16.8h, v16.8h, v28.8h // PREP_BIAS 1628.endif 1629 st1 {v16.d}[0], [\dst], \d_strd 1630 st1 {v16.d}[1], [\ds2], \d_strd 1631 b.gt 4b 1632 ret 1633 163480: 1635160: 1636320: 1637640: 16381280: // 8xN, 16xN, 32xN, ... h 1639 ld1 {v0.8b}, [\xmx] 1640 sub \src, \src, #6 1641 add \ds2, \dst, \d_strd 1642 add \sr2, \src, \s_strd 1643 lsl \s_strd, \s_strd, #1 1644 sxtl v0.8h, v0.8b 1645 1646 sub \s_strd, \s_strd, \w, uxtw #1 1647 sub \s_strd, \s_strd, #16 1648.ifc \type, put 1649 lsl \d_strd, \d_strd, #1 1650 sub \d_strd, \d_strd, \w, uxtw #1 1651.endif 165281: 1653 ld1 {v16.8h, v17.8h}, [\src], #32 1654 ld1 {v20.8h, v21.8h}, [\sr2], #32 1655 mov \mx, \w 1656 16578: 1658 smull v18.4s, v16.4h, v0.h[0] 1659 smull2 v19.4s, v16.8h, v0.h[0] 1660 smull v22.4s, v20.4h, v0.h[0] 1661 smull2 v23.4s, v20.8h, v0.h[0] 1662.irpc i, 1234567 1663 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1664 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1665 smlal v18.4s, v24.4h, v0.h[\i] 1666 smlal2 v19.4s, v24.8h, v0.h[\i] 1667 smlal v22.4s, v25.4h, v0.h[\i] 1668 smlal2 v23.4s, v25.8h, v0.h[\i] 1669.endr 1670 subs \mx, \mx, #8 1671 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) 1672 srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) 1673 srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) 1674 srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) 1675.ifc \type, put 1676 sqxtun v18.4h, v18.4s 1677 sqxtun2 v18.8h, v19.4s 1678 sqxtun v22.4h, v22.4s 1679 sqxtun2 v22.8h, v23.4s 1680 srshl v18.8h, v18.8h, v29.8h // -intermediate_bits 1681 srshl v22.8h, v22.8h, v29.8h // -intermediate_bits 1682 umin v18.8h, v18.8h, v31.8h 1683 umin v22.8h, v22.8h, v31.8h 1684.else 1685 xtn v18.4h, v18.4s 1686 xtn2 v18.8h, v19.4s 1687 xtn v22.4h, v22.4s 1688 xtn2 v22.8h, v23.4s 1689 sub v18.8h, v18.8h, v28.8h // PREP_BIAS 1690 sub v22.8h, v22.8h, v28.8h // PREP_BIAS 1691.endif 1692 st1 {v18.8h}, [\dst], #16 1693 st1 {v22.8h}, [\ds2], #16 1694 b.le 9f 1695 1696 mov v16.16b, v17.16b 1697 mov v20.16b, v21.16b 1698 ld1 {v17.8h}, [\src], #16 1699 ld1 {v21.8h}, [\sr2], #16 1700 b 8b 1701 17029: 1703 add \dst, \dst, \d_strd 1704 add \ds2, \ds2, \d_strd 1705 add \src, \src, \s_strd 1706 add \sr2, \sr2, \s_strd 1707 1708 subs \h, \h, #2 1709 b.gt 81b 1710 ret 1711 1712L(\type\()_8tap_h_tbl): 1713 .hword L(\type\()_8tap_h_tbl) - 1280b 1714 .hword L(\type\()_8tap_h_tbl) - 640b 1715 .hword L(\type\()_8tap_h_tbl) - 320b 1716 .hword L(\type\()_8tap_h_tbl) - 160b 1717 .hword L(\type\()_8tap_h_tbl) - 80b 1718 .hword L(\type\()_8tap_h_tbl) - 40b 1719 .hword L(\type\()_8tap_h_tbl) - 20b 1720 .hword 0 1721 1722 1723L(\type\()_8tap_v): 1724 cmp \h, #4 1725 ubfx w10, \my, #7, #7 1726 and \my, \my, #0x7f 1727 b.le 4f 1728 mov \my, w10 17294: 1730 add \xmy, x11, \my, uxtw #3 1731 1732.ifc \type, prep 1733 dup v30.4s, w12 // 6 - intermediate_bits 1734 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 1735.endif 1736 adr x10, L(\type\()_8tap_v_tbl) 1737 ldrh w9, [x10, x9, lsl #1] 1738.ifc \type, prep 1739 neg v30.4s, v30.4s // -(6-intermediate_bits) 1740.endif 1741 sub x10, x10, w9, uxtw 1742 br x10 1743 174420: // 2xN v 1745.ifc \type, put 1746 b.gt 28f 1747 1748 cmp \h, #2 1749 add \xmy, \xmy, #2 1750 ld1 {v0.s}[0], [\xmy] 1751 sub \src, \src, \s_strd 1752 add \ds2, \dst, \d_strd 1753 add \sr2, \src, \s_strd 1754 lsl \s_strd, \s_strd, #1 1755 lsl \d_strd, \d_strd, #1 1756 sxtl v0.8h, v0.8b 1757 1758 // 2x2 v 1759 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1760 interleave_1_s v1, v2, v3, v4, v5 1761 b.gt 24f 1762 smull_smlal_4 v6, v1, v2, v3, v4 1763 sqrshrun_h 6, v6 1764 umin_h v31, .8h, v6 1765 st_s \d_strd, v6, 2 1766 ret 1767 176824: // 2x4 v 1769 load_s \sr2, \src, \s_strd, v6, v7 1770 interleave_1_s v5, v6, v7 1771 smull_smlal_4 v16, v1, v2, v3, v4 1772 smull_smlal_4 v17, v3, v4, v5, v6 1773 sqrshrun_h 6, v16, v17 1774 umin_h v31, .8h, v16 1775 st_s \d_strd, v16, 4 1776 ret 1777 177828: // 2x8, 2x16 v 1779 ld1 {v0.8b}, [\xmy] 1780 sub \sr2, \src, \s_strd, lsl #1 1781 add \ds2, \dst, \d_strd 1782 sub \src, \sr2, \s_strd 1783 lsl \d_strd, \d_strd, #1 1784 lsl \s_strd, \s_strd, #1 1785 sxtl v0.8h, v0.8b 1786 1787 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1788 interleave_1_s v1, v2, v3, v4, v5 1789 interleave_1_s v5, v6, v7 1790216: 1791 subs \h, \h, #8 1792 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 1793 load_s \sr2, \src, \s_strd, v20, v21, v22, v23 1794 interleave_1_s v7, v16, v17, v18, v19 1795 interleave_1_s v19, v20, v21, v22, v23 1796 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 1797 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 1798 smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20 1799 smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22 1800 sqrshrun_h 6, v24, v25, v26, v27 1801 umin_h v31, .8h, v24, v26 1802 st_s \d_strd, v24, 4 1803 st_s \d_strd, v26, 4 1804 b.le 0f 1805 mov v1.16b, v17.16b 1806 mov v2.16b, v18.16b 1807 mov v3.16b, v19.16b 1808 mov v4.16b, v20.16b 1809 mov v5.16b, v21.16b 1810 mov v6.16b, v22.16b 1811 mov v7.16b, v23.16b 1812 b 216b 18130: 1814 ret 1815.endif 1816 181740: 1818 b.gt 480f 1819 1820 // 4x2, 4x4 v 1821 cmp \h, #2 1822 add \xmy, \xmy, #2 1823 ld1 {v0.s}[0], [\xmy] 1824 sub \src, \src, \s_strd 1825 add \ds2, \dst, \d_strd 1826 add \sr2, \src, \s_strd 1827 lsl \s_strd, \s_strd, #1 1828 lsl \d_strd, \d_strd, #1 1829 sxtl v0.8h, v0.8b 1830 1831 load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1832 smull_smlal_4 v6, v1, v2, v3, v4 1833 smull_smlal_4 v7, v2, v3, v4, v5 1834 shift_store_4 \type, \d_strd, v6, v7 1835 b.le 0f 1836 load_4h \sr2, \src, \s_strd, v6, v7 1837 smull_smlal_4 v1, v3, v4, v5, v6 1838 smull_smlal_4 v2, v4, v5, v6, v7 1839 shift_store_4 \type, \d_strd, v1, v2 18400: 1841 ret 1842 1843480: // 4x8, 4x16 v 1844 ld1 {v0.8b}, [\xmy] 1845 sub \sr2, \src, \s_strd, lsl #1 1846 add \ds2, \dst, \d_strd 1847 sub \src, \sr2, \s_strd 1848 lsl \s_strd, \s_strd, #1 1849 lsl \d_strd, \d_strd, #1 1850 sxtl v0.8h, v0.8b 1851 1852 load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1853 185448: 1855 subs \h, \h, #4 1856 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 1857 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1858 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 1859 smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 1860 smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1861 shift_store_4 \type, \d_strd, v1, v2, v3, v4 1862 b.le 0f 1863 mov v16.8b, v20.8b 1864 mov v17.8b, v21.8b 1865 mov v18.8b, v22.8b 1866 mov v19.8b, v23.8b 1867 mov v20.8b, v24.8b 1868 mov v21.8b, v25.8b 1869 mov v22.8b, v26.8b 1870 b 48b 18710: 1872 ret 1873 187480: 1875 b.gt 880f 1876 1877 // 8x2, 8x4 v 1878 cmp \h, #2 1879 add \xmy, \xmy, #2 1880 ld1 {v0.s}[0], [\xmy] 1881 sub \src, \src, \s_strd 1882 add \ds2, \dst, \d_strd 1883 add \sr2, \src, \s_strd 1884 lsl \s_strd, \s_strd, #1 1885 lsl \d_strd, \d_strd, #1 1886 sxtl v0.8h, v0.8b 1887 1888 load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1889 smull_smlal_4 v16, v1, v2, v3, v4 1890 smull2_smlal2_4 v17, v1, v2, v3, v4 1891 smull_smlal_4 v18, v2, v3, v4, v5 1892 smull2_smlal2_4 v19, v2, v3, v4, v5 1893 shift_store_8 \type, \d_strd, v16, v17, v18, v19 1894 b.le 0f 1895 load_8h \sr2, \src, \s_strd, v6, v7 1896 smull_smlal_4 v16, v3, v4, v5, v6 1897 smull2_smlal2_4 v17, v3, v4, v5, v6 1898 smull_smlal_4 v18, v4, v5, v6, v7 1899 smull2_smlal2_4 v19, v4, v5, v6, v7 1900 shift_store_8 \type, \d_strd, v16, v17, v18, v19 19010: 1902 ret 1903 1904880: // 8x6, 8x8, 8x16, 8x32 v 19051680: // 16x8, 16x16, ... 1906320: // 32x8, 32x16, ... 1907640: 19081280: 1909 ld1 {v0.8b}, [\xmy] 1910 sub \src, \src, \s_strd 1911 sub \src, \src, \s_strd, lsl #1 1912 sxtl v0.8h, v0.8b 1913 mov \my, \h 1914168: 1915 add \ds2, \dst, \d_strd 1916 add \sr2, \src, \s_strd 1917 lsl \s_strd, \s_strd, #1 1918 lsl \d_strd, \d_strd, #1 1919 1920 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1921 192288: 1923 subs \h, \h, #2 1924 load_8h \sr2, \src, \s_strd, v23, v24 1925 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 1926 smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 1927 smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 1928 smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 1929 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1930 b.le 9f 1931 subs \h, \h, #2 1932 load_8h \sr2, \src, \s_strd, v25, v26 1933 smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 1934 smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 1935 smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 1936 smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 1937 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1938 b.le 9f 1939 mov v16.16b, v20.16b 1940 mov v17.16b, v21.16b 1941 mov v18.16b, v22.16b 1942 mov v19.16b, v23.16b 1943 mov v20.16b, v24.16b 1944 mov v21.16b, v25.16b 1945 mov v22.16b, v26.16b 1946 b 88b 19479: 1948 subs \w, \w, #8 1949 b.le 0f 1950 asr \s_strd, \s_strd, #1 1951 asr \d_strd, \d_strd, #1 1952 msub \src, \s_strd, \xmy, \src 1953 msub \dst, \d_strd, \xmy, \dst 1954 sub \src, \src, \s_strd, lsl #3 1955 mov \h, \my 1956 add \src, \src, #16 1957 add \dst, \dst, #16 1958 b 168b 19590: 1960 ret 1961 1962160: 1963 b.gt 1680b 1964 1965 // 16x2, 16x4 v 1966 add \xmy, \xmy, #2 1967 ld1 {v0.s}[0], [\xmy] 1968 sub \src, \src, \s_strd 1969 sxtl v0.8h, v0.8b 1970 1971 load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 197216: 1973 load_16h \src, \src, \s_strd, v22, v23 1974 subs \h, \h, #1 1975 smull_smlal_4 v1, v16, v18, v20, v22 1976 smull2_smlal2_4 v2, v16, v18, v20, v22 1977 smull_smlal_4 v3, v17, v19, v21, v23 1978 smull2_smlal2_4 v4, v17, v19, v21, v23 1979 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 1980 b.le 0f 1981 mov v16.16b, v18.16b 1982 mov v17.16b, v19.16b 1983 mov v18.16b, v20.16b 1984 mov v19.16b, v21.16b 1985 mov v20.16b, v22.16b 1986 mov v21.16b, v23.16b 1987 b 16b 19880: 1989 ret 1990 1991L(\type\()_8tap_v_tbl): 1992 .hword L(\type\()_8tap_v_tbl) - 1280b 1993 .hword L(\type\()_8tap_v_tbl) - 640b 1994 .hword L(\type\()_8tap_v_tbl) - 320b 1995 .hword L(\type\()_8tap_v_tbl) - 160b 1996 .hword L(\type\()_8tap_v_tbl) - 80b 1997 .hword L(\type\()_8tap_v_tbl) - 40b 1998 .hword L(\type\()_8tap_v_tbl) - 20b 1999 .hword 0 2000 2001L(\type\()_8tap_hv): 2002 cmp \h, #4 2003 ubfx w10, \my, #7, #7 2004 and \my, \my, #0x7f 2005 b.le 4f 2006 mov \my, w10 20074: 2008 add \xmy, x11, \my, uxtw #3 2009 2010 adr x10, L(\type\()_8tap_hv_tbl) 2011 dup v30.4s, w12 // 6 - intermediate_bits 2012 ldrh w9, [x10, x9, lsl #1] 2013 neg v30.4s, v30.4s // -(6-intermediate_bits) 2014.ifc \type, put 2015 dup v29.4s, w13 // 6 + intermediate_bits 2016.else 2017 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2018.endif 2019 sub x10, x10, w9, uxtw 2020.ifc \type, put 2021 neg v29.4s, v29.4s // -(6+intermediate_bits) 2022.endif 2023 br x10 2024 202520: 2026.ifc \type, put 2027 add \xmx, \xmx, #2 2028 ld1 {v0.s}[0], [\xmx] 2029 b.gt 280f 2030 add \xmy, \xmy, #2 2031 ld1 {v1.s}[0], [\xmy] 2032 2033 // 2x2, 2x4 hv 2034 sub \sr2, \src, #2 2035 sub \src, \sr2, \s_strd 2036 add \ds2, \dst, \d_strd 2037 lsl \s_strd, \s_strd, #1 2038 lsl \d_strd, \d_strd, #1 2039 sxtl v0.8h, v0.8b 2040 sxtl v1.8h, v1.8b 2041 mov x15, x30 2042 2043 ld1 {v27.8h}, [\src], \s_strd 2044 ext v28.16b, v27.16b, v27.16b, #2 2045 smull v27.4s, v27.4h, v0.4h 2046 smull v28.4s, v28.4h, v0.4h 2047 addp v27.4s, v27.4s, v28.4s 2048 addp v16.4s, v27.4s, v27.4s 2049 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2050 bl L(\type\()_8tap_filter_2) 2051 // The intermediates from the horizontal pass fit in 16 bit without 2052 // any bias; we could just as well keep them as .4s, but narrowing 2053 // them to .4h gives a significant speedup on out of order cores 2054 // (at the cost of a smaller slowdown on in-order cores such as A53). 2055 xtn v16.4h, v16.4s 2056 2057 trn1 v16.2s, v16.2s, v24.2s 2058 mov v17.8b, v24.8b 2059 20602: 2061 bl L(\type\()_8tap_filter_2) 2062 2063 ext v18.8b, v17.8b, v24.8b, #4 2064 smull v2.4s, v16.4h, v1.h[0] 2065 smlal v2.4s, v17.4h, v1.h[1] 2066 smlal v2.4s, v18.4h, v1.h[2] 2067 smlal v2.4s, v24.4h, v1.h[3] 2068 2069 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2070 sqxtun v2.4h, v2.4s 2071 umin v2.4h, v2.4h, v31.4h 2072 subs \h, \h, #2 2073 st1 {v2.s}[0], [\dst], \d_strd 2074 st1 {v2.s}[1], [\ds2], \d_strd 2075 b.le 0f 2076 mov v16.8b, v18.8b 2077 mov v17.8b, v24.8b 2078 b 2b 2079 2080280: // 2x8, 2x16, 2x32 hv 2081 ld1 {v1.8b}, [\xmy] 2082 sub \src, \src, #2 2083 sub \sr2, \src, \s_strd, lsl #1 2084 sub \src, \sr2, \s_strd 2085 add \ds2, \dst, \d_strd 2086 lsl \s_strd, \s_strd, #1 2087 lsl \d_strd, \d_strd, #1 2088 sxtl v0.8h, v0.8b 2089 sxtl v1.8h, v1.8b 2090 mov x15, x30 2091 2092 ld1 {v27.8h}, [\src], \s_strd 2093 ext v28.16b, v27.16b, v27.16b, #2 2094 smull v27.4s, v27.4h, v0.4h 2095 smull v28.4s, v28.4h, v0.4h 2096 addp v27.4s, v27.4s, v28.4s 2097 addp v16.4s, v27.4s, v27.4s 2098 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2099 // The intermediates from the horizontal pass fit in 16 bit without 2100 // any bias; we could just as well keep them as .4s, but narrowing 2101 // them to .4h gives a significant speedup on out of order cores 2102 // (at the cost of a smaller slowdown on in-order cores such as A53). 2103 2104 bl L(\type\()_8tap_filter_2) 2105 xtn v16.4h, v16.4s 2106 trn1 v16.2s, v16.2s, v24.2s 2107 mov v17.8b, v24.8b 2108 bl L(\type\()_8tap_filter_2) 2109 ext v18.8b, v17.8b, v24.8b, #4 2110 mov v19.8b, v24.8b 2111 bl L(\type\()_8tap_filter_2) 2112 ext v20.8b, v19.8b, v24.8b, #4 2113 mov v21.8b, v24.8b 2114 211528: 2116 bl L(\type\()_8tap_filter_2) 2117 ext v22.8b, v21.8b, v24.8b, #4 2118 smull v3.4s, v16.4h, v1.h[0] 2119 smlal v3.4s, v17.4h, v1.h[1] 2120 smlal v3.4s, v18.4h, v1.h[2] 2121 smlal v3.4s, v19.4h, v1.h[3] 2122 smlal v3.4s, v20.4h, v1.h[4] 2123 smlal v3.4s, v21.4h, v1.h[5] 2124 smlal v3.4s, v22.4h, v1.h[6] 2125 smlal v3.4s, v24.4h, v1.h[7] 2126 2127 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2128 sqxtun v3.4h, v3.4s 2129 umin v3.4h, v3.4h, v31.4h 2130 subs \h, \h, #2 2131 st1 {v3.s}[0], [\dst], \d_strd 2132 st1 {v3.s}[1], [\ds2], \d_strd 2133 b.le 0f 2134 mov v16.8b, v18.8b 2135 mov v17.8b, v19.8b 2136 mov v18.8b, v20.8b 2137 mov v19.8b, v21.8b 2138 mov v20.8b, v22.8b 2139 mov v21.8b, v24.8b 2140 b 28b 2141 21420: 2143 br x15 2144 2145L(\type\()_8tap_filter_2): 2146 ld1 {v25.8h}, [\sr2], \s_strd 2147 ld1 {v27.8h}, [\src], \s_strd 2148 ext v26.16b, v25.16b, v25.16b, #2 2149 ext v28.16b, v27.16b, v27.16b, #2 2150 trn1 v24.2s, v25.2s, v27.2s 2151 trn2 v27.2s, v25.2s, v27.2s 2152 trn1 v25.2s, v26.2s, v28.2s 2153 trn2 v28.2s, v26.2s, v28.2s 2154 smull v24.4s, v24.4h, v0.h[0] 2155 smlal v24.4s, v25.4h, v0.h[1] 2156 smlal v24.4s, v27.4h, v0.h[2] 2157 smlal v24.4s, v28.4h, v0.h[3] 2158 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2159 xtn v24.4h, v24.4s 2160 ret 2161.endif 2162 216340: 2164 add \xmx, \xmx, #2 2165 ld1 {v0.s}[0], [\xmx] 2166 b.gt 480f 2167 add \xmy, \xmy, #2 2168 ld1 {v1.s}[0], [\xmy] 2169 sub \sr2, \src, #2 2170 sub \src, \sr2, \s_strd 2171 add \ds2, \dst, \d_strd 2172 lsl \s_strd, \s_strd, #1 2173 lsl \d_strd, \d_strd, #1 2174 sxtl v0.8h, v0.8b 2175 sxtl v1.8h, v1.8b 2176 mov x15, x30 2177 2178 // 4x2, 4x4 hv 2179 ld1 {v25.8h}, [\src], \s_strd 2180 ext v26.16b, v25.16b, v25.16b, #2 2181 ext v27.16b, v25.16b, v25.16b, #4 2182 ext v28.16b, v25.16b, v25.16b, #6 2183 smull v25.4s, v25.4h, v0.h[0] 2184 smlal v25.4s, v26.4h, v0.h[1] 2185 smlal v25.4s, v27.4h, v0.h[2] 2186 smlal v25.4s, v28.4h, v0.h[3] 2187 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2188 // The intermediates from the horizontal pass fit in 16 bit without 2189 // any bias; we could just as well keep them as .4s, but narrowing 2190 // them to .4h gives a significant speedup on out of order cores 2191 // (at the cost of a smaller slowdown on in-order cores such as A53). 2192 xtn v16.4h, v16.4s 2193 2194 bl L(\type\()_8tap_filter_4) 2195 mov v17.8b, v24.8b 2196 mov v18.8b, v25.8b 2197 21984: 2199 bl L(\type\()_8tap_filter_4) 2200 smull v2.4s, v16.4h, v1.h[0] 2201 smlal v2.4s, v17.4h, v1.h[1] 2202 smlal v2.4s, v18.4h, v1.h[2] 2203 smlal v2.4s, v24.4h, v1.h[3] 2204 smull v3.4s, v17.4h, v1.h[0] 2205 smlal v3.4s, v18.4h, v1.h[1] 2206 smlal v3.4s, v24.4h, v1.h[2] 2207 smlal v3.4s, v25.4h, v1.h[3] 2208.ifc \type, put 2209 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2210 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2211 sqxtun v2.4h, v2.4s 2212 sqxtun2 v2.8h, v3.4s 2213 umin v2.8h, v2.8h, v31.8h 2214.else 2215 rshrn v2.4h, v2.4s, #6 2216 rshrn2 v2.8h, v3.4s, #6 2217 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2218.endif 2219 subs \h, \h, #2 2220 2221 st1 {v2.d}[0], [\dst], \d_strd 2222 st1 {v2.d}[1], [\ds2], \d_strd 2223 b.le 0f 2224 mov v16.8b, v18.8b 2225 mov v17.8b, v24.8b 2226 mov v18.8b, v25.8b 2227 b 4b 2228 2229480: // 4x8, 4x16, 4x32 hv 2230 ld1 {v1.8b}, [\xmy] 2231 sub \src, \src, #2 2232 sub \sr2, \src, \s_strd, lsl #1 2233 sub \src, \sr2, \s_strd 2234 add \ds2, \dst, \d_strd 2235 lsl \s_strd, \s_strd, #1 2236 lsl \d_strd, \d_strd, #1 2237 sxtl v0.8h, v0.8b 2238 sxtl v1.8h, v1.8b 2239 mov x15, x30 2240 2241 ld1 {v25.8h}, [\src], \s_strd 2242 ext v26.16b, v25.16b, v25.16b, #2 2243 ext v27.16b, v25.16b, v25.16b, #4 2244 ext v28.16b, v25.16b, v25.16b, #6 2245 smull v25.4s, v25.4h, v0.h[0] 2246 smlal v25.4s, v26.4h, v0.h[1] 2247 smlal v25.4s, v27.4h, v0.h[2] 2248 smlal v25.4s, v28.4h, v0.h[3] 2249 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2250 // The intermediates from the horizontal pass fit in 16 bit without 2251 // any bias; we could just as well keep them as .4s, but narrowing 2252 // them to .4h gives a significant speedup on out of order cores 2253 // (at the cost of a smaller slowdown on in-order cores such as A53). 2254 xtn v16.4h, v16.4s 2255 2256 bl L(\type\()_8tap_filter_4) 2257 mov v17.8b, v24.8b 2258 mov v18.8b, v25.8b 2259 bl L(\type\()_8tap_filter_4) 2260 mov v19.8b, v24.8b 2261 mov v20.8b, v25.8b 2262 bl L(\type\()_8tap_filter_4) 2263 mov v21.8b, v24.8b 2264 mov v22.8b, v25.8b 2265 226648: 2267 bl L(\type\()_8tap_filter_4) 2268 smull v3.4s, v16.4h, v1.h[0] 2269 smlal v3.4s, v17.4h, v1.h[1] 2270 smlal v3.4s, v18.4h, v1.h[2] 2271 smlal v3.4s, v19.4h, v1.h[3] 2272 smlal v3.4s, v20.4h, v1.h[4] 2273 smlal v3.4s, v21.4h, v1.h[5] 2274 smlal v3.4s, v22.4h, v1.h[6] 2275 smlal v3.4s, v24.4h, v1.h[7] 2276 smull v4.4s, v17.4h, v1.h[0] 2277 smlal v4.4s, v18.4h, v1.h[1] 2278 smlal v4.4s, v19.4h, v1.h[2] 2279 smlal v4.4s, v20.4h, v1.h[3] 2280 smlal v4.4s, v21.4h, v1.h[4] 2281 smlal v4.4s, v22.4h, v1.h[5] 2282 smlal v4.4s, v24.4h, v1.h[6] 2283 smlal v4.4s, v25.4h, v1.h[7] 2284.ifc \type, put 2285 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2286 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2287 sqxtun v3.4h, v3.4s 2288 sqxtun2 v3.8h, v4.4s 2289 umin v3.8h, v3.8h, v31.8h 2290.else 2291 rshrn v3.4h, v3.4s, #6 2292 rshrn2 v3.8h, v4.4s, #6 2293 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2294.endif 2295 subs \h, \h, #2 2296 st1 {v3.d}[0], [\dst], \d_strd 2297 st1 {v3.d}[1], [\ds2], \d_strd 2298 b.le 0f 2299 mov v16.8b, v18.8b 2300 mov v17.8b, v19.8b 2301 mov v18.8b, v20.8b 2302 mov v19.8b, v21.8b 2303 mov v20.8b, v22.8b 2304 mov v21.8b, v24.8b 2305 mov v22.8b, v25.8b 2306 b 48b 23070: 2308 br x15 2309 2310L(\type\()_8tap_filter_4): 2311 ld1 {v24.8h}, [\sr2], \s_strd 2312 ld1 {v25.8h}, [\src], \s_strd 2313 ext v26.16b, v24.16b, v24.16b, #2 2314 ext v27.16b, v24.16b, v24.16b, #4 2315 ext v28.16b, v24.16b, v24.16b, #6 2316 smull v24.4s, v24.4h, v0.h[0] 2317 smlal v24.4s, v26.4h, v0.h[1] 2318 smlal v24.4s, v27.4h, v0.h[2] 2319 smlal v24.4s, v28.4h, v0.h[3] 2320 ext v26.16b, v25.16b, v25.16b, #2 2321 ext v27.16b, v25.16b, v25.16b, #4 2322 ext v28.16b, v25.16b, v25.16b, #6 2323 smull v25.4s, v25.4h, v0.h[0] 2324 smlal v25.4s, v26.4h, v0.h[1] 2325 smlal v25.4s, v27.4h, v0.h[2] 2326 smlal v25.4s, v28.4h, v0.h[3] 2327 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2328 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2329 xtn v24.4h, v24.4s 2330 xtn v25.4h, v25.4s 2331 ret 2332 233380: 2334160: 2335320: 2336 b.gt 880f 2337 add \xmy, \xmy, #2 2338 ld1 {v0.8b}, [\xmx] 2339 ld1 {v1.s}[0], [\xmy] 2340 sub \src, \src, #6 2341 sub \src, \src, \s_strd 2342 sxtl v0.8h, v0.8b 2343 sxtl v1.8h, v1.8b 2344 mov x15, x30 2345 mov \my, \h 2346 2347164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2348 add \ds2, \dst, \d_strd 2349 add \sr2, \src, \s_strd 2350 lsl \d_strd, \d_strd, #1 2351 lsl \s_strd, \s_strd, #1 2352 2353 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2354 smull v24.4s, v27.4h, v0.h[0] 2355 smull2 v25.4s, v27.8h, v0.h[0] 2356.irpc i, 1234567 2357 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2358 smlal v24.4s, v26.4h, v0.h[\i] 2359 smlal2 v25.4s, v26.8h, v0.h[\i] 2360.endr 2361 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2362 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2363 // The intermediates from the horizontal pass fit in 16 bit without 2364 // any bias; we could just as well keep them as .4s, but narrowing 2365 // them to .4h gives a significant speedup on out of order cores 2366 // (at the cost of a smaller slowdown on in-order cores such as A53), 2367 // and conserves register space (no need to clobber v8-v15). 2368 xtn v16.4h, v24.4s 2369 xtn2 v16.8h, v25.4s 2370 2371 bl L(\type\()_8tap_filter_8) 2372 mov v17.16b, v23.16b 2373 mov v18.16b, v24.16b 2374 23758: 2376 smull v2.4s, v16.4h, v1.h[0] 2377 smull2 v3.4s, v16.8h, v1.h[0] 2378 bl L(\type\()_8tap_filter_8) 2379 smull v4.4s, v17.4h, v1.h[0] 2380 smull2 v5.4s, v17.8h, v1.h[0] 2381 smlal v2.4s, v17.4h, v1.h[1] 2382 smlal2 v3.4s, v17.8h, v1.h[1] 2383 smlal v4.4s, v18.4h, v1.h[1] 2384 smlal2 v5.4s, v18.8h, v1.h[1] 2385 smlal v2.4s, v18.4h, v1.h[2] 2386 smlal2 v3.4s, v18.8h, v1.h[2] 2387 smlal v4.4s, v23.4h, v1.h[2] 2388 smlal2 v5.4s, v23.8h, v1.h[2] 2389 smlal v2.4s, v23.4h, v1.h[3] 2390 smlal2 v3.4s, v23.8h, v1.h[3] 2391 smlal v4.4s, v24.4h, v1.h[3] 2392 smlal2 v5.4s, v24.8h, v1.h[3] 2393.ifc \type, put 2394 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2395 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2396 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2397 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2398 sqxtun v2.4h, v2.4s 2399 sqxtun2 v2.8h, v3.4s 2400 sqxtun v3.4h, v4.4s 2401 sqxtun2 v3.8h, v5.4s 2402 umin v2.8h, v2.8h, v31.8h 2403 umin v3.8h, v3.8h, v31.8h 2404.else 2405 rshrn v2.4h, v2.4s, #6 2406 rshrn2 v2.8h, v3.4s, #6 2407 rshrn v3.4h, v4.4s, #6 2408 rshrn2 v3.8h, v5.4s, #6 2409 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2410 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2411.endif 2412 subs \h, \h, #2 2413 st1 {v2.8h}, [\dst], \d_strd 2414 st1 {v3.8h}, [\ds2], \d_strd 2415 b.le 9f 2416 mov v16.16b, v18.16b 2417 mov v17.16b, v23.16b 2418 mov v18.16b, v24.16b 2419 b 8b 24209: 2421 subs \w, \w, #8 2422 b.le 0f 2423 asr \s_strd, \s_strd, #1 2424 asr \d_strd, \d_strd, #1 2425 msub \src, \s_strd, \xmy, \src 2426 msub \dst, \d_strd, \xmy, \dst 2427 sub \src, \src, \s_strd, lsl #2 2428 mov \h, \my 2429 add \src, \src, #16 2430 add \dst, \dst, #16 2431 b 164b 2432 2433880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2434640: 24351280: 2436 ld1 {v0.8b}, [\xmx] 2437 ld1 {v1.8b}, [\xmy] 2438 sub \src, \src, #6 2439 sub \src, \src, \s_strd 2440 sub \src, \src, \s_strd, lsl #1 2441 sxtl v0.8h, v0.8b 2442 sxtl v1.8h, v1.8b 2443 mov x15, x30 2444 mov \my, \h 2445 2446168: 2447 add \ds2, \dst, \d_strd 2448 add \sr2, \src, \s_strd 2449 lsl \d_strd, \d_strd, #1 2450 lsl \s_strd, \s_strd, #1 2451 2452 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2453 smull v24.4s, v27.4h, v0.h[0] 2454 smull2 v25.4s, v27.8h, v0.h[0] 2455.irpc i, 1234567 2456 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2457 smlal v24.4s, v26.4h, v0.h[\i] 2458 smlal2 v25.4s, v26.8h, v0.h[\i] 2459.endr 2460 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2461 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2462 // The intermediates from the horizontal pass fit in 16 bit without 2463 // any bias; we could just as well keep them as .4s, but narrowing 2464 // them to .4h gives a significant speedup on out of order cores 2465 // (at the cost of a smaller slowdown on in-order cores such as A53), 2466 // and conserves register space (no need to clobber v8-v15). 2467 xtn v16.4h, v24.4s 2468 xtn2 v16.8h, v25.4s 2469 2470 bl L(\type\()_8tap_filter_8) 2471 mov v17.16b, v23.16b 2472 mov v18.16b, v24.16b 2473 bl L(\type\()_8tap_filter_8) 2474 mov v19.16b, v23.16b 2475 mov v20.16b, v24.16b 2476 bl L(\type\()_8tap_filter_8) 2477 mov v21.16b, v23.16b 2478 mov v22.16b, v24.16b 2479 248088: 2481 smull v2.4s, v16.4h, v1.h[0] 2482 smull2 v3.4s, v16.8h, v1.h[0] 2483 bl L(\type\()_8tap_filter_8) 2484 smull v4.4s, v17.4h, v1.h[0] 2485 smull2 v5.4s, v17.8h, v1.h[0] 2486 smlal v2.4s, v17.4h, v1.h[1] 2487 smlal2 v3.4s, v17.8h, v1.h[1] 2488 smlal v4.4s, v18.4h, v1.h[1] 2489 smlal2 v5.4s, v18.8h, v1.h[1] 2490 smlal v2.4s, v18.4h, v1.h[2] 2491 smlal2 v3.4s, v18.8h, v1.h[2] 2492 smlal v4.4s, v19.4h, v1.h[2] 2493 smlal2 v5.4s, v19.8h, v1.h[2] 2494 smlal v2.4s, v19.4h, v1.h[3] 2495 smlal2 v3.4s, v19.8h, v1.h[3] 2496 smlal v4.4s, v20.4h, v1.h[3] 2497 smlal2 v5.4s, v20.8h, v1.h[3] 2498 smlal v2.4s, v20.4h, v1.h[4] 2499 smlal2 v3.4s, v20.8h, v1.h[4] 2500 smlal v4.4s, v21.4h, v1.h[4] 2501 smlal2 v5.4s, v21.8h, v1.h[4] 2502 smlal v2.4s, v21.4h, v1.h[5] 2503 smlal2 v3.4s, v21.8h, v1.h[5] 2504 smlal v4.4s, v22.4h, v1.h[5] 2505 smlal2 v5.4s, v22.8h, v1.h[5] 2506 smlal v2.4s, v22.4h, v1.h[6] 2507 smlal2 v3.4s, v22.8h, v1.h[6] 2508 smlal v4.4s, v23.4h, v1.h[6] 2509 smlal2 v5.4s, v23.8h, v1.h[6] 2510 smlal v2.4s, v23.4h, v1.h[7] 2511 smlal2 v3.4s, v23.8h, v1.h[7] 2512 smlal v4.4s, v24.4h, v1.h[7] 2513 smlal2 v5.4s, v24.8h, v1.h[7] 2514.ifc \type, put 2515 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2516 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2517 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2518 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2519 sqxtun v2.4h, v2.4s 2520 sqxtun2 v2.8h, v3.4s 2521 sqxtun v3.4h, v4.4s 2522 sqxtun2 v3.8h, v5.4s 2523 umin v2.8h, v2.8h, v31.8h 2524 umin v3.8h, v3.8h, v31.8h 2525.else 2526 rshrn v2.4h, v2.4s, #6 2527 rshrn2 v2.8h, v3.4s, #6 2528 rshrn v3.4h, v4.4s, #6 2529 rshrn2 v3.8h, v5.4s, #6 2530 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2531 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2532.endif 2533 subs \h, \h, #2 2534 st1 {v2.8h}, [\dst], \d_strd 2535 st1 {v3.8h}, [\ds2], \d_strd 2536 b.le 9f 2537 mov v16.16b, v18.16b 2538 mov v17.16b, v19.16b 2539 mov v18.16b, v20.16b 2540 mov v19.16b, v21.16b 2541 mov v20.16b, v22.16b 2542 mov v21.16b, v23.16b 2543 mov v22.16b, v24.16b 2544 b 88b 25459: 2546 subs \w, \w, #8 2547 b.le 0f 2548 asr \s_strd, \s_strd, #1 2549 asr \d_strd, \d_strd, #1 2550 msub \src, \s_strd, \xmy, \src 2551 msub \dst, \d_strd, \xmy, \dst 2552 sub \src, \src, \s_strd, lsl #3 2553 mov \h, \my 2554 add \src, \src, #16 2555 add \dst, \dst, #16 2556 b 168b 25570: 2558 br x15 2559 2560L(\type\()_8tap_filter_8): 2561 ld1 {v4.8h, v5.8h}, [\sr2], \s_strd 2562 ld1 {v6.8h, v7.8h}, [\src], \s_strd 2563 smull v25.4s, v4.4h, v0.h[0] 2564 smull2 v26.4s, v4.8h, v0.h[0] 2565 smull v27.4s, v6.4h, v0.h[0] 2566 smull2 v28.4s, v6.8h, v0.h[0] 2567.irpc i, 1234567 2568 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2569 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2570 smlal v25.4s, v23.4h, v0.h[\i] 2571 smlal2 v26.4s, v23.8h, v0.h[\i] 2572 smlal v27.4s, v24.4h, v0.h[\i] 2573 smlal2 v28.4s, v24.8h, v0.h[\i] 2574.endr 2575 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2576 srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) 2577 srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) 2578 srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) 2579 xtn v23.4h, v25.4s 2580 xtn2 v23.8h, v26.4s 2581 xtn v24.4h, v27.4s 2582 xtn2 v24.8h, v28.4s 2583 ret 2584 2585L(\type\()_8tap_hv_tbl): 2586 .hword L(\type\()_8tap_hv_tbl) - 1280b 2587 .hword L(\type\()_8tap_hv_tbl) - 640b 2588 .hword L(\type\()_8tap_hv_tbl) - 320b 2589 .hword L(\type\()_8tap_hv_tbl) - 160b 2590 .hword L(\type\()_8tap_hv_tbl) - 80b 2591 .hword L(\type\()_8tap_hv_tbl) - 40b 2592 .hword L(\type\()_8tap_hv_tbl) - 20b 2593 .hword 0 2594endfunc 2595 2596 2597function \type\()_bilin_16bpc_neon, export=1 2598.ifc \bdmax, w8 2599 ldr w8, [sp] 2600.endif 2601 dup v1.8h, \mx 2602 dup v3.8h, \my 2603 mov w10, #16 2604 sub w9, w10, \mx 2605 sub w10, w10, \my 2606 dup v0.8h, w9 2607 dup v2.8h, w10 2608.ifc \type, prep 2609 uxtw \d_strd, \w 2610 lsl \d_strd, \d_strd, #1 2611.endif 2612 2613 clz \bdmax, \bdmax // bitdepth_max 2614 clz w9, \w 2615 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 2616 mov w11, #4 2617 sub w9, w9, #24 2618 sub w11, w11, \bdmax // 4 - intermediate_bits 2619 add w12, \bdmax, #4 // 4 + intermediate_bits 2620 cbnz \mx, L(\type\()_bilin_h) 2621 cbnz \my, L(\type\()_bilin_v) 2622 b \type\()_neon 2623 2624L(\type\()_bilin_h): 2625 cbnz \my, L(\type\()_bilin_hv) 2626 2627 adr x10, L(\type\()_bilin_h_tbl) 2628 dup v31.8h, w11 // 4 - intermediate_bits 2629 ldrh w9, [x10, x9, lsl #1] 2630 neg v31.8h, v31.8h // -(4-intermediate_bits) 2631.ifc \type, put 2632 dup v30.8h, \bdmax // intermediate_bits 2633.else 2634 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2635.endif 2636 sub x10, x10, w9, uxtw 2637.ifc \type, put 2638 neg v30.8h, v30.8h // -intermediate_bits 2639.endif 2640 br x10 2641 264220: // 2xN h 2643.ifc \type, put 2644 add \ds2, \dst, \d_strd 2645 add \sr2, \src, \s_strd 2646 lsl \d_strd, \d_strd, #1 2647 lsl \s_strd, \s_strd, #1 26482: 2649 ld1 {v4.4h}, [\src], \s_strd 2650 ld1 {v6.4h}, [\sr2], \s_strd 2651 ext v5.8b, v4.8b, v4.8b, #2 2652 ext v7.8b, v6.8b, v6.8b, #2 2653 trn1 v4.2s, v4.2s, v6.2s 2654 trn1 v5.2s, v5.2s, v7.2s 2655 subs \h, \h, #2 2656 mul v4.4h, v4.4h, v0.4h 2657 mla v4.4h, v5.4h, v1.4h 2658 urshl v4.4h, v4.4h, v31.4h 2659 urshl v4.4h, v4.4h, v30.4h 2660 st1 {v4.s}[0], [\dst], \d_strd 2661 st1 {v4.s}[1], [\ds2], \d_strd 2662 b.gt 2b 2663 ret 2664.endif 2665 266640: // 4xN h 2667 add \ds2, \dst, \d_strd 2668 add \sr2, \src, \s_strd 2669 lsl \d_strd, \d_strd, #1 2670 lsl \s_strd, \s_strd, #1 26714: 2672 ld1 {v4.8h}, [\src], \s_strd 2673 ld1 {v6.8h}, [\sr2], \s_strd 2674 ext v5.16b, v4.16b, v4.16b, #2 2675 ext v7.16b, v6.16b, v6.16b, #2 2676 trn1 v4.2d, v4.2d, v6.2d 2677 trn1 v5.2d, v5.2d, v7.2d 2678 subs \h, \h, #2 2679 mul v4.8h, v4.8h, v0.8h 2680 mla v4.8h, v5.8h, v1.8h 2681 urshl v4.8h, v4.8h, v31.8h 2682.ifc \type, put 2683 urshl v4.8h, v4.8h, v30.8h 2684.else 2685 sub v4.8h, v4.8h, v29.8h 2686.endif 2687 st1 {v4.d}[0], [\dst], \d_strd 2688 st1 {v4.d}[1], [\ds2], \d_strd 2689 b.gt 4b 2690 ret 2691 269280: // 8xN h 2693 add \ds2, \dst, \d_strd 2694 add \sr2, \src, \s_strd 2695 lsl \d_strd, \d_strd, #1 2696 lsl \s_strd, \s_strd, #1 26978: 2698 ldr h5, [\src, #16] 2699 ldr h7, [\sr2, #16] 2700 ld1 {v4.8h}, [\src], \s_strd 2701 ld1 {v6.8h}, [\sr2], \s_strd 2702 ext v5.16b, v4.16b, v5.16b, #2 2703 ext v7.16b, v6.16b, v7.16b, #2 2704 subs \h, \h, #2 2705 mul v4.8h, v4.8h, v0.8h 2706 mla v4.8h, v5.8h, v1.8h 2707 mul v6.8h, v6.8h, v0.8h 2708 mla v6.8h, v7.8h, v1.8h 2709 urshl v4.8h, v4.8h, v31.8h 2710 urshl v6.8h, v6.8h, v31.8h 2711.ifc \type, put 2712 urshl v4.8h, v4.8h, v30.8h 2713 urshl v6.8h, v6.8h, v30.8h 2714.else 2715 sub v4.8h, v4.8h, v29.8h 2716 sub v6.8h, v6.8h, v29.8h 2717.endif 2718 st1 {v4.8h}, [\dst], \d_strd 2719 st1 {v6.8h}, [\ds2], \d_strd 2720 b.gt 8b 2721 ret 2722160: 2723320: 2724640: 27251280: // 16xN, 32xN, ... h 2726 add \ds2, \dst, \d_strd 2727 add \sr2, \src, \s_strd 2728 lsl \s_strd, \s_strd, #1 2729 2730 sub \s_strd, \s_strd, \w, uxtw #1 2731 sub \s_strd, \s_strd, #16 2732.ifc \type, put 2733 lsl \d_strd, \d_strd, #1 2734 sub \d_strd, \d_strd, \w, uxtw #1 2735.endif 2736161: 2737 ld1 {v16.8h}, [\src], #16 2738 ld1 {v21.8h}, [\sr2], #16 2739 mov \mx, \w 2740 274116: 2742 ld1 {v17.8h, v18.8h}, [\src], #32 2743 ld1 {v22.8h, v23.8h}, [\sr2], #32 2744 ext v19.16b, v16.16b, v17.16b, #2 2745 ext v20.16b, v17.16b, v18.16b, #2 2746 ext v24.16b, v21.16b, v22.16b, #2 2747 ext v25.16b, v22.16b, v23.16b, #2 2748 mul v16.8h, v16.8h, v0.8h 2749 mla v16.8h, v19.8h, v1.8h 2750 mul v17.8h, v17.8h, v0.8h 2751 mla v17.8h, v20.8h, v1.8h 2752 mul v21.8h, v21.8h, v0.8h 2753 mla v21.8h, v24.8h, v1.8h 2754 mul v22.8h, v22.8h, v0.8h 2755 mla v22.8h, v25.8h, v1.8h 2756 urshl v16.8h, v16.8h, v31.8h 2757 urshl v17.8h, v17.8h, v31.8h 2758 urshl v21.8h, v21.8h, v31.8h 2759 urshl v22.8h, v22.8h, v31.8h 2760 subs \mx, \mx, #16 2761.ifc \type, put 2762 urshl v16.8h, v16.8h, v30.8h 2763 urshl v17.8h, v17.8h, v30.8h 2764 urshl v21.8h, v21.8h, v30.8h 2765 urshl v22.8h, v22.8h, v30.8h 2766.else 2767 sub v16.8h, v16.8h, v29.8h 2768 sub v17.8h, v17.8h, v29.8h 2769 sub v21.8h, v21.8h, v29.8h 2770 sub v22.8h, v22.8h, v29.8h 2771.endif 2772 st1 {v16.8h, v17.8h}, [\dst], #32 2773 st1 {v21.8h, v22.8h}, [\ds2], #32 2774 b.le 9f 2775 2776 mov v16.16b, v18.16b 2777 mov v21.16b, v23.16b 2778 b 16b 2779 27809: 2781 add \dst, \dst, \d_strd 2782 add \ds2, \ds2, \d_strd 2783 add \src, \src, \s_strd 2784 add \sr2, \sr2, \s_strd 2785 2786 subs \h, \h, #2 2787 b.gt 161b 2788 ret 2789 2790L(\type\()_bilin_h_tbl): 2791 .hword L(\type\()_bilin_h_tbl) - 1280b 2792 .hword L(\type\()_bilin_h_tbl) - 640b 2793 .hword L(\type\()_bilin_h_tbl) - 320b 2794 .hword L(\type\()_bilin_h_tbl) - 160b 2795 .hword L(\type\()_bilin_h_tbl) - 80b 2796 .hword L(\type\()_bilin_h_tbl) - 40b 2797 .hword L(\type\()_bilin_h_tbl) - 20b 2798 .hword 0 2799 2800 2801L(\type\()_bilin_v): 2802 cmp \h, #4 2803 adr x10, L(\type\()_bilin_v_tbl) 2804.ifc \type, prep 2805 dup v31.8h, w11 // 4 - intermediate_bits 2806.endif 2807 ldrh w9, [x10, x9, lsl #1] 2808.ifc \type, prep 2809 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2810 neg v31.8h, v31.8h // -(4-intermediate_bits) 2811.endif 2812 sub x10, x10, w9, uxtw 2813 br x10 2814 281520: // 2xN v 2816.ifc \type, put 2817 cmp \h, #2 2818 add \ds2, \dst, \d_strd 2819 add \sr2, \src, \s_strd 2820 lsl \s_strd, \s_strd, #1 2821 lsl \d_strd, \d_strd, #1 2822 2823 // 2x2 v 2824 ld1 {v16.s}[0], [\src], \s_strd 2825 b.gt 24f 2826 ld1 {v17.s}[0], [\sr2], \s_strd 2827 ld1 {v18.s}[0], [\src], \s_strd 2828 trn1 v16.2s, v16.2s, v17.2s 2829 trn1 v17.2s, v17.2s, v18.2s 2830 mul v4.4h, v16.4h, v2.4h 2831 mla v4.4h, v17.4h, v3.4h 2832 urshr v4.8h, v4.8h, #4 2833 st1 {v4.s}[0], [\dst] 2834 st1 {v4.s}[1], [\ds2] 2835 ret 283624: // 2x4, 2x8, ... v 2837 ld1 {v17.s}[0], [\sr2], \s_strd 2838 ld1 {v18.s}[0], [\src], \s_strd 2839 ld1 {v19.s}[0], [\sr2], \s_strd 2840 ld1 {v20.s}[0], [\src], \s_strd 2841 trn1 v16.2s, v16.2s, v17.2s 2842 trn1 v17.2s, v17.2s, v18.2s 2843 trn1 v18.2s, v18.2s, v19.2s 2844 trn1 v19.2s, v19.2s, v20.2s 2845 trn1 v16.2d, v16.2d, v18.2d 2846 trn1 v17.2d, v17.2d, v19.2d 2847 mul v4.8h, v16.8h, v2.8h 2848 mla v4.8h, v17.8h, v3.8h 2849 subs \h, \h, #4 2850 urshr v4.8h, v4.8h, #4 2851 st1 {v4.s}[0], [\dst], \d_strd 2852 st1 {v4.s}[1], [\ds2], \d_strd 2853 st1 {v4.s}[2], [\dst], \d_strd 2854 st1 {v4.s}[3], [\ds2], \d_strd 2855 b.le 0f 2856 mov v16.8b, v20.8b 2857 b 24b 28580: 2859 ret 2860.endif 2861 286240: // 4xN v 2863 add \ds2, \dst, \d_strd 2864 add \sr2, \src, \s_strd 2865 lsl \s_strd, \s_strd, #1 2866 lsl \d_strd, \d_strd, #1 2867 ld1 {v16.4h}, [\src], \s_strd 28684: 2869 ld1 {v17.4h}, [\sr2], \s_strd 2870 ld1 {v18.4h}, [\src], \s_strd 2871 trn1 v16.2d, v16.2d, v17.2d 2872 trn1 v17.2d, v17.2d, v18.2d 2873 mul v4.8h, v16.8h, v2.8h 2874 mla v4.8h, v17.8h, v3.8h 2875 subs \h, \h, #2 2876.ifc \type, put 2877 urshr v4.8h, v4.8h, #4 2878.else 2879 urshl v4.8h, v4.8h, v31.8h 2880 sub v4.8h, v4.8h, v29.8h 2881.endif 2882 st1 {v4.d}[0], [\dst], \d_strd 2883 st1 {v4.d}[1], [\ds2], \d_strd 2884 b.le 0f 2885 mov v16.8b, v18.8b 2886 b 4b 28870: 2888 ret 2889 289080: // 8xN v 2891 add \ds2, \dst, \d_strd 2892 add \sr2, \src, \s_strd 2893 lsl \s_strd, \s_strd, #1 2894 lsl \d_strd, \d_strd, #1 2895 ld1 {v16.8h}, [\src], \s_strd 28968: 2897 ld1 {v17.8h}, [\sr2], \s_strd 2898 ld1 {v18.8h}, [\src], \s_strd 2899 mul v4.8h, v16.8h, v2.8h 2900 mla v4.8h, v17.8h, v3.8h 2901 mul v5.8h, v17.8h, v2.8h 2902 mla v5.8h, v18.8h, v3.8h 2903 subs \h, \h, #2 2904.ifc \type, put 2905 urshr v4.8h, v4.8h, #4 2906 urshr v5.8h, v5.8h, #4 2907.else 2908 urshl v4.8h, v4.8h, v31.8h 2909 urshl v5.8h, v5.8h, v31.8h 2910 sub v4.8h, v4.8h, v29.8h 2911 sub v5.8h, v5.8h, v29.8h 2912.endif 2913 st1 {v4.8h}, [\dst], \d_strd 2914 st1 {v5.8h}, [\ds2], \d_strd 2915 b.le 0f 2916 mov v16.16b, v18.16b 2917 b 8b 29180: 2919 ret 2920 2921160: // 16xN, 32xN, ... 2922320: 2923640: 29241280: 2925 mov \my, \h 29261: 2927 add \ds2, \dst, \d_strd 2928 add \sr2, \src, \s_strd 2929 lsl \s_strd, \s_strd, #1 2930 lsl \d_strd, \d_strd, #1 2931 2932 ld1 {v16.8h, v17.8h}, [\src], \s_strd 29332: 2934 ld1 {v18.8h, v19.8h}, [\sr2], \s_strd 2935 ld1 {v20.8h, v21.8h}, [\src], \s_strd 2936 mul v4.8h, v16.8h, v2.8h 2937 mla v4.8h, v18.8h, v3.8h 2938 mul v5.8h, v17.8h, v2.8h 2939 mla v5.8h, v19.8h, v3.8h 2940 mul v6.8h, v18.8h, v2.8h 2941 mla v6.8h, v20.8h, v3.8h 2942 mul v7.8h, v19.8h, v2.8h 2943 mla v7.8h, v21.8h, v3.8h 2944 subs \h, \h, #2 2945.ifc \type, put 2946 urshr v4.8h, v4.8h, #4 2947 urshr v5.8h, v5.8h, #4 2948 urshr v6.8h, v6.8h, #4 2949 urshr v7.8h, v7.8h, #4 2950.else 2951 urshl v4.8h, v4.8h, v31.8h 2952 urshl v5.8h, v5.8h, v31.8h 2953 urshl v6.8h, v6.8h, v31.8h 2954 urshl v7.8h, v7.8h, v31.8h 2955 sub v4.8h, v4.8h, v29.8h 2956 sub v5.8h, v5.8h, v29.8h 2957 sub v6.8h, v6.8h, v29.8h 2958 sub v7.8h, v7.8h, v29.8h 2959.endif 2960 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2961 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2962 b.le 9f 2963 mov v16.16b, v20.16b 2964 mov v17.16b, v21.16b 2965 b 2b 29669: 2967 subs \w, \w, #16 2968 b.le 0f 2969 asr \s_strd, \s_strd, #1 2970 asr \d_strd, \d_strd, #1 2971 msub \src, \s_strd, \xmy, \src 2972 msub \dst, \d_strd, \xmy, \dst 2973 sub \src, \src, \s_strd, lsl #1 2974 mov \h, \my 2975 add \src, \src, #32 2976 add \dst, \dst, #32 2977 b 1b 29780: 2979 ret 2980 2981L(\type\()_bilin_v_tbl): 2982 .hword L(\type\()_bilin_v_tbl) - 1280b 2983 .hword L(\type\()_bilin_v_tbl) - 640b 2984 .hword L(\type\()_bilin_v_tbl) - 320b 2985 .hword L(\type\()_bilin_v_tbl) - 160b 2986 .hword L(\type\()_bilin_v_tbl) - 80b 2987 .hword L(\type\()_bilin_v_tbl) - 40b 2988 .hword L(\type\()_bilin_v_tbl) - 20b 2989 .hword 0 2990 2991L(\type\()_bilin_hv): 2992 adr x10, L(\type\()_bilin_hv_tbl) 2993 dup v31.8h, w11 // 4 - intermediate_bits 2994 ldrh w9, [x10, x9, lsl #1] 2995 neg v31.8h, v31.8h // -(4-intermediate_bits) 2996.ifc \type, put 2997 dup v30.4s, w12 // 4 + intermediate_bits 2998.else 2999 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 3000.endif 3001 sub x10, x10, w9, uxtw 3002.ifc \type, put 3003 neg v30.4s, v30.4s // -(4+intermediate_bits) 3004.endif 3005 br x10 3006 300720: // 2xN hv 3008.ifc \type, put 3009 add \sr2, \src, \s_strd 3010 add \ds2, \dst, \d_strd 3011 lsl \s_strd, \s_strd, #1 3012 lsl \d_strd, \d_strd, #1 3013 3014 ld1 {v20.4h}, [\src], \s_strd 3015 ext v21.8b, v20.8b, v20.8b, #2 3016 mul v16.4h, v20.4h, v0.4h 3017 mla v16.4h, v21.4h, v1.4h 3018 urshl v16.4h, v16.4h, v31.4h 3019 30202: 3021 ld1 {v22.4h}, [\sr2], \s_strd 3022 ld1 {v24.4h}, [\src], \s_strd 3023 ext v23.8b, v22.8b, v22.8b, #2 3024 ext v25.8b, v24.8b, v24.8b, #2 3025 trn1 v22.2s, v22.2s, v24.2s 3026 trn1 v23.2s, v23.2s, v25.2s 3027 mul v17.4h, v22.4h, v0.4h 3028 mla v17.4h, v23.4h, v1.4h 3029 urshl v17.4h, v17.4h, v31.4h 3030 3031 trn1 v16.2s, v16.2s, v17.2s 3032 3033 umull v4.4s, v16.4h, v2.4h 3034 umlal v4.4s, v17.4h, v3.4h 3035 urshl v4.4s, v4.4s, v30.4s 3036 xtn v4.4h, v4.4s 3037 subs \h, \h, #2 3038 st1 {v4.s}[0], [\dst], \d_strd 3039 st1 {v4.s}[1], [\ds2], \d_strd 3040 b.le 0f 3041 trn2 v16.2s, v17.2s, v17.2s 3042 b 2b 30430: 3044 ret 3045.endif 3046 304740: // 4xN hv 3048 add \sr2, \src, \s_strd 3049 add \ds2, \dst, \d_strd 3050 lsl \s_strd, \s_strd, #1 3051 lsl \d_strd, \d_strd, #1 3052 3053 ld1 {v20.8h}, [\src], \s_strd 3054 ext v21.16b, v20.16b, v20.16b, #2 3055 mul v16.4h, v20.4h, v0.4h 3056 mla v16.4h, v21.4h, v1.4h 3057 urshl v16.4h, v16.4h, v31.4h 3058 30594: 3060 ld1 {v22.8h}, [\sr2], \s_strd 3061 ld1 {v24.8h}, [\src], \s_strd 3062 ext v23.16b, v22.16b, v22.16b, #2 3063 ext v25.16b, v24.16b, v24.16b, #2 3064 trn1 v22.2d, v22.2d, v24.2d 3065 trn1 v23.2d, v23.2d, v25.2d 3066 mul v17.8h, v22.8h, v0.8h 3067 mla v17.8h, v23.8h, v1.8h 3068 urshl v17.8h, v17.8h, v31.8h 3069 3070 trn1 v16.2d, v16.2d, v17.2d 3071 3072 umull v4.4s, v16.4h, v2.4h 3073 umlal v4.4s, v17.4h, v3.4h 3074 umull2 v5.4s, v16.8h, v2.8h 3075 umlal2 v5.4s, v17.8h, v3.8h 3076.ifc \type, put 3077 urshl v4.4s, v4.4s, v30.4s 3078 urshl v5.4s, v5.4s, v30.4s 3079 xtn v4.4h, v4.4s 3080 xtn2 v4.8h, v5.4s 3081.else 3082 rshrn v4.4h, v4.4s, #4 3083 rshrn2 v4.8h, v5.4s, #4 3084 sub v4.8h, v4.8h, v29.8h 3085.endif 3086 subs \h, \h, #2 3087 st1 {v4.d}[0], [\dst], \d_strd 3088 st1 {v4.d}[1], [\ds2], \d_strd 3089 b.le 0f 3090 trn2 v16.2d, v17.2d, v17.2d 3091 b 4b 30920: 3093 ret 3094 309580: // 8xN, 16xN, ... hv 3096160: 3097320: 3098640: 30991280: 3100 mov \my, \h 3101 31021: 3103 add \sr2, \src, \s_strd 3104 add \ds2, \dst, \d_strd 3105 lsl \s_strd, \s_strd, #1 3106 lsl \d_strd, \d_strd, #1 3107 3108 ldr h21, [\src, #16] 3109 ld1 {v20.8h}, [\src], \s_strd 3110 ext v21.16b, v20.16b, v21.16b, #2 3111 mul v16.8h, v20.8h, v0.8h 3112 mla v16.8h, v21.8h, v1.8h 3113 urshl v16.8h, v16.8h, v31.8h 3114 31152: 3116 ldr h23, [\sr2, #16] 3117 ld1 {v22.8h}, [\sr2], \s_strd 3118 ldr h25, [\src, #16] 3119 ld1 {v24.8h}, [\src], \s_strd 3120 ext v23.16b, v22.16b, v23.16b, #2 3121 ext v25.16b, v24.16b, v25.16b, #2 3122 mul v17.8h, v22.8h, v0.8h 3123 mla v17.8h, v23.8h, v1.8h 3124 mul v18.8h, v24.8h, v0.8h 3125 mla v18.8h, v25.8h, v1.8h 3126 urshl v17.8h, v17.8h, v31.8h 3127 urshl v18.8h, v18.8h, v31.8h 3128 3129 umull v4.4s, v16.4h, v2.4h 3130 umlal v4.4s, v17.4h, v3.4h 3131 umull2 v5.4s, v16.8h, v2.8h 3132 umlal2 v5.4s, v17.8h, v3.8h 3133 umull v6.4s, v17.4h, v2.4h 3134 umlal v6.4s, v18.4h, v3.4h 3135 umull2 v7.4s, v17.8h, v2.8h 3136 umlal2 v7.4s, v18.8h, v3.8h 3137.ifc \type, put 3138 urshl v4.4s, v4.4s, v30.4s 3139 urshl v5.4s, v5.4s, v30.4s 3140 urshl v6.4s, v6.4s, v30.4s 3141 urshl v7.4s, v7.4s, v30.4s 3142 xtn v4.4h, v4.4s 3143 xtn2 v4.8h, v5.4s 3144 xtn v5.4h, v6.4s 3145 xtn2 v5.8h, v7.4s 3146.else 3147 rshrn v4.4h, v4.4s, #4 3148 rshrn2 v4.8h, v5.4s, #4 3149 rshrn v5.4h, v6.4s, #4 3150 rshrn2 v5.8h, v7.4s, #4 3151 sub v4.8h, v4.8h, v29.8h 3152 sub v5.8h, v5.8h, v29.8h 3153.endif 3154 subs \h, \h, #2 3155 st1 {v4.8h}, [\dst], \d_strd 3156 st1 {v5.8h}, [\ds2], \d_strd 3157 b.le 9f 3158 mov v16.16b, v18.16b 3159 b 2b 31609: 3161 subs \w, \w, #8 3162 b.le 0f 3163 asr \s_strd, \s_strd, #1 3164 asr \d_strd, \d_strd, #1 3165 msub \src, \s_strd, \xmy, \src 3166 msub \dst, \d_strd, \xmy, \dst 3167 sub \src, \src, \s_strd, lsl #1 3168 mov \h, \my 3169 add \src, \src, #16 3170 add \dst, \dst, #16 3171 b 1b 31720: 3173 ret 3174 3175L(\type\()_bilin_hv_tbl): 3176 .hword L(\type\()_bilin_hv_tbl) - 1280b 3177 .hword L(\type\()_bilin_hv_tbl) - 640b 3178 .hword L(\type\()_bilin_hv_tbl) - 320b 3179 .hword L(\type\()_bilin_hv_tbl) - 160b 3180 .hword L(\type\()_bilin_hv_tbl) - 80b 3181 .hword L(\type\()_bilin_hv_tbl) - 40b 3182 .hword L(\type\()_bilin_hv_tbl) - 20b 3183 .hword 0 3184endfunc 3185.endm 3186 3187filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 3188filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 3189 3190.macro load_filter_row dst, src, inc 3191 asr w13, \src, #10 3192 ldr \dst, [x11, w13, sxtw #3] 3193 add \src, \src, \inc 3194.endm 3195 3196function warp_filter_horz_neon 3197 add w12, w5, #512 3198 3199 ld1 {v16.8h, v17.8h}, [x2], x3 3200 3201 load_filter_row d0, w12, w7 3202 load_filter_row d1, w12, w7 3203 load_filter_row d2, w12, w7 3204 sxtl v0.8h, v0.8b 3205 load_filter_row d3, w12, w7 3206 sxtl v1.8h, v1.8b 3207 load_filter_row d4, w12, w7 3208 sxtl v2.8h, v2.8b 3209 load_filter_row d5, w12, w7 3210 sxtl v3.8h, v3.8b 3211 load_filter_row d6, w12, w7 3212 sxtl v4.8h, v4.8b 3213 load_filter_row d7, w12, w7 3214 sxtl v5.8h, v5.8b 3215 ext v18.16b, v16.16b, v17.16b, #2*1 3216 smull v8.4s, v16.4h, v0.4h 3217 smull2 v9.4s, v16.8h, v0.8h 3218 sxtl v6.8h, v6.8b 3219 ext v19.16b, v16.16b, v17.16b, #2*2 3220 smull v10.4s, v18.4h, v1.4h 3221 smull2 v11.4s, v18.8h, v1.8h 3222 sxtl v7.8h, v7.8b 3223 ext v20.16b, v16.16b, v17.16b, #2*3 3224 smull v0.4s, v19.4h, v2.4h 3225 smull2 v1.4s, v19.8h, v2.8h 3226 ext v21.16b, v16.16b, v17.16b, #2*4 3227 addp v8.4s, v8.4s, v9.4s 3228 smull v2.4s, v20.4h, v3.4h 3229 smull2 v3.4s, v20.8h, v3.8h 3230 ext v22.16b, v16.16b, v17.16b, #2*5 3231 addp v9.4s, v10.4s, v11.4s 3232 smull v10.4s, v21.4h, v4.4h 3233 smull2 v11.4s, v21.8h, v4.8h 3234 ext v23.16b, v16.16b, v17.16b, #2*6 3235 addp v0.4s, v0.4s, v1.4s 3236 smull v18.4s, v22.4h, v5.4h 3237 smull2 v19.4s, v22.8h, v5.8h 3238 ext v16.16b, v16.16b, v17.16b, #2*7 3239 addp v1.4s, v2.4s, v3.4s 3240 addp v2.4s, v10.4s, v11.4s 3241 smull v20.4s, v23.4h, v6.4h 3242 smull2 v21.4s, v23.8h, v6.8h 3243 addp v3.4s, v18.4s, v19.4s 3244 smull v22.4s, v16.4h, v7.4h 3245 smull2 v23.4s, v16.8h, v7.8h 3246 addp v4.4s, v20.4s, v21.4s 3247 addp v5.4s, v22.4s, v23.4s 3248 3249 addp v8.4s, v8.4s, v9.4s 3250 addp v0.4s, v0.4s, v1.4s 3251 addp v2.4s, v2.4s, v3.4s 3252 addp v4.4s, v4.4s, v5.4s 3253 3254 addp v16.4s, v8.4s, v0.4s 3255 addp v17.4s, v2.4s, v4.4s 3256 3257 add w5, w5, w8 3258 3259 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) 3260 srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) 3261 3262 ret 3263endfunc 3264 3265// void dav1d_warp_affine_8x8_16bpc_neon( 3266// pixel *dst, const ptrdiff_t dst_stride, 3267// const pixel *src, const ptrdiff_t src_stride, 3268// const int16_t *const abcd, int mx, int my, 3269// const int bitdepth_max) 3270.macro warp t 3271function warp_affine_8x8\t\()_16bpc_neon, export=1 3272 stp d8, d9, [sp, #-0x40]! 3273 stp d10, d11, [sp, #0x10] 3274 stp d12, d13, [sp, #0x20] 3275 stp d14, d15, [sp, #0x30] 3276 3277.ifb \t 3278 dup v15.8h, w7 // bitdepth_max 3279.else 3280 movi v15.8h, #(PREP_BIAS >> 8), lsl #8 3281.endif 3282 clz w7, w7 3283 // intermediate_bits = clz(bitdepth_max) - 18 3284.ifb \t 3285 sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 3286.endif 3287 sub w7, w7, #25 // -(7 - intermediate_bits) 3288.ifb \t 3289 neg w8, w8 // -(7 + intermediate_bits) 3290.endif 3291 dup v14.4s, w7 // -(7 - intermediate_bits) 3292.ifb \t 3293 dup v13.4s, w8 // -(7 + intermediate_bits) 3294.endif 3295 3296 ldr x4, [x4] 3297 sbfx x7, x4, #0, #16 3298 sbfx x8, x4, #16, #16 3299 sbfx x9, x4, #32, #16 3300 sbfx x4, x4, #48, #16 3301 mov w10, #8 3302 sub x2, x2, x3, lsl #1 3303 sub x2, x2, x3 3304 sub x2, x2, #6 3305 movrel x11, X(mc_warp_filter), 64*8 3306 mov x15, x30 3307.ifnb \t 3308 lsl x1, x1, #1 3309.endif 3310 3311 bl warp_filter_horz_neon 3312 xtn v24.4h, v16.4s 3313 xtn2 v24.8h, v17.4s 3314 bl warp_filter_horz_neon 3315 xtn v25.4h, v16.4s 3316 xtn2 v25.8h, v17.4s 3317 bl warp_filter_horz_neon 3318 xtn v26.4h, v16.4s 3319 xtn2 v26.8h, v17.4s 3320 bl warp_filter_horz_neon 3321 xtn v27.4h, v16.4s 3322 xtn2 v27.8h, v17.4s 3323 bl warp_filter_horz_neon 3324 xtn v28.4h, v16.4s 3325 xtn2 v28.8h, v17.4s 3326 bl warp_filter_horz_neon 3327 xtn v29.4h, v16.4s 3328 xtn2 v29.8h, v17.4s 3329 bl warp_filter_horz_neon 3330 xtn v30.4h, v16.4s 3331 xtn2 v30.8h, v17.4s 3332 33331: 3334 add w14, w6, #512 3335 bl warp_filter_horz_neon 3336 xtn v31.4h, v16.4s 3337 xtn2 v31.8h, v17.4s 3338 3339 load_filter_row d0, w14, w9 3340 load_filter_row d1, w14, w9 3341 load_filter_row d2, w14, w9 3342 load_filter_row d3, w14, w9 3343 load_filter_row d4, w14, w9 3344 load_filter_row d5, w14, w9 3345 load_filter_row d6, w14, w9 3346 load_filter_row d7, w14, w9 3347 transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 3348 sxtl v0.8h, v0.8b 3349 sxtl v1.8h, v1.8b 3350 sxtl v2.8h, v2.8b 3351 sxtl v3.8h, v3.8b 3352 sxtl v4.8h, v4.8b 3353 sxtl v5.8h, v5.8b 3354 sxtl v6.8h, v6.8b 3355 sxtl v7.8h, v7.8b 3356 3357 // This ordering of smull/smlal/smull2/smlal2 is highly 3358 // beneficial for Cortex A53 here. 3359 smull v16.4s, v24.4h, v0.4h 3360 smlal v16.4s, v25.4h, v1.4h 3361 smlal v16.4s, v26.4h, v2.4h 3362 smlal v16.4s, v27.4h, v3.4h 3363 smlal v16.4s, v28.4h, v4.4h 3364 smlal v16.4s, v29.4h, v5.4h 3365 smlal v16.4s, v30.4h, v6.4h 3366 smlal v16.4s, v31.4h, v7.4h 3367 smull2 v17.4s, v24.8h, v0.8h 3368 smlal2 v17.4s, v25.8h, v1.8h 3369 smlal2 v17.4s, v26.8h, v2.8h 3370 smlal2 v17.4s, v27.8h, v3.8h 3371 smlal2 v17.4s, v28.8h, v4.8h 3372 smlal2 v17.4s, v29.8h, v5.8h 3373 smlal2 v17.4s, v30.8h, v6.8h 3374 smlal2 v17.4s, v31.8h, v7.8h 3375 3376 mov v24.16b, v25.16b 3377 mov v25.16b, v26.16b 3378.ifb \t 3379 srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) 3380 srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) 3381.else 3382 rshrn v16.4h, v16.4s, #7 3383 rshrn2 v16.8h, v17.4s, #7 3384.endif 3385 mov v26.16b, v27.16b 3386.ifb \t 3387 sqxtun v16.4h, v16.4s 3388 sqxtun2 v16.8h, v17.4s 3389.else 3390 sub v16.8h, v16.8h, v15.8h // PREP_BIAS 3391.endif 3392 mov v27.16b, v28.16b 3393 mov v28.16b, v29.16b 3394.ifb \t 3395 umin v16.8h, v16.8h, v15.8h // bitdepth_max 3396.endif 3397 mov v29.16b, v30.16b 3398 mov v30.16b, v31.16b 3399 subs w10, w10, #1 3400 st1 {v16.8h}, [x0], x1 3401 3402 add w6, w6, w4 3403 b.gt 1b 3404 3405 ldp d14, d15, [sp, #0x30] 3406 ldp d12, d13, [sp, #0x20] 3407 ldp d10, d11, [sp, #0x10] 3408 ldp d8, d9, [sp], 0x40 3409 3410 br x15 3411endfunc 3412.endm 3413 3414warp 3415warp t 3416 3417// void dav1d_emu_edge_16bpc_neon( 3418// const intptr_t bw, const intptr_t bh, 3419// const intptr_t iw, const intptr_t ih, 3420// const intptr_t x, const intptr_t y, 3421// pixel *dst, const ptrdiff_t dst_stride, 3422// const pixel *ref, const ptrdiff_t ref_stride) 3423function emu_edge_16bpc_neon, export=1 3424 ldp x8, x9, [sp] 3425 3426 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3427 // ref += iclip(x, 0, iw - 1) 3428 sub x12, x3, #1 // ih - 1 3429 cmp x5, x3 3430 sub x13, x2, #1 // iw - 1 3431 csel x12, x12, x5, ge // min(y, ih - 1) 3432 cmp x4, x2 3433 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3434 csel x13, x13, x4, ge // min(x, iw - 1) 3435 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3436 madd x8, x12, x9, x8 // ref += iclip() * stride 3437 add x8, x8, x13, lsl #1 // ref += iclip() 3438 3439 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3440 // top_ext = iclip(-y, 0, bh - 1) 3441 add x10, x5, x1 // y + bh 3442 neg x5, x5 // -y 3443 sub x10, x10, x3 // y + bh - ih 3444 sub x12, x1, #1 // bh - 1 3445 cmp x10, x1 3446 bic x5, x5, x5, asr #63 // max(-y, 0) 3447 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3448 cmp x5, x1 3449 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3450 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3451 3452 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3453 // left_ext = iclip(-x, 0, bw - 1) 3454 add x11, x4, x0 // x + bw 3455 neg x4, x4 // -x 3456 sub x11, x11, x2 // x + bw - iw 3457 sub x13, x0, #1 // bw - 1 3458 cmp x11, x0 3459 bic x4, x4, x4, asr #63 // max(-x, 0) 3460 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3461 cmp x4, x0 3462 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3463 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3464 3465 // center_h = bh - top_ext - bottom_ext 3466 // dst += top_ext * PXSTRIDE(dst_stride) 3467 // center_w = bw - left_ext - right_ext 3468 sub x1, x1, x5 // bh - top_ext 3469 madd x6, x5, x7, x6 3470 sub x2, x0, x4 // bw - left_ext 3471 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3472 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3473 3474 mov x14, x6 // backup of dst 3475 3476.macro v_loop need_left, need_right 34770: 3478.if \need_left 3479 ld1r {v0.8h}, [x8] 3480 mov x12, x6 // out = dst 3481 mov x3, x4 3482 mov v1.16b, v0.16b 34831: 3484 subs x3, x3, #16 3485 st1 {v0.8h, v1.8h}, [x12], #32 3486 b.gt 1b 3487.endif 3488 mov x13, x8 3489 add x12, x6, x4, lsl #1 // out = dst + left_ext 3490 mov x3, x2 34911: 3492 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 3493 subs x3, x3, #32 3494 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 3495 b.gt 1b 3496.if \need_right 3497 add x3, x8, x2, lsl #1 // in + center_w 3498 sub x3, x3, #2 // in + center_w - 1 3499 add x12, x6, x4, lsl #1 // dst + left_ext 3500 ld1r {v0.8h}, [x3] 3501 add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w 3502 mov x3, x11 3503 mov v1.16b, v0.16b 35041: 3505 subs x3, x3, #16 3506 st1 {v0.8h, v1.8h}, [x12], #32 3507 b.gt 1b 3508.endif 3509 3510 subs x1, x1, #1 // center_h-- 3511 add x6, x6, x7 3512 add x8, x8, x9 3513 b.gt 0b 3514.endm 3515 3516 cbz x4, 2f 3517 // need_left 3518 cbz x11, 3f 3519 // need_left + need_right 3520 v_loop 1, 1 3521 b 5f 3522 35232: 3524 // !need_left 3525 cbz x11, 4f 3526 // !need_left + need_right 3527 v_loop 0, 1 3528 b 5f 3529 35303: 3531 // need_left + !need_right 3532 v_loop 1, 0 3533 b 5f 3534 35354: 3536 // !need_left + !need_right 3537 v_loop 0, 0 3538 35395: 3540 3541 cbz x10, 3f 3542 // need_bottom 3543 sub x8, x6, x7 // ref = dst - stride 3544 mov x4, x0 35451: 3546 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 3547 mov x3, x10 35482: 3549 subs x3, x3, #1 3550 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3551 b.gt 2b 3552 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3553 subs x4, x4, #32 // bw -= 32 3554 add x6, x6, #64 // dst += 32 3555 b.gt 1b 3556 35573: 3558 cbz x5, 3f 3559 // need_top 3560 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 35611: 3562 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 3563 mov x3, x5 35642: 3565 subs x3, x3, #1 3566 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3567 b.gt 2b 3568 msub x6, x7, x5, x6 // dst -= top_ext * stride 3569 subs x0, x0, #32 // bw -= 32 3570 add x6, x6, #64 // dst += 32 3571 b.gt 1b 3572 35733: 3574 ret 3575endfunc 3576