1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// The exported functions in this file have got the following signature: 32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, 33// int bitdepth_max); 34 35// Most of the functions use the following register layout: 36// x0-x3 external parameters 37// x4 function pointer to first transform 38// x5 function pointer to second transform 39// x6 output parameter for helper function 40// x7 input parameter for helper function 41// x8 input stride for helper function 42// x9-x12 scratch variables for helper functions 43// x13 pointer to list of eob thresholds 44// x14 return pointer for helper function 45// x15 return pointer for main function 46 47// The SIMD registers most often use the following layout: 48// v0-v1 multiplication coefficients 49// v2-v7 scratch registers 50// v8-v15 unused 51// v16-v31 inputs/outputs of transforms 52 53const idct_coeffs, align=4 54 // idct4 55 .int 2896, 2896*8*(1<<16), 1567, 3784 56 // idct8 57 .int 799, 4017, 3406, 2276 58 // idct16 59 .int 401, 4076, 3166, 2598 60 .int 1931, 3612, 3920, 1189 61 // idct32 62 .int 201, 4091, 3035, 2751 63 .int 1751, 3703, 3857, 1380 64 .int 995, 3973, 3513, 2106 65 .int 2440, 3290, 4052, 601 66endconst 67 68const idct64_coeffs, align=4 69 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) 70 .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) 71 .int 4076, 401, 4017, 799 72 73 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) 74 .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) 75 .int -3166, -2598, -799, -4017 76 77 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) 78 .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) 79 .int 3612, 1931, 2276, 3406 80 81 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) 82 .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) 83 .int -3920, -1189, -3406, -2276 84endconst 85 86const iadst4_coeffs, align=4 87 .int 1321, 3803, 2482, 3344 88endconst 89 90const iadst8_coeffs, align=4 91 .int 4076, 401, 3612, 1931 92 .int 2598, 3166, 1189, 3920 93 // idct_coeffs 94 .int 2896, 0, 1567, 3784 95endconst 96 97const iadst16_coeffs, align=4 98 .int 4091, 201, 3973, 995 99 .int 3703, 1751, 3290, 2440 100 .int 2751, 3035, 2106, 3513 101 .int 1380, 3857, 601, 4052 102endconst 103 104.macro mul_mla d, s0, s1, c0, c1 105 mul \d\().4s, \s0\().4s, \c0 106 mla \d\().4s, \s1\().4s, \c1 107.endm 108 109.macro mul_mls d, s0, s1, c0, c1 110 mul \d\().4s, \s0\().4s, \c0 111 mls \d\().4s, \s1\().4s, \c1 112.endm 113 114.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 115 sqrdmulh \r0\sz, \r0\sz, \c 116 sqrdmulh \r1\sz, \r1\sz, \c 117 sqrdmulh \r2\sz, \r2\sz, \c 118 sqrdmulh \r3\sz, \r3\sz, \c 119.ifnb \r4 120 sqrdmulh \r4\sz, \r4\sz, \c 121 sqrdmulh \r5\sz, \r5\sz, \c 122 sqrdmulh \r6\sz, \r6\sz, \c 123 sqrdmulh \r7\sz, \r7\sz, \c 124.endif 125.endm 126 127.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 128.ifnb \load 129 ld1 {\load}, [\src], x1 130.endif 131.ifnb \shift 132 srshr \shift, \shift, #\shiftbits 133.endif 134.ifnb \addsrc 135 sqadd \adddst, \adddst, \addsrc 136.endif 137.ifnb \max 138 smax \max, \max, v6.8h 139.endif 140.ifnb \min 141 smin \min, \min, v7.8h 142.endif 143.ifnb \store 144 st1 {\store}, [\dst], x1 145.endif 146.endm 147.macro load_add_store_8x16 dst, src 148 mov \src, \dst 149 movi v6.8h, #0 150 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 151 load_add_store v2.8h, v16.8h, , , , , , \dst, \src 152 load_add_store v3.8h, v17.8h, , , , , , \dst, \src 153 load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src 154 load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src 155 load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src 156 load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src 157 load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src 158 load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src 159 load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src 160 load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src 161 load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src 162 load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src 163 load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src 164 load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src 165 load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src 166 load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src 167 load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src 168 load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src 169 load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src 170 load_add_store , , , , , v31.8h, v30.8h, \dst, \src 171 load_add_store , , , , , , v31.8h, \dst, \src 172.endm 173.macro load_add_store_8x8 dst, src, shiftbits=4 174 mov \src, \dst 175 movi v6.8h, #0 176 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 177 load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits 178 load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits 179 load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits 180 load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits 181 load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits 182 load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits 183 load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits 184 load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits 185 load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits 186 load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits 187 load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits 188 load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits 189 load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits 190.endm 191.macro load_add_store_8x4 dst, src, shiftbits=4 192 mov \src, \dst 193 movi v6.8h, #0 194 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 195 load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits 196 load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits 197 load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits 198 load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits 199 load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits 200 load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits 201 load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits 202 load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits 203 load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits 204.endm 205.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src 206.ifnb \load 207 ld1 {\load}[0], [\src], x1 208.endif 209.ifnb \inssrc 210 ins \insdst\().d[1], \inssrc\().d[0] 211.endif 212.ifnb \shift 213 srshr \shift, \shift, #4 214.endif 215.ifnb \load 216 ld1 {\load}[1], [\src], x1 217.endif 218.ifnb \addsrc 219 sqadd \adddst, \adddst, \addsrc 220.endif 221.ifnb \store 222 st1 {\store}[0], [\dst], x1 223.endif 224.ifnb \max 225 smax \max, \max, v6.8h 226.endif 227.ifnb \min 228 smin \min, \min, v7.8h 229.endif 230.ifnb \store 231 st1 {\store}[1], [\dst], x1 232.endif 233.endm 234.macro load_add_store_4x16 dst, src 235 mov \src, \dst 236 movi v6.8h, #0 237 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 238 load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src 239 load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src 240 load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src 241 load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src 242 load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src 243 load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src 244 load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src 245 load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src 246 load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src 247 load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src 248 load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src 249 load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src 250 load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src 251 load_add_store4 , , , , , , , , v30.d, \dst, \src 252.endm 253.macro load_add_store_4x8 dst, src 254 mov \src, \dst 255 movi v6.8h, #0 256 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 257 load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src 258 load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src 259 load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src 260 load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src 261 load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src 262 load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src 263 load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src 264 load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src 265 load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src 266 load_add_store4 , , , , , , , , v22.d, \dst, \src 267.endm 268 269.macro idct_dc w, h, shift 270 cbnz w3, 1f 271 movz w16, #2896*8, lsl #16 272 ld1r {v16.4s}, [x2] 273 dup v0.2s, w16 274 sqrdmulh v20.4s, v16.4s, v0.s[0] 275 str wzr, [x2] 276.if (\w == 2*\h) || (2*\w == \h) 277 sqrdmulh v20.4s, v20.4s, v0.s[0] 278.endif 279.if \shift > 0 280 sqrshrn v16.4h, v20.4s, #\shift 281 sqrshrn2 v16.8h, v20.4s, #\shift 282.else 283 sqxtn v16.4h, v20.4s 284 sqxtn2 v16.8h, v20.4s 285.endif 286 sqrdmulh v16.8h, v16.8h, v0.h[1] 287 srshr v16.8h, v16.8h, #4 288 mov w4, #\h 289 b idct_dc_w\w\()_neon 2901: 291.endm 292 293function idct_dc_w4_neon 294 movi v30.8h, #0 295 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 2961: 297 ld1 {v0.d}[0], [x0], x1 298 ld1 {v0.d}[1], [x0], x1 299 ld1 {v1.d}[0], [x0], x1 300 subs w4, w4, #4 301 ld1 {v1.d}[1], [x0], x1 302 sqadd v0.8h, v0.8h, v16.8h 303 sub x0, x0, x1, lsl #2 304 sqadd v1.8h, v1.8h, v16.8h 305 smax v0.8h, v0.8h, v30.8h 306 smax v1.8h, v1.8h, v30.8h 307 smin v0.8h, v0.8h, v31.8h 308 st1 {v0.d}[0], [x0], x1 309 smin v1.8h, v1.8h, v31.8h 310 st1 {v0.d}[1], [x0], x1 311 st1 {v1.d}[0], [x0], x1 312 st1 {v1.d}[1], [x0], x1 313 b.gt 1b 314 ret 315endfunc 316 317function idct_dc_w8_neon 318 movi v30.8h, #0 319 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3201: 321 ld1 {v0.8h}, [x0], x1 322 subs w4, w4, #4 323 ld1 {v1.8h}, [x0], x1 324 sqadd v0.8h, v0.8h, v16.8h 325 ld1 {v2.8h}, [x0], x1 326 sqadd v1.8h, v1.8h, v16.8h 327 ld1 {v3.8h}, [x0], x1 328 sqadd v2.8h, v2.8h, v16.8h 329 sqadd v3.8h, v3.8h, v16.8h 330 sub x0, x0, x1, lsl #2 331 smax v0.8h, v0.8h, v30.8h 332 smax v1.8h, v1.8h, v30.8h 333 smax v2.8h, v2.8h, v30.8h 334 smax v3.8h, v3.8h, v30.8h 335 smin v0.8h, v0.8h, v31.8h 336 smin v1.8h, v1.8h, v31.8h 337 st1 {v0.8h}, [x0], x1 338 smin v2.8h, v2.8h, v31.8h 339 st1 {v1.8h}, [x0], x1 340 smin v3.8h, v3.8h, v31.8h 341 st1 {v2.8h}, [x0], x1 342 st1 {v3.8h}, [x0], x1 343 b.gt 1b 344 ret 345endfunc 346 347function idct_dc_w16_neon 348 movi v30.8h, #0 349 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3501: 351 ld1 {v0.8h, v1.8h}, [x0], x1 352 subs w4, w4, #2 353 ld1 {v2.8h, v3.8h}, [x0], x1 354 sqadd v0.8h, v0.8h, v16.8h 355 sqadd v1.8h, v1.8h, v16.8h 356 sub x0, x0, x1, lsl #1 357 sqadd v2.8h, v2.8h, v16.8h 358 sqadd v3.8h, v3.8h, v16.8h 359 smax v0.8h, v0.8h, v30.8h 360 smax v1.8h, v1.8h, v30.8h 361 smax v2.8h, v2.8h, v30.8h 362 smax v3.8h, v3.8h, v30.8h 363 smin v0.8h, v0.8h, v31.8h 364 smin v1.8h, v1.8h, v31.8h 365 smin v2.8h, v2.8h, v31.8h 366 st1 {v0.8h, v1.8h}, [x0], x1 367 smin v3.8h, v3.8h, v31.8h 368 st1 {v2.8h, v3.8h}, [x0], x1 369 b.gt 1b 370 ret 371endfunc 372 373function idct_dc_w32_neon 374 movi v30.8h, #0 375 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3761: 377 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 378 subs w4, w4, #1 379 sqadd v0.8h, v0.8h, v16.8h 380 sqadd v1.8h, v1.8h, v16.8h 381 sqadd v2.8h, v2.8h, v16.8h 382 sqadd v3.8h, v3.8h, v16.8h 383 smax v0.8h, v0.8h, v30.8h 384 smax v1.8h, v1.8h, v30.8h 385 smax v2.8h, v2.8h, v30.8h 386 smax v3.8h, v3.8h, v30.8h 387 smin v0.8h, v0.8h, v31.8h 388 smin v1.8h, v1.8h, v31.8h 389 smin v2.8h, v2.8h, v31.8h 390 smin v3.8h, v3.8h, v31.8h 391 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 392 b.gt 1b 393 ret 394endfunc 395 396function idct_dc_w64_neon 397 movi v30.8h, #0 398 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 399 sub x1, x1, #64 4001: 401 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 402 subs w4, w4, #1 403 sqadd v0.8h, v0.8h, v16.8h 404 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] 405 sqadd v1.8h, v1.8h, v16.8h 406 sub x0, x0, #64 407 sqadd v2.8h, v2.8h, v16.8h 408 sqadd v3.8h, v3.8h, v16.8h 409 sqadd v4.8h, v4.8h, v16.8h 410 sqadd v5.8h, v5.8h, v16.8h 411 sqadd v6.8h, v6.8h, v16.8h 412 sqadd v7.8h, v7.8h, v16.8h 413 smax v0.8h, v0.8h, v30.8h 414 smax v1.8h, v1.8h, v30.8h 415 smax v2.8h, v2.8h, v30.8h 416 smax v3.8h, v3.8h, v30.8h 417 smax v4.8h, v4.8h, v30.8h 418 smax v5.8h, v5.8h, v30.8h 419 smax v6.8h, v6.8h, v30.8h 420 smax v7.8h, v7.8h, v30.8h 421 smin v0.8h, v0.8h, v31.8h 422 smin v1.8h, v1.8h, v31.8h 423 smin v2.8h, v2.8h, v31.8h 424 smin v3.8h, v3.8h, v31.8h 425 smin v4.8h, v4.8h, v31.8h 426 smin v5.8h, v5.8h, v31.8h 427 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 428 smin v6.8h, v6.8h, v31.8h 429 smin v7.8h, v7.8h, v31.8h 430 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 431 b.gt 1b 432 ret 433endfunc 434 435.macro iwht4 436 add v16.4s, v16.4s, v17.4s 437 sub v21.4s, v18.4s, v19.4s 438 sub v20.4s, v16.4s, v21.4s 439 sshr v20.4s, v20.4s, #1 440 sub v18.4s, v20.4s, v17.4s 441 sub v17.4s, v20.4s, v19.4s 442 add v19.4s, v21.4s, v18.4s 443 sub v16.4s, v16.4s, v17.4s 444.endm 445 446.macro idct_4 r0, r1, r2, r3 447 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] 448 mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] 449 mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] 450 mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] 451 srshr v6.4s, v6.4s, #12 452 srshr v7.4s, v4.4s, #12 453 srshr v2.4s, v2.4s, #12 454 srshr v3.4s, v3.4s, #12 455 sqadd \r0\().4s, v2.4s, v6.4s 456 sqsub \r3\().4s, v2.4s, v6.4s 457 sqadd \r1\().4s, v3.4s, v7.4s 458 sqsub \r2\().4s, v3.4s, v7.4s 459.endm 460 461function inv_dct_4s_x4_neon 462 movrel x16, idct_coeffs 463 ld1 {v0.4s}, [x16] 464 idct_4 v16, v17, v18, v19 465 ret 466endfunc 467 468.macro iadst_4x4 o0, o1, o2, o3 469 movrel x16, iadst4_coeffs 470 ld1 {v0.4s}, [x16] 471 472 sub v3.4s, v16.4s, v18.4s 473 mul v4.4s, v16.4s, v0.s[0] 474 mla v4.4s, v18.4s, v0.s[1] 475 mla v4.4s, v19.4s, v0.s[2] 476 mul v7.4s, v17.4s, v0.s[3] 477 add v3.4s, v3.4s, v19.4s 478 mul v5.4s, v16.4s, v0.s[2] 479 mls v5.4s, v18.4s, v0.s[0] 480 mls v5.4s, v19.4s, v0.s[1] 481 482 add \o3\().4s, v4.4s, v5.4s 483 mul \o2\().4s, v3.4s, v0.s[3] 484 add \o0\().4s, v4.4s, v7.4s 485 add \o1\().4s, v5.4s, v7.4s 486 sub \o3\().4s, \o3\().4s, v7.4s 487 488 srshr \o0\().4s, \o0\().4s, #12 489 srshr \o2\().4s, \o2\().4s, #12 490 srshr \o1\().4s, \o1\().4s, #12 491 srshr \o3\().4s, \o3\().4s, #12 492.endm 493 494function inv_adst_4s_x4_neon 495 iadst_4x4 v16, v17, v18, v19 496 ret 497endfunc 498 499function inv_flipadst_4s_x4_neon 500 iadst_4x4 v19, v18, v17, v16 501 ret 502endfunc 503 504function inv_identity_4s_x4_neon 505 movz w16, #(5793-4096)*8, lsl #16 506 dup v0.2s, w16 507 sqrdmulh v4.4s, v16.4s, v0.s[0] 508 sqrdmulh v5.4s, v17.4s, v0.s[0] 509 sqrdmulh v6.4s, v18.4s, v0.s[0] 510 sqrdmulh v7.4s, v19.4s, v0.s[0] 511 sqadd v16.4s, v16.4s, v4.4s 512 sqadd v17.4s, v17.4s, v5.4s 513 sqadd v18.4s, v18.4s, v6.4s 514 sqadd v19.4s, v19.4s, v7.4s 515 ret 516endfunc 517 518function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 519 mov x15, x30 520 movi v30.4s, #0 521 movi v31.4s, #0 522 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 523 st1 {v30.4s, v31.4s}, [x2], #32 524 525 sshr v16.4s, v16.4s, #2 526 sshr v17.4s, v17.4s, #2 527 sshr v18.4s, v18.4s, #2 528 sshr v19.4s, v19.4s, #2 529 530 iwht4 531 532 st1 {v30.4s, v31.4s}, [x2], #32 533 transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 534 535 iwht4 536 537 ld1 {v0.d}[0], [x0], x1 538 sqxtn v16.4h, v16.4s 539 ld1 {v0.d}[1], [x0], x1 540 sqxtn2 v16.8h, v17.4s 541 ld1 {v1.d}[0], [x0], x1 542 sqxtn v18.4h, v18.4s 543 ld1 {v1.d}[1], [x0], x1 544 sqxtn2 v18.8h, v19.4s 545 546 b L(itx_4x4_end) 547endfunc 548 549function inv_txfm_add_4x4_neon 550 movi v30.4s, #0 551 movi v31.4s, #0 552 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 553 st1 {v30.4s, v31.4s}, [x2], #32 554 555 blr x4 556 557 st1 {v30.4s, v31.4s}, [x2], #32 558 sqxtn v16.4h, v16.4s 559 sqxtn v17.4h, v17.4s 560 sqxtn v18.4h, v18.4s 561 sqxtn v19.4h, v19.4s 562 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 563 564 blr x5 565 566 ld1 {v0.d}[0], [x0], x1 567 ld1 {v0.d}[1], [x0], x1 568 ins v16.d[1], v17.d[0] 569 ins v18.d[1], v19.d[0] 570 ld1 {v1.d}[0], [x0], x1 571 ld1 {v1.d}[1], [x0], x1 572 srshr v16.8h, v16.8h, #4 573 srshr v18.8h, v18.8h, #4 574 575L(itx_4x4_end): 576 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 577 sub x0, x0, x1, lsl #2 578 sqadd v16.8h, v16.8h, v0.8h 579 sqadd v18.8h, v18.8h, v1.8h 580 smax v16.8h, v16.8h, v30.8h 581 smax v18.8h, v18.8h, v30.8h 582 smin v16.8h, v16.8h, v31.8h 583 st1 {v16.d}[0], [x0], x1 584 smin v18.8h, v18.8h, v31.8h 585 st1 {v16.d}[1], [x0], x1 586 st1 {v18.d}[0], [x0], x1 587 st1 {v18.d}[1], [x0], x1 588 589 br x15 590endfunc 591 592.macro def_fn_4x4 txfm1, txfm2 593function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 594 mov x15, x30 595 596.ifc \txfm1\()_\txfm2, dct_dct 597 cbnz w3, 1f 598 movz w16, #2896*8, lsl #16 599 ld1r {v16.4s}, [x2] 600 dup v4.2s, w16 601 str wzr, [x2] 602 sqrdmulh v16.4s, v16.4s, v4.s[0] 603 ld1 {v0.d}[0], [x0], x1 604 sqxtn v20.4h, v16.4s 605 sqxtn2 v20.8h, v16.4s 606 ld1 {v0.d}[1], [x0], x1 607 sqrdmulh v20.8h, v20.8h, v4.h[1] 608 ld1 {v1.d}[0], [x0], x1 609 srshr v16.8h, v20.8h, #4 610 ld1 {v1.d}[1], [x0], x1 611 srshr v18.8h, v20.8h, #4 612 movi v30.8h, #0 613 b L(itx_4x4_end) 6141: 615.endif 616 adr x4, inv_\txfm1\()_4s_x4_neon 617 movrel x5, X(inv_\txfm2\()_4h_x4_neon) 618 b inv_txfm_add_4x4_neon 619endfunc 620.endm 621 622def_fn_4x4 dct, dct 623def_fn_4x4 identity, identity 624def_fn_4x4 dct, adst 625def_fn_4x4 dct, flipadst 626def_fn_4x4 dct, identity 627def_fn_4x4 adst, dct 628def_fn_4x4 adst, adst 629def_fn_4x4 adst, flipadst 630def_fn_4x4 flipadst, dct 631def_fn_4x4 flipadst, adst 632def_fn_4x4 flipadst, flipadst 633def_fn_4x4 identity, dct 634 635def_fn_4x4 adst, identity 636def_fn_4x4 flipadst, identity 637def_fn_4x4 identity, adst 638def_fn_4x4 identity, flipadst 639 640.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 641 idct_4 \r0, \r2, \r4, \r6 642 643 mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a 644 mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a 645 mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a 646 mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a 647 srshr \r1\().4s, v2.4s, #12 // t4a 648 srshr \r7\().4s, v4.4s, #12 // t7a 649 srshr \r3\().4s, v6.4s, #12 // t5a 650 srshr \r5\().4s, v7.4s, #12 // taa 651 652 sqadd v2.4s, \r1\().4s, \r3\().4s // t4 653 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a 654 sqadd v3.4s, \r7\().4s, \r5\().4s // t7 655 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a 656 657 mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 658 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 659 srshr v4.4s, v4.4s, #12 // t5 660 srshr v5.4s, v6.4s, #12 // t6 661 662 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 663 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 664 sqadd \r1\().4s, \r2\().4s, v5.4s // out1 665 sqsub v6.4s, \r2\().4s, v5.4s // out6 666 sqadd \r2\().4s, \r4\().4s, v4.4s // out2 667 sqsub \r5\().4s, \r4\().4s, v4.4s // out5 668 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 669 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 670 mov \r6\().16b, v6.16b // out6 671.endm 672 673function inv_dct_4s_x8_neon 674 movrel x16, idct_coeffs 675 ld1 {v0.4s, v1.4s}, [x16] 676 idct_8 v16, v17, v18, v19, v20, v21, v22, v23 677 ret 678endfunc 679 680.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 681 movrel x16, iadst8_coeffs 682 ld1 {v0.4s, v1.4s}, [x16], #32 683 684 mul_mla v2, v23, v16, v0.s[0], v0.s[1] 685 mul_mls v4, v23, v16, v0.s[1], v0.s[0] 686 mul_mla v6, v21, v18, v0.s[2], v0.s[3] 687 srshr v16.4s, v2.4s, #12 // t0a 688 srshr v23.4s, v4.4s, #12 // t1a 689 mul_mls v2, v21, v18, v0.s[3], v0.s[2] 690 mul_mla v4, v19, v20, v1.s[0], v1.s[1] 691 srshr v18.4s, v6.4s, #12 // t2a 692 srshr v21.4s, v2.4s, #12 // t3a 693 mul_mls v6, v19, v20, v1.s[1], v1.s[0] 694 mul_mla v2, v17, v22, v1.s[2], v1.s[3] 695 srshr v20.4s, v4.4s, #12 // t4a 696 srshr v19.4s, v6.4s, #12 // t5a 697 mul_mls v4, v17, v22, v1.s[3], v1.s[2] 698 srshr v22.4s, v2.4s, #12 // t6a 699 srshr v17.4s, v4.4s, #12 // t7a 700 701 ld1 {v0.4s}, [x16] 702 703 sqadd v2.4s, v16.4s, v20.4s // t0 704 sqsub v3.4s, v16.4s, v20.4s // t4 705 sqadd v4.4s, v23.4s, v19.4s // t1 706 sqsub v5.4s, v23.4s, v19.4s // t5 707 sqadd v6.4s, v18.4s, v22.4s // t2 708 sqsub v7.4s, v18.4s, v22.4s // t6 709 sqadd v18.4s, v21.4s, v17.4s // t3 710 sqsub v19.4s, v21.4s, v17.4s // t7 711 712 mul_mla v16, v3, v5, v0.s[3], v0.s[2] 713 mul_mls v20, v3, v5, v0.s[2], v0.s[3] 714 mul_mls v22, v19, v7, v0.s[3], v0.s[2] 715 716 srshr v3.4s, v16.4s, #12 // t4a 717 srshr v5.4s, v20.4s, #12 // t5a 718 719 mul_mla v16, v19, v7, v0.s[2], v0.s[3] 720 721 srshr v7.4s, v22.4s, #12 // t6a 722 srshr v19.4s, v16.4s, #12 // t7a 723 724 sqadd \o0\().4s, v2.4s, v6.4s // out0 725 sqsub v2.4s, v2.4s, v6.4s // t2 726 sqadd \o7\().4s, v4.4s, v18.4s // out7 727 sqsub v4.4s, v4.4s, v18.4s // t3 728 sqneg \o7\().4s, \o7\().4s // out7 729 730 sqadd \o1\().4s, v3.4s, v7.4s // out1 731 sqsub v3.4s, v3.4s, v7.4s // t6 732 sqadd \o6\().4s, v5.4s, v19.4s // out6 733 sqsub v5.4s, v5.4s, v19.4s // t7 734 sqneg \o1\().4s, \o1\().4s // out1 735 736 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) 737 mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) 738 mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) 739 srshr v2.4s, v18.4s, #12 // out3 740 mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) 741 srshr v3.4s, v20.4s, #12 // out5 742 srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) 743 srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) 744 745 sqneg \o3\().4s, v2.4s // out3 746 sqneg \o5\().4s, v3.4s // out5 747.endm 748 749function inv_adst_4s_x8_neon 750 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 751 ret 752endfunc 753 754function inv_flipadst_4s_x8_neon 755 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 756 ret 757endfunc 758 759function inv_identity_4s_x8_neon 760 sqshl v16.4s, v16.4s, #1 761 sqshl v17.4s, v17.4s, #1 762 sqshl v18.4s, v18.4s, #1 763 sqshl v19.4s, v19.4s, #1 764 sqshl v20.4s, v20.4s, #1 765 sqshl v21.4s, v21.4s, #1 766 sqshl v22.4s, v22.4s, #1 767 sqshl v23.4s, v23.4s, #1 768 ret 769endfunc 770 771function inv_txfm_add_8x8_neon 772 movi v31.4s, #0 773 774 cmp w3, w13 775 mov x11, #32 776 b.lt 1f 777 778 add x6, x2, #16 779.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 780 ld1 {\i}, [x6] 781 st1 {v31.4s}, [x6], x11 782.endr 783 784 blr x4 785 786 sqrshrn v24.4h, v16.4s, #1 787 sqrshrn v25.4h, v17.4s, #1 788 sqrshrn v26.4h, v18.4s, #1 789 sqrshrn v27.4h, v19.4s, #1 790 sqrshrn2 v24.8h, v20.4s, #1 791 sqrshrn2 v25.8h, v21.4s, #1 792 sqrshrn2 v26.8h, v22.4s, #1 793 sqrshrn2 v27.8h, v23.4s, #1 794 795 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 796 797 b 2f 798 7991: 800.irp i, v24.8h, v25.8h, v26.8h, v27.8h 801 movi \i, #0 802.endr 803 8042: 805 806.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 807 ld1 {\i}, [x2] 808 st1 {v31.4s}, [x2], x11 809.endr 810 811 blr x4 812 813 sqrshrn v16.4h, v16.4s, #1 814 sqrshrn v17.4h, v17.4s, #1 815 sqrshrn v18.4h, v18.4s, #1 816 sqrshrn v19.4h, v19.4s, #1 817 sqrshrn2 v16.8h, v20.4s, #1 818 sqrshrn2 v17.8h, v21.4s, #1 819 sqrshrn2 v18.8h, v22.4s, #1 820 sqrshrn2 v19.8h, v23.4s, #1 821 822 transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 823 824 mov v20.16b, v24.16b 825 mov v21.16b, v25.16b 826 mov v22.16b, v26.16b 827 mov v23.16b, v27.16b 828 829 blr x5 830 831 load_add_store_8x8 x0, x7 832 br x15 833endfunc 834 835.macro def_fn_8x8 txfm1, txfm2, eob_half 836function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 837 mov x15, x30 838 839.ifc \txfm1\()_\txfm2, dct_dct 840 idct_dc 8, 8, 1 841.endif 842 movrel x5, X(inv_\txfm2\()_8h_x8_neon) 843 mov w13, #\eob_half 844 adr x4, inv_\txfm1\()_4s_x8_neon 845 b inv_txfm_add_8x8_neon 846endfunc 847.endm 848 849def_fn_8x8 dct, dct, 10 850def_fn_8x8 identity, identity, 10 851def_fn_8x8 dct, adst, 10 852def_fn_8x8 dct, flipadst, 10 853def_fn_8x8 dct, identity, 4 854def_fn_8x8 adst, dct, 10 855def_fn_8x8 adst, adst, 10 856def_fn_8x8 adst, flipadst, 10 857def_fn_8x8 flipadst, dct, 10 858def_fn_8x8 flipadst, adst, 10 859def_fn_8x8 flipadst, flipadst, 10 860def_fn_8x8 identity, dct, 4 861def_fn_8x8 adst, identity, 4 862def_fn_8x8 flipadst, identity, 4 863def_fn_8x8 identity, adst, 4 864def_fn_8x8 identity, flipadst, 4 865 866function inv_txfm_add_8x4_neon 867 movi v28.4s, #0 868 movi v29.4s, #0 869 movi v30.4s, #0 870 movi v31.4s, #0 871 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 872 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 873 movz w16, #2896*8, lsl #16 874 dup v0.2s, w16 875 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] 876 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] 877 878 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 879 880 blr x4 881 882 sqxtn v16.4h, v16.4s 883 sqxtn v17.4h, v17.4s 884 sqxtn v18.4h, v18.4s 885 sqxtn v19.4h, v19.4s 886 sqxtn v20.4h, v20.4s 887 sqxtn v21.4h, v21.4s 888 sqxtn v22.4h, v22.4s 889 sqxtn v23.4h, v23.4s 890 891 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 892 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 893 ins v16.d[1], v20.d[0] 894 ins v17.d[1], v21.d[0] 895 ins v18.d[1], v22.d[0] 896 ins v19.d[1], v23.d[0] 897 898 blr x5 899 900 load_add_store_8x4 x0, x7 901 br x15 902endfunc 903 904function inv_txfm_add_4x8_neon 905 movz w16, #2896*8, lsl #16 906 movi v31.4s, #0 907 dup v30.2s, w16 908 909 cmp w3, w13 910 mov x11, #32 911 b.lt 1f 912 913 add x6, x2, #16 914.irp i, v16.4s, v17.4s, v18.4s, v19.4s 915 ld1 {\i}, [x6] 916 st1 {v31.4s}, [x6], x11 917.endr 918 scale_input .4s, v30.s[0], v16, v17, v18, v19 919 blr x4 920 sqxtn v20.4h, v16.4s 921 sqxtn v21.4h, v17.4s 922 sqxtn v22.4h, v18.4s 923 sqxtn v23.4h, v19.4s 924 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 925 926 b 2f 927 9281: 929.irp i, v20, v21, v22, v23 930 movi \i\().4h, #0 931.endr 932 9332: 934 935.irp i, v16.4s, v17.4s, v18.4s, v19.4s 936 ld1 {\i}, [x2] 937 st1 {v31.4s}, [x2], x11 938.endr 939 scale_input .4s, v30.s[0], v16, v17, v18, v19 940 blr x4 941 sqxtn v16.4h, v16.4s 942 sqxtn v17.4h, v17.4s 943 sqxtn v18.4h, v18.4s 944 sqxtn v19.4h, v19.4s 945 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 946 947 blr x5 948 949 load_add_store_4x8 x0, x7 950 br x15 951endfunc 952 953.macro def_fn_48 w, h, txfm1, txfm2, eob_half 954function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 955 mov x15, x30 956 957.ifc \txfm1\()_\txfm2, dct_dct 958 idct_dc \w, \h, 0 959.endif 960 adr x4, inv_\txfm1\()_4s_x\w\()_neon 961.if \w == 4 962 mov w13, #\eob_half 963.endif 964 movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) 965 b inv_txfm_add_\w\()x\h\()_neon 966endfunc 967.endm 968 969.macro def_fns_48 w, h 970def_fn_48 \w, \h, dct, dct, 13 971def_fn_48 \w, \h, identity, identity, 13 972def_fn_48 \w, \h, dct, adst, 13 973def_fn_48 \w, \h, dct, flipadst, 13 974def_fn_48 \w, \h, dct, identity, 4 975def_fn_48 \w, \h, adst, dct, 13 976def_fn_48 \w, \h, adst, adst, 13 977def_fn_48 \w, \h, adst, flipadst, 13 978def_fn_48 \w, \h, flipadst, dct, 13 979def_fn_48 \w, \h, flipadst, adst, 13 980def_fn_48 \w, \h, flipadst, flipadst, 13 981def_fn_48 \w, \h, identity, dct, 16 982def_fn_48 \w, \h, adst, identity, 4 983def_fn_48 \w, \h, flipadst, identity, 4 984def_fn_48 \w, \h, identity, adst, 16 985def_fn_48 \w, \h, identity, flipadst, 16 986.endm 987 988def_fns_48 4, 8 989def_fns_48 8, 4 990 991 992function inv_dct_4s_x16_neon 993 movrel x16, idct_coeffs 994 ld1 {v0.4s, v1.4s}, [x16], #32 995 996 idct_8 v16, v18, v20, v22, v24, v26, v28, v30 997 998 ld1 {v0.4s, v1.4s}, [x16] 999 sub x16, x16, #32 1000 1001 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a 1002 mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a 1003 mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a 1004 srshr v17.4s, v2.4s, #12 // t8a 1005 srshr v31.4s, v4.4s, #12 // t15a 1006 mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a 1007 mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a 1008 srshr v23.4s, v6.4s, #12 // t9a 1009 srshr v25.4s, v2.4s, #12 // t14a 1010 mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a 1011 mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a 1012 srshr v21.4s, v4.4s, #12 // t10a 1013 srshr v27.4s, v6.4s, #12 // t13a 1014 mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a 1015 srshr v19.4s, v2.4s, #12 // t11a 1016 srshr v29.4s, v4.4s, #12 // t12a 1017 1018 ld1 {v0.4s}, [x16] 1019 1020 sqsub v2.4s, v17.4s, v23.4s // t9 1021 sqadd v17.4s, v17.4s, v23.4s // t8 1022 sqsub v3.4s, v31.4s, v25.4s // t14 1023 sqadd v31.4s, v31.4s, v25.4s // t15 1024 sqsub v23.4s, v19.4s, v21.4s // t10 1025 sqadd v19.4s, v19.4s, v21.4s // t11 1026 sqadd v25.4s, v29.4s, v27.4s // t12 1027 sqsub v29.4s, v29.4s, v27.4s // t13 1028 1029 mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a 1030 mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a 1031 srshr v21.4s, v4.4s, #12 // t9a 1032 srshr v27.4s, v6.4s, #12 // t14a 1033 1034 mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a 1035 mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a 1036 srshr v29.4s, v4.4s, #12 // t13a 1037 neg v6.4s, v6.4s 1038 srshr v23.4s, v6.4s, #12 // t10a 1039 1040 sqsub v2.4s, v17.4s, v19.4s // t11a 1041 sqadd v17.4s, v17.4s, v19.4s // t8a 1042 sqsub v3.4s, v31.4s, v25.4s // t12a 1043 sqadd v31.4s, v31.4s, v25.4s // t15a 1044 sqadd v19.4s, v21.4s, v23.4s // t9 1045 sqsub v21.4s, v21.4s, v23.4s // t10 1046 sqsub v25.4s, v27.4s, v29.4s // t13 1047 sqadd v27.4s, v27.4s, v29.4s // t14 1048 1049 mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 1050 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 1051 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a 1052 1053 srshr v4.4s, v4.4s, #12 // t11 1054 srshr v5.4s, v6.4s, #12 // t12 1055 mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a 1056 srshr v2.4s, v2.4s, #12 // t10a 1057 srshr v3.4s, v6.4s, #12 // t13a 1058 1059 sqadd v6.4s, v16.4s, v31.4s // out0 1060 sqsub v31.4s, v16.4s, v31.4s // out15 1061 mov v16.16b, v6.16b 1062 sqadd v23.4s, v30.4s, v17.4s // out7 1063 sqsub v7.4s, v30.4s, v17.4s // out8 1064 sqadd v17.4s, v18.4s, v27.4s // out1 1065 sqsub v30.4s, v18.4s, v27.4s // out14 1066 sqadd v18.4s, v20.4s, v3.4s // out2 1067 sqsub v29.4s, v20.4s, v3.4s // out13 1068 sqadd v3.4s, v28.4s, v19.4s // out6 1069 sqsub v25.4s, v28.4s, v19.4s // out9 1070 sqadd v19.4s, v22.4s, v5.4s // out3 1071 sqsub v28.4s, v22.4s, v5.4s // out12 1072 sqadd v20.4s, v24.4s, v4.4s // out4 1073 sqsub v27.4s, v24.4s, v4.4s // out11 1074 sqadd v21.4s, v26.4s, v2.4s // out5 1075 sqsub v26.4s, v26.4s, v2.4s // out10 1076 mov v24.16b, v7.16b 1077 mov v22.16b, v3.16b 1078 1079 ret 1080endfunc 1081 1082.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 1083 movrel x16, iadst16_coeffs 1084 ld1 {v0.4s, v1.4s}, [x16], #32 1085 1086 mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 1087 mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 1088 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 1089 srshr v16.4s, v2.4s, #12 // t0 1090 srshr v31.4s, v4.4s, #12 // t1 1091 mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 1092 mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 1093 srshr v18.4s, v6.4s, #12 // t2 1094 srshr v29.4s, v2.4s, #12 // t3 1095 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 1096 mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 1097 srshr v20.4s, v4.4s, #12 // t4 1098 srshr v27.4s, v6.4s, #12 // t5 1099 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 1100 ld1 {v0.4s, v1.4s}, [x16] 1101 movrel x16, idct_coeffs 1102 mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 1103 srshr v22.4s, v2.4s, #12 // t6 1104 srshr v25.4s, v4.4s, #12 // t7 1105 mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 1106 mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 1107 srshr v23.4s, v6.4s, #12 // t8 1108 srshr v24.4s, v2.4s, #12 // t9 1109 mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 1110 mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 1111 srshr v21.4s, v4.4s, #12 // t10 1112 srshr v26.4s, v6.4s, #12 // t11 1113 mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 1114 mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 1115 srshr v19.4s, v2.4s, #12 // t12 1116 srshr v28.4s, v4.4s, #12 // t13 1117 mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 1118 srshr v17.4s, v6.4s, #12 // t14 1119 srshr v30.4s, v2.4s, #12 // t15 1120 1121 ld1 {v0.4s, v1.4s}, [x16] 1122 1123 sqsub v2.4s, v16.4s, v23.4s // t8a 1124 sqadd v16.4s, v16.4s, v23.4s // t0a 1125 sqsub v3.4s, v31.4s, v24.4s // t9a 1126 sqadd v31.4s, v31.4s, v24.4s // t1a 1127 sqadd v23.4s, v18.4s, v21.4s // t2a 1128 sqsub v18.4s, v18.4s, v21.4s // t10a 1129 sqadd v24.4s, v29.4s, v26.4s // t3a 1130 sqsub v29.4s, v29.4s, v26.4s // t11a 1131 sqadd v21.4s, v20.4s, v19.4s // t4a 1132 sqsub v20.4s, v20.4s, v19.4s // t12a 1133 sqadd v26.4s, v27.4s, v28.4s // t5a 1134 sqsub v27.4s, v27.4s, v28.4s // t13a 1135 sqadd v19.4s, v22.4s, v17.4s // t6a 1136 sqsub v22.4s, v22.4s, v17.4s // t14a 1137 sqadd v28.4s, v25.4s, v30.4s // t7a 1138 sqsub v25.4s, v25.4s, v30.4s // t15a 1139 1140 mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 1141 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 1142 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 1143 srshr v17.4s, v4.4s, #12 // t8 1144 srshr v30.4s, v6.4s, #12 // t9 1145 mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 1146 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 1147 srshr v18.4s, v2.4s, #12 // t10 1148 srshr v29.4s, v4.4s, #12 // t11 1149 mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 1150 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 1151 srshr v27.4s, v6.4s, #12 // t12 1152 srshr v20.4s, v2.4s, #12 // t13 1153 mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 1154 srshr v25.4s, v4.4s, #12 // t14 1155 srshr v22.4s, v6.4s, #12 // t15 1156 1157 sqsub v2.4s, v16.4s, v21.4s // t4 1158 sqadd v16.4s, v16.4s, v21.4s // t0 1159 sqsub v3.4s, v31.4s, v26.4s // t5 1160 sqadd v31.4s, v31.4s, v26.4s // t1 1161 sqadd v21.4s, v23.4s, v19.4s // t2 1162 sqsub v23.4s, v23.4s, v19.4s // t6 1163 sqadd v26.4s, v24.4s, v28.4s // t3 1164 sqsub v24.4s, v24.4s, v28.4s // t7 1165 sqadd v19.4s, v17.4s, v27.4s // t8a 1166 sqsub v17.4s, v17.4s, v27.4s // t12a 1167 sqadd v28.4s, v30.4s, v20.4s // t9a 1168 sqsub v30.4s, v30.4s, v20.4s // t13a 1169 sqadd v27.4s, v18.4s, v25.4s // t10a 1170 sqsub v18.4s, v18.4s, v25.4s // t14a 1171 sqadd v20.4s, v29.4s, v22.4s // t11a 1172 sqsub v29.4s, v29.4s, v22.4s // t15a 1173 1174 mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a 1175 mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a 1176 mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a 1177 srshr v22.4s, v4.4s, #12 // t4a 1178 srshr v25.4s, v6.4s, #12 // t5a 1179 mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a 1180 mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 1181 srshr v24.4s, v2.4s, #12 // t6a 1182 srshr v23.4s, v4.4s, #12 // t7a 1183 mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 1184 mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 1185 srshr v17.4s, v6.4s, #12 // t12 1186 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 1187 srshr v29.4s, v2.4s, #12 // t13 1188 srshr v30.4s, v4.4s, #12 // t14 1189 srshr v18.4s, v6.4s, #12 // t15 1190 1191 sqsub v2.4s, v16.4s, v21.4s // t2a 1192.ifc \o0, v16 1193 sqadd \o0\().4s, v16.4s, v21.4s // out0 1194 sqsub v21.4s, v31.4s, v26.4s // t3a 1195 sqadd \o15\().4s, v31.4s, v26.4s // out15 1196.else 1197 sqadd v4.4s, v16.4s, v21.4s // out0 1198 sqsub v21.4s, v31.4s, v26.4s // t3a 1199 sqadd \o15\().4s, v31.4s, v26.4s // out15 1200 mov \o0\().16b, v4.16b 1201.endif 1202 sqneg \o15\().4s, \o15\().4s // out15 1203 1204 sqsub v3.4s, v29.4s, v18.4s // t15a 1205 sqadd \o13\().4s, v29.4s, v18.4s // out13 1206 sqadd \o2\().4s, v17.4s, v30.4s // out2 1207 sqsub v26.4s, v17.4s, v30.4s // t14a 1208 sqneg \o13\().4s, \o13\().4s // out13 1209 1210 sqadd \o1\().4s, v19.4s, v27.4s // out1 1211 sqsub v27.4s, v19.4s, v27.4s // t10 1212 sqadd \o14\().4s, v28.4s, v20.4s // out14 1213 sqsub v20.4s, v28.4s, v20.4s // t11 1214 sqneg \o1\().4s, \o1\().4s // out1 1215 1216 sqadd \o3\().4s, v22.4s, v24.4s // out3 1217 sqsub v22.4s, v22.4s, v24.4s // t6 1218 sqadd \o12\().4s, v25.4s, v23.4s // out12 1219 sqsub v23.4s, v25.4s, v23.4s // t7 1220 sqneg \o3\().4s, \o3\().4s // out3 1221 1222 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) 1223 mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) 1224 mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) 1225 1226 srshr v24.4s, v24.4s, #12 // out8 1227 srshr v4.4s, v4.4s, #12 // out7 1228 srshr v5.4s, v6.4s, #12 // out5 1229 mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) 1230 mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) 1231 srshr v26.4s, v6.4s, #12 // out10 1232 1233 mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) 1234 mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) 1235 mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) 1236 1237 srshr \o4\().4s, v2.4s, #12 // out4 1238 srshr v6.4s, v6.4s, #12 // out11 1239 srshr v7.4s, v21.4s, #12 // out9 1240 srshr \o6\().4s, v22.4s, #12 // out6 1241 1242.ifc \o8, v23 1243 mov \o8\().16b, v24.16b 1244 mov \o10\().16b, v26.16b 1245.endif 1246 1247 sqneg \o7\().4s, v4.4s // out7 1248 sqneg \o5\().4s, v5.4s // out5 1249 sqneg \o11\().4s, v6.4s // out11 1250 sqneg \o9\().4s, v7.4s // out9 1251.endm 1252 1253function inv_adst_4s_x16_neon 1254 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 1255 ret 1256endfunc 1257 1258function inv_flipadst_4s_x16_neon 1259 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 1260 ret 1261endfunc 1262 1263function inv_identity_4s_x16_neon 1264 movz w16, #2*(5793-4096)*8, lsl #16 1265 dup v0.2s, w16 1266.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1267 sqrdmulh v2.4s, v\i\().4s, v0.s[0] 1268 sqadd v\i\().4s, v\i\().4s, v\i\().4s 1269 sqadd v\i\().4s, v\i\().4s, v2.4s 1270.endr 1271 ret 1272endfunc 1273 1274.macro identity_4x16_shift1 c 1275.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1276 sqrdmulh v3.4s, \i, \c 1277 srshr v3.4s, v3.4s, #1 1278 sqadd \i, \i, v3.4s 1279.endr 1280.endm 1281 1282.macro identity_4x16 c 1283.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1284 sqrdmulh v3.4s, \i, \c 1285 sqadd \i, \i, \i 1286 sqadd \i, \i, v3.4s 1287.endr 1288.endm 1289 1290.macro def_horz_16 scale=0, shift=2, suffix 1291function inv_txfm_horz\suffix\()_16x4_neon 1292 mov x14, x30 1293 movi v7.4s, #0 1294.if \scale 1295 movz w16, #2896*8, lsl #16 1296 dup v0.2s, w16 1297.endif 1298.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1299 ld1 {\i}, [x7] 1300 st1 {v7.4s}, [x7], x8 1301.endr 1302.if \scale 1303 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1304 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1305.endif 1306 blr x4 1307 sqrshrn v16.4h, v16.4s, #\shift 1308 sqrshrn v17.4h, v17.4s, #\shift 1309 sqrshrn v18.4h, v18.4s, #\shift 1310 sqrshrn v19.4h, v19.4s, #\shift 1311 sqrshrn2 v16.8h, v20.4s, #\shift 1312 sqrshrn2 v17.8h, v21.4s, #\shift 1313 sqrshrn2 v18.8h, v22.4s, #\shift 1314 sqrshrn2 v19.8h, v23.4s, #\shift 1315 sqrshrn v20.4h, v24.4s, #\shift 1316 sqrshrn v21.4h, v25.4s, #\shift 1317 sqrshrn v22.4h, v26.4s, #\shift 1318 sqrshrn v23.4h, v27.4s, #\shift 1319 sqrshrn2 v20.8h, v28.4s, #\shift 1320 sqrshrn2 v21.8h, v29.4s, #\shift 1321 sqrshrn2 v22.8h, v30.4s, #\shift 1322 sqrshrn2 v23.8h, v31.4s, #\shift 1323 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 1324 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 1325 1326.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h 1327 st1 {\i}, [x6], #16 1328.endr 1329 1330 br x14 1331endfunc 1332.endm 1333 1334def_horz_16 scale=0, shift=2 1335def_horz_16 scale=1, shift=1, suffix=_scale 1336 1337function inv_txfm_add_vert_8x16_neon 1338 mov x14, x30 1339.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1340 ld1 {v\i\().8h}, [x7], x8 1341.endr 1342 blr x5 1343 load_add_store_8x16 x6, x7 1344 br x14 1345endfunc 1346 1347function inv_txfm_add_16x16_neon 1348 mov x15, x30 1349 sub sp, sp, #512 1350 ldrh w12, [x13], #2 1351.irp i, 0, 4, 8, 12 1352 add x6, sp, #(\i*16*2) 1353.if \i > 0 1354 mov w8, #(16 - \i) 1355 cmp w3, w12 1356 b.lt 1f 1357.if \i < 12 1358 ldrh w12, [x13], #2 1359.endif 1360.endif 1361 add x7, x2, #(\i*4) 1362 mov x8, #16*4 1363 bl inv_txfm_horz_16x4_neon 1364.endr 1365 b 3f 13661: 1367 movi v4.8h, #0 1368 movi v5.8h, #0 1369 movi v6.8h, #0 1370 movi v7.8h, #0 13712: 1372 subs w8, w8, #4 1373.rept 2 1374 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 1375.endr 1376 b.gt 2b 13773: 1378.irp i, 0, 8 1379 add x6, x0, #(\i*2) 1380 add x7, sp, #(\i*2) 1381 mov x8, #32 1382 bl inv_txfm_add_vert_8x16_neon 1383.endr 1384 1385 add sp, sp, #512 1386 br x15 1387endfunc 1388 1389const eob_16x16 1390 .short 10, 36, 78, 256 1391endconst 1392 1393const eob_16x16_identity 1394 .short 4, 8, 12, 256 1395endconst 1396 1397.macro def_fn_16x16 txfm1, txfm2 1398function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 1399.ifc \txfm1\()_\txfm2, dct_dct 1400 idct_dc 16, 16, 2 1401.endif 1402 adr x4, inv_\txfm1\()_4s_x16_neon 1403 movrel x5, X(inv_\txfm2\()_8h_x16_neon) 1404.ifc \txfm1, identity 1405.ifc \txfm2, identity 1406 movrel x13, eob_16x16 1407.else 1408 movrel x13, eob_16x16_identity 1409.endif 1410.else 1411.ifc \txfm2, identity 1412 movrel x13, eob_16x16_identity 1413.else 1414 movrel x13, eob_16x16 1415.endif 1416.endif 1417 b inv_txfm_add_16x16_neon 1418endfunc 1419.endm 1420 1421def_fn_16x16 dct, dct 1422def_fn_16x16 identity, identity 1423def_fn_16x16 dct, adst 1424def_fn_16x16 dct, flipadst 1425def_fn_16x16 dct, identity 1426def_fn_16x16 adst, dct 1427def_fn_16x16 adst, adst 1428def_fn_16x16 adst, flipadst 1429def_fn_16x16 flipadst, dct 1430def_fn_16x16 flipadst, adst 1431def_fn_16x16 flipadst, flipadst 1432def_fn_16x16 identity, dct 1433 1434function inv_txfm_add_16x4_neon 1435 mov x15, x30 1436 movi v4.4s, #0 1437 1438.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1439 ld1 {\i}, [x2] 1440 st1 {v4.4s}, [x2], #16 1441.endr 1442 1443 blr x4 1444 1445 sqrshrn v16.4h, v16.4s, #1 1446 sqrshrn v17.4h, v17.4s, #1 1447 sqrshrn v18.4h, v18.4s, #1 1448 sqrshrn v19.4h, v19.4s, #1 1449 sqrshrn2 v16.8h, v20.4s, #1 1450 sqrshrn2 v17.8h, v21.4s, #1 1451 sqrshrn2 v18.8h, v22.4s, #1 1452 sqrshrn2 v19.8h, v23.4s, #1 1453 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1454 blr x5 1455 mov x6, x0 1456 load_add_store_8x4 x6, x7 1457 1458 sqrshrn v16.4h, v24.4s, #1 1459 sqrshrn v17.4h, v25.4s, #1 1460 sqrshrn v18.4h, v26.4s, #1 1461 sqrshrn v19.4h, v27.4s, #1 1462 sqrshrn2 v16.8h, v28.4s, #1 1463 sqrshrn2 v17.8h, v29.4s, #1 1464 sqrshrn2 v18.8h, v30.4s, #1 1465 sqrshrn2 v19.8h, v31.4s, #1 1466 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1467 blr x5 1468 add x6, x0, #16 1469 load_add_store_8x4 x6, x7 1470 1471 br x15 1472endfunc 1473 1474function inv_txfm_add_4x16_neon 1475 ldrh w12, [x13, #4] 1476 mov x15, x30 1477 1478 mov x11, #64 1479 1480 cmp w3, w12 1481 ldrh w12, [x13, #2] 1482 b.lt 1f 1483 1484 add x6, x2, #48 1485 movi v2.4s, #0 1486.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1487 ld1 {\i}, [x6] 1488 st1 {v2.4s}, [x6], x11 1489.endr 1490 blr x4 1491 rshrn v28.4h, v16.4s, #1 1492 rshrn v29.4h, v17.4s, #1 1493 rshrn v30.4h, v18.4s, #1 1494 rshrn v31.4h, v19.4s, #1 1495 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 1496 1497 b 2f 14981: 1499.irp i, v28.4h, v29.4h, v30.4h, v31.4h 1500 movi \i, #0 1501.endr 15022: 1503 cmp w3, w12 1504 ldrh w12, [x13, #0] 1505 b.lt 1f 1506 1507 add x6, x2, #32 1508 movi v2.4s, #0 1509.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1510 ld1 {\i}, [x6] 1511 st1 {v2.4s}, [x6], x11 1512.endr 1513 blr x4 1514 rshrn v24.4h, v16.4s, #1 1515 rshrn v25.4h, v17.4s, #1 1516 rshrn v26.4h, v18.4s, #1 1517 rshrn v27.4h, v19.4s, #1 1518 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 1519 1520 b 2f 15211: 1522.irp i, v24.4h, v25.4h, v26.4h, v27.4h 1523 movi \i, #0 1524.endr 15252: 1526 cmp w3, w12 1527 b.lt 1f 1528 1529 add x6, x2, #16 1530 movi v2.4s, #0 1531.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1532 ld1 {\i}, [x6] 1533 st1 {v2.4s}, [x6], x11 1534.endr 1535 blr x4 1536 rshrn v20.4h, v16.4s, #1 1537 rshrn v21.4h, v17.4s, #1 1538 rshrn v22.4h, v18.4s, #1 1539 rshrn v23.4h, v19.4s, #1 1540 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 1541 1542 b 2f 15431: 1544.irp i, v20.4h, v21.4h, v22.4h, v23.4h 1545 movi \i, #0 1546.endr 15472: 1548 1549 movi v2.4s, #0 1550.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1551 ld1 {\i}, [x2] 1552 st1 {v2.4s}, [x2], x11 1553.endr 1554 blr x4 1555 rshrn v16.4h, v16.4s, #1 1556 rshrn v17.4h, v17.4s, #1 1557 rshrn v18.4h, v18.4s, #1 1558 rshrn v19.4h, v19.4s, #1 1559 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 1560 1561 blr x5 1562 1563 load_add_store_4x16 x0, x6 1564 1565 br x15 1566endfunc 1567 1568const eob_4x16 1569 .short 13, 29, 45, 64 1570endconst 1571 1572const eob_4x16_identity1 1573 .short 16, 32, 48, 64 1574endconst 1575 1576const eob_4x16_identity2 1577 .short 4, 8, 12, 64 1578endconst 1579 1580.macro def_fn_416 w, h, txfm1, txfm2 1581function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 1582.ifc \txfm1\()_\txfm2, dct_dct 1583 idct_dc \w, \h, 1 1584.endif 1585.if \w == 4 1586 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1587 movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) 1588.ifc \txfm1, identity 1589.ifc \txfm2, identity 1590 movrel x13, eob_4x16 1591.else 1592 movrel x13, eob_4x16_identity1 1593.endif 1594.else 1595.ifc \txfm2, identity 1596 movrel x13, eob_4x16_identity2 1597.else 1598 movrel x13, eob_4x16 1599.endif 1600.endif 1601.else 1602 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1603 movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) 1604.endif 1605 b inv_txfm_add_\w\()x\h\()_neon 1606endfunc 1607.endm 1608 1609.macro def_fns_416 w, h 1610def_fn_416 \w, \h, dct, dct 1611def_fn_416 \w, \h, identity, identity 1612def_fn_416 \w, \h, dct, adst 1613def_fn_416 \w, \h, dct, flipadst 1614def_fn_416 \w, \h, dct, identity 1615def_fn_416 \w, \h, adst, dct 1616def_fn_416 \w, \h, adst, adst 1617def_fn_416 \w, \h, adst, flipadst 1618def_fn_416 \w, \h, flipadst, dct 1619def_fn_416 \w, \h, flipadst, adst 1620def_fn_416 \w, \h, flipadst, flipadst 1621def_fn_416 \w, \h, identity, dct 1622def_fn_416 \w, \h, adst, identity 1623def_fn_416 \w, \h, flipadst, identity 1624def_fn_416 \w, \h, identity, adst 1625def_fn_416 \w, \h, identity, flipadst 1626.endm 1627 1628def_fns_416 4, 16 1629def_fns_416 16, 4 1630 1631 1632function inv_txfm_add_16x8_neon 1633 mov x15, x30 1634 stp d8, d9, [sp, #-0x40]! 1635 stp d10, d11, [sp, #0x10] 1636 stp d12, d13, [sp, #0x20] 1637 stp d14, d15, [sp, #0x30] 1638 1639 cmp w3, w13 1640 mov x11, #32 1641 b.lt 1f 1642 1643 movi v4.4s, #0 1644 movz w16, #2896*8, lsl #16 1645 dup v0.2s, w16 1646 1647 add x6, x2, #16 1648.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1649 ld1 {\i}, [x6] 1650 st1 {v4.4s}, [x6], x11 1651.endr 1652 1653 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1654 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1655 blr x4 1656 1657 sqrshrn v8.4h, v16.4s, #1 1658 sqrshrn v9.4h, v17.4s, #1 1659 sqrshrn v10.4h, v18.4s, #1 1660 sqrshrn v11.4h, v19.4s, #1 1661 sqrshrn2 v8.8h, v20.4s, #1 1662 sqrshrn2 v9.8h, v21.4s, #1 1663 sqrshrn2 v10.8h, v22.4s, #1 1664 sqrshrn2 v11.8h, v23.4s, #1 1665 sqrshrn v12.4h, v24.4s, #1 1666 sqrshrn v13.4h, v25.4s, #1 1667 sqrshrn v14.4h, v26.4s, #1 1668 sqrshrn v15.4h, v27.4s, #1 1669 sqrshrn2 v12.8h, v28.4s, #1 1670 sqrshrn2 v13.8h, v29.4s, #1 1671 sqrshrn2 v14.8h, v30.4s, #1 1672 sqrshrn2 v15.8h, v31.4s, #1 1673 1674 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1675 transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 1676 1677 b 2f 16781: 1679.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h 1680 movi \i, #0 1681.endr 16822: 1683 movz w16, #2896*8, lsl #16 1684 dup v0.2s, w16 1685 1686 movi v4.4s, #0 1687.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1688 ld1 {\i}, [x2] 1689 st1 {v4.4s}, [x2], x11 1690.endr 1691 1692 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1693 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1694 blr x4 1695 1696 sqrshrn v16.4h, v16.4s, #1 1697 sqrshrn v17.4h, v17.4s, #1 1698 sqrshrn v18.4h, v18.4s, #1 1699 sqrshrn v19.4h, v19.4s, #1 1700 sqrshrn2 v16.8h, v20.4s, #1 1701 sqrshrn2 v17.8h, v21.4s, #1 1702 sqrshrn2 v18.8h, v22.4s, #1 1703 sqrshrn2 v19.8h, v23.4s, #1 1704 1705 mov v20.16b, v8.16b 1706 mov v21.16b, v9.16b 1707 mov v22.16b, v10.16b 1708 mov v23.16b, v11.16b 1709 1710 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1711 1712 sqrshrn v8.4h, v24.4s, #1 1713 sqrshrn v9.4h, v25.4s, #1 1714 sqrshrn v10.4h, v26.4s, #1 1715 sqrshrn v11.4h, v27.4s, #1 1716 sqrshrn2 v8.8h, v28.4s, #1 1717 sqrshrn2 v9.8h, v29.4s, #1 1718 sqrshrn2 v10.8h, v30.4s, #1 1719 sqrshrn2 v11.8h, v31.4s, #1 1720 1721 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1722 1723 blr x5 1724 1725 mov x6, x0 1726 load_add_store_8x8 x6, x7 1727 1728 mov v16.16b, v8.16b 1729 mov v17.16b, v9.16b 1730 mov v18.16b, v10.16b 1731 mov v19.16b, v11.16b 1732 mov v20.16b, v12.16b 1733 mov v21.16b, v13.16b 1734 mov v22.16b, v14.16b 1735 mov v23.16b, v15.16b 1736 1737 blr x5 1738 1739 add x0, x0, #16 1740 load_add_store_8x8 x0, x7 1741 1742 ldp d14, d15, [sp, #0x30] 1743 ldp d12, d13, [sp, #0x20] 1744 ldp d10, d11, [sp, #0x10] 1745 ldp d8, d9, [sp], 0x40 1746 br x15 1747endfunc 1748 1749function inv_txfm_add_8x16_neon 1750 mov x15, x30 1751 stp d8, d9, [sp, #-0x20]! 1752 stp d10, d11, [sp, #0x10] 1753 ldrh w12, [x13, #4] 1754 1755 mov x11, #64 1756 1757 cmp w3, w12 1758 ldrh w12, [x13, #2] 1759 b.lt 1f 1760 1761 add x6, x2, #48 1762 movi v4.4s, #0 1763 movz w16, #2896*8, lsl #16 1764 dup v0.2s, w16 1765.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1766 ld1 {\i}, [x6] 1767 st1 {v4.4s}, [x6], x11 1768.endr 1769 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1770 blr x4 1771 1772 sqrshrn v28.4h, v16.4s, #1 1773 sqrshrn v29.4h, v17.4s, #1 1774 sqrshrn v30.4h, v18.4s, #1 1775 sqrshrn v31.4h, v19.4s, #1 1776 sqrshrn2 v28.8h, v20.4s, #1 1777 sqrshrn2 v29.8h, v21.4s, #1 1778 sqrshrn2 v30.8h, v22.4s, #1 1779 sqrshrn2 v31.8h, v23.4s, #1 1780 transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 1781 1782 b 2f 1783 17841: 1785.irp i, v28.8h, v29.8h, v30.8h, v31.8h 1786 movi \i, #0 1787.endr 1788 17892: 1790 cmp w3, w12 1791 ldrh w12, [x13, #0] 1792 b.lt 1f 1793 1794 add x6, x2, #32 1795 movi v4.4s, #0 1796 movz w16, #2896*8, lsl #16 1797 dup v0.2s, w16 1798.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1799 ld1 {\i}, [x6] 1800 st1 {v4.4s}, [x6], x11 1801.endr 1802 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1803 blr x4 1804 1805 sqrshrn v24.4h, v16.4s, #1 1806 sqrshrn v25.4h, v17.4s, #1 1807 sqrshrn v26.4h, v18.4s, #1 1808 sqrshrn v27.4h, v19.4s, #1 1809 sqrshrn2 v24.8h, v20.4s, #1 1810 sqrshrn2 v25.8h, v21.4s, #1 1811 sqrshrn2 v26.8h, v22.4s, #1 1812 sqrshrn2 v27.8h, v23.4s, #1 1813 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 1814 1815 b 2f 1816 18171: 1818.irp i, v24.8h, v25.8h, v26.8h, v27.8h 1819 movi \i, #0 1820.endr 1821 18222: 1823 cmp w3, w12 1824 b.lt 1f 1825 1826 add x6, x2, #16 1827 movi v4.4s, #0 1828 movz w16, #2896*8, lsl #16 1829 dup v0.2s, w16 1830.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1831 ld1 {\i}, [x6] 1832 st1 {v4.4s}, [x6], x11 1833.endr 1834 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1835 blr x4 1836 1837 sqrshrn v8.4h, v16.4s, #1 1838 sqrshrn v9.4h, v17.4s, #1 1839 sqrshrn v10.4h, v18.4s, #1 1840 sqrshrn v11.4h, v19.4s, #1 1841 sqrshrn2 v8.8h, v20.4s, #1 1842 sqrshrn2 v9.8h, v21.4s, #1 1843 sqrshrn2 v10.8h, v22.4s, #1 1844 sqrshrn2 v11.8h, v23.4s, #1 1845 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1846 1847 b 2f 1848 18491: 1850.irp i, v8.8h, v9.8h, v10.8h, v11.8h 1851 movi \i, #0 1852.endr 1853 18542: 1855 movi v4.4s, #0 1856 movz w16, #2896*8, lsl #16 1857 dup v0.2s, w16 1858.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1859 ld1 {\i}, [x2] 1860 st1 {v4.4s}, [x2], x11 1861.endr 1862 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1863 blr x4 1864 1865 sqrshrn v16.4h, v16.4s, #1 1866 sqrshrn v17.4h, v17.4s, #1 1867 sqrshrn v18.4h, v18.4s, #1 1868 sqrshrn v19.4h, v19.4s, #1 1869 sqrshrn2 v16.8h, v20.4s, #1 1870 sqrshrn2 v17.8h, v21.4s, #1 1871 sqrshrn2 v18.8h, v22.4s, #1 1872 sqrshrn2 v19.8h, v23.4s, #1 1873 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1874 1875 mov v20.16b, v8.16b 1876 mov v21.16b, v9.16b 1877 mov v22.16b, v10.16b 1878 mov v23.16b, v11.16b 1879 1880 blr x5 1881 1882 load_add_store_8x16 x0, x6 1883 1884 ldp d10, d11, [sp, #0x10] 1885 ldp d8, d9, [sp], 0x20 1886 1887 br x15 1888endfunc 1889 1890const eob_8x16 1891 .short 10, 43, 75, 128 1892endconst 1893 1894const eob_8x16_identity1 1895 .short 4, 64, 96, 128 1896endconst 1897 1898const eob_8x16_identity2 1899 .short 4, 8, 12, 128 1900endconst 1901 1902.macro def_fn_816 w, h, txfm1, txfm2 1903function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 1904.ifc \txfm1\()_\txfm2, dct_dct 1905 idct_dc \w, \h, 1 1906.endif 1907 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1908 movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) 1909.ifc \txfm1, identity 1910.ifc \txfm2, identity 1911 movrel x13, eob_8x16 1912.else 1913 movrel x13, eob_8x16_identity1 1914.endif 1915.else 1916.ifc \txfm2, identity 1917 movrel x13, eob_8x16_identity2 1918.else 1919 movrel x13, eob_8x16 1920.endif 1921.endif 1922.if \h == 8 1923 ldrh w13, [x13] 1924.endif 1925 b inv_txfm_add_\w\()x\h\()_neon 1926endfunc 1927.endm 1928 1929.macro def_fns_816 w, h 1930def_fn_816 \w, \h, dct, dct 1931def_fn_816 \w, \h, identity, identity 1932def_fn_816 \w, \h, dct, adst 1933def_fn_816 \w, \h, dct, flipadst 1934def_fn_816 \w, \h, dct, identity 1935def_fn_816 \w, \h, adst, dct 1936def_fn_816 \w, \h, adst, adst 1937def_fn_816 \w, \h, adst, flipadst 1938def_fn_816 \w, \h, flipadst, dct 1939def_fn_816 \w, \h, flipadst, adst 1940def_fn_816 \w, \h, flipadst, flipadst 1941def_fn_816 \w, \h, identity, dct 1942def_fn_816 \w, \h, adst, identity 1943def_fn_816 \w, \h, flipadst, identity 1944def_fn_816 \w, \h, identity, adst 1945def_fn_816 \w, \h, identity, flipadst 1946.endm 1947 1948def_fns_816 8, 16 1949def_fns_816 16, 8 1950 1951function inv_dct32_odd_4s_x16_neon 1952 movrel x16, idct_coeffs, 4*16 1953 ld1 {v0.4s, v1.4s}, [x16], #32 1954 1955 mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a 1956 mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a 1957 mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a 1958 srshr v16.4s, v2.4s, #12 // t16a 1959 srshr v31.4s, v4.4s, #12 // t31a 1960 mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a 1961 mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a 1962 srshr v24.4s, v6.4s, #12 // t17a 1963 srshr v23.4s, v2.4s, #12 // t30a 1964 mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a 1965 mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a 1966 srshr v20.4s, v4.4s, #12 // t18a 1967 srshr v27.4s, v6.4s, #12 // t29a 1968 mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a 1969 ld1 {v0.4s, v1.4s}, [x16] 1970 sub x16, x16, #4*24 1971 mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a 1972 srshr v28.4s, v2.4s, #12 // t19a 1973 srshr v19.4s, v4.4s, #12 // t28a 1974 mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a 1975 mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a 1976 srshr v18.4s, v6.4s, #12 // t20a 1977 srshr v29.4s, v2.4s, #12 // t27a 1978 mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a 1979 mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a 1980 srshr v26.4s, v4.4s, #12 // t21a 1981 srshr v21.4s, v6.4s, #12 // t26a 1982 mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a 1983 mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a 1984 srshr v22.4s, v2.4s, #12 // t22a 1985 srshr v25.4s, v4.4s, #12 // t25a 1986 mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a 1987 srshr v30.4s, v6.4s, #12 // t23a 1988 srshr v17.4s, v2.4s, #12 // t24a 1989 1990 ld1 {v0.4s, v1.4s}, [x16] 1991 1992 sqsub v2.4s, v16.4s, v24.4s // t17 1993 sqadd v16.4s, v16.4s, v24.4s // t16 1994 sqsub v3.4s, v31.4s, v23.4s // t30 1995 sqadd v31.4s, v31.4s, v23.4s // t31 1996 sqsub v24.4s, v28.4s, v20.4s // t18 1997 sqadd v28.4s, v28.4s, v20.4s // t19 1998 sqadd v23.4s, v18.4s, v26.4s // t20 1999 sqsub v18.4s, v18.4s, v26.4s // t21 2000 sqsub v20.4s, v30.4s, v22.4s // t22 2001 sqadd v30.4s, v30.4s, v22.4s // t23 2002 sqadd v26.4s, v17.4s, v25.4s // t24 2003 sqsub v17.4s, v17.4s, v25.4s // t25 2004 sqsub v22.4s, v29.4s, v21.4s // t26 2005 sqadd v29.4s, v29.4s, v21.4s // t27 2006 sqadd v25.4s, v19.4s, v27.4s // t28 2007 sqsub v19.4s, v19.4s, v27.4s // t29 2008 2009 mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a 2010 mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a 2011 mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a 2012 srshr v21.4s, v4.4s, #12 // t17a 2013 srshr v27.4s, v6.4s, #12 // t30a 2014 neg v2.4s, v2.4s // -> t18a 2015 mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a 2016 mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a 2017 srshr v19.4s, v2.4s, #12 // t18a 2018 srshr v24.4s, v4.4s, #12 // t29a 2019 mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a 2020 mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a 2021 srshr v22.4s, v6.4s, #12 // t21a 2022 srshr v18.4s, v2.4s, #12 // t26a 2023 neg v4.4s, v4.4s // -> t22a 2024 mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a 2025 srshr v17.4s, v4.4s, #12 // t22a 2026 srshr v20.4s, v6.4s, #12 // t25a 2027 2028 sqsub v2.4s, v27.4s, v24.4s // t29 2029 sqadd v27.4s, v27.4s, v24.4s // t30 2030 sqsub v3.4s, v21.4s, v19.4s // t18 2031 sqadd v21.4s, v21.4s, v19.4s // t17 2032 sqsub v24.4s, v16.4s, v28.4s // t19a 2033 sqadd v16.4s, v16.4s, v28.4s // t16a 2034 sqsub v19.4s, v30.4s, v23.4s // t20a 2035 sqadd v30.4s, v30.4s, v23.4s // t23a 2036 sqsub v28.4s, v17.4s, v22.4s // t21 2037 sqadd v17.4s, v17.4s, v22.4s // t22 2038 sqadd v23.4s, v26.4s, v29.4s // t24a 2039 sqsub v26.4s, v26.4s, v29.4s // t27a 2040 sqadd v22.4s, v20.4s, v18.4s // t25 2041 sqsub v20.4s, v20.4s, v18.4s // t26 2042 sqsub v29.4s, v31.4s, v25.4s // t28a 2043 sqadd v31.4s, v31.4s, v25.4s // t31a 2044 2045 mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a 2046 mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a 2047 mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 2048 srshr v18.4s, v4.4s, #12 // t18a 2049 srshr v25.4s, v6.4s, #12 // t29a 2050 mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 2051 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 2052 srshr v29.4s, v2.4s, #12 // t19 2053 srshr v24.4s, v4.4s, #12 // t28 2054 neg v6.4s, v6.4s // -> t20 2055 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 2056 mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a 2057 srshr v26.4s, v6.4s, #12 // t20 2058 srshr v19.4s, v2.4s, #12 // t27 2059 neg v4.4s, v4.4s // -> t21a 2060 mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a 2061 srshr v20.4s, v4.4s, #12 // t21a 2062 srshr v28.4s, v6.4s, #12 // t26a 2063 2064 sqsub v2.4s, v16.4s, v30.4s // t23 2065 sqadd v16.4s, v16.4s, v30.4s // t16 = out16 2066 sqsub v3.4s, v31.4s, v23.4s // t24 2067 sqadd v31.4s, v31.4s, v23.4s // t31 = out31 2068 sqsub v23.4s, v21.4s, v17.4s // t22a 2069 sqadd v17.4s, v21.4s, v17.4s // t17a = out17 2070 sqadd v30.4s, v27.4s, v22.4s // t30a = out30 2071 sqsub v21.4s, v27.4s, v22.4s // t25a 2072 sqsub v27.4s, v18.4s, v20.4s // t21 2073 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 2074 sqadd v4.4s, v29.4s, v26.4s // t19a = out19 2075 sqsub v26.4s, v29.4s, v26.4s // t20a 2076 sqadd v29.4s, v25.4s, v28.4s // t29 = out29 2077 sqsub v25.4s, v25.4s, v28.4s // t26 2078 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 2079 sqsub v24.4s, v24.4s, v19.4s // t27a 2080 mov v19.16b, v4.16b // out19 2081 2082 mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 2083 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 2084 srshr v20.4s, v4.4s, #12 // t20 2085 srshr v22.4s, v6.4s, #12 // t27 2086 2087 mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a 2088 mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a 2089 mov v27.16b, v22.16b // t27 2090 srshr v26.4s, v4.4s, #12 // t26a 2091 2092 mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 2093 mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 2094 srshr v21.4s, v6.4s, #12 // t21a 2095 srshr v22.4s, v24.4s, #12 // t22 2096 srshr v25.4s, v4.4s, #12 // t25 2097 2098 mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a 2099 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a 2100 srshr v23.4s, v4.4s, #12 // t23a 2101 srshr v24.4s, v6.4s, #12 // t24a 2102 2103 ret 2104endfunc 2105 2106.macro def_horz_32 scale=0, shift=2, suffix 2107function inv_txfm_horz\suffix\()_dct_32x4_neon 2108 mov x14, x30 2109 movi v7.4s, #0 2110 lsl x8, x8, #1 2111.if \scale 2112 movz w16, #2896*8, lsl #16 2113 dup v0.2s, w16 2114.endif 2115 2116.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2117 ld1 {\i}, [x7] 2118 st1 {v7.4s}, [x7], x8 2119.endr 2120 sub x7, x7, x8, lsl #4 2121 add x7, x7, x8, lsr #1 2122.if \scale 2123 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 2124 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 2125.endif 2126 bl inv_dct_4s_x16_neon 2127 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 2128 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 2129 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 2130 transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 2131 2132.macro store1 r0, r1, r2, r3 2133 st1 {\r0}, [x6], #16 2134 st1 {\r1}, [x6], #16 2135 st1 {\r2}, [x6], #16 2136 st1 {\r3}, [x6], #16 2137.endm 2138 store1 v16.4s, v20.4s, v24.4s, v28.4s 2139 store1 v17.4s, v21.4s, v25.4s, v29.4s 2140 store1 v18.4s, v22.4s, v26.4s, v30.4s 2141 store1 v19.4s, v23.4s, v27.4s, v31.4s 2142.purgem store1 2143 sub x6, x6, #64*4 2144 2145 movi v7.4s, #0 2146.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2147 ld1 {\i}, [x7] 2148 st1 {v7.4s}, [x7], x8 2149.endr 2150.if \scale 2151 // This relies on the fact that the idct also leaves the right coeff in v0.s[1] 2152 scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 2153 scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 2154.endif 2155 bl inv_dct32_odd_4s_x16_neon 2156 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 2157 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 2158 transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 2159 transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 2160.macro store2 r0, r1, r2, r3, shift 2161 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] 2162 sqsub v4.4s, v0.4s, \r0 2163 sqadd v0.4s, v0.4s, \r0 2164 sqsub v5.4s, v1.4s, \r1 2165 sqadd v1.4s, v1.4s, \r1 2166 sqsub v6.4s, v2.4s, \r2 2167 sqadd v2.4s, v2.4s, \r2 2168 sqsub v7.4s, v3.4s, \r3 2169 sqadd v3.4s, v3.4s, \r3 2170 sqrshrn v0.4h, v0.4s, #\shift 2171 sqrshrn2 v0.8h, v1.4s, #\shift 2172 sqrshrn v1.4h, v2.4s, #\shift 2173 sqrshrn2 v1.8h, v3.4s, #\shift 2174 sqrshrn v2.4h, v7.4s, #\shift 2175 sqrshrn2 v2.8h, v6.4s, #\shift 2176 sqrshrn v3.4h, v5.4s, #\shift 2177 sqrshrn2 v3.8h, v4.4s, #\shift 2178 st1 {v0.8h, v1.8h}, [x6], #32 2179 rev64 v2.8h, v2.8h 2180 rev64 v3.8h, v3.8h 2181 st1 {v2.8h, v3.8h}, [x6], #32 2182.endm 2183 2184 store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift 2185 store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift 2186 store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift 2187 store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift 2188.purgem store2 2189 br x14 2190endfunc 2191.endm 2192 2193def_horz_32 scale=0, shift=2 2194def_horz_32 scale=1, shift=1, suffix=_scale 2195 2196function inv_txfm_add_vert_dct_8x32_neon 2197 mov x14, x30 2198 lsl x8, x8, #1 2199 2200.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2201 ld1 {v\i\().8h}, [x7], x8 2202.endr 2203 sub x7, x7, x8, lsl #4 2204 2205 bl X(inv_dct_8h_x16_neon) 2206 2207.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2208 st1 {v\i\().8h}, [x7], x8 2209.endr 2210 sub x7, x7, x8, lsl #4 2211 add x7, x7, x8, lsr #1 2212 2213.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2214 ld1 {v\i\().8h}, [x7], x8 2215.endr 2216 sub x7, x7, x8, lsl #4 2217 sub x7, x7, x8, lsr #1 2218 bl X(inv_dct32_odd_8h_x16_neon) 2219 2220 neg x9, x8 2221 mov x10, x6 2222 movi v0.8h, #0 2223 mvni v1.8h, #0xfc, lsl #8 // 0x3ff 2224.macro combine r0, r1, r2, r3, op, stride 2225 ld1 {v5.8h}, [x7], \stride 2226 ld1 {v2.8h}, [x10], x1 2227 ld1 {v6.8h}, [x7], \stride 2228 ld1 {v3.8h}, [x10], x1 2229 \op v5.8h, v5.8h, \r0 2230 ld1 {v7.8h}, [x7], \stride 2231 ld1 {v4.8h}, [x10], x1 2232 srshr v5.8h, v5.8h, #4 2233 \op v6.8h, v6.8h, \r1 2234 sqadd v5.8h, v5.8h, v2.8h 2235 srshr v6.8h, v6.8h, #4 2236 \op v7.8h, v7.8h, \r2 2237 smax v2.8h, v5.8h, v0.8h 2238 ld1 {v5.8h}, [x7], \stride 2239 sqadd v6.8h, v6.8h, v3.8h 2240 smin v2.8h, v2.8h, v1.8h 2241 srshr v7.8h, v7.8h, #4 2242 \op v5.8h, v5.8h, \r3 2243 st1 {v2.8h}, [x6], x1 2244 ld1 {v2.8h}, [x10], x1 2245 smax v3.8h, v6.8h, v0.8h 2246 sqadd v7.8h, v7.8h, v4.8h 2247 smin v3.8h, v3.8h, v1.8h 2248 srshr v5.8h, v5.8h, #4 2249 st1 {v3.8h}, [x6], x1 2250 smax v4.8h, v7.8h, v0.8h 2251 sqadd v5.8h, v5.8h, v2.8h 2252 smin v4.8h, v4.8h, v1.8h 2253 st1 {v4.8h}, [x6], x1 2254 smax v2.8h, v5.8h, v0.8h 2255 smin v2.8h, v2.8h, v1.8h 2256 st1 {v2.8h}, [x6], x1 2257.endm 2258 combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 2259 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 2260 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 2261 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 2262 sub x7, x7, x8 2263 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 2264 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 2265 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 2266 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 2267.purgem combine 2268 2269 br x14 2270endfunc 2271 2272const eob_32x32 2273 .short 10, 36, 78, 136, 210, 300, 406, 1024 2274endconst 2275 2276const eob_16x32 2277 .short 10, 36, 78, 151, 215, 279, 343, 512 2278endconst 2279 2280const eob_16x32_shortside 2281 .short 10, 36, 78, 512 2282endconst 2283 2284const eob_8x32 2285 .short 10, 43, 75, 107, 139, 171, 203, 256 2286endconst 2287 2288function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 2289 movi v0.8h, #0 2290 movi v1.8h, #0 2291 movrel x13, eob_32x32, 2 2292 2293 mov x8, #4*32 22941: 2295 mov w9, #0 2296 movrel x12, eob_32x32, 2 22972: 2298 add w9, w9, #8 2299 ld1 {v16.4s, v17.4s}, [x2] 2300 st1 {v0.4s, v1.4s}, [x2], x8 2301 ld1 {v18.4s, v19.4s}, [x2] 2302 st1 {v0.4s, v1.4s}, [x2], x8 2303 ld1 {v20.4s, v21.4s}, [x2] 2304 st1 {v0.4s, v1.4s}, [x2], x8 2305 ld1 {v22.4s, v23.4s}, [x2] 2306 st1 {v0.4s, v1.4s}, [x2], x8 2307 ld1 {v24.4s, v25.4s}, [x2] 2308 st1 {v0.4s, v1.4s}, [x2], x8 2309 ld1 {v26.4s, v27.4s}, [x2] 2310 st1 {v0.4s, v1.4s}, [x2], x8 2311 ld1 {v28.4s, v29.4s}, [x2] 2312 st1 {v0.4s, v1.4s}, [x2], x8 2313 ld1 {v30.4s, v31.4s}, [x2] 2314 st1 {v0.4s, v1.4s}, [x2], x8 2315 sqxtn v16.4h, v16.4s 2316 sqxtn2 v16.8h, v17.4s 2317 sqxtn v17.4h, v18.4s 2318 sqxtn2 v17.8h, v19.4s 2319 sqxtn v18.4h, v20.4s 2320 sqxtn2 v18.8h, v21.4s 2321 sqxtn v19.4h, v22.4s 2322 sqxtn2 v19.8h, v23.4s 2323 sqxtn v20.4h, v24.4s 2324 sqxtn2 v20.8h, v25.4s 2325 sqxtn v21.4h, v26.4s 2326 sqxtn2 v21.8h, v27.4s 2327 sqxtn v22.4h, v28.4s 2328 sqxtn2 v22.8h, v29.4s 2329 sqxtn v23.4h, v30.4s 2330 sqxtn2 v23.8h, v31.4s 2331 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2332 2333 load_add_store_8x8 x0, x7, shiftbits=2 2334 ldrh w11, [x12], #4 2335 sub x0, x0, x1, lsl #3 2336 add x0, x0, #2*8 2337 cmp w3, w11 2338 b.ge 2b 2339 2340 ldrh w11, [x13], #4 2341 cmp w3, w11 2342 b.lt 9f 2343 2344 sub x0, x0, w9, uxtw #1 2345 add x0, x0, x1, lsl #3 2346 msub x2, x8, x9, x2 2347 add x2, x2, #4*8 2348 b 1b 23499: 2350 ret 2351endfunc 2352 2353.macro shift_16_regs op, shift 2354.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2355 \op \i, \i, #\shift 2356.endr 2357.endm 2358 2359.macro def_identity_1632 w, h, wshort, hshort 2360function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 2361 movz w16, #2896*8, lsl #16 2362 movz w17, #2*(5793-4096)*8, lsl #16 2363 movi v0.4s, #0 2364 movi v1.4s, #0 2365 movrel x13, eob_16x32\hshort, 2 2366 2367 mov x8, #4*\h 23681: 2369 mov w9, #0 2370 movrel x12, eob_16x32\wshort, 2 23712: 2372 add w9, w9, #8 2373 ld1 {v16.4s, v17.4s}, [x2] 2374 st1 {v0.4s, v1.4s}, [x2], x8 2375 dup v2.2s, w16 2376 ld1 {v18.4s, v19.4s}, [x2] 2377 st1 {v0.4s, v1.4s}, [x2], x8 2378 mov v2.s[1], w17 2379 ld1 {v20.4s, v21.4s}, [x2] 2380 st1 {v0.4s, v1.4s}, [x2], x8 2381 ld1 {v22.4s, v23.4s}, [x2] 2382 st1 {v0.4s, v1.4s}, [x2], x8 2383 ld1 {v24.4s, v25.4s}, [x2] 2384 st1 {v0.4s, v1.4s}, [x2], x8 2385 ld1 {v26.4s, v27.4s}, [x2] 2386 st1 {v0.4s, v1.4s}, [x2], x8 2387 ld1 {v28.4s, v29.4s}, [x2] 2388 st1 {v0.4s, v1.4s}, [x2], x8 2389 ld1 {v30.4s, v31.4s}, [x2] 2390 st1 {v0.4s, v1.4s}, [x2], x8 2391 scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 2392 scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 2393 2394.if \w == 16 2395 // 16x32 2396 identity_4x16_shift1 v2.s[1] 2397.else 2398 // 32x16 2399 shift_16_regs sqshl, 1 2400 identity_4x16 v2.s[1] 2401.endif 2402 sqxtn v16.4h, v16.4s 2403 sqxtn2 v16.8h, v17.4s 2404 sqxtn v17.4h, v18.4s 2405 sqxtn2 v17.8h, v19.4s 2406 sqxtn v18.4h, v20.4s 2407 sqxtn2 v18.8h, v21.4s 2408 sqxtn v19.4h, v22.4s 2409 sqxtn2 v19.8h, v23.4s 2410 sqxtn v20.4h, v24.4s 2411 sqxtn2 v20.8h, v25.4s 2412 sqxtn v21.4h, v26.4s 2413 sqxtn2 v21.8h, v27.4s 2414 sqxtn v22.4h, v28.4s 2415 sqxtn2 v22.8h, v29.4s 2416 sqxtn v23.4h, v30.4s 2417 sqxtn2 v23.8h, v31.4s 2418 2419 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2420 2421.if \w == 16 2422 load_add_store_8x8 x0, x7, shiftbits=2 2423.else 2424 load_add_store_8x8 x0, x7, shiftbits=4 2425.endif 2426 ldrh w11, [x12], #4 2427 sub x0, x0, x1, lsl #3 2428 add x0, x0, #16 2429 cmp w3, w11 2430 b.ge 2b 2431 2432 ldrh w11, [x13], #4 2433 cmp w3, w11 2434 b.lt 9f 2435 2436 sub x0, x0, w9, uxtw #1 2437 add x0, x0, x1, lsl #3 2438 msub x2, x8, x9, x2 2439 add x2, x2, #4*8 2440 b 1b 24419: 2442 ret 2443endfunc 2444.endm 2445 2446def_identity_1632 16, 32, _shortside, 2447def_identity_1632 32, 16, , _shortside 2448 2449.macro def_identity_832 w, h 2450function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 2451 movi v0.4s, #0 2452 movi v1.4s, #0 2453 // Working on 8x8 blocks, read every other entry from eob_8x32 2454 movrel x13, eob_8x32, 2 2455 2456 mov w8, #4*\h 24571: 2458 // Working on 8x8 blocks, read every other entry from eob_8x32 2459 ldrh w12, [x13], #4 2460 ld1 {v16.4s, v17.4s}, [x2] 2461 st1 {v0.4s, v1.4s}, [x2], x8 2462 ld1 {v18.4s, v19.4s}, [x2] 2463 st1 {v0.4s, v1.4s}, [x2], x8 2464 ld1 {v20.4s, v21.4s}, [x2] 2465 st1 {v0.4s, v1.4s}, [x2], x8 2466 ld1 {v22.4s, v23.4s}, [x2] 2467 st1 {v0.4s, v1.4s}, [x2], x8 2468 ld1 {v24.4s, v25.4s}, [x2] 2469 st1 {v0.4s, v1.4s}, [x2], x8 2470 ld1 {v26.4s, v27.4s}, [x2] 2471 st1 {v0.4s, v1.4s}, [x2], x8 2472 ld1 {v28.4s, v29.4s}, [x2] 2473 st1 {v0.4s, v1.4s}, [x2], x8 2474 ld1 {v30.4s, v31.4s}, [x2] 2475 st1 {v0.4s, v1.4s}, [x2], x8 2476 2477.if \w == 8 2478 sqrshrn v16.4h, v16.4s, #1 2479 sqrshrn2 v16.8h, v17.4s, #1 2480 sqrshrn v17.4h, v18.4s, #1 2481 sqrshrn2 v17.8h, v19.4s, #1 2482 sqrshrn v18.4h, v20.4s, #1 2483 sqrshrn2 v18.8h, v21.4s, #1 2484 sqrshrn v19.4h, v22.4s, #1 2485 sqrshrn2 v19.8h, v23.4s, #1 2486 sqrshrn v20.4h, v24.4s, #1 2487 sqrshrn2 v20.8h, v25.4s, #1 2488 sqrshrn v21.4h, v26.4s, #1 2489 sqrshrn2 v21.8h, v27.4s, #1 2490 sqrshrn v22.4h, v28.4s, #1 2491 sqrshrn2 v22.8h, v29.4s, #1 2492 sqrshrn v23.4h, v30.4s, #1 2493 sqrshrn2 v23.8h, v31.4s, #1 2494.else 2495 sqxtn v16.4h, v16.4s 2496 sqxtn2 v16.8h, v17.4s 2497 sqxtn v17.4h, v18.4s 2498 sqxtn2 v17.8h, v19.4s 2499 sqxtn v18.4h, v20.4s 2500 sqxtn2 v18.8h, v21.4s 2501 sqxtn v19.4h, v22.4s 2502 sqxtn2 v19.8h, v23.4s 2503 sqxtn v20.4h, v24.4s 2504 sqxtn2 v20.8h, v25.4s 2505 sqxtn v21.4h, v26.4s 2506 sqxtn2 v21.8h, v27.4s 2507 sqxtn v22.4h, v28.4s 2508 sqxtn2 v22.8h, v29.4s 2509 sqxtn v23.4h, v30.4s 2510 sqxtn2 v23.8h, v31.4s 2511.endif 2512 2513 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2514 2515 2516 cmp w3, w12 2517.if \w == 8 2518 load_add_store_8x8 x0, x7, shiftbits=2 2519.else 2520 load_add_store_8x8 x0, x7, shiftbits=3 2521.endif 2522 2523 b.lt 9f 2524.if \w == 8 2525 sub x2, x2, x8, lsl #3 2526 add x2, x2, #4*8 2527.else 2528 sub x0, x0, x1, lsl #3 2529 add x0, x0, #2*8 2530.endif 2531 b 1b 2532 25339: 2534 ret 2535endfunc 2536.endm 2537 2538def_identity_832 8, 32 2539def_identity_832 32, 8 2540 2541function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 2542 idct_dc 32, 32, 2 2543 2544 mov x15, x30 2545 sub sp, sp, #2048 2546 movrel x13, eob_32x32 2547 ldrh w12, [x13], #2 2548 2549.irp i, 0, 4, 8, 12, 16, 20, 24, 28 2550 add x6, sp, #(\i*32*2) 2551.if \i > 0 2552 mov w8, #(32 - \i) 2553 cmp w3, w12 2554 b.lt 1f 2555.if \i < 28 2556 ldrh w12, [x13], #2 2557.endif 2558.endif 2559 add x7, x2, #(\i*4) 2560 mov x8, #32*4 2561 bl inv_txfm_horz_dct_32x4_neon 2562.endr 2563 b 3f 2564 25651: 2566 movi v4.8h, #0 2567 movi v5.8h, #0 2568 movi v6.8h, #0 2569 movi v7.8h, #0 25702: 2571 subs w8, w8, #4 2572.rept 4 2573 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2574.endr 2575 b.gt 2b 2576 25773: 2578.irp i, 0, 8, 16, 24 2579 add x6, x0, #(\i*2) 2580 add x7, sp, #(\i*2) 2581 mov x8, #32*2 2582 bl inv_txfm_add_vert_dct_8x32_neon 2583.endr 2584 2585 add sp, sp, #2048 2586 br x15 2587endfunc 2588 2589function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 2590 idct_dc 16, 32, 1 2591 2592 mov x15, x30 2593 sub sp, sp, #1024 2594 movrel x13, eob_16x32 2595 ldrh w12, [x13], #2 2596 adr x4, inv_dct_4s_x16_neon 2597 2598.irp i, 0, 4, 8, 12, 16, 20, 24, 28 2599 add x6, sp, #(\i*16*2) 2600 add x7, x2, #(\i*4) 2601.if \i > 0 2602 mov w8, #(32 - \i) 2603 cmp w3, w12 2604 b.lt 1f 2605.if \i < 28 2606 ldrh w12, [x13], #2 2607.endif 2608.endif 2609 mov x8, #4*32 2610 bl inv_txfm_horz_scale_16x4_neon 2611.endr 2612 b 3f 2613 26141: 2615 movi v4.8h, #0 2616 movi v5.8h, #0 2617 movi v6.8h, #0 2618 movi v7.8h, #0 26192: 2620 subs w8, w8, #4 2621.rept 2 2622 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2623.endr 2624 b.gt 2b 2625 26263: 2627.irp i, 0, 8 2628 add x6, x0, #(\i*2) 2629 add x7, sp, #(\i*2) 2630 mov x8, #16*2 2631 bl inv_txfm_add_vert_dct_8x32_neon 2632.endr 2633 2634 add sp, sp, #1024 2635 br x15 2636endfunc 2637 2638function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 2639 idct_dc 32, 16, 1 2640 2641 mov x15, x30 2642 sub sp, sp, #1024 2643 2644 movrel x13, eob_16x32 2645 movrel x5, X(inv_dct_8h_x16_neon) 2646 ldrh w12, [x13], #2 2647 2648.irp i, 0, 4, 8, 12 2649 add x6, sp, #(\i*32*2) 2650 add x7, x2, #(\i*4) 2651.if \i > 0 2652 mov w8, #(16 - \i) 2653 cmp w3, w12 2654 b.lt 1f 2655 ldrh w12, [x13], #2 2656.endif 2657 mov x8, #4*16 2658 bl inv_txfm_horz_scale_dct_32x4_neon 2659.endr 2660 b 3f 2661 26621: 2663 movi v4.8h, #0 2664 movi v5.8h, #0 2665 movi v6.8h, #0 2666 movi v7.8h, #0 26672: 2668 subs w8, w8, #4 2669.rept 4 2670 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2671.endr 2672 b.gt 2b 2673 26743: 2675.irp i, 0, 8, 16, 24 2676 add x6, x0, #(\i*2) 2677 add x7, sp, #(\i*2) 2678 mov x8, #32*2 2679 bl inv_txfm_add_vert_8x16_neon 2680.endr 2681 2682 add sp, sp, #1024 2683 br x15 2684endfunc 2685 2686function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 2687 idct_dc 8, 32, 2 2688 2689 mov x15, x30 2690 sub sp, sp, #512 2691 2692 movrel x13, eob_8x32 2693 2694 movi v28.4s, #0 2695 mov x8, #4*32 2696 mov w9, #32 2697 mov x6, sp 2698 mov x7, x2 26991: 2700.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2701 ld1 {v\i\().4s}, [x7] 2702 st1 {v28.4s}, [x7], x8 2703.endr 2704 ldrh w12, [x13], #2 2705 sub w9, w9, #4 2706 sub x7, x7, x8, lsl #3 2707 add x7, x7, #4*4 2708 2709 bl inv_dct_4s_x8_neon 2710 2711 sqrshrn v16.4h, v16.4s, #2 2712 sqrshrn v17.4h, v17.4s, #2 2713 sqrshrn v18.4h, v18.4s, #2 2714 sqrshrn v19.4h, v19.4s, #2 2715 sqrshrn2 v16.8h, v20.4s, #2 2716 sqrshrn2 v17.8h, v21.4s, #2 2717 sqrshrn2 v18.8h, v22.4s, #2 2718 sqrshrn2 v19.8h, v23.4s, #2 2719 2720 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 2721 2722 cmp w3, w12 2723 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 2724 2725 b.ge 1b 2726 cbz w9, 3f 2727 2728 movi v29.8h, #0 2729 movi v30.8h, #0 2730 movi v31.8h, #0 27312: 2732 subs w9, w9, #4 2733 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 2734 b.gt 2b 2735 27363: 2737 mov x6, x0 2738 mov x7, sp 2739 mov x8, #8*2 2740 bl inv_txfm_add_vert_dct_8x32_neon 2741 2742 add sp, sp, #512 2743 br x15 2744endfunc 2745 2746function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 2747 idct_dc 32, 8, 2 2748 2749 mov x15, x30 2750 sub sp, sp, #512 2751 2752.irp i, 0, 4 2753 add x6, sp, #(\i*32*2) 2754 add x7, x2, #(\i*4) 2755.if \i > 0 2756 cmp w3, #10 2757 b.lt 1f 2758.endif 2759 mov x8, #8*4 2760 bl inv_txfm_horz_dct_32x4_neon 2761.endr 2762 b 2f 2763 27641: 2765 movi v4.8h, #0 2766 movi v5.8h, #0 2767 movi v6.8h, #0 2768 movi v7.8h, #0 2769.rept 4 2770 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2771.endr 2772 27732: 2774 mov x8, #2*32 2775 mov w9, #0 27761: 2777 add x6, x0, x9, lsl #1 2778 add x7, sp, x9, lsl #1 // #(\i*2) 2779 2780.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2781 ld1 {v\i\().8h}, [x7], x8 2782.endr 2783 add w9, w9, #8 2784 2785 bl X(inv_dct_8h_x8_neon) 2786 2787 cmp w9, #32 2788 2789 load_add_store_8x8 x6, x7 2790 2791 b.lt 1b 2792 2793 add sp, sp, #512 2794 br x15 2795endfunc 2796 2797function inv_dct64_step1_neon 2798 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 2799 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 2800 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 2801 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 2802 2803 ld1 {v0.4s, v1.4s}, [x17], #32 2804 2805 sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a 2806 sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a 2807 sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a 2808 sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a 2809 sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a 2810 sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a 2811 sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a 2812 sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a 2813 2814 ld1 {v0.4s}, [x17], #16 2815 2816 sqadd v24.4s, v16.4s, v17.4s // t32 2817 sqsub v25.4s, v16.4s, v17.4s // t33 2818 sqsub v26.4s, v19.4s, v18.4s // t34 2819 sqadd v27.4s, v19.4s, v18.4s // t35 2820 sqadd v28.4s, v20.4s, v21.4s // t60 2821 sqsub v29.4s, v20.4s, v21.4s // t61 2822 sqsub v30.4s, v23.4s, v22.4s // t62 2823 sqadd v31.4s, v23.4s, v22.4s // t63 2824 2825 mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a 2826 mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a 2827 neg v2.4s, v2.4s // t34a 2828 mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a 2829 srshr v26.4s, v2.4s, #12 // t34a 2830 mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a 2831 srshr v29.4s, v4.4s, #12 // t61a 2832 srshr v25.4s, v6.4s, #12 // t33a 2833 srshr v30.4s, v2.4s, #12 // t62a 2834 2835 sqadd v16.4s, v24.4s, v27.4s // t32a 2836 sqsub v19.4s, v24.4s, v27.4s // t35a 2837 sqadd v17.4s, v25.4s, v26.4s // t33 2838 sqsub v18.4s, v25.4s, v26.4s // t34 2839 sqsub v20.4s, v31.4s, v28.4s // t60a 2840 sqadd v23.4s, v31.4s, v28.4s // t63a 2841 sqsub v21.4s, v30.4s, v29.4s // t61 2842 sqadd v22.4s, v30.4s, v29.4s // t62 2843 2844 mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a 2845 mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a 2846 mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 2847 srshr v21.4s, v2.4s, #12 // t61a 2848 srshr v18.4s, v4.4s, #12 // t34a 2849 mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 2850 srshr v20.4s, v6.4s, #12 // t60 2851 srshr v19.4s, v2.4s, #12 // t35 2852 2853 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 2854 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 2855 2856 ret 2857endfunc 2858 2859function inv_dct64_step2_neon 2860 movrel x16, idct_coeffs 2861 ld1 {v0.4s}, [x16] 28621: 2863 // t32a/33/34a/35/60/61a/62/63a 2864 // t56a/57/58a/59/36/37a/38/39a 2865 // t40a/41/42a/43/52/53a/54/55a 2866 // t48a/49/50a/51/44/45a/46/47a 2867 ldr q16, [x6, #4*4*0] // t32a 2868 ldr q17, [x9, #4*4*8] // t39a 2869 ldr q18, [x9, #4*4*0] // t63a 2870 ldr q19, [x6, #4*4*8] // t56a 2871 ldr q20, [x6, #4*4*16] // t40a 2872 ldr q21, [x9, #4*4*24] // t47a 2873 ldr q22, [x9, #4*4*16] // t55a 2874 ldr q23, [x6, #4*4*24] // t48a 2875 2876 sqadd v24.4s, v16.4s, v17.4s // t32 2877 sqsub v25.4s, v16.4s, v17.4s // t39 2878 sqadd v26.4s, v18.4s, v19.4s // t63 2879 sqsub v27.4s, v18.4s, v19.4s // t56 2880 sqsub v28.4s, v21.4s, v20.4s // t40 2881 sqadd v29.4s, v21.4s, v20.4s // t47 2882 sqadd v30.4s, v23.4s, v22.4s // t48 2883 sqsub v31.4s, v23.4s, v22.4s // t55 2884 2885 mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a 2886 mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a 2887 mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a 2888 srshr v25.4s, v2.4s, #12 // t56a 2889 srshr v27.4s, v4.4s, #12 // t39a 2890 neg v6.4s, v6.4s // t40a 2891 mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a 2892 srshr v31.4s, v6.4s, #12 // t40a 2893 srshr v28.4s, v2.4s, #12 // t55a 2894 2895 sqadd v16.4s, v24.4s, v29.4s // t32a 2896 sqsub v19.4s, v24.4s, v29.4s // t47a 2897 sqadd v17.4s, v27.4s, v31.4s // t39 2898 sqsub v18.4s, v27.4s, v31.4s // t40 2899 sqsub v20.4s, v26.4s, v30.4s // t48a 2900 sqadd v23.4s, v26.4s, v30.4s // t63a 2901 sqsub v21.4s, v25.4s, v28.4s // t55 2902 sqadd v22.4s, v25.4s, v28.4s // t56 2903 2904 mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a 2905 mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a 2906 mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 2907 srshr v18.4s, v2.4s, #12 // t40a 2908 srshr v21.4s, v4.4s, #12 // t55a 2909 mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 2910 srshr v19.4s, v6.4s, #12 // t47 2911 srshr v20.4s, v2.4s, #12 // t48 2912 2913 str q16, [x6, #4*4*0] // t32a 2914 str q17, [x9, #4*4*0] // t39 2915 str q18, [x6, #4*4*8] // t40a 2916 str q19, [x9, #4*4*8] // t47 2917 str q20, [x6, #4*4*16] // t48 2918 str q21, [x9, #4*4*16] // t55a 2919 str q22, [x6, #4*4*24] // t56 2920 str q23, [x9, #4*4*24] // t63a 2921 2922 add x6, x6, #4*4 2923 sub x9, x9, #4*4 2924 cmp x6, x9 2925 b.lt 1b 2926 ret 2927endfunc 2928 2929.macro load8 src, strd, zero, clear 2930.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 2931.if \clear 2932 ld1 {\i}, [\src] 2933 st1 {\zero}, [\src], \strd 2934.else 2935 ld1 {\i}, [\src], \strd 2936.endif 2937.endr 2938.endm 2939 2940.macro store16 dst 2941.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2942 st1 {\i}, [\dst], #16 2943.endr 2944.endm 2945 2946.macro clear_upper8 2947.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2948 movi \i, #0 2949.endr 2950.endm 2951 2952.macro movi_if reg, val, cond 2953.if \cond 2954 movi \reg, \val 2955.endif 2956.endm 2957 2958.macro movz16dup_if reg, gpr, val, cond 2959.if \cond 2960 movz \gpr, \val, lsl #16 2961 dup \reg, \gpr 2962.endif 2963.endm 2964 2965.macro st1_if regs, dst, cond 2966.if \cond 2967 st1 \regs, \dst 2968.endif 2969.endm 2970 2971.macro str_if reg, dst, cond 2972.if \cond 2973 str \reg, \dst 2974.endif 2975.endm 2976 2977.macro stroff_if reg, dst, dstoff, cond 2978.if \cond 2979 str \reg, \dst, \dstoff 2980.endif 2981.endm 2982 2983.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 2984.if \cond 2985 scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 2986.endif 2987.endm 2988 2989.macro def_dct64_func suffix, clear=0, scale=0 2990function inv_txfm_dct\suffix\()_4s_x64_neon 2991 mov x14, x30 2992 mov x6, sp 2993 lsl x8, x8, #2 2994 2995 movz16dup_if v0.2s, w16, #2896*8, \scale 2996 movi_if v7.4s, #0, \clear 2997 load8 x7, x8, v7.4s, \clear 2998 clear_upper8 2999 sub x7, x7, x8, lsl #3 3000 add x7, x7, x8, lsr #1 3001 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 3002 3003 bl inv_dct_4s_x16_neon 3004 3005 store16 x6 3006 3007 movz16dup_if v0.2s, w16, #2896*8, \scale 3008 movi_if v7.8h, #0, \clear 3009 load8 x7, x8, v7.4s, \clear 3010 clear_upper8 3011 sub x7, x7, x8, lsl #3 3012 lsr x8, x8, #1 3013 sub x7, x7, x8, lsr #1 3014 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 3015 3016 bl inv_dct32_odd_4s_x16_neon 3017 3018 add x10, x6, #16*15 3019 sub x6, x6, #16*16 3020 3021 mov x9, #-16 3022 3023.macro store_addsub r0, r1, r2, r3 3024 ld1 {v2.4s}, [x6], #16 3025 ld1 {v3.4s}, [x6], #16 3026 sqadd v6.4s, v2.4s, \r0 3027 sqsub \r0, v2.4s, \r0 3028 ld1 {v4.4s}, [x6], #16 3029 sqadd v7.4s, v3.4s, \r1 3030 sqsub \r1, v3.4s, \r1 3031 ld1 {v5.4s}, [x6], #16 3032 sqadd v2.4s, v4.4s, \r2 3033 sub x6, x6, #16*4 3034 sqsub \r2, v4.4s, \r2 3035 st1 {v6.4s}, [x6], #16 3036 st1 {\r0}, [x10], x9 3037 sqadd v3.4s, v5.4s, \r3 3038 sqsub \r3, v5.4s, \r3 3039 st1 {v7.4s}, [x6], #16 3040 st1 {\r1}, [x10], x9 3041 st1 {v2.4s}, [x6], #16 3042 st1 {\r2}, [x10], x9 3043 st1 {v3.4s}, [x6], #16 3044 st1 {\r3}, [x10], x9 3045.endm 3046 store_addsub v31.4s, v30.4s, v29.4s, v28.4s 3047 store_addsub v27.4s, v26.4s, v25.4s, v24.4s 3048 store_addsub v23.4s, v22.4s, v21.4s, v20.4s 3049 store_addsub v19.4s, v18.4s, v17.4s, v16.4s 3050.purgem store_addsub 3051 3052 add x6, x6, #4*4*16 3053 3054 movrel x17, idct64_coeffs 3055 movz16dup_if v0.2s, w16, #2896*8, \scale 3056 movi_if v7.4s, #0, \clear 3057 add x9, x7, x8, lsl #4 // offset 16 3058 add x10, x7, x8, lsl #3 // offset 8 3059 sub x9, x9, x8 // offset 15 3060 sub x11, x10, x8 // offset 7 3061 ld1 {v16.4s}, [x7] // in1 (offset 0) 3062 ld1 {v17.4s}, [x9] // in31 (offset 15) 3063 ld1 {v18.4s}, [x10] // in17 (offset 8) 3064 ld1 {v19.4s}, [x11] // in15 (offset 7) 3065 st1_if {v7.4s}, [x7], \clear 3066 st1_if {v7.4s}, [x9], \clear 3067 st1_if {v7.4s}, [x10], \clear 3068 st1_if {v7.4s}, [x11], \clear 3069 scale_if \scale, v0.s[0], v16, v17, v18, v19 3070 bl inv_dct64_step1_neon 3071 movz16dup_if v0.2s, w16, #2896*8, \scale 3072 movi_if v7.4s, #0, \clear 3073 add x7, x7, x8, lsl #2 // offset 4 3074 sub x9, x9, x8, lsl #2 // offset 11 3075 sub x10, x7, x8 // offset 3 3076 add x11, x9, x8 // offset 12 3077 ld1 {v16.4s}, [x10] // in7 (offset 3) 3078 ld1 {v17.4s}, [x11] // in25 (offset 12) 3079 ld1 {v18.4s}, [x9] // in23 (offset 11) 3080 ld1 {v19.4s}, [x7] // in9 (offset 4) 3081 st1_if {v7.4s}, [x7], \clear 3082 st1_if {v7.4s}, [x9], \clear 3083 st1_if {v7.4s}, [x10], \clear 3084 st1_if {v7.4s}, [x11], \clear 3085 scale_if \scale, v0.s[0], v16, v17, v18, v19 3086 bl inv_dct64_step1_neon 3087 movz16dup_if v0.2s, w16, #2896*8, \scale 3088 movi_if v7.4s, #0, \clear 3089 sub x10, x10, x8, lsl #1 // offset 1 3090 sub x9, x9, x8, lsl #1 // offset 9 3091 add x7, x7, x8 // offset 5 3092 add x11, x11, x8 // offset 13 3093 ldr q16, [x10, x8] // in5 (offset 2) 3094 ldr q17, [x11] // in27 (offset 13) 3095 ldr q18, [x9, x8] // in21 (offset 10) 3096 ldr q19, [x7] // in11 (offset 5) 3097 stroff_if q7, [x10, x8], \clear 3098 str_if q7, [x11], \clear 3099 stroff_if q7, [x9, x8], \clear 3100 str_if q7, [x7], \clear 3101 scale_if \scale, v0.s[0], v16, v17, v18, v19 3102 bl inv_dct64_step1_neon 3103 movz16dup_if v0.2s, w16, #2896*8, \scale 3104 movi_if v7.4s, #0, \clear 3105 ldr q16, [x10] // in3 (offset 1) 3106 ldr q17, [x11, x8] // in29 (offset 14) 3107 ldr q18, [x9] // in19 (offset 9) 3108 ldr q19, [x7, x8] // in13 (offset 6) 3109 str_if q7, [x10], \clear 3110 stroff_if q7, [x11, x8], \clear 3111 str_if q7, [x9], \clear 3112 stroff_if q7, [x7, x8], \clear 3113 scale_if \scale, v0.s[0], v16, v17, v18, v19 3114 bl inv_dct64_step1_neon 3115 3116 sub x6, x6, #4*4*32 3117 add x9, x6, #4*4*7 3118 3119 bl inv_dct64_step2_neon 3120 3121 br x14 3122endfunc 3123.endm 3124 3125def_dct64_func _clear, clear=1 3126def_dct64_func _clear_scale, clear=1, scale=1 3127 3128 3129function inv_txfm_horz_dct_64x4_neon 3130 mov x14, x30 3131 3132 mov x7, sp 3133 add x8, sp, #4*4*(64 - 4) 3134 add x9, x6, #2*56 3135 mov x10, #2*64 3136 mov x11, #-4*4*4 3137 3138 dup v7.4s, w12 31391: 3140 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 3141 ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 3142 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 3143 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 3144 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 3145 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 3146 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 3147 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 3148 3149.macro store_addsub src0, src1, src2, src3 3150 sqsub v1.4s, \src0, \src1 3151 sqadd v0.4s, \src0, \src1 3152 sqsub v3.4s, \src2, \src3 3153 srshl v1.4s, v1.4s, v7.4s 3154 sqadd v2.4s, \src2, \src3 3155 srshl v3.4s, v3.4s, v7.4s 3156 srshl v0.4s, v0.4s, v7.4s 3157 srshl v2.4s, v2.4s, v7.4s 3158 sqxtn v3.4h, v3.4s 3159 sqxtn2 v3.8h, v1.4s 3160 sqxtn v0.4h, v0.4s 3161 sqxtn2 v0.8h, v2.4s 3162 rev64 v3.8h, v3.8h 3163 st1 {v0.8h}, [x6], x10 3164 st1 {v3.8h}, [x9], x10 3165.endm 3166 store_addsub v16.4s, v31.4s, v20.4s, v27.4s 3167 store_addsub v17.4s, v30.4s, v21.4s, v26.4s 3168 store_addsub v18.4s, v29.4s, v22.4s, v25.4s 3169 store_addsub v19.4s, v28.4s, v23.4s, v24.4s 3170.purgem store_addsub 3171 sub x6, x6, x10, lsl #2 3172 sub x9, x9, x10, lsl #2 3173 add x6, x6, #16 3174 sub x9, x9, #16 3175 3176 cmp x7, x8 3177 b.lt 1b 3178 br x14 3179endfunc 3180 3181function inv_txfm_add_vert_dct_8x64_neon 3182 mov x14, x30 3183 lsl x8, x8, #1 3184 3185 mov x7, sp 3186 add x8, sp, #2*8*(64 - 4) 3187 add x9, x6, x1, lsl #6 3188 sub x9, x9, x1 3189 neg x10, x1 3190 mov x11, #-2*8*4 3191 31921: 3193 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 3194 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 3195 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 3196 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 3197 3198 movi v6.8h, #0 3199 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 3200.macro add_dest_addsub src0, src1, src2, src3 3201 ld1 {v0.8h}, [x6], x1 3202 ld1 {v1.8h}, [x9], x10 3203 sqadd v4.8h, \src0, \src1 3204 ld1 {v2.8h}, [x6] 3205 sqsub \src0, \src0, \src1 3206 ld1 {v3.8h}, [x9] 3207 sqadd v5.8h, \src2, \src3 3208 sqsub \src2, \src2, \src3 3209 sub x6, x6, x1 3210 sub x9, x9, x10 3211 srshr v4.8h, v4.8h, #4 3212 srshr v5.8h, v5.8h, #4 3213 srshr \src0, \src0, #4 3214 sqadd v0.8h, v0.8h, v4.8h 3215 srshr \src2, \src2, #4 3216 sqadd v1.8h, v1.8h, \src0 3217 sqadd v2.8h, v2.8h, v5.8h 3218 smax v0.8h, v0.8h, v6.8h 3219 sqadd v3.8h, v3.8h, \src2 3220 smax v1.8h, v1.8h, v6.8h 3221 smin v0.8h, v0.8h, v7.8h 3222 smax v2.8h, v2.8h, v6.8h 3223 smin v1.8h, v1.8h, v7.8h 3224 st1 {v0.8h}, [x6], x1 3225 smax v3.8h, v3.8h, v6.8h 3226 smin v2.8h, v2.8h, v7.8h 3227 st1 {v1.8h}, [x9], x10 3228 smin v3.8h, v3.8h, v7.8h 3229 st1 {v2.8h}, [x6], x1 3230 st1 {v3.8h}, [x9], x10 3231.endm 3232 add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h 3233 add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h 3234 add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h 3235 add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h 3236.purgem add_dest_addsub 3237 cmp x7, x8 3238 b.lt 1b 3239 3240 br x14 3241endfunc 3242 3243.macro sub_sp space 3244#ifdef _WIN32 3245.if \space > 8192 3246 // Here, we'd need to touch two (or more) pages while decrementing 3247 // the stack pointer. 3248 .error "sub_sp_align doesn't support values over 8K at the moment" 3249.elseif \space > 4096 3250 sub x16, sp, #4096 3251 ldr xzr, [x16] 3252 sub sp, x16, #(\space - 4096) 3253.else 3254 sub sp, sp, #\space 3255.endif 3256#else 3257.if \space >= 4096 3258 sub sp, sp, #(\space)/4096*4096 3259.endif 3260.if (\space % 4096) != 0 3261 sub sp, sp, #(\space)%4096 3262.endif 3263#endif 3264.endm 3265 3266function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 3267 idct_dc 64, 64, 2 3268 3269 mov x15, x30 3270 3271 sub_sp 64*32*2+64*4*4 3272 add x5, sp, #64*4*4 3273 3274 movrel x13, eob_32x32 3275 3276.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3277 add x6, x5, #(\i*64*2) 3278.if \i > 0 3279 mov w8, #(32 - \i) 3280 cmp w3, w12 3281 b.lt 1f 3282.endif 3283 add x7, x2, #(\i*4) 3284 mov x8, #32*4 3285 mov x12, #-2 // shift 3286 bl inv_txfm_dct_clear_4s_x64_neon 3287 add x6, x5, #(\i*64*2) 3288 bl inv_txfm_horz_dct_64x4_neon 3289.if \i < 28 3290 ldrh w12, [x13], #2 3291.endif 3292.endr 3293 b 3f 3294 32951: 3296 movi v4.8h, #0 3297 movi v5.8h, #0 3298 movi v6.8h, #0 3299 movi v7.8h, #0 33002: 3301 subs w8, w8, #2 3302.rept 4 3303 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3304.endr 3305 b.gt 2b 3306 33073: 3308.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3309 add x7, x5, #(\i*2) 3310 mov x8, #64*2 3311 bl X(inv_txfm_dct_8h_x64_neon) 3312 add x6, x0, #(\i*2) 3313 bl inv_txfm_add_vert_dct_8x64_neon 3314.endr 3315 3316 add sp, x5, #64*32*2 3317 br x15 3318endfunc 3319 3320function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 3321 idct_dc 64, 32, 1 3322 3323 mov x15, x30 3324 3325 sub_sp 64*32*2+64*4*4 3326 add x5, sp, #64*4*4 3327 3328 movrel x13, eob_32x32 3329 3330.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3331 add x6, x5, #(\i*64*2) 3332.if \i > 0 3333 mov w8, #(32 - \i) 3334 cmp w3, w12 3335 b.lt 1f 3336.endif 3337 add x7, x2, #(\i*4) 3338 mov x8, #32*4 3339 mov x12, #-1 // shift 3340 bl inv_txfm_dct_clear_scale_4s_x64_neon 3341 add x6, x5, #(\i*64*2) 3342 bl inv_txfm_horz_dct_64x4_neon 3343.if \i < 28 3344 ldrh w12, [x13], #2 3345.endif 3346.endr 3347 b 3f 3348 33491: 3350 movi v4.8h, #0 3351 movi v5.8h, #0 3352 movi v6.8h, #0 3353 movi v7.8h, #0 33542: 3355 subs w8, w8, #2 3356.rept 4 3357 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3358.endr 3359 b.gt 2b 3360 33613: 3362.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3363 add x6, x0, #(\i*2) 3364 add x7, x5, #(\i*2) 3365 mov x8, #64*2 3366 bl inv_txfm_add_vert_dct_8x32_neon 3367.endr 3368 3369 add sp, x5, #64*32*2 3370 br x15 3371endfunc 3372 3373function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 3374 idct_dc 32, 64, 1 3375 3376 mov x15, x30 3377 3378 sub_sp 32*32*2+64*8*2 3379 add x5, sp, #64*8*2 3380 3381 movrel x13, eob_32x32 3382 ldrh w12, [x13], #2 3383 3384.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3385 add x6, x5, #(\i*32*2) 3386.if \i > 0 3387 mov w8, #(32 - \i) 3388 cmp w3, w12 3389 b.lt 1f 3390 ldrh w12, [x13], #2 3391.endif 3392 add x7, x2, #(\i*4) 3393 mov x8, #32*4 3394 bl inv_txfm_horz_scale_dct_32x4_neon 3395.endr 3396 b 3f 3397 33981: 3399 movi v4.8h, #0 3400 movi v5.8h, #0 3401 movi v6.8h, #0 3402 movi v7.8h, #0 34032: 3404 subs w8, w8, #4 3405.rept 4 3406 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3407.endr 3408 b.gt 2b 3409 34103: 3411.irp i, 0, 8, 16, 24 3412 add x7, x5, #(\i*2) 3413 mov x8, #32*2 3414 bl X(inv_txfm_dct_8h_x64_neon) 3415 add x6, x0, #(\i*2) 3416 bl inv_txfm_add_vert_dct_8x64_neon 3417.endr 3418 3419 add sp, x5, #32*32*2 3420 br x15 3421endfunc 3422 3423function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 3424 idct_dc 64, 16, 2 3425 3426 mov x15, x30 3427 3428 sub_sp 64*16*2+64*4*4 3429 add x4, sp, #64*4*4 3430 3431 movrel x13, eob_16x32 3432 3433.irp i, 0, 4, 8, 12 3434 add x6, x4, #(\i*64*2) 3435.if \i > 0 3436 mov w8, #(16 - \i) 3437 cmp w3, w12 3438 b.lt 1f 3439.endif 3440 add x7, x2, #(\i*4) 3441 mov x8, #16*4 3442 mov x12, #-2 // shift 3443 bl inv_txfm_dct_clear_4s_x64_neon 3444 add x6, x4, #(\i*64*2) 3445 bl inv_txfm_horz_dct_64x4_neon 3446.if \i < 12 3447 ldrh w12, [x13], #2 3448.endif 3449.endr 3450 b 3f 3451 34521: 3453 movi v4.8h, #0 3454 movi v5.8h, #0 3455 movi v6.8h, #0 3456 movi v7.8h, #0 34572: 3458 subs w8, w8, #2 3459.rept 4 3460 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3461.endr 3462 b.gt 2b 3463 34643: 3465 movrel x5, X(inv_dct_8h_x16_neon) 3466.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3467 add x6, x0, #(\i*2) 3468 add x7, x4, #(\i*2) 3469 mov x8, #64*2 3470 bl inv_txfm_add_vert_8x16_neon 3471.endr 3472 3473 add sp, x4, #64*16*2 3474 br x15 3475endfunc 3476 3477function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 3478 idct_dc 16, 64, 2 3479 3480 mov x15, x30 3481 3482 sub_sp 16*32*2+64*8*2 3483 add x5, sp, #64*8*2 3484 3485 movrel x13, eob_16x32 3486 ldrh w12, [x13], #2 3487 3488 adr x4, inv_dct_4s_x16_neon 3489.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3490 add x6, x5, #(\i*16*2) 3491.if \i > 0 3492 mov w8, #(32 - \i) 3493 cmp w3, w12 3494 b.lt 1f 3495 ldrh w12, [x13], #2 3496.endif 3497 add x7, x2, #(\i*4) 3498 mov x8, #32*4 3499 bl inv_txfm_horz_16x4_neon 3500.endr 3501 b 3f 3502 35031: 3504 movi v4.8h, #0 3505 movi v5.8h, #0 3506 movi v6.8h, #0 3507 movi v7.8h, #0 35082: 3509 subs w8, w8, #4 3510.rept 2 3511 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3512.endr 3513 b.gt 2b 3514 35153: 3516.irp i, 0, 8 3517 add x7, x5, #(\i*2) 3518 mov x8, #16*2 3519 bl X(inv_txfm_dct_8h_x64_neon) 3520 add x6, x0, #(\i*2) 3521 bl inv_txfm_add_vert_dct_8x64_neon 3522.endr 3523 3524 add sp, x5, #16*32*2 3525 br x15 3526endfunc 3527