1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/aarch64/asm.S" 23#include "neon.S" 24 25function ff_h264_idct_add_neon, export=1 26.L_ff_h264_idct_add_neon: 27 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] 28 sxtw x2, w2 29 movi v30.8H, #0 30 31 add v4.4H, v0.4H, v2.4H 32 sshr v16.4H, v1.4H, #1 33 st1 {v30.8H}, [x1], #16 34 sshr v17.4H, v3.4H, #1 35 st1 {v30.8H}, [x1], #16 36 sub v5.4H, v0.4H, v2.4H 37 sub v6.4H, v16.4H, v3.4H 38 add v7.4H, v1.4H, v17.4H 39 add v0.4H, v4.4H, v7.4H 40 add v1.4H, v5.4H, v6.4H 41 sub v2.4H, v5.4H, v6.4H 42 sub v3.4H, v4.4H, v7.4H 43 44 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 45 46 add v4.4H, v0.4H, v2.4H 47 ld1 {v18.S}[0], [x0], x2 48 sshr v16.4H, v3.4H, #1 49 sshr v17.4H, v1.4H, #1 50 ld1 {v18.S}[1], [x0], x2 51 sub v5.4H, v0.4H, v2.4H 52 ld1 {v19.S}[1], [x0], x2 53 add v6.4H, v16.4H, v1.4H 54 ins v4.D[1], v5.D[0] 55 sub v7.4H, v17.4H, v3.4H 56 ld1 {v19.S}[0], [x0], x2 57 ins v6.D[1], v7.D[0] 58 sub x0, x0, x2, lsl #2 59 add v0.8H, v4.8H, v6.8H 60 sub v1.8H, v4.8H, v6.8H 61 62 srshr v0.8H, v0.8H, #6 63 srshr v1.8H, v1.8H, #6 64 65 uaddw v0.8H, v0.8H, v18.8B 66 uaddw v1.8H, v1.8H, v19.8B 67 68 sqxtun v0.8B, v0.8H 69 sqxtun v1.8B, v1.8H 70 71 st1 {v0.S}[0], [x0], x2 72 st1 {v0.S}[1], [x0], x2 73 st1 {v1.S}[1], [x0], x2 74 st1 {v1.S}[0], [x0], x2 75 76 sub x1, x1, #32 77 ret 78endfunc 79 80function ff_h264_idct_dc_add_neon, export=1 81.L_ff_h264_idct_dc_add_neon: 82 sxtw x2, w2 83 mov w3, #0 84 ld1r {v2.8H}, [x1] 85 strh w3, [x1] 86 srshr v2.8H, v2.8H, #6 87 ld1 {v0.S}[0], [x0], x2 88 ld1 {v0.S}[1], [x0], x2 89 uaddw v3.8H, v2.8H, v0.8B 90 ld1 {v1.S}[0], [x0], x2 91 ld1 {v1.S}[1], [x0], x2 92 uaddw v4.8H, v2.8H, v1.8B 93 sqxtun v0.8B, v3.8H 94 sqxtun v1.8B, v4.8H 95 sub x0, x0, x2, lsl #2 96 st1 {v0.S}[0], [x0], x2 97 st1 {v0.S}[1], [x0], x2 98 st1 {v1.S}[0], [x0], x2 99 st1 {v1.S}[1], [x0], x2 100 ret 101endfunc 102 103function ff_h264_idct_add16_neon, export=1 104 mov x12, x30 105 mov x6, x0 // dest 106 mov x5, x1 // block_offset 107 mov x1, x2 // block 108 mov w9, w3 // stride 109 movrel x7, scan8 110 mov x10, #16 111 movrel x13, .L_ff_h264_idct_dc_add_neon 112 movrel x14, .L_ff_h264_idct_add_neon 1131: mov w2, w9 114 ldrb w3, [x7], #1 115 ldrsw x0, [x5], #4 116 ldrb w3, [x4, w3, uxtw] 117 subs w3, w3, #1 118 b.lt 2f 119 ldrsh w3, [x1] 120 add x0, x0, x6 121 ccmp w3, #0, #4, eq 122 csel x15, x13, x14, ne 123 blr x15 1242: subs x10, x10, #1 125 add x1, x1, #32 126 b.ne 1b 127 ret x12 128endfunc 129 130function ff_h264_idct_add16intra_neon, export=1 131 mov x12, x30 132 mov x6, x0 // dest 133 mov x5, x1 // block_offset 134 mov x1, x2 // block 135 mov w9, w3 // stride 136 movrel x7, scan8 137 mov x10, #16 138 movrel x13, .L_ff_h264_idct_dc_add_neon 139 movrel x14, .L_ff_h264_idct_add_neon 1401: mov w2, w9 141 ldrb w3, [x7], #1 142 ldrsw x0, [x5], #4 143 ldrb w3, [x4, w3, uxtw] 144 add x0, x0, x6 145 cmp w3, #0 146 ldrsh w3, [x1] 147 csel x15, x13, x14, eq 148 ccmp w3, #0, #0, eq 149 b.eq 2f 150 blr x15 1512: subs x10, x10, #1 152 add x1, x1, #32 153 b.ne 1b 154 ret x12 155endfunc 156 157function ff_h264_idct_add8_neon, export=1 158 sub sp, sp, #0x40 159 stp x19, x20, [sp] 160 mov x12, x30 161 ldp x6, x15, [x0] // dest[0], dest[1] 162 add x5, x1, #16*4 // block_offset 163 add x9, x2, #16*32 // block 164 mov w19, w3 // stride 165 movrel x13, .L_ff_h264_idct_dc_add_neon 166 movrel x14, .L_ff_h264_idct_add_neon 167 movrel x7, scan8, 16 168 mov x10, #0 169 mov x11, #16 1701: mov w2, w19 171 ldrb w3, [x7, x10] // scan8[i] 172 ldrsw x0, [x5, x10, lsl #2] // block_offset[i] 173 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] 174 add x0, x0, x6 // block_offset[i] + dst[j-1] 175 add x1, x9, x10, lsl #5 // block + i * 16 176 cmp w3, #0 177 ldrsh w3, [x1] // block[i*16] 178 csel x20, x13, x14, eq 179 ccmp w3, #0, #0, eq 180 b.eq 2f 181 blr x20 1822: add x10, x10, #1 183 cmp x10, #4 184 csel x10, x11, x10, eq // mov x10, #16 185 csel x6, x15, x6, eq 186 cmp x10, #20 187 b.lt 1b 188 ldp x19, x20, [sp] 189 add sp, sp, #0x40 190 ret x12 191endfunc 192 193.macro idct8x8_cols pass 194 .if \pass == 0 195 va .req v18 196 vb .req v30 197 sshr v18.8H, v26.8H, #1 198 add v16.8H, v24.8H, v28.8H 199 ld1 {v30.8H, v31.8H}, [x1] 200 st1 {v19.8H}, [x1], #16 201 st1 {v19.8H}, [x1], #16 202 sub v17.8H, v24.8H, v28.8H 203 sshr v19.8H, v30.8H, #1 204 sub v18.8H, v18.8H, v30.8H 205 add v19.8H, v19.8H, v26.8H 206 .else 207 va .req v30 208 vb .req v18 209 sshr v30.8H, v26.8H, #1 210 sshr v19.8H, v18.8H, #1 211 add v16.8H, v24.8H, v28.8H 212 sub v17.8H, v24.8H, v28.8H 213 sub v30.8H, v30.8H, v18.8H 214 add v19.8H, v19.8H, v26.8H 215 .endif 216 add v26.8H, v17.8H, va.8H 217 sub v28.8H, v17.8H, va.8H 218 add v24.8H, v16.8H, v19.8H 219 sub vb.8H, v16.8H, v19.8H 220 sub v16.8H, v29.8H, v27.8H 221 add v17.8H, v31.8H, v25.8H 222 sub va.8H, v31.8H, v25.8H 223 add v19.8H, v29.8H, v27.8H 224 sub v16.8H, v16.8H, v31.8H 225 sub v17.8H, v17.8H, v27.8H 226 add va.8H, va.8H, v29.8H 227 add v19.8H, v19.8H, v25.8H 228 sshr v25.8H, v25.8H, #1 229 sshr v27.8H, v27.8H, #1 230 sshr v29.8H, v29.8H, #1 231 sshr v31.8H, v31.8H, #1 232 sub v16.8H, v16.8H, v31.8H 233 sub v17.8H, v17.8H, v27.8H 234 add va.8H, va.8H, v29.8H 235 add v19.8H, v19.8H, v25.8H 236 sshr v25.8H, v16.8H, #2 237 sshr v27.8H, v17.8H, #2 238 sshr v29.8H, va.8H, #2 239 sshr v31.8H, v19.8H, #2 240 sub v19.8H, v19.8H, v25.8H 241 sub va.8H, v27.8H, va.8H 242 add v17.8H, v17.8H, v29.8H 243 add v16.8H, v16.8H, v31.8H 244 .if \pass == 0 245 sub v31.8H, v24.8H, v19.8H 246 add v24.8H, v24.8H, v19.8H 247 add v25.8H, v26.8H, v18.8H 248 sub v18.8H, v26.8H, v18.8H 249 add v26.8H, v28.8H, v17.8H 250 add v27.8H, v30.8H, v16.8H 251 sub v29.8H, v28.8H, v17.8H 252 sub v28.8H, v30.8H, v16.8H 253 .else 254 sub v31.8H, v24.8H, v19.8H 255 add v24.8H, v24.8H, v19.8H 256 add v25.8H, v26.8H, v30.8H 257 sub v30.8H, v26.8H, v30.8H 258 add v26.8H, v28.8H, v17.8H 259 sub v29.8H, v28.8H, v17.8H 260 add v27.8H, v18.8H, v16.8H 261 sub v28.8H, v18.8H, v16.8H 262 .endif 263 .unreq va 264 .unreq vb 265.endm 266 267function ff_h264_idct8_add_neon, export=1 268.L_ff_h264_idct8_add_neon: 269 movi v19.8H, #0 270 sxtw x2, w2 271 ld1 {v24.8H, v25.8H}, [x1] 272 st1 {v19.8H}, [x1], #16 273 st1 {v19.8H}, [x1], #16 274 ld1 {v26.8H, v27.8H}, [x1] 275 st1 {v19.8H}, [x1], #16 276 st1 {v19.8H}, [x1], #16 277 ld1 {v28.8H, v29.8H}, [x1] 278 st1 {v19.8H}, [x1], #16 279 st1 {v19.8H}, [x1], #16 280 281 idct8x8_cols 0 282 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 283 idct8x8_cols 1 284 285 mov x3, x0 286 srshr v24.8H, v24.8H, #6 287 ld1 {v0.8B}, [x0], x2 288 srshr v25.8H, v25.8H, #6 289 ld1 {v1.8B}, [x0], x2 290 srshr v26.8H, v26.8H, #6 291 ld1 {v2.8B}, [x0], x2 292 srshr v27.8H, v27.8H, #6 293 ld1 {v3.8B}, [x0], x2 294 srshr v28.8H, v28.8H, #6 295 ld1 {v4.8B}, [x0], x2 296 srshr v29.8H, v29.8H, #6 297 ld1 {v5.8B}, [x0], x2 298 srshr v30.8H, v30.8H, #6 299 ld1 {v6.8B}, [x0], x2 300 srshr v31.8H, v31.8H, #6 301 ld1 {v7.8B}, [x0], x2 302 uaddw v24.8H, v24.8H, v0.8B 303 uaddw v25.8H, v25.8H, v1.8B 304 uaddw v26.8H, v26.8H, v2.8B 305 sqxtun v0.8B, v24.8H 306 uaddw v27.8H, v27.8H, v3.8B 307 sqxtun v1.8B, v25.8H 308 uaddw v28.8H, v28.8H, v4.8B 309 sqxtun v2.8B, v26.8H 310 st1 {v0.8B}, [x3], x2 311 uaddw v29.8H, v29.8H, v5.8B 312 sqxtun v3.8B, v27.8H 313 st1 {v1.8B}, [x3], x2 314 uaddw v30.8H, v30.8H, v6.8B 315 sqxtun v4.8B, v28.8H 316 st1 {v2.8B}, [x3], x2 317 uaddw v31.8H, v31.8H, v7.8B 318 sqxtun v5.8B, v29.8H 319 st1 {v3.8B}, [x3], x2 320 sqxtun v6.8B, v30.8H 321 sqxtun v7.8B, v31.8H 322 st1 {v4.8B}, [x3], x2 323 st1 {v5.8B}, [x3], x2 324 st1 {v6.8B}, [x3], x2 325 st1 {v7.8B}, [x3], x2 326 327 sub x1, x1, #128 328 ret 329endfunc 330 331function ff_h264_idct8_dc_add_neon, export=1 332.L_ff_h264_idct8_dc_add_neon: 333 mov w3, #0 334 sxtw x2, w2 335 ld1r {v31.8H}, [x1] 336 strh w3, [x1] 337 ld1 {v0.8B}, [x0], x2 338 srshr v31.8H, v31.8H, #6 339 ld1 {v1.8B}, [x0], x2 340 ld1 {v2.8B}, [x0], x2 341 uaddw v24.8H, v31.8H, v0.8B 342 ld1 {v3.8B}, [x0], x2 343 uaddw v25.8H, v31.8H, v1.8B 344 ld1 {v4.8B}, [x0], x2 345 uaddw v26.8H, v31.8H, v2.8B 346 ld1 {v5.8B}, [x0], x2 347 uaddw v27.8H, v31.8H, v3.8B 348 ld1 {v6.8B}, [x0], x2 349 uaddw v28.8H, v31.8H, v4.8B 350 ld1 {v7.8B}, [x0], x2 351 uaddw v29.8H, v31.8H, v5.8B 352 uaddw v30.8H, v31.8H, v6.8B 353 uaddw v31.8H, v31.8H, v7.8B 354 sqxtun v0.8B, v24.8H 355 sqxtun v1.8B, v25.8H 356 sqxtun v2.8B, v26.8H 357 sqxtun v3.8B, v27.8H 358 sub x0, x0, x2, lsl #3 359 st1 {v0.8B}, [x0], x2 360 sqxtun v4.8B, v28.8H 361 st1 {v1.8B}, [x0], x2 362 sqxtun v5.8B, v29.8H 363 st1 {v2.8B}, [x0], x2 364 sqxtun v6.8B, v30.8H 365 st1 {v3.8B}, [x0], x2 366 sqxtun v7.8B, v31.8H 367 st1 {v4.8B}, [x0], x2 368 st1 {v5.8B}, [x0], x2 369 st1 {v6.8B}, [x0], x2 370 st1 {v7.8B}, [x0], x2 371 ret 372endfunc 373 374function ff_h264_idct8_add4_neon, export=1 375 mov x12, x30 376 mov x6, x0 377 mov x5, x1 378 mov x1, x2 379 mov w2, w3 380 movrel x7, scan8 381 mov w10, #16 382 movrel x13, .L_ff_h264_idct8_dc_add_neon 383 movrel x14, .L_ff_h264_idct8_add_neon 3841: ldrb w9, [x7], #4 385 ldrsw x0, [x5], #16 386 ldrb w9, [x4, w9, UXTW] 387 subs w9, w9, #1 388 b.lt 2f 389 ldrsh w11, [x1] 390 add x0, x6, x0 391 ccmp w11, #0, #4, eq 392 csel x15, x13, x14, ne 393 blr x15 3942: subs w10, w10, #4 395 add x1, x1, #128 396 b.ne 1b 397 ret x12 398endfunc 399 400const scan8 401 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 402 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 403 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 404 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 405 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 406 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 407 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 408 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 409 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 410 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 411 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 412 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 413endconst 414