1/* 2 * ARM NEON IDCT 3 * 4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 5 * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com> 6 * 7 * Based on Simple IDCT 8 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27#include "libavutil/aarch64/asm.S" 28 29#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 30#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 31#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 32#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 33#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 34#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 35#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 36#define Z4c ((1<<(COL_SHIFT-1))/Z4) 37#define ROW_SHIFT 11 38#define COL_SHIFT 20 39 40#define z1 v0.H[0] 41#define z2 v0.H[1] 42#define z3 v0.H[2] 43#define z4 v0.H[3] 44#define z5 v0.H[4] 45#define z6 v0.H[5] 46#define z7 v0.H[6] 47#define z4c v0.H[7] 48 49const idct_coeff_neon, align=4 50 .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c 51endconst 52 53.macro idct_start data 54 prfm pldl1keep, [\data] 55 mov x10, x30 56 movrel x3, idct_coeff_neon 57 ld1 {v0.2D}, [x3] 58.endm 59 60.macro idct_end 61 br x10 62.endm 63 64.macro smull1 a, b, c 65 smull \a, \b, \c 66.endm 67 68.macro smlal1 a, b, c 69 smlal \a, \b, \c 70.endm 71 72.macro smlsl1 a, b, c 73 smlsl \a, \b, \c 74.endm 75 76.macro idct_col4_top y1, y2, y3, y4, i, l 77 smull\i v7.4S, \y3\l, z2 78 smull\i v16.4S, \y3\l, z6 79 smull\i v17.4S, \y2\l, z1 80 add v19.4S, v23.4S, v7.4S 81 smull\i v18.4S, \y2\l, z3 82 add v20.4S, v23.4S, v16.4S 83 smull\i v5.4S, \y2\l, z5 84 sub v21.4S, v23.4S, v16.4S 85 smull\i v6.4S, \y2\l, z7 86 sub v22.4S, v23.4S, v7.4S 87 88 smlal\i v17.4S, \y4\l, z3 89 smlsl\i v18.4S, \y4\l, z7 90 smlsl\i v5.4S, \y4\l, z1 91 smlsl\i v6.4S, \y4\l, z5 92.endm 93 94.macro idct_row4_neon y1, y2, y3, y4, pass 95 ld1 {\y1\().2D,\y2\().2D}, [x2], #32 96 movi v23.4S, #1<<2, lsl #8 97 orr v5.16B, \y1\().16B, \y2\().16B 98 ld1 {\y3\().2D,\y4\().2D}, [x2], #32 99 orr v6.16B, \y3\().16B, \y4\().16B 100 orr v5.16B, v5.16B, v6.16B 101 mov x3, v5.D[1] 102 smlal v23.4S, \y1\().4H, z4 103 104 idct_col4_top \y1, \y2, \y3, \y4, 1, .4H 105 106 cmp x3, #0 107 b.eq \pass\()f 108 109 smull2 v7.4S, \y1\().8H, z4 110 smlal2 v17.4S, \y2\().8H, z5 111 smlsl2 v18.4S, \y2\().8H, z1 112 smull2 v16.4S, \y3\().8H, z2 113 smlal2 v5.4S, \y2\().8H, z7 114 add v19.4S, v19.4S, v7.4S 115 sub v20.4S, v20.4S, v7.4S 116 sub v21.4S, v21.4S, v7.4S 117 add v22.4S, v22.4S, v7.4S 118 smlal2 v6.4S, \y2\().8H, z3 119 smull2 v7.4S, \y3\().8H, z6 120 smlal2 v17.4S, \y4\().8H, z7 121 smlsl2 v18.4S, \y4\().8H, z5 122 smlal2 v5.4S, \y4\().8H, z3 123 smlsl2 v6.4S, \y4\().8H, z1 124 add v19.4S, v19.4S, v7.4S 125 sub v20.4S, v20.4S, v16.4S 126 add v21.4S, v21.4S, v16.4S 127 sub v22.4S, v22.4S, v7.4S 128 129\pass: add \y3\().4S, v19.4S, v17.4S 130 add \y4\().4S, v20.4S, v18.4S 131 shrn \y1\().4H, \y3\().4S, #ROW_SHIFT 132 shrn \y2\().4H, \y4\().4S, #ROW_SHIFT 133 add v7.4S, v21.4S, v5.4S 134 add v16.4S, v22.4S, v6.4S 135 shrn \y3\().4H, v7.4S, #ROW_SHIFT 136 shrn \y4\().4H, v16.4S, #ROW_SHIFT 137 sub v22.4S, v22.4S, v6.4S 138 sub v19.4S, v19.4S, v17.4S 139 sub v21.4S, v21.4S, v5.4S 140 shrn2 \y1\().8H, v22.4S, #ROW_SHIFT 141 sub v20.4S, v20.4S, v18.4S 142 shrn2 \y2\().8H, v21.4S, #ROW_SHIFT 143 shrn2 \y3\().8H, v20.4S, #ROW_SHIFT 144 shrn2 \y4\().8H, v19.4S, #ROW_SHIFT 145 146 trn1 v16.8H, \y1\().8H, \y2\().8H 147 trn2 v17.8H, \y1\().8H, \y2\().8H 148 trn1 v18.8H, \y3\().8H, \y4\().8H 149 trn2 v19.8H, \y3\().8H, \y4\().8H 150 trn1 \y1\().4S, v16.4S, v18.4S 151 trn1 \y2\().4S, v17.4S, v19.4S 152 trn2 \y3\().4S, v16.4S, v18.4S 153 trn2 \y4\().4S, v17.4S, v19.4S 154.endm 155 156.macro declare_idct_col4_neon i, l 157function idct_col4_neon\i 158 dup v23.4H, z4c 159.if \i == 1 160 add v23.4H, v23.4H, v24.4H 161.else 162 mov v5.D[0], v24.D[1] 163 add v23.4H, v23.4H, v5.4H 164.endif 165 smull v23.4S, v23.4H, z4 166 167 idct_col4_top v24, v25, v26, v27, \i, \l 168 169 mov x4, v28.D[\i - 1] 170 mov x5, v29.D[\i - 1] 171 cmp x4, #0 172 b.eq 1f 173 174 smull\i v7.4S, v28\l, z4 175 add v19.4S, v19.4S, v7.4S 176 sub v20.4S, v20.4S, v7.4S 177 sub v21.4S, v21.4S, v7.4S 178 add v22.4S, v22.4S, v7.4S 179 1801: mov x4, v30.D[\i - 1] 181 cmp x5, #0 182 b.eq 2f 183 184 smlal\i v17.4S, v29\l, z5 185 smlsl\i v18.4S, v29\l, z1 186 smlal\i v5.4S, v29\l, z7 187 smlal\i v6.4S, v29\l, z3 188 1892: mov x5, v31.D[\i - 1] 190 cmp x4, #0 191 b.eq 3f 192 193 smull\i v7.4S, v30\l, z6 194 smull\i v16.4S, v30\l, z2 195 add v19.4S, v19.4S, v7.4S 196 sub v22.4S, v22.4S, v7.4S 197 sub v20.4S, v20.4S, v16.4S 198 add v21.4S, v21.4S, v16.4S 199 2003: cmp x5, #0 201 b.eq 4f 202 203 smlal\i v17.4S, v31\l, z7 204 smlsl\i v18.4S, v31\l, z5 205 smlal\i v5.4S, v31\l, z3 206 smlsl\i v6.4S, v31\l, z1 207 2084: addhn v7.4H, v19.4S, v17.4S 209 addhn2 v7.8H, v20.4S, v18.4S 210 subhn v18.4H, v20.4S, v18.4S 211 subhn2 v18.8H, v19.4S, v17.4S 212 213 addhn v16.4H, v21.4S, v5.4S 214 addhn2 v16.8H, v22.4S, v6.4S 215 subhn v17.4H, v22.4S, v6.4S 216 subhn2 v17.8H, v21.4S, v5.4S 217 218 ret 219endfunc 220.endm 221 222declare_idct_col4_neon 1, .4H 223declare_idct_col4_neon 2, .8H 224 225function ff_simple_idct_put_neon, export=1 226 idct_start x2 227 228 idct_row4_neon v24, v25, v26, v27, 1 229 idct_row4_neon v28, v29, v30, v31, 2 230 bl idct_col4_neon1 231 232 sqshrun v1.8B, v7.8H, #COL_SHIFT-16 233 sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16 234 sqshrun v3.8B, v17.8H, #COL_SHIFT-16 235 sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16 236 237 bl idct_col4_neon2 238 239 sqshrun v2.8B, v7.8H, #COL_SHIFT-16 240 sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16 241 sqshrun v4.8B, v17.8H, #COL_SHIFT-16 242 sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16 243 244 zip1 v16.4S, v1.4S, v2.4S 245 zip2 v17.4S, v1.4S, v2.4S 246 247 st1 {v16.D}[0], [x0], x1 248 st1 {v16.D}[1], [x0], x1 249 250 zip1 v18.4S, v3.4S, v4.4S 251 zip2 v19.4S, v3.4S, v4.4S 252 253 st1 {v17.D}[0], [x0], x1 254 st1 {v17.D}[1], [x0], x1 255 st1 {v18.D}[0], [x0], x1 256 st1 {v18.D}[1], [x0], x1 257 st1 {v19.D}[0], [x0], x1 258 st1 {v19.D}[1], [x0], x1 259 260 idct_end 261endfunc 262 263function ff_simple_idct_add_neon, export=1 264 idct_start x2 265 266 idct_row4_neon v24, v25, v26, v27, 1 267 idct_row4_neon v28, v29, v30, v31, 2 268 bl idct_col4_neon1 269 270 sshr v1.8H, v7.8H, #COL_SHIFT-16 271 sshr v2.8H, v16.8H, #COL_SHIFT-16 272 sshr v3.8H, v17.8H, #COL_SHIFT-16 273 sshr v4.8H, v18.8H, #COL_SHIFT-16 274 275 bl idct_col4_neon2 276 277 sshr v7.8H, v7.8H, #COL_SHIFT-16 278 sshr v16.8H, v16.8H, #COL_SHIFT-16 279 sshr v17.8H, v17.8H, #COL_SHIFT-16 280 sshr v18.8H, v18.8H, #COL_SHIFT-16 281 282 mov x9, x0 283 ld1 {v19.D}[0], [x0], x1 284 zip1 v23.2D, v1.2D, v7.2D 285 zip2 v24.2D, v1.2D, v7.2D 286 ld1 {v19.D}[1], [x0], x1 287 zip1 v25.2D, v2.2D, v16.2D 288 zip2 v26.2D, v2.2D, v16.2D 289 ld1 {v20.D}[0], [x0], x1 290 zip1 v27.2D, v3.2D, v17.2D 291 zip2 v28.2D, v3.2D, v17.2D 292 ld1 {v20.D}[1], [x0], x1 293 zip1 v29.2D, v4.2D, v18.2D 294 zip2 v30.2D, v4.2D, v18.2D 295 ld1 {v21.D}[0], [x0], x1 296 uaddw v23.8H, v23.8H, v19.8B 297 uaddw2 v24.8H, v24.8H, v19.16B 298 ld1 {v21.D}[1], [x0], x1 299 sqxtun v23.8B, v23.8H 300 sqxtun2 v23.16B, v24.8H 301 ld1 {v22.D}[0], [x0], x1 302 uaddw v24.8H, v25.8H, v20.8B 303 uaddw2 v25.8H, v26.8H, v20.16B 304 ld1 {v22.D}[1], [x0], x1 305 sqxtun v24.8B, v24.8H 306 sqxtun2 v24.16B, v25.8H 307 st1 {v23.D}[0], [x9], x1 308 uaddw v25.8H, v27.8H, v21.8B 309 uaddw2 v26.8H, v28.8H, v21.16B 310 st1 {v23.D}[1], [x9], x1 311 sqxtun v25.8B, v25.8H 312 sqxtun2 v25.16B, v26.8H 313 st1 {v24.D}[0], [x9], x1 314 uaddw v26.8H, v29.8H, v22.8B 315 uaddw2 v27.8H, v30.8H, v22.16B 316 st1 {v24.D}[1], [x9], x1 317 sqxtun v26.8B, v26.8H 318 sqxtun2 v26.16B, v27.8H 319 st1 {v25.D}[0], [x9], x1 320 st1 {v25.D}[1], [x9], x1 321 st1 {v26.D}[0], [x9], x1 322 st1 {v26.D}[1], [x9], x1 323 324 idct_end 325endfunc 326 327function ff_simple_idct_neon, export=1 328 idct_start x0 329 330 mov x2, x0 331 idct_row4_neon v24, v25, v26, v27, 1 332 idct_row4_neon v28, v29, v30, v31, 2 333 sub x2, x2, #128 334 bl idct_col4_neon1 335 336 sshr v1.8H, v7.8H, #COL_SHIFT-16 337 sshr v2.8H, v16.8H, #COL_SHIFT-16 338 sshr v3.8H, v17.8H, #COL_SHIFT-16 339 sshr v4.8H, v18.8H, #COL_SHIFT-16 340 341 bl idct_col4_neon2 342 343 sshr v7.8H, v7.8H, #COL_SHIFT-16 344 sshr v16.8H, v16.8H, #COL_SHIFT-16 345 sshr v17.8H, v17.8H, #COL_SHIFT-16 346 sshr v18.8H, v18.8H, #COL_SHIFT-16 347 348 zip1 v23.2D, v1.2D, v7.2D 349 zip2 v24.2D, v1.2D, v7.2D 350 st1 {v23.2D,v24.2D}, [x2], #32 351 zip1 v25.2D, v2.2D, v16.2D 352 zip2 v26.2D, v2.2D, v16.2D 353 st1 {v25.2D,v26.2D}, [x2], #32 354 zip1 v27.2D, v3.2D, v17.2D 355 zip2 v28.2D, v3.2D, v17.2D 356 st1 {v27.2D,v28.2D}, [x2], #32 357 zip1 v29.2D, v4.2D, v18.2D 358 zip2 v30.2D, v4.2D, v18.2D 359 st1 {v29.2D,v30.2D}, [x2], #32 360 361 idct_end 362endfunc 363